Files
scylladb/alternator/executor.cc
Nadav Har'El 751da00692 alternator: split alternator/executor.cc
Already six years ago, in #5783, we noticed that alternator/executor.cc
has grown too large. The previous patches added hundreds of more lines
to it to implement vector search, and it reached a whopping 7,000 lines
of code. This is too much.

This patch splits from executor.cc two major chunks:

1. The implementation of **read** requests - GetItem, BatchGetItem,
   Query (base table, GSI/LSI, and vector-search), and Scan - was
   moved to a new source file alternator/executor_read.cc.
   The new file has 2,000 lines.

2. Moved 250 lines of template functions dealing with attribute paths
   and maps of them to a new header file, attribute_path.hh.
   These utilities are used for many different operations - various
   read operations use them for ProjectionExpression, and UpdateItem
   uses them for modifications to nested attributes, so we need the
   new header file from both executor.cc and executor_read.cc

The remaining executor.cc is still pretty big, 5,000 lines, and
contains write operations (PutItem, UpdateItem, DeleteItem,
BatchWriteItem) as well as various table and other operations, and
also many utility functions used by many types of operations, so
we can later continue this refactoring effort.

Refs #5783

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2026-04-16 14:30:10 +03:00

5111 lines
270 KiB
C++

/*
* Copyright 2019-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#include <fmt/ranges.h>
#include <seastar/core/on_internal_error.hh>
#include "alternator/executor.hh"
#include "alternator/consumed_capacity.hh"
#include "auth/permission.hh"
#include "auth/resource.hh"
#include "cdc/log.hh"
#include "cdc/cdc_options.hh"
#include "auth/service.hh"
#include "cql3/cql3_type.hh"
#include "db/config.hh"
#include "db/view/view_build_status.hh"
#include "locator/tablets.hh"
#include "mutation/tombstone.hh"
#include "locator/abstract_replication_strategy.hh"
#include "utils/log.hh"
#include "schema/schema_builder.hh"
#include "exceptions/exceptions.hh"
#include "service/client_state.hh"
#include "mutation/timestamp.hh"
#include "types/map.hh"
#include "schema/schema.hh"
#include "query/query-request.hh"
#include "query/query-result-reader.hh"
#include "cql3/selection/selection.hh"
#include "cql3/result_set.hh"
#include "bytes.hh"
#include "service/pager/query_pagers.hh"
#include <functional>
#include "error.hh"
#include "serialization.hh"
#include "expressions.hh"
#include "conditions.hh"
#include <optional>
#include "utils/assert.hh"
#include "utils/overloaded_functor.hh"
#include "mutation/collection_mutation.hh"
#include "schema/schema.hh"
#include "db/tags/extension.hh"
#include "db/tags/utils.hh"
#include "replica/database.hh"
#include "alternator/rmw_operation.hh"
#include <seastar/core/coroutine.hh>
#include <seastar/core/sleep.hh>
#include <seastar/core/loop.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <boost/range/algorithm/find_end.hpp>
#include <unordered_set>
#include "service/storage_proxy.hh"
#include "gms/feature_service.hh"
#include "gms/gossiper.hh"
#include "utils/error_injection.hh"
#include "db/schema_tables.hh"
#include "utils/rjson.hh"
#include "alternator/extract_from_attrs.hh"
#include "types/types.hh"
#include "db/system_keyspace.hh"
#include "cql3/statements/ks_prop_defs.hh"
#include "cql3/statements/index_target.hh"
#include "index/secondary_index.hh"
#include "alternator/ttl_tag.hh"
#include "vector_search/vector_store_client.hh"
using namespace std::chrono_literals;
namespace std {
template <> struct hash<std::pair<sstring, sstring>> {
size_t operator () (const std::pair<sstring, sstring>& p) const {
return std::hash<sstring>()(p.first) * 1009 + std::hash<sstring>()(p.second) * 3;
}
};
}
namespace alternator {
logging::logger elogger("alternator-executor");
// Alternator-specific table properties stored as hidden table tags:
//
// Alternator doesn't keep its own records of which Alternator tables exist
// or how each one was configured. Instead, an Alternator table is created
// as a CQL table - and that CQL table stores its own name, schema, views,
// and so on. However, there are some Alternator-specific properties of a
// table which are not part of a CQL schema but which we need to remember.
// We store those extra properties as hidden "tags" on the CQL table, using
// the schema's "tags extension". This extension provides a map<string,string>
// for the table that is stored persistently on disk, but also readable
// quickly from memory.
// Alternator also uses the tags extension to store user-defined tags on
// tables (the TagResource, UntagResource and ListTagsOfResource requests).
// So the internal tags are kept hidden from the user by using the prefix
// "system:" in their name (see tag_key_is_internal()).
// The following is the list of these Alternator-specific hidden tags that
// Alternator adds to tables:
//
// Tags storing the "ReadCapacityUnits" and "WriteCapacityUnits"
// configured for a table with BillingMode=PROVISIONED.
const sstring RCU_TAG_KEY("system:provisioned_rcu");
const sstring WCU_TAG_KEY("system:provisioned_wcu");
// Tag storing the table's original creation time, in milliseconds since the
// Unix epoch. All tables get this tag when they are created, but it may be
// missing in old tables created before this tag was introduced.
const sstring TABLE_CREATION_TIME_TAG_KEY("system:table_creation_time");
// If this tag is present, it stores the name of the attribute that was
// configured by UpdateTimeToLive to be the expiration-time attribute for
// this table.
extern const sstring TTL_TAG_KEY("system:ttl_attribute");
// This will be set to 1 in a case, where user DID NOT specify a range key.
// The way GSI / LSI is implemented by Alternator assumes user specified keys will come first
// in materialized view's key list. Then, if needed missing keys are added (current implementation
// of materialized views requires that all base hash / range keys were added to the view as well).
// Alternator allows only a single range key attribute to be specified by the user. So if
// the SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY is set the user didn't specify any key and
// base table's keys were added as range keys. In all other cases either the first key is the user specified key,
// following ones are base table's keys added as needed or range key list will be empty.
static const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY("system:spurious_range_key_added_to_gsi_and_user_didnt_specify_range_key");
// The following tags also have the "system:" prefix but are NOT used
// by Alternator to store table properties - only the user ever writes to
// them, as a way to configure the table. As such, these tags are writable
// (and readable) by the user, and not hidden by tag_key_is_internal().
// The reason why both hidden (internal) and user-configurable tags share the
// same "system:" prefix is historic.
// Setting the tag with a numeric value will enable a specific initial number
// of tablets (setting the value to 0 means enabling tablets with
// an automatic selection of the best number of tablets).
// Setting this tag to any non-numeric value (e.g., an empty string or the
// word "none") will ask to disable tablets.
static constexpr auto INITIAL_TABLETS_TAG_KEY = "system:initial_tablets";
enum class table_status {
active = 0,
creating,
updating,
deleting
};
static std::string_view table_status_to_sstring(table_status tbl_status) {
switch (tbl_status) {
case table_status::active:
return "ACTIVE";
case table_status::creating:
return "CREATING";
case table_status::updating:
return "UPDATING";
case table_status::deleting:
return "DELETING";
}
return "UNKNOWN";
}
void executor::maybe_audit(
std::unique_ptr<audit::audit_info_alternator>& audit_info,
audit::statement_category category,
std::string_view ks_name,
std::string_view table_name,
std::string_view operation_name,
const rjson::value& request,
std::optional<db::consistency_level> cl)
{
if (_audit.local_is_initialized() && _audit.local().will_log(category, ks_name, table_name)) {
audit_info = std::make_unique<audit::audit_info_alternator>(
category, sstring(ks_name), sstring(table_name), cl);
// FIXME: rjson::print(request) serializes the entire JSON request body, which
// can be up to 16 MB for BatchWriteItem.
audit_info->set_query_string(sstring(rjson::print(request)), sstring(operation_name));
}
}
static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type,
const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode);
map_type attrs_type() {
static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
return t;
}
static const column_definition& attrs_column(const schema& schema) {
const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
throwing_assert(cdef);
return *cdef;
}
lw_shared_ptr<stats> get_stats_from_schema(service::storage_proxy& sp, const schema& schema) {
try {
replica::table& table = sp.local_db().find_column_family(schema.id());
if (!table.get_stats().alternator_stats) {
table.get_stats().alternator_stats = seastar::make_shared<table_stats>(schema.ks_name(), schema.cf_name());
}
return table.get_stats().alternator_stats->_stats;
} catch (std::runtime_error&) {
// If we're here it means that a table we are currently working on was deleted before the
// operation completed, returning a temporary object is fine, if the table get deleted so will its metrics
return make_lw_shared<stats>();
}
}
executor::body_writer make_streamed(rjson::value&& value) {
return [value = std::move(value)](output_stream<char>&& _out) mutable -> future<> {
auto out = std::move(_out);
std::exception_ptr ex;
try {
co_await rjson::print(value, out);
} catch (...) {
ex = std::current_exception();
}
co_await out.close();
co_await rjson::destroy_gently(std::move(value));
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
};
}
// This function throws api_error::validation if input value is not an object.
static void validate_is_object(const rjson::value& value, const char* caller) {
if (!value.IsObject()) {
throw api_error::validation(fmt::format("{} must be an object", caller));
}
}
// This function assumes the given value is an object and returns requested member value.
// If it is not possible, an api_error::validation is thrown.
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
validate_is_object(obj, caller);
const rjson::value* ret = rjson::find(obj, member_name);
if (!ret) {
throw api_error::validation(fmt::format("{} is missing a mandatory member {}", caller, member_name));
}
return *ret;
}
// This function assumes the given value is an object with a single member, and returns this member.
// In case the requirements are not met, an api_error::validation is thrown.
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
if (!v.IsObject() || v.MemberCount() != 1) {
throw api_error::validation(format("{}: expected an object with a single member.", caller));
}
return *(v.MemberBegin());
}
class executor::describe_table_info_manager : public service::migration_listener::empty_listener {
executor &_executor;
struct table_info {
utils::simple_value_with_expiry<std::uint64_t> size_in_bytes;
};
std::unordered_map<std::pair<sstring, sstring>, table_info> info_for_tables;
bool active = false;
public:
describe_table_info_manager(executor& executor) : _executor(executor) {
_executor._proxy.data_dictionary().real_database_ptr()->get_notifier().register_listener(this);
active = true;
}
describe_table_info_manager(const describe_table_info_manager &) = delete;
describe_table_info_manager(describe_table_info_manager&&) = delete;
~describe_table_info_manager() {
if (active) {
on_fatal_internal_error(elogger, "describe_table_info_manager was not stopped before destruction");
}
}
describe_table_info_manager &operator = (const describe_table_info_manager &) = delete;
describe_table_info_manager &operator = (describe_table_info_manager&&) = delete;
static std::chrono::high_resolution_clock::time_point now() {
return std::chrono::high_resolution_clock::now();
}
std::optional<std::uint64_t> get_cached_size_in_bytes(const sstring &ks_name, const sstring &cf_name) const {
auto it = info_for_tables.find({ks_name, cf_name});
if (it != info_for_tables.end()) {
return it->second.size_in_bytes.get();
}
return std::nullopt;
}
void cache_size_in_bytes(sstring ks_name, sstring cf_name, std::uint64_t size_in_bytes, std::chrono::high_resolution_clock::time_point expiry) {
info_for_tables[{std::move(ks_name), std::move(cf_name)}].size_in_bytes.set_if_longer_expiry(size_in_bytes, expiry);
}
future<> stop() {
co_await _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().unregister_listener(this);
active = false;
co_return;
}
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
if (!ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX)) return;
info_for_tables.erase({ks_name, cf_name});
}
};
executor::executor(gms::gossiper& gossiper,
service::storage_proxy& proxy,
service::storage_service& ss,
service::migration_manager& mm,
db::system_distributed_keyspace& sdks,
cdc::metadata& cdc_metadata,
vector_search::vector_store_client& vsc,
smp_service_group ssg,
utils::updateable_value<uint32_t> default_timeout_in_ms)
: _gossiper(gossiper),
_ss(ss),
_proxy(proxy),
_mm(mm),
_sdks(sdks),
_cdc_metadata(cdc_metadata),
_vsc(vsc),
_enforce_authorization(_proxy.data_dictionary().get_config().alternator_enforce_authorization),
_warn_authorization(_proxy.data_dictionary().get_config().alternator_warn_authorization),
_audit(audit::audit::audit_instance()),
_ssg(ssg),
_parsed_expression_cache(std::make_unique<parsed::expression_cache>(
parsed::expression_cache::config{_proxy.data_dictionary().get_config().alternator_max_expression_cache_entries_per_shard},
_stats))
{
s_default_timeout_in_ms = std::move(default_timeout_in_ms);
_describe_table_info_manager = std::make_unique<describe_table_info_manager>(*this);
register_metrics(_metrics, _stats);
}
executor::~executor() = default;
static void set_table_creation_time(std::map<sstring, sstring>& tags_map, db_clock::time_point creation_time) {
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(creation_time.time_since_epoch()).count();
tags_map[TABLE_CREATION_TIME_TAG_KEY] = std::to_string(tm);
}
double get_table_creation_time(const schema &schema) {
auto time = db::find_tag(schema, TABLE_CREATION_TIME_TAG_KEY);
if (time) {
try {
auto val = std::stoll(*time);
if (val > 0) {
return val / 1000.0;
}
}
catch(...) {}
}
return 0.0;
}
void executor::supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
auto creation_time = get_table_creation_time(schema);
rjson::add(descr, "CreationDateTime", rjson::value(creation_time));
rjson::add(descr, "TableStatus", "ACTIVE");
rjson::add(descr, "TableId", rjson::from_string(schema.id().to_sstring()));
executor::supplement_table_stream_info(descr, schema, sp);
}
// We would have liked to support table names up to 255 bytes, like DynamoDB.
// But Scylla creates a directory whose name is the table's name plus 33
// bytes (dash and UUID), and since directory names are limited to 255 bytes,
// we need to limit table names to 222 bytes, instead of 255.
// See https://github.com/scylladb/scylla/issues/4480
// We actually have two limits here,
// * max_table_name_length is the limit that Alternator will impose on names
// of new Alternator tables.
// * max_auxiliary_table_name_length is the potentially higher absolute limit
// that Scylla imposes on the names of auxiliary tables that Alternator
// wants to create internally - i.e. materialized views or CDC log tables.
// The second limit might mean that it is not possible to add a GSI to an
// existing table, because the name of the new auxiliary table may go over
// the limit. The second limit is also one of the reasons why the first limit
// is set lower than 222 - to have room to enable streams which add the extra
// suffix "_scylla_cdc_log" to the table name.
static constexpr int max_table_name_length = 192;
static constexpr int max_auxiliary_table_name_length = 222;
static bool valid_table_name_chars(std::string_view name) {
for (auto c : name) {
if ((c < 'a' || c > 'z') &&
(c < 'A' || c > 'Z') &&
(c < '0' || c > '9') &&
c != '_' &&
c != '-' &&
c != '.') {
return false;
}
}
return true;
}
// validate_table_name() validates the TableName parameter in a request - it
// should only be called in CreateTable or when a request looking for an
// existing table failed to find it. validate_table_name() throws the
// appropriate api_error if this validation fails.
// The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules
// specifies that table "names must be between 3 and 255 characters long and
// can contain only the following characters: a-z, A-Z, 0-9, _ (underscore),
// - (dash), . (dot)". However, Alternator only allows max_table_name_length
// characters (see above) - not 255.
static void validate_table_name(std::string_view name, const char* source="TableName") {
if (name.length() < 3 || name.length() > max_table_name_length) {
throw api_error::validation(
format("{} must be at least 3 characters long and at most {} characters long", source, max_table_name_length));
}
if (!valid_table_name_chars(name)) {
throw api_error::validation(
format("{} must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", source));
}
}
// Validate that a CDC log table could be created for the base table with a
// given table_name, and if not, throw a user-visible api_error::validation.
// It is not possible to create a CDC log table if the table name is so long
// that adding the 15-character suffix "_scylla_cdc_log" (cdc_log_suffix)
// makes it go over max_auxiliary_table_name_length.
// Note that if max_table_name_length is set to less than 207 (which is
// max_auxiliary_table_name_length-15), then this function will never
// fail. However, it's still important to call it in UpdateTable, in case
// we have pre-existing tables with names longer than this to avoid #24598.
static void validate_cdc_log_name_length(std::string_view table_name) {
if (cdc::log_name(table_name).length() > max_auxiliary_table_name_length) {
// CDC will add cdc_log_suffix ("_scylla_cdc_log") to the table name
// to create its log table, and this will exceed the maximum allowed
// length. To provide a more helpful error message, we assume that
// cdc::log_name() always adds a suffix of the same length.
int suffix_len = cdc::log_name(table_name).length() - table_name.length();
throw api_error::validation(fmt::format("Streams or vector search cannot be enabled on a table whose name is longer than {} characters: {}",
max_auxiliary_table_name_length - suffix_len, table_name));
}
}
// In DynamoDB index names are local to a table, while in Scylla, materialized
// view names are global (in a keyspace). So we need to compose a unique name
// for the view taking into account both the table's name and the index name.
// We concatenate the table and index name separated by a delim character
// (a character not allowed by DynamoDB in ordinary table names, default: ":").
// The downside of this approach is that it limits the sum of the lengths,
// instead of each component individually as DynamoDB does.
// The view_name() function assumes the table_name has already been validated
// but validates the legality of index_name and the combination of both.
std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim, bool validate_len) {
if (index_name.length() < 3) {
throw api_error::validation("IndexName must be at least 3 characters long");
}
if (!valid_table_name_chars(index_name)) {
throw api_error::validation(
fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
}
std::string ret = std::string(table_name) + delim + std::string(index_name);
if (ret.length() > max_auxiliary_table_name_length && validate_len) {
throw api_error::validation(
fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
table_name, index_name, max_auxiliary_table_name_length - delim.size()));
}
return ret;
}
std::string gsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) {
return view_name(table_name, index_name, ":", validate_len);
}
std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) {
return view_name(table_name, index_name, "!:", validate_len);
}
/** Extract table name from a request.
* Most requests expect the table's name to be listed in a "TableName" field.
* This convenience function returns the name or api_error in case the
* table name is missing or not a string.
*/
static std::optional<std::string> find_table_name(const rjson::value& request) {
const rjson::value* table_name_value = rjson::find(request, "TableName");
if (!table_name_value) {
return std::nullopt;
}
if (!table_name_value->IsString()) {
throw api_error::validation("Non-string TableName field in request");
}
std::string table_name = rjson::to_string(*table_name_value);
return table_name;
}
std::string get_table_name(const rjson::value& request) {
auto name = find_table_name(request);
if (!name) {
throw api_error::validation("Missing TableName field in request");
}
return *name;
}
/** Extract table schema from a request.
* Many requests expect the table's name to be listed in a "TableName" field
* and need to look it up as an existing table. This convenience function
* does this, with the appropriate validation and api_error in case the table
* name is missing, invalid or the table doesn't exist. If everything is
* successful, it returns the table's schema.
*/
schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::value& request) {
auto table_name = find_table_name(request);
if (!table_name) {
return nullptr;
}
return find_table(proxy, *table_name);
}
schema_ptr executor::find_table(service::storage_proxy& proxy, std::string_view table_name) {
try {
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(table_name), table_name);
} catch(data_dictionary::no_such_column_family&) {
// DynamoDB returns validation error even when table does not exist
// and the table name is invalid.
validate_table_name(table_name);
throw api_error::resource_not_found(
fmt::format("Requested resource not found: Table: {} not found", table_name));
}
}
schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) {
auto schema = executor::find_table(proxy, request);
if (!schema) {
// if we get here then the name was missing, since syntax or missing actual CF
// checks throw. Slow path, but just call get_table_name to generate exception.
get_table_name(request);
}
return schema;
}
// try_get_internal_table() handles the special case that the given table_name
// begins with INTERNAL_TABLE_PREFIX (".scylla.alternator."). In that case,
// this function assumes that the rest of the name refers to an internal
// Scylla table (e.g., system table) and returns the schema of that table -
// or an exception if it doesn't exist. Otherwise, if table_name does not
// start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr
// and the caller should look for a normal Alternator table with that name.
schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) {
size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX);
if (it != 0) {
return schema_ptr{};
}
table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size());
size_t delim = table_name.find_first_of('.');
if (delim == std::string_view::npos) {
return schema_ptr{};
}
std::string_view ks_name = table_name.substr(0, delim);
table_name.remove_prefix(ks_name.size() + 1);
// Only internal keyspaces can be accessed to avoid leakage
auto ks = db.try_find_keyspace(ks_name);
if (!ks || !ks->is_internal()) {
return schema_ptr{};
}
try {
return db.find_schema(ks_name, table_name);
} catch (data_dictionary::no_such_column_family&) {
// DynamoDB returns validation error even when table does not exist
// and the table name is invalid.
validate_table_name(table_name);
throw api_error::resource_not_found(
fmt::format("Requested resource not found: Internal table: {}.{} not found", ks_name, table_name));
}
}
// get_table_for_write() is similar to get_table(), but additionally, if the
// configuration allows this, may also allow writing to system table with
// prefix INTERNAL_TABLE_PREFIX. See also get_table_or_view() in
// executor_read.cc which allows *reading* internal tables by the Query
// operation.
static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson::value& request) {
std::string table_name = get_table_name(request);
if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) {
if (!proxy.data_dictionary().get_config().alternator_allow_system_table_write()) {
throw api_error::resource_not_found(fmt::format(
"Table {} is an internal table, and writing to it is forbidden"
" by the alternator_allow_system_table_write configuration",
table_name));
}
return s;
}
return executor::find_table(proxy, table_name);
}
// Convenience function for getting the value of a string attribute, or a
// default value if it is missing. If the attribute exists, but is not a
// string, a descriptive api_error is thrown.
static std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return) {
const rjson::value* attribute_value = rjson::find(value, attribute_name);
if (!attribute_value)
return default_return;
if (!attribute_value->IsString()) {
throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}",
attribute_name, value));
}
return rjson::to_string(*attribute_value);
}
// Convenience function for getting the value of a boolean attribute, or a
// default value if it is missing. If the attribute exists, but is not a
// bool, a descriptive api_error is thrown.
bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) {
const rjson::value* attribute_value = rjson::find(value, attribute_name);
if (!attribute_value) {
return default_return;
}
if (!attribute_value->IsBool()) {
throw api_error::validation(fmt::format("Expected boolean value for attribute {}, got: {}",
attribute_name, value));
}
return attribute_value->GetBool();
}
// Convenience function for getting the value of an integer attribute, or
// an empty optional if it is missing. If the attribute exists, but is not
// an integer, a descriptive api_error is thrown.
std::optional<int> get_int_attribute(const rjson::value& value, std::string_view attribute_name) {
const rjson::value* attribute_value = rjson::find(value, attribute_name);
if (!attribute_value)
return {};
if (!attribute_value->IsInt()) {
throw api_error::validation(fmt::format("Expected integer value for attribute {}, got: {}",
attribute_name, value));
}
return attribute_value->GetInt();
}
// Sets a KeySchema object inside the given JSON parent describing the key
// attributes of the given schema as being either HASH or RANGE keys.
// Additionally, adds to a given map mappings between the key attribute
// names and their type (as a DynamoDB type string).
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
rjson::value key_schema = rjson::empty_array();
const bool ignore_range_keys_as_spurious = tags != nullptr && tags->contains(SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY);
for (const column_definition& cdef : schema.partition_key_columns()) {
rjson::value key = rjson::empty_object();
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
rjson::add(key, "KeyType", "HASH");
rjson::push_back(key_schema, std::move(key));
if (attribute_types) {
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
}
}
if (!ignore_range_keys_as_spurious) {
// NOTE: user requested key (there can be at most one) will always come first.
// There might be more keys following it, which were added, but those were
// not requested by the user, so we ignore them.
for (const column_definition& cdef : schema.clustering_key_columns()) {
rjson::value key = rjson::empty_object();
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
rjson::add(key, "KeyType", "RANGE");
rjson::push_back(key_schema, std::move(key));
if (attribute_types) {
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
}
break;
}
}
rjson::add(parent, "KeySchema", std::move(key_schema));
}
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>& attribute_types, const std::map<sstring, sstring> *tags) {
describe_key_schema(parent, schema, &attribute_types, tags);
}
static rjson::value generate_arn_for_table(const schema& schema) {
return rjson::from_string(format("arn:scylla:alternator:{}:scylla:table/{}", schema.ks_name(), schema.cf_name()));
}
static rjson::value generate_arn_for_index(const schema& schema, std::string_view index_name) {
return rjson::from_string(fmt::format(
"arn:scylla:alternator:{}:scylla:table/{}/index/{}",
schema.ks_name(), schema.cf_name(), index_name));
}
// The following function checks if a given view has finished building.
// We need this for describe_table() to know if a view is still backfilling,
// or active.
//
// Currently we don't have in view_ptr the knowledge whether a view finished
// building long ago - so checking this involves a somewhat inefficient, but
// still node-local, process:
// We need a table that can accurately tell that all nodes have finished
// building this view. system.built_views is not good enough because it only
// knows the view building status in the current node. In recent versions,
// after PR #19745, we have a local table system.view_build_status_v2 with
// global information, replacing the old system_distributed.view_build_status.
// In theory, there can be a period during upgrading an old cluster when this
// table is not yet available. However, since the IndexStatus is a new feature
// too, it is acceptable that it doesn't yet work in the middle of the update.
static future<bool> is_view_built(
view_ptr view,
service::storage_proxy& proxy,
service::client_state& client_state,
tracing::trace_state_ptr trace_state,
service_permit permit) {
auto schema = proxy.data_dictionary().find_table(
"system", db::system_keyspace::VIEW_BUILD_STATUS_V2).schema();
// The table system.view_build_status_v2 has "keyspace_name" and
// "view_name" as the partition key, and each clustering row has
// "host_id" as clustering key and a string "status". We need to
// read a single partition:
partition_key pk = partition_key::from_exploded(*schema,
{utf8_type->decompose(view->ks_name()),
utf8_type->decompose(view->cf_name())});
dht::partition_range_vector partition_ranges{
dht::partition_range(dht::decorate_key(*schema, pk))};
auto selection = cql3::selection::selection::wildcard(schema); // only for get_query_options()!
auto partition_slice = query::partition_slice(
{query::clustering_range::make_open_ended_both_sides()},
{}, // static columns
{schema->get_column_definition("status")->id}, // regular columns
selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(
schema->id(), schema->version(), partition_slice,
proxy.get_max_result_size(partition_slice),
query::tombstone_limit(proxy.get_tombstone_limit()));
service::storage_proxy::coordinator_query_result qr =
co_await proxy.query(
schema, std::move(command), std::move(partition_ranges),
db::consistency_level::LOCAL_ONE,
service::storage_proxy::coordinator_query_options(
executor::default_timeout(), std::move(permit), client_state, trace_state));
query::result_set rs = query::result_set::from_raw_result(
schema, partition_slice, *qr.query_result);
std::unordered_map<locator::host_id, sstring> statuses;
for (auto&& r : rs.rows()) {
auto host_id = r.get<utils::UUID>("host_id");
auto status = r.get<sstring>("status");
if (host_id && status) {
statuses.emplace(locator::host_id(*host_id), *status);
}
}
// A view is considered "built" if all nodes reported SUCCESS in having
// built this view. Note that we need this "SUCCESS" for all nodes in the
// cluster - even those that are temporarily down (their success is known
// by this node, even if they are down). Conversely, we don't care what is
// the recorded status for any node which is no longer in the cluster - it
// is possible we forgot to erase the status of nodes that left the
// cluster, but here we just ignore them and look at the nodes actually
// in the topology.
bool all_built = true;
auto token_metadata = proxy.get_token_metadata_ptr();
token_metadata->get_topology().for_each_node(
[&] (const locator::node& node) {
// Note: we could skip nodes in DCs which have no replication of
// this view. However, in practice even those nodes would run
// the view building (and just see empty content) so we don't
// need to bother with this skipping.
auto it = statuses.find(node.host_id());
if (it == statuses.end() || it->second != "SUCCESS") {
all_built = false;
}
});
co_return all_built;
}
future<> executor::cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl) {
auto expiry = describe_table_info_manager::now() + ttl;
return container().invoke_on_all(
[schema, size_in_bytes, expiry] (executor& exec) {
exec._describe_table_info_manager->cache_size_in_bytes(schema->ks_name(), schema->cf_name(), size_in_bytes, expiry);
});
}
future<> executor::fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting) {
auto cached_size = _describe_table_info_manager->get_cached_size_in_bytes(schema->ks_name(), schema->cf_name());
std::uint64_t total_size = 0;
if (cached_size) {
total_size = *cached_size;
} else {
// there's no point in trying to estimate value of table that is being deleted, as other nodes more often than not might
// move forward with deletion faster than we calculate the size
if (!deleting) {
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
// Note: we don't care when the notification of other shards will finish, as long as it will be done
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
// which is also fine, as the specification doesn't give precision guarantees of any kind.
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
}
}
rjson::add(table_description, "TableSizeBytes", total_size);
}
future<rjson::value> executor::fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
{
rjson::value table_description = rjson::empty_object();
auto tags_ptr = db::get_tags_of_table(schema);
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
co_await fill_table_size(table_description, schema, tbl_status == table_status::deleting);
auto creation_timestamp = get_table_creation_time(*schema);
// FIXME: In DynamoDB the CreateTable implementation is asynchronous, and
// the table may be in "Creating" state until creating is finished.
// We don't currently do this in Alternator - instead CreateTable waits
// until the table is really available. So/ DescribeTable returns either
// ACTIVE or doesn't exist at all (and DescribeTable returns an error).
// The states CREATING and UPDATING are not currently returned.
rjson::add(table_description, "TableStatus", rjson::from_string(table_status_to_sstring(tbl_status)));
rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_timestamp));
// In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
int rcu = 0;
int wcu = 0;
bool is_pay_per_request = true;
if (tags_ptr) {
auto rcu_tag = tags_ptr->find(RCU_TAG_KEY);
auto wcu_tag = tags_ptr->find(WCU_TAG_KEY);
if (rcu_tag != tags_ptr->end() && wcu_tag != tags_ptr->end()) {
try {
rcu = std::stoi(rcu_tag->second);
wcu = std::stoi(wcu_tag->second);
is_pay_per_request = false;
} catch (...) {
rcu = 0;
wcu = 0;
}
}
}
if (is_pay_per_request) {
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
} else {
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PROVISIONED");
}
rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", rcu);
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
if (tbl_status != table_status::deleting) {
rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
std::unordered_map<std::string,std::string> key_attribute_types;
// Add base table's KeySchema and collect types for AttributeDefinitions:
executor::describe_key_schema(table_description, *schema, key_attribute_types, tags_ptr);
if (!t.views().empty()) {
rjson::value gsi_array = rjson::empty_array();
rjson::value lsi_array = rjson::empty_array();
for (const view_ptr& vptr : t.views()) {
rjson::value view_entry = rjson::empty_object();
const sstring& cf_name = vptr->cf_name();
size_t delim_it = cf_name.find(':');
if (delim_it == sstring::npos) {
elogger.error("Invalid internal index table name: {}", cf_name);
continue;
}
sstring index_name = cf_name.substr(delim_it + 1);
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
// Add index's KeySchema and collect types for AttributeDefinitions:
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
// Add projection type
rjson::value projection = rjson::empty_object();
rjson::add(projection, "ProjectionType", "ALL");
// FIXME: we have to get ProjectionType from the schema when it is added
rjson::add(view_entry, "Projection", std::move(projection));
// Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
bool is_lsi = (delim_it > 1 && cf_name[delim_it-1] == '!');
// Add IndexStatus and Backfilling flags, but only for GSIs -
// LSIs can only be created with the table itself and do not
// have a status. Alternator schema operations are synchronous
// so only two combinations of these flags are possible: ACTIVE
// (for a built view) or CREATING+Backfilling (if view building
// is in progress).
if (!is_lsi) {
if (co_await is_view_built(vptr, _proxy, client_state, trace_state, permit)) {
rjson::add(view_entry, "IndexStatus", "ACTIVE");
} else {
rjson::add(view_entry, "IndexStatus", "CREATING");
rjson::add(view_entry, "Backfilling", rjson::value(true));
}
}
rjson::value& index_array = is_lsi ? lsi_array : gsi_array;
rjson::push_back(index_array, std::move(view_entry));
}
if (!lsi_array.Empty()) {
rjson::add(table_description, "LocalSecondaryIndexes", std::move(lsi_array));
}
if (!gsi_array.Empty()) {
rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
}
}
// List vector indexes, if this table has any:
rjson::value vector_index_array = rjson::empty_array();
abort_on_expiry vector_index_status_aoe(executor::default_timeout());
for (const index_metadata& im : schema->indices()) {
const auto& opts = im.options();
auto class_it = opts.find(db::index::secondary_index::custom_class_option_name);
if (class_it == opts.end() || class_it->second != "vector_index") {
continue;
}
rjson::value entry = rjson::empty_object();
rjson::add(entry, "IndexName", rjson::from_string(im.name()));
rjson::value vector_attribute = rjson::empty_object();
auto target_it = opts.find(cql3::statements::index_target::target_option_name);
if (target_it != opts.end()) {
rjson::add(vector_attribute, "AttributeName", rjson::from_string(target_it->second));
}
auto dims_it = opts.find("dimensions");
if (dims_it != opts.end()) {
try {
rjson::add(vector_attribute, "Dimensions", std::stoi(dims_it->second));
} catch (const std::logic_error&) {
// This should never happen, because the dimensions option
// is validated on index creation
on_internal_error(elogger, fmt::format("Unexpected non-integer dimensions value '{}' for vector index '{}'", dims_it->second, im.name()));
}
}
rjson::add(entry, "VectorAttribute", std::move(vector_attribute));
// Always return a Projection. Currently only KEYS_ONLY is
// supported, so we always return that.
rjson::value projection = rjson::empty_object();
rjson::add(projection, "ProjectionType", "KEYS_ONLY");
rjson::add(entry, "Projection", std::move(projection));
// Report IndexStatus and Backfilling based on the vector store's
// reported state: SERVING -> ACTIVE, BOOTSTRAPPING -> CREATING+Backfilling,
// anything else (INITIALIZING, unreachable, etc.) -> CREATING.
auto vstatus = co_await _vsc.get_index_status(
schema->ks_name(), im.name(), vector_index_status_aoe.abort_source());
using index_status = vector_search::vector_store_client::index_status;
if (vstatus == index_status::serving) {
rjson::add(entry, "IndexStatus", "ACTIVE");
} else {
rjson::add(entry, "IndexStatus", "CREATING");
if (vstatus == index_status::backfilling) {
rjson::add(entry, "Backfilling", rjson::value(true));
}
}
rjson::push_back(vector_index_array, std::move(entry));
}
if (!vector_index_array.Empty()) {
rjson::add(table_description, "VectorIndexes", std::move(vector_index_array));
}
// Use map built by describe_key_schema() for base and indexes to produce
// AttributeDefinitions for all key columns:
rjson::value attribute_definitions = rjson::empty_array();
for (auto& type : key_attribute_types) {
rjson::value key = rjson::empty_object();
rjson::add(key, "AttributeName", rjson::from_string(type.first));
rjson::add(key, "AttributeType", rjson::from_string(type.second));
rjson::push_back(attribute_definitions, std::move(key));
}
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
}
executor::supplement_table_stream_info(table_description, *schema, _proxy);
co_return table_description;
}
bool is_alternator_keyspace(const sstring& ks_name) {
return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
}
sstring executor::table_name(const schema& s) {
return s.cf_name();
}
future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.describe_table++;
elogger.trace("Describing table {}", request);
schema_ptr schema = get_table(_proxy, request);
maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "DescribeTable", request);
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
tracing::add_alternator_table_name(trace_state, schema->cf_name());
rjson::value table_description = co_await fill_table_description(schema, table_status::active, client_state, trace_state, permit);
rjson::value response = rjson::empty_object();
rjson::add(response, "Table", std::move(table_description));
elogger.trace("returning {}", response);
co_return rjson::print(std::move(response));
}
// This function increments the authorization_failures counter, and may also
// log a warn-level message and/or throw an access_denied exception, depending
// on what enforce_authorization and warn_authorization are set to.
// Note that if enforce_authorization is false, this function will return
// without throwing. So a caller that doesn't want to continue after an
// authorization_error must explicitly return after calling this function.
static void authorization_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, std::string msg) {
stats.authorization_failures++;
if (enforce_authorization) {
if (warn_authorization) {
elogger.warn("alternator_warn_authorization=true: {}", msg);
}
throw api_error::access_denied(std::move(msg));
} else {
if (warn_authorization) {
elogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {}", msg);
}
}
}
// Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
// SELECT, DROP, etc.) on the given table. When permission is denied an
// appropriate user-readable api_error::access_denied is thrown.
future<> verify_permission(
bool enforce_authorization,
bool warn_authorization,
const service::client_state& client_state,
const schema_ptr& schema,
auth::permission permission_to_check,
alternator::stats& stats) {
if (!enforce_authorization && !warn_authorization) {
co_return;
}
// Unfortunately, the fix for issue #23218 did not modify the function
// that we use here - check_has_permissions(). So if we want to allow
// writes to internal tables (from try_get_internal_table()) only to a
// superuser, we need to explicitly check it here.
if (permission_to_check == auth::permission::MODIFY && is_internal_keyspace(schema->ks_name())) {
if (!client_state.user() ||
!client_state.user()->name ||
!co_await client_state.get_auth_service()->underlying_role_manager().is_superuser(*client_state.user()->name)) {
sstring username = "<anonymous>";
if (client_state.user() && client_state.user()->name) {
username = client_state.user()->name.value();
}
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
"Write access denied on internal table {}.{} to role {} because it is not a superuser",
schema->ks_name(), schema->cf_name(), username));
co_return;
}
}
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
if (!client_state.user() || !client_state.user()->name ||
!co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) {
sstring username = "<anonymous>";
if (client_state.user() && client_state.user()->name) {
username = client_state.user()->name.value();
}
// Using exceptions for errors makes this function faster in the
// success path (when the operation is allowed).
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
"{} access on table {}.{} is denied to role {}, client address {}",
auth::permissions::to_string(permission_to_check),
schema->ks_name(), schema->cf_name(), username, client_state.get_client_address()));
}
}
// Similar to verify_permission() above, but just for CREATE operations.
// Those do not operate on any specific table, so require permissions on
// ALL KEYSPACES instead of any specific table.
static future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state& client_state, alternator::stats& stats) {
if (!enforce_authorization && !warn_authorization) {
co_return;
}
auto resource = auth::resource(auth::resource_kind::data);
if (!co_await client_state.check_has_permission(auth::command_desc(auth::permission::CREATE, resource))) {
sstring username = "<anonymous>";
if (client_state.user() && client_state.user()->name) {
username = client_state.user()->name.value();
}
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
"CREATE access on ALL KEYSPACES is denied to role {}", username));
}
}
future<executor::request_return_type> executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.delete_table++;
elogger.trace("Deleting table {}", request);
std::string table_name = get_table_name(request);
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
maybe_audit(audit_info, audit::statement_category::DDL, keyspace_name, table_name, "DeleteTable", request);
tracing::add_alternator_table_name(trace_state, table_name);
auto& p = _proxy.container();
schema_ptr schema = get_table(_proxy, request);
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, client_state, trace_state, permit);
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
size_t retries = mm.get_concurrent_ddl_retries();
for (;;) {
auto group0_guard = co_await mm.start_group0_operation();
std::optional<data_dictionary::table> tbl = p.local().data_dictionary().try_find_table(keyspace_name, table_name);
if (!tbl) {
// DynamoDB returns validation error even when table does not exist
// and the table name is invalid.
validate_table_name(table_name);
throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
}
auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
auto m2 = co_await service::prepare_keyspace_drop_announcement(p.local(), keyspace_name, group0_guard.write_timestamp());
std::move(m2.begin(), m2.end(), std::back_inserter(m));
// When deleting a table and its views, we need to remove this role's
// special permissions in those tables (undoing the "auto-grant" done
// by CreateTable). If we didn't do this, if a second role later
// recreates a table with the same name, the first role would still
// have permissions over the new table.
// To make things more robust we just remove *all* permissions for
// the deleted table (CQL's drop_table_statement also does this).
// Unfortunately, there is an API mismatch between this code (which
// uses separate group0_guard and vector<mutation>) and the function
// revoke_all() which uses a combined "group0_batch" structure - so
// we need to do some ugly back-and-forth conversions between the pair
// to the group0_batch and back to the pair :-(
service::group0_batch mc(std::move(group0_guard));
mc.add_mutations(std::move(m));
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
co_await auth::revoke_all(*cs.get().get_auth_service(), resource, mc);
for (const view_ptr& v : tbl->views()) {
resource = auth::make_data_resource(v->ks_name(), v->cf_name());
co_await auth::revoke_all(*cs.get().get_auth_service(), resource, mc);
}
std::tie(m, group0_guard) = co_await std::move(mc).extract();
try {
co_await mm.announce(std::move(m), std::move(group0_guard), fmt::format("alternator-executor: delete {} table", table_name));
break;
} catch (const service::group0_concurrent_modification& ex) {
elogger.info("Failed to execute DeleteTable {} due to concurrent schema modifications. {}.",
table_name, retries ? "Retrying" : "Number of retries exceeded, giving up");
if (retries--) {
continue;
}
throw;
}
}
});
rjson::value response = rjson::empty_object();
rjson::add(response, "TableDescription", std::move(table_description));
elogger.trace("returning {}", response);
co_return rjson::print(std::move(response));
}
static data_type parse_key_type(std::string_view type) {
// Note that keys are only allowed to be string, blob or number (S/B/N).
// The other types: boolean and various lists or sets - are not allowed.
if (type.length() == 1) {
switch (type[0]) {
case 'S': return utf8_type;
case 'B': return bytes_type;
case 'N': return decimal_type; // FIXME: use a specialized Alternator type, not the general "decimal_type".
}
}
throw api_error::validation(
fmt::format("Invalid key type '{}', can only be S, B or N.", type));
}
static void add_column(schema_builder& builder, const std::string& name, const rjson::value& attribute_definitions, column_kind kind, bool computed_column=false) {
// FIXME: Currently, the column name ATTRS_COLUMN_NAME is not allowed
// because we use it for our untyped attribute map, and we can't have a
// second column with the same name. We should fix this, by renaming
// some column names which we want to reserve.
if (name == executor::ATTRS_COLUMN_NAME) {
throw api_error::validation(fmt::format("Column name '{}' is currently reserved. FIXME.", name));
}
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
const rjson::value& attribute_info = *it;
if (rjson::to_string_view(attribute_info["AttributeName"]) == name) {
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
data_type dt = parse_key_type(type);
if (computed_column) {
// Computed column for GSI (doesn't choose a real column as-is
// but rather extracts a single value from the ":attrs" map)
alternator_type at = type_info_from_string(type).atype;
builder.with_computed_column(to_bytes(name), dt, kind,
std::make_unique<extract_from_attrs_column_computation>(to_bytes(name), at));
} else {
builder.with_column(to_bytes(name), dt, kind);
}
return;
}
}
throw api_error::validation(
fmt::format("KeySchema key '{}' missing in AttributeDefinitions", name));
}
// Parse the KeySchema request attribute, which specifies the column names
// for a key. A KeySchema must include up to two elements, the first must be
// the HASH key name, and the second one, if exists, must be a RANGE key name.
// The function returns the two column names - the first is the hash key
// and always present, the second is the range key and may be an empty string.
static std::pair<std::string, std::string> parse_key_schema(const rjson::value& obj, std::string_view supplementary_context) {
const rjson::value *key_schema;
if (!obj.IsObject() || !(key_schema = rjson::find(obj, "KeySchema"))) {
throw api_error::validation("Missing KeySchema member");
}
if (!key_schema->IsArray() || key_schema->Size() < 1 || key_schema->Size() > 2) {
throw api_error::validation("KeySchema must list exactly one or two key columns");
}
if (!(*key_schema)[0].IsObject()) {
throw api_error::validation("First element of KeySchema must be an object");
}
const rjson::value *v = rjson::find((*key_schema)[0], "KeyType");
if (!v || !v->IsString() || rjson::to_string_view(*v) != "HASH") {
throw api_error::validation("First key in KeySchema must be a HASH key");
}
v = rjson::find((*key_schema)[0], "AttributeName");
if (!v || !v->IsString()) {
throw api_error::validation("First key in KeySchema must have string AttributeName");
}
validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "HASH key in KeySchema - ");
std::string hash_key = rjson::to_string(*v);
std::string range_key;
if (key_schema->Size() == 2) {
if (!(*key_schema)[1].IsObject()) {
throw api_error::validation("Second element of KeySchema must be an object");
}
v = rjson::find((*key_schema)[1], "KeyType");
if (!v || !v->IsString() || rjson::to_string_view(*v) != "RANGE") {
throw api_error::validation("Second key in KeySchema must be a RANGE key");
}
v = rjson::find((*key_schema)[1], "AttributeName");
if (!v || !v->IsString()) {
throw api_error::validation("Second key in KeySchema must have string AttributeName");
}
validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "RANGE key in KeySchema - ");
range_key = v->GetString();
}
return {hash_key, range_key};
}
arn_parts parse_arn(std::string_view arn, std::string_view arn_field_name, std::string_view type_name, std::string_view expected_postfix) {
// Expected ARN format arn:partition:service:region:account-id:resource-type/resource-id
// So (old arn produced by scylla for internal purposes) arn:scylla:alternator:${KEYSPACE_NAME}:scylla:table/${TABLE_NAME}
// or (KCL ready new arn) arn:aws:dynamodb:us-east-1:797456418907:table/${KEYSPACE_NAME}@${TABLE_NAME}/stream/2025-12-18T17:38:48.952
// According to Amazon account-id must be a number, but we don't check for it here.
// postfix is part of the string after table name, including the separator, e.g. "/stream/2025-12-18T17:38:48.952" in the second example above
// if expected_postfix is empty, then we reject ARN with any postfix
// otherwise we require that postfix starts with expected_postfix and we return it
if (!arn.starts_with("arn:")) {
throw api_error::access_denied(fmt::format("{}: Invalid {} ARN `{}` - missing `arn:` prefix", arn_field_name, type_name, arn));
}
// skip to the resource part
// we don't require ARNs to follow KCL requirements, except for number of parts.
auto index = arn.find(':'); // skip arn
bool is_scylla_arn = false;
std::string_view keyspace_name;
if (index != std::string_view::npos) {
auto start = index + 1;
index = arn.find(':', index + 1); // skip partition
if (index != std::string_view::npos) {
if (arn.substr(start, index - start) == "scylla") {
is_scylla_arn = true;
}
index = arn.find(':', index + 1); // skip service
if (index != std::string_view::npos) {
start = index + 1;
index = arn.find(':', index + 1); // skip region
if (index != std::string_view::npos) {
if (is_scylla_arn) {
keyspace_name = arn.substr(start, index - start);
}
index = arn.find(':', index + 1); // skip account-id
}
}
}
}
if (index == std::string_view::npos) {
throw api_error::access_denied(fmt::format("{}: Invalid {} ARN `{}` - missing arn:<partition>:<service>:<region>:<account-id> prefix", arn_field_name, type_name, arn));
}
// index is at last `:` before `table/<table-name>`
// this looks like a valid ARN up to this point
// as a sanity check make sure that what follows is a resource-type "table/"
if (arn.substr(index + 1, 6) != "table/") {
throw api_error::validation(fmt::format("{}: Invalid {} ARN `{}` - resource-type is not `table/`", arn_field_name, type_name, arn));
}
index += 7; // move past ":table/"
// Amazon's spec allows both ':' and '/' as resource separators
auto end_index = arn.substr(index).find_first_of("/:");
if (end_index == std::string_view::npos) {
end_index = arn.length();
}
else {
end_index += index; // adjust end_index to be relative to the start of the string, not to the start of the table name
}
auto table_name = arn.substr(index, end_index - index);
if (!is_scylla_arn) {
auto separator_pos = table_name.find_first_of("@");
if (separator_pos == std::string_view::npos) {
throw api_error::validation(fmt::format("{}: Invalid {} ARN `{}` - missing separator `@`", arn_field_name, type_name, arn));
}
keyspace_name = table_name.substr(0, separator_pos);
table_name = table_name.substr(separator_pos + 1);
}
index = end_index;
// caller might require a specific postfix after the table name
// so for example in arn:aws:dynamodb:us-east-1:797456418907:table/dynamodb_streams_verification_table_rc/stream/2025-12-18T17:38:48.952
// specific postfix could be "/stream/" to make sure ARN is for a stream, not for the table itself
// postfix will contain leading separator
std::string_view postfix = arn.substr(index);
if (postfix.empty() == expected_postfix.empty() && postfix.starts_with(expected_postfix)) {
// we will end here if
// - postfix and expected_postfix are both empty (thus `starts_with` will check against empty string and return true)
// - both are not empty and postfix starts with expected_postfix
return { keyspace_name, table_name, postfix };
}
throw api_error::validation(fmt::format("{}: Invalid {} ARN `{}` - expected `{}` after table name `{}`", arn_field_name, type_name, arn, expected_postfix, table_name));
}
static schema_ptr get_table_from_arn(service::storage_proxy& proxy, std::string_view arn) {
// NOTE: This code returns AccessDeniedException if it's problematic to parse or recognize an arn.
// Technically, a properly formatted, but nonexistent arn *should* return AccessDeniedException,
// while an incorrectly formatted one should return ValidationException.
// Unfortunately, the rules are really uncertain, since DynamoDB
// states that arns are of the form arn:partition:service:region:account-id:resource-type/resource-id
// or similar - yet, for some arns that do not fit that pattern (e.g. "john"),
// it still returns AccessDeniedException rather than ValidationException.
// Consequently, this code simply falls back to AccessDeniedException,
// concluding that an error is an error and code which uses tagging
// must be ready for handling AccessDeniedException instances anyway.
try {
auto parts = parse_arn(arn, "ResourceArn", "table", "");
return proxy.data_dictionary().find_schema(parts.keyspace_name, parts.table_name);
} catch (const data_dictionary::no_such_column_family& e) {
throw api_error::resource_not_found(fmt::format("ResourceArn: Invalid table ARN `{}` - not found", arn));
} catch (const std::out_of_range& e) {
throw api_error::access_denied(fmt::format("ResourceArn: Invalid table ARN `{}` - {}", arn, e.what()));
}
}
static bool is_legal_tag_char(char c) {
// FIXME: According to docs, unicode strings should also be accepted.
// Alternator currently uses a simplified ASCII approach
return std::isalnum(c) || std::isspace(c)
|| c == '+' || c == '-' || c == '=' || c == '.' || c == '_' || c == ':' || c == '/' ;
}
static bool validate_legal_tag_chars(std::string_view tag) {
return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char);
}
static const std::unordered_set<std::string_view> allowed_write_isolation_values = {
"f", "forbid", "forbid_rmw",
"a", "always", "always_use_lwt",
"o", "only_rmw_uses_lwt",
"u", "unsafe", "unsafe_rmw",
};
static void validate_tags(const std::map<sstring, sstring>& tags) {
auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY);
if (it != tags.end()) {
std::string_view value = it->second;
if (!allowed_write_isolation_values.contains(value)) {
throw api_error::validation(
fmt::format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
}
}
}
static rmw_operation::write_isolation parse_write_isolation(std::string_view value) {
if (!value.empty()) {
switch (value[0]) {
case 'f':
return rmw_operation::write_isolation::FORBID_RMW;
case 'a':
return rmw_operation::write_isolation::LWT_ALWAYS;
case 'o':
return rmw_operation::write_isolation::LWT_RMW_ONLY;
case 'u':
return rmw_operation::write_isolation::UNSAFE_RMW;
}
}
// Shouldn't happen as validate_tags() / set_default_write_isolation()
// verify allow only a closed set of values.
return rmw_operation::default_write_isolation;
}
// This default_write_isolation is always overwritten in main.cc, which calls
// set_default_write_isolation().
rmw_operation::write_isolation rmw_operation::default_write_isolation =
rmw_operation::write_isolation::LWT_ALWAYS;
void rmw_operation::set_default_write_isolation(std::string_view value) {
if (value.empty()) {
throw std::runtime_error("When Alternator is enabled, write "
"isolation policy must be selected, using the "
"'--alternator-write-isolation' option. "
"See docs/alternator/alternator.md for instructions.");
}
if (!allowed_write_isolation_values.contains(value)) {
throw std::runtime_error(fmt::format("Invalid --alternator-write-isolation "
"setting '{}'. Allowed values: {}.",
value, allowed_write_isolation_values));
}
default_write_isolation = parse_write_isolation(value);
}
// Alternator uses tags whose keys start with the "system:" prefix for
// internal purposes. Those should not be readable by ListTagsOfResource,
// nor writable with TagResource or UntagResource (see #24098).
// Only a few specific system tags, currently only "system:write_isolation"
// and "system:initial_tablets", are deliberately intended to be set and read
// by the user, so are not considered "internal".
static bool tag_key_is_internal(std::string_view tag_key) {
return tag_key.starts_with("system:")
&& tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
&& tag_key != INITIAL_TABLETS_TAG_KEY;
}
enum class update_tags_action { add_tags, delete_tags };
static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
if (action == update_tags_action::add_tags) {
for (auto it = tags.Begin(); it != tags.End(); ++it) {
if (!it->IsObject()) {
throw api_error::validation("invalid tag object");
}
const rjson::value* key = rjson::find(*it, "Key");
const rjson::value* value = rjson::find(*it, "Value");
if (!key || !key->IsString() || !value || !value->IsString()) {
throw api_error::validation("string Key and Value required");
}
auto tag_key = rjson::to_string_view(*key);
auto tag_value = rjson::to_string_view(*value);
if (tag_key.empty()) {
throw api_error::validation("A tag Key cannot be empty");
}
if (tag_key.size() > 128) {
throw api_error::validation("A tag Key is limited to 128 characters");
}
if (!validate_legal_tag_chars(tag_key)) {
throw api_error::validation("A tag Key can only contain letters, spaces, and [+-=._:/]");
}
if (tag_key_is_internal(tag_key)) {
throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
}
// Note tag values are limited similarly to tag keys, but have a
// longer length limit, and *can* be empty.
if (tag_value.size() > 256) {
throw api_error::validation("A tag Value is limited to 256 characters");
}
if (!validate_legal_tag_chars(tag_value)) {
throw api_error::validation("A tag Value can only contain letters, spaces, and [+-=._:/]");
}
tags_map[sstring(tag_key)] = sstring(tag_value);
}
} else if (action == update_tags_action::delete_tags) {
for (auto it = tags.Begin(); it != tags.End(); ++it) {
auto tag_key = rjson::to_string_view(*it);
if (tag_key_is_internal(tag_key)) {
throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
}
tags_map.erase(sstring(tag_key));
}
}
if (tags_map.size() > 50) {
throw api_error::validation("Number of Tags exceed the current limit for the provided ResourceArn");
}
validate_tags(tags_map);
}
const std::map<sstring, sstring>& get_tags_of_table_or_throw(schema_ptr schema) {
auto tags_ptr = db::get_tags_of_table(schema);
if (tags_ptr) {
return *tags_ptr;
} else {
throw api_error::validation(format("Table {} does not have valid tagging information", schema->ks_name()));
}
}
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.tag_resource++;
const rjson::value* arn = rjson::find(request, "ResourceArn");
if (!arn || !arn->IsString()) {
co_return api_error::access_denied("Incorrect resource identifier");
}
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
maybe_audit(audit_info, audit::statement_category::DDL, schema->ks_name(), schema->cf_name(), "TagResource", request);
get_stats_from_schema(_proxy, *schema)->api_operations.tag_resource++;
const rjson::value* tags = rjson::find(request, "Tags");
if (!tags || !tags->IsArray()) {
co_return api_error::validation("Cannot parse tags");
}
if (tags->Size() < 1) {
co_return api_error::validation("The number of tags must be at least 1") ;
}
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
});
co_return ""; // empty response
}
future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.untag_resource++;
const rjson::value* arn = rjson::find(request, "ResourceArn");
if (!arn || !arn->IsString()) {
co_return api_error::access_denied("Incorrect resource identifier");
}
const rjson::value* tags = rjson::find(request, "TagKeys");
if (!tags || !tags->IsArray()) {
co_return api_error::validation(format("Cannot parse tag keys"));
}
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
maybe_audit(audit_info, audit::statement_category::DDL, schema->ks_name(), schema->cf_name(), "UntagResource", request);
get_stats_from_schema(_proxy, *schema)->api_operations.untag_resource++;
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
});
co_return ""; // empty response
}
future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.list_tags_of_resource++;
const rjson::value* arn = rjson::find(request, "ResourceArn");
if (!arn || !arn->IsString()) {
return make_ready_future<request_return_type>(api_error::access_denied("Incorrect resource identifier"));
}
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "ListTagsOfResource", request);
get_stats_from_schema(_proxy, *schema)->api_operations.list_tags_of_resource++;
auto tags_map = get_tags_of_table_or_throw(schema);
rjson::value ret = rjson::empty_object();
rjson::add(ret, "Tags", rjson::empty_array());
rjson::value& tags = ret["Tags"];
for (auto& tag_entry : tags_map) {
if (tag_key_is_internal(tag_entry.first)) {
continue;
}
rjson::value new_entry = rjson::empty_object();
rjson::add(new_entry, "Key", rjson::from_string(tag_entry.first));
rjson::add(new_entry, "Value", rjson::from_string(tag_entry.second));
rjson::push_back(tags, std::move(new_entry));
}
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
}
struct billing_mode_type {
bool provisioned = false;
int rcu;
int wcu;
};
static billing_mode_type verify_billing_mode(const rjson::value& request) {
// Alternator does not yet support billing or throughput limitations, but
// let's verify that BillingMode is at least legal.
std::string billing_mode = get_string_attribute(request, "BillingMode", "PROVISIONED");
if (billing_mode == "PAY_PER_REQUEST") {
if (rjson::find(request, "ProvisionedThroughput")) {
throw api_error::validation("When BillingMode=PAY_PER_REQUEST, ProvisionedThroughput cannot be specified.");
}
} else if (billing_mode == "PROVISIONED") {
const rjson::value *provisioned_throughput = rjson::find(request, "ProvisionedThroughput");
if (!provisioned_throughput) {
throw api_error::validation("When BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
}
const rjson::value& throughput = *provisioned_throughput;
auto rcu = get_int_attribute(throughput, "ReadCapacityUnits");
auto wcu = get_int_attribute(throughput, "WriteCapacityUnits");
if (!rcu.has_value()) {
throw api_error::validation("provisionedThroughput.readCapacityUnits is missing, when BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
}
if (!wcu.has_value()) {
throw api_error::validation("provisionedThroughput.writeCapacityUnits is missing, when BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
}
return billing_mode_type{true, *rcu, *wcu};
} else {
throw api_error::validation("Unknown BillingMode={}. Must be PAY_PER_REQUEST or PROVISIONED.");
}
return billing_mode_type();
}
// Validate that a AttributeDefinitions parameter in CreateTable is valid, and
// throws user-facing api_error::validation if it's not.
// In particular, verify that the same AttributeName doesn't appear more than
// once (Issue #13870).
// Return the set of attribute names defined in AttributeDefinitions - this
// set is useful for later verifying that all of them are used by some
// KeySchema (issue #19784)
static std::unordered_set<std::string> validate_attribute_definitions(std::string_view supplementary_context, const rjson::value& attribute_definitions) {
if (!attribute_definitions.IsArray()) {
throw api_error::validation("AttributeDefinitions must be an array");
}
std::unordered_set<std::string> seen_attribute_names;
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
const rjson::value* attribute_name = rjson::find(*it, "AttributeName");
if (!attribute_name) {
throw api_error::validation("AttributeName missing in AttributeDefinitions");
}
if (!attribute_name->IsString()) {
throw api_error::validation("AttributeName in AttributeDefinitions must be a string");
}
validate_attr_name_length(supplementary_context, attribute_name->GetStringLength(), true, "in AttributeDefinitions - ");
auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
if (!added) {
throw api_error::validation(fmt::format("Duplicate AttributeName={} in AttributeDefinitions",
rjson::to_string_view(*attribute_name)));
}
const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
if (!attribute_type) {
throw api_error::validation("AttributeType missing in AttributeDefinitions");
}
if (!attribute_type->IsString()) {
throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
}
}
return seen_attribute_names;
}
// The following "extract_from_attrs_column_computation" implementation is
// what allows Alternator GSIs and LSIs to use in a materialized view's key a
// member from the ":attrs" map instead of a real column in the schema:
const bytes extract_from_attrs_column_computation::MAP_NAME = executor::ATTRS_COLUMN_NAME;
column_computation_ptr extract_from_attrs_column_computation::clone() const {
return std::make_unique<extract_from_attrs_column_computation>(*this);
}
// Serialize the *definition* of this column computation into a JSON
// string with a unique "type" string - TYPE_NAME - which then causes
// column_computation::deserialize() to create an object from this class.
bytes extract_from_attrs_column_computation::serialize() const {
rjson::value ret = rjson::empty_object();
rjson::add(ret, "type", TYPE_NAME);
rjson::add(ret, "attr_name", rjson::from_string(to_string_view(_attr_name)));
rjson::add(ret, "desired_type", represent_type(_desired_type).ident);
return to_bytes(rjson::print(ret));
}
// Construct an extract_from_attrs_column_computation object based on the
// saved output of serialize(). Calls on_internal_error() if the string
// doesn't match the expected output format of serialize(). "type" is not
// checked - we assume the caller (column_computation::deserialize()) won't
// call this constructor if "type" doesn't match.
extract_from_attrs_column_computation::extract_from_attrs_column_computation(const rjson::value &v) {
const rjson::value* attr_name = rjson::find(v, "attr_name");
if (attr_name->IsString()) {
_attr_name = bytes(to_bytes_view(rjson::to_string_view(*attr_name)));
const rjson::value* desired_type = rjson::find(v, "desired_type");
if (desired_type->IsString()) {
_desired_type = type_info_from_string(rjson::to_string_view(*desired_type)).atype;
switch (_desired_type) {
case alternator_type::S:
case alternator_type::B:
case alternator_type::N:
// We're done
return;
default:
// Fall through to on_internal_error below.
break;
}
}
}
on_internal_error(elogger, format("Improperly formatted alternator::extract_from_attrs_column_computation computed column definition: {}", v));
}
regular_column_transformation::result extract_from_attrs_column_computation::compute_value(
const schema& schema,
const partition_key& key,
const db::view::clustering_or_static_row& row) const
{
const column_definition* attrs_col = schema.get_column_definition(MAP_NAME);
if (!attrs_col || !attrs_col->is_regular() || !attrs_col->is_multi_cell()) {
on_internal_error(elogger, "extract_from_attrs_column_computation::compute_value() on a table without an attrs map");
}
// Look for the desired attribute _attr_name in the attrs_col map in row:
const atomic_cell_or_collection* attrs = row.cells().find_cell(attrs_col->id);
if (!attrs) {
return regular_column_transformation::result();
}
collection_mutation_view cmv = attrs->as_collection_mutation();
return cmv.with_deserialized(*attrs_col->type, [this] (const collection_mutation_view_description& cmvd) {
for (auto&& [key, cell] : cmvd.cells) {
if (key == _attr_name) {
return regular_column_transformation::result(cell,
std::bind(serialized_value_if_type, std::placeholders::_1, _desired_type));
}
}
return regular_column_transformation::result();
});
}
// extract_from_attrs_column_computation needs the whole row to compute
// value, it can't use just the partition key.
bytes extract_from_attrs_column_computation::compute_value(const schema&, const partition_key&) const {
on_internal_error(elogger, "extract_from_attrs_column_computation::compute_value called without row");
}
// Because `CreateTable` request creates GSI/LSI together with the base table (so the base table is empty),
// we can skip view building process and immediately mark the view as built on all nodes.
//
// However, we can do this only for tablet-based views because `view_building_worker` will automatically propagate
// this information to `system.built_views` table (see `view_building_worker::update_built_views()`).
// For vnode-based views, `view_builder` will process the view and mark it as built.
static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out, std::vector<schema_ptr> schemas, api::timestamp_type ts, service::storage_proxy& sp) {
auto token_metadata = sp.get_token_metadata_ptr();
for (auto& schema: schemas) {
if (schema->is_view()) {
for (auto& host_id: token_metadata->get_topology().get_all_host_ids()) {
auto view_status_mut = co_await sp.system_keyspace().make_view_build_status_mutation(ts, {schema->ks_name(), schema->cf_name()}, host_id, db::view::build_status::SUCCESS);
out.push_back(std::move(view_status_mut));
}
}
}
}
// Returns true if the given attribute name is already the target of any vector
// index on the schema. Analogous to schema::has_index(), but looks up by the
// indexed attribute name rather than the index name.
static bool has_vector_index_on_attribute(const schema& s, std::string_view attribute_name) {
for (const index_metadata& im : s.indices()) {
// No need to check if the secondary index is a vector index, because
// Alternator doesn't use secondary indexes for anything else (GSI and
// LSI are implemented as materialized views, not secondary indexes).
const auto& opts = im.options();
auto target_it = opts.find(cql3::statements::index_target::target_option_name);
if (target_it != opts.end() && target_it->second == attribute_name) {
return true;
}
}
return false;
}
// Returns the validated "Dimensions" value from a VectorAttribute JSON object
// or throws api_error::validation if invalid. The "source" parameter is used
// in error messages (e.g., "VectorIndexes" or "VectorIndexUpdates").
static int get_dimensions(const rjson::value& vector_attribute, std::string_view source) {
const rjson::value* dimensions_v = rjson::find(vector_attribute, "Dimensions");
if (!dimensions_v || !dimensions_v->IsInt() || dimensions_v->GetInt() <= 0 || (vector_dimension_t)dimensions_v->GetInt() > cql3::cql3_type::MAX_VECTOR_DIMENSION) {
throw api_error::validation(fmt::format("{} Dimensions must be an integer between 1 and {}.", source, cql3::cql3_type::MAX_VECTOR_DIMENSION));
}
return dimensions_v->GetInt();
}
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
throwing_assert(this_shard_id() == 0);
// We begin by parsing and validating the content of the CreateTable
// command. We can't inspect the current database schema at this point
// (e.g., verify that this table doesn't already exist) - we can only
// do this further down - after taking group0_guard.
std::string table_name = get_table_name(request);
validate_table_name(table_name);
if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
co_return api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
}
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
maybe_audit(audit_info, audit::statement_category::DDL, keyspace_name, table_name, "CreateTable", request);
const rjson::value* attribute_definitions = rjson::find(request, "AttributeDefinitions");
if (attribute_definitions == nullptr) {
co_return api_error::validation("Missing AttributeDefinitions in CreateTable request");
}
// Save the list of AttributeDefinitions in unused_attribute_definitions,
// and below remove each one as we see it in a KeySchema of the table or
// any of its GSIs or LSIs. If anything remains in this set at the end of
// this function, it's an error.
std::unordered_set<std::string> unused_attribute_definitions =
validate_attribute_definitions("", *attribute_definitions);
tracing::add_alternator_table_name(trace_state, table_name);
schema_builder builder(keyspace_name, table_name);
auto [hash_key, range_key] = parse_key_schema(request, "");
add_column(builder, hash_key, *attribute_definitions, column_kind::partition_key);
unused_attribute_definitions.erase(hash_key);
if (!range_key.empty()) {
add_column(builder, range_key, *attribute_definitions, column_kind::clustering_key);
unused_attribute_definitions.erase(range_key);
}
builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
billing_mode_type bm = verify_billing_mode(request);
schema_ptr partial_schema = builder.build();
// Parse Local/GlobalSecondaryIndexes parameters before creating the
// base table, so if we have a parse errors we can fail without creating
// any table.
std::vector<schema_builder> view_builders;
std::unordered_set<std::string> index_names;
const rjson::value* lsi = rjson::find(request, "LocalSecondaryIndexes");
if (lsi) {
if (!lsi->IsArray()) {
throw api_error::validation("LocalSecondaryIndexes must be an array.");
}
for (const rjson::value& l : lsi->GetArray()) {
const rjson::value* index_name_v = rjson::find(l, "IndexName");
if (!index_name_v || !index_name_v->IsString()) {
throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
}
std::string_view index_name = rjson::to_string_view(*index_name_v);
auto [it, added] = index_names.emplace(index_name);
if (!added) {
co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
}
std::string vname(lsi_name(table_name, index_name));
elogger.trace("Adding LSI {}", index_name);
if (range_key.empty()) {
co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
}
// FIXME: read and handle "Projection" parameter. This will
// require the MV code to copy just parts of the attrs map.
schema_builder view_builder(keyspace_name, vname);
auto [view_hash_key, view_range_key] = parse_key_schema(l, "Local Secondary Index");
if (view_hash_key != hash_key) {
co_return api_error::validation("LocalSecondaryIndex hash key must match the base table hash key");
}
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key);
unused_attribute_definitions.erase(view_hash_key);
if (view_range_key.empty()) {
co_return api_error::validation("LocalSecondaryIndex must specify a sort key");
}
unused_attribute_definitions.erase(view_range_key);
if (view_range_key == hash_key) {
co_return api_error::validation("LocalSecondaryIndex sort key cannot be the same as hash key");
}
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, view_range_key != range_key);
// Base key columns which aren't part of the index's key need to
// be added to the view nonetheless, as (additional) clustering
// key(s).
if (!range_key.empty() && view_range_key != range_key) {
add_column(view_builder, range_key, *attribute_definitions, column_kind::clustering_key);
}
view_builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
// Note above we don't need to add virtual columns, as all
// base columns were copied to view. TODO: reconsider the need
// for virtual columns when we support Projection.
// LSIs have no tags, but Scylla's "synchronous_updates" feature
// (which an LSIs need), is actually implemented as a tag so we
// need to add it here:
std::map<sstring, sstring> tags_map = {{db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY, "true"}};
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
view_builders.emplace_back(std::move(view_builder));
}
}
const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
if (gsi) {
if (!gsi->IsArray()) {
co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
}
for (const rjson::value& g : gsi->GetArray()) {
const rjson::value* index_name_v = rjson::find(g, "IndexName");
if (!index_name_v || !index_name_v->IsString()) {
co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
}
std::string_view index_name = rjson::to_string_view(*index_name_v);
auto [it, added] = index_names.emplace(index_name);
if (!added) {
co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
}
std::string vname(view_name(table_name, index_name));
elogger.trace("Adding GSI {}", index_name);
// FIXME: read and handle "Projection" parameter. This will
// require the MV code to copy just parts of the attrs map.
schema_builder view_builder(keyspace_name, vname);
auto [view_hash_key, view_range_key] = parse_key_schema(g, "GlobalSecondaryIndexes");
// If an attribute is already a real column in the base table
// (i.e., a key attribute), we can use it directly as a view key.
// Otherwise, we need to add it as a "computed column", which
// extracts and deserializes the attribute from the ":attrs" map.
bool view_hash_key_real_column = partial_schema->get_column_definition(to_bytes(view_hash_key));
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key, !view_hash_key_real_column);
unused_attribute_definitions.erase(view_hash_key);
if (!view_range_key.empty()) {
bool view_range_key_real_column = partial_schema->get_column_definition(to_bytes(view_range_key));
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, !view_range_key_real_column);
if (!partial_schema->get_column_definition(to_bytes(view_range_key)) &&
!partial_schema->get_column_definition(to_bytes(view_hash_key))) {
// FIXME: This warning should go away. See issue #6714
elogger.warn("Only 1 regular column from the base table should be used in the GSI key in order to ensure correct liveness management without assumptions");
}
unused_attribute_definitions.erase(view_range_key);
}
// Base key columns which aren't part of the index's key need to
// be added to the view nonetheless, as (additional) clustering
// key(s).
// NOTE: DescribeTable's implementation depends on those keys being added AFTER user specified keys.
bool spurious_base_key_added_as_range_key = false;
if (hash_key != view_hash_key && hash_key != view_range_key) {
add_column(view_builder, hash_key, *attribute_definitions, column_kind::clustering_key);
spurious_base_key_added_as_range_key = true;
}
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
add_column(view_builder, range_key, *attribute_definitions, column_kind::clustering_key);
spurious_base_key_added_as_range_key = true;
}
std::map<sstring, sstring> tags;
if (view_range_key.empty() && spurious_base_key_added_as_range_key) {
tags[SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY] = "true";
}
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags)));
view_builders.emplace_back(std::move(view_builder));
}
}
if (!unused_attribute_definitions.empty()) {
co_return api_error::validation(fmt::format(
"AttributeDefinitions defines spurious attributes not used by any KeySchema: {}",
unused_attribute_definitions));
}
// Parse VectorIndexes parameters and apply them to "builder". This all
// happens before we actually create the table, so if we have a parse
// errors we can still fail without creating any table.
const rjson::value* vector_indexes = rjson::find(request, "VectorIndexes");
if (vector_indexes) {
if (!vector_indexes->IsArray()) {
co_return api_error::validation("VectorIndexes must be an array.");
}
std::unordered_set<std::string> seen_attribute_names;
for (const rjson::value& v : vector_indexes->GetArray()) {
const rjson::value* index_name_v = rjson::find(v, "IndexName");
if (!index_name_v || !index_name_v->IsString()) {
co_return api_error::validation("VectorIndexes IndexName must be a string.");
}
std::string_view index_name = rjson::to_string_view(*index_name_v);
// Limit the length and character choice of a vector index's
// name to the same rules as table names. This is slightly
// different from GSI/LSI names, where we limit not the length
// of the index name but its sum with the base table name.
validate_table_name(index_name, "VectorIndexes IndexName");
if (!index_names.emplace(index_name).second) {
co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
}
const rjson::value* vector_attribute_v = rjson::find(v, "VectorAttribute");
if (!vector_attribute_v || !vector_attribute_v->IsObject()) {
co_return api_error::validation("VectorIndexes VectorAttribute must be an object.");
}
const rjson::value* attribute_name_v = rjson::find(*vector_attribute_v, "AttributeName");
if (!attribute_name_v || !attribute_name_v->IsString()) {
co_return api_error::validation("VectorIndexes AttributeName must be a string.");
}
std::string_view attribute_name = rjson::to_string_view(*attribute_name_v);
validate_attr_name_length("VectorIndexes", attribute_name.size(), /*is_key=*/false, "AttributeName ");
if (!seen_attribute_names.emplace(attribute_name).second) {
co_return api_error::validation(fmt::format("Duplicate vector index on the same AttributeName '{}'", attribute_name));
}
// attribute_name must not be one of the key columns of the base
// or GSIs or LSIs, because those have mandatory types (defined in
// AttributeDefinitions) which will never be a vector.
for (auto it = attribute_definitions->Begin(); it != attribute_definitions->End(); ++it) {
if (rjson::to_string_view((*it)["AttributeName"]) == attribute_name) {
co_return api_error::validation(fmt::format(
"VectorIndexes AttributeName '{}' is a key column of type {} so cannot be used as a vector index target.", attribute_name, rjson::to_string_view((*it)["AttributeType"])));
}
}
int dimensions = get_dimensions(*vector_attribute_v, "VectorIndexes");
// The optional Projection parameter is only supported with
// ProjectionType=KEYS_ONLY. Other values are not yet supported.
const rjson::value* projection_v = rjson::find(v, "Projection");
if (projection_v) {
if (!projection_v->IsObject()) {
co_return api_error::validation("VectorIndexes Projection must be an object.");
}
const rjson::value* projection_type_v = rjson::find(*projection_v, "ProjectionType");
if (!projection_type_v || !projection_type_v->IsString() ||
rjson::to_string_view(*projection_type_v) != "KEYS_ONLY") {
co_return api_error::validation("VectorIndexes Projection: only ProjectionType=KEYS_ONLY is currently supported.");
}
}
// Add a vector index metadata entry to the base table schema.
index_options_map index_options;
index_options[db::index::secondary_index::custom_class_option_name] = "vector_index";
index_options[cql3::statements::index_target::target_option_name] = sstring(attribute_name);
index_options["dimensions"] = std::to_string(dimensions);
builder.with_index(index_metadata{sstring(index_name), index_options,
index_metadata_kind::custom, index_metadata::is_local_index(false)});
}
// If we have any vector indexes, we will use CDC and the CDC log
// name will need to fit our length limits, so validate it now.
if (vector_indexes->Size() > 0) {
validate_cdc_log_name_length(builder.cf_name());
}
}
// We don't yet support configuring server-side encryption (SSE) via the
// SSESpecifiction attribute, but an SSESpecification with Enabled=false
// is simply the default, and should be accepted:
rjson::value* sse_specification = rjson::find(request, "SSESpecification");
if (sse_specification && sse_specification->IsObject()) {
rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
if (!enabled || !enabled->IsBool()) {
co_return api_error("ValidationException", "SSESpecification needs boolean Enabled");
}
if (enabled->GetBool()) {
// TODO: full support for SSESpecification
co_return api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported.");
}
}
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
if (stream_specification && stream_specification->IsObject()) {
if (executor::add_stream_options(*stream_specification, builder, _proxy)) {
validate_cdc_log_name_length(builder.cf_name());
}
}
// Parse the "Tags" parameter early, so we can avoid creating the table
// at all if this parsing failed.
const rjson::value* tags = rjson::find(request, "Tags");
std::map<sstring, sstring> tags_map;
if (tags && tags->IsArray()) {
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
}
if (bm.provisioned) {
tags_map[RCU_TAG_KEY] = std::to_string(bm.rcu);
tags_map[WCU_TAG_KEY] = std::to_string(bm.wcu);
}
set_table_creation_time(tags_map, db_clock::now());
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, _stats);
schema_ptr schema = builder.build();
for (auto& view_builder : view_builders) {
// Note below we don't need to add virtual columns, as all
// base columns were copied to view. TODO: reconsider the need
// for virtual columns when we support Projection.
for (const column_definition& regular_cdef : schema->regular_columns()) {
if (!view_builder.has_column(*cql3::to_identifier(regular_cdef))) {
view_builder.with_column(regular_cdef.name(), regular_cdef.type, column_kind::regular_column);
}
}
const bool include_all_columns = true;
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
}
size_t retries = _mm.get_concurrent_ddl_retries();
for (;;) {
auto group0_guard = co_await _mm.start_group0_operation();
auto ts = group0_guard.write_timestamp();
utils::chunked_vector<mutation> schema_mutations;
auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
if (stream_specification && stream_specification->IsObject()) {
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
if (rs->uses_tablets()) {
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
"If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
}
}
}
// Vector indexes is a new feature that we decided to only support
// on tablets.
if (vector_indexes && vector_indexes->Size() > 0) {
if (!rs->uses_tablets()) {
co_return api_error::validation("Vector indexes are not supported on tables using vnodes.");
}
}
// Creating an index in tablets mode requires the keyspace to be RF-rack-valid.
// GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues.
if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, _proxy.local_db().get_token_metadata_ptr(), *rs);
} catch (const std::invalid_argument& ex) {
if (!view_builders.empty()) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes and LocalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
} else {
co_return api_error::validation(fmt::format("Cannot create table '{}' with tablets: the configuration "
"option 'rf_rack_valid_keyspaces' is enabled, which enforces that tables using tablets can only be created in clusters "
"that have either 1 or 3 racks", table_name));
}
}
}
try {
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
} catch (exceptions::already_exists_exception&) {
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
}
}
if (_proxy.data_dictionary().try_find_table(schema->id())) {
// This should never happen, the ID is supposed to be unique
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
}
std::vector<schema_ptr> schemas;
schemas.push_back(schema);
for (schema_builder& view_builder : view_builders) {
schemas.push_back(view_builder.build());
}
co_await service::prepare_new_column_families_announcement(schema_mutations, _proxy, *ksm, schemas, ts);
if (ksm->uses_tablets()) {
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, _proxy);
}
// If a role is allowed to create a table, we must give it permissions to
// use (and eventually delete) the specific table it just created (and
// also the view tables). This is known as "auto-grant".
// Unfortunately, there is an API mismatch between this code (which uses
// separate group0_guard and vector<mutation>) and the function
// grant_applicable_permissions() which uses a combined "group0_batch"
// structure - so we need to do some ugly back-and-forth conversions
// between the pair to the group0_batch and back to the pair :-(
service::group0_batch mc(std::move(group0_guard));
mc.add_mutations(std::move(schema_mutations));
if (client_state.user()) {
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
co_await auth::grant_applicable_permissions(
*client_state.get_auth_service(), *client_state.user(), resource, mc);
for (const schema_builder& view_builder : view_builders) {
resource = auth::make_data_resource(view_builder.ks_name(), view_builder.cf_name());
co_await auth::grant_applicable_permissions(
*client_state.get_auth_service(), *client_state.user(), resource, mc);
}
}
std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
try {
co_await _mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
break;
} catch (const service::group0_concurrent_modification& ex) {
elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
table_name, retries ? "Retrying" : "Number of retries exceeded, giving up");
if (retries--) {
continue;
}
throw;
}
}
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
rjson::value status = rjson::empty_object();
executor::supplement_table_info(request, *schema, _proxy);
rjson::add(status, "TableDescription", std::move(request));
co_return rjson::print(std::move(status));
}
future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.create_table++;
elogger.trace("Creating table {}", request);
// Note: audit_info is captured by reference into the invoke_on() lambda and written on shard 0.
// This is safe because co_await keeps the caller's coroutine frame (and audit_info) alive for
// the entire duration of invoke_on(). Only the unique_ptr itself is read/written cross-shard —
// no concurrent access occurs since the caller is suspended during invoke_on().
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
// `invoke_on` hopped us to shard 0, but `this` points to `executor` is from 'old' shard, we need to hop it too.
co_return co_await e.local().create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), enforce_authorization, warn_authorization, std::move(tablets_mode), audit_info);
});
}
// When UpdateTable adds a GSI, the type of its key columns must be specified
// in a AttributeDefinitions. If one of these key columns are *already* key
// columns of the base table or any of its prior GSIs or LSIs, the type
// given in AttributeDefinitions must match the type of the existing key -
// otherwise Alternator will not know which type to enforce in new writes.
// Also, if the table already has vector indexes, their key attributes cannot
// be redefined in AttributeDefinitions with a non-vector type.
// This function checks for such conflicts. It assumes that the structure of
// the given attribute_definitions was already validated (with
// validate_attribute_definitions()).
// This function should be called multiple times - once for the base schema
// and once for each of its views (existing GSIs and LSIs on this table).
static void check_attribute_definitions_conflicts(const rjson::value& attribute_definitions, const schema& schema) {
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
const rjson::value& attribute_info = *it;
std::string_view attribute_name = rjson::to_string_view(attribute_info["AttributeName"]);
for (auto& def : schema.primary_key_columns()) {
if (attribute_name == def.name_as_text()) {
auto def_type = type_to_string(def.type);
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
if (type != def_type) {
throw api_error::validation(fmt::format("AttributeDefinitions redefined {} to {} already a key attribute of type {} in this table", def.name_as_text(), type, def_type));
}
break;
}
}
// Additionally, if we have a vector index, its key attribute is
// required to have a vector type, and cannot be listed in
// AttributeDefinitions with a non-vector key type.
if (has_vector_index_on_attribute(schema, attribute_name)) {
throw api_error::validation(fmt::format("AttributeDefinitions redefines {} but already a key of a vector index in this table", attribute_name));
}
}
}
future<executor::request_return_type> executor::update_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.update_table++;
elogger.trace("Updating table {}", request);
static const std::vector<sstring> unsupported = {
"ProvisionedThroughput",
"ReplicaUpdates",
"SSESpecification",
};
for (auto& s : unsupported) {
if (rjson::find(request, s)) {
co_await coroutine::return_exception(api_error::validation(s + " not supported"));
}
}
bool empty_request = true;
if (rjson::find(request, "BillingMode")) {
empty_request = false;
verify_billing_mode(request);
}
// Note: audit_info is captured by reference into the invoke_on() lambda and written on shard 0.
// This is safe because co_await keeps the caller's coroutine frame (and audit_info) alive for
// the entire duration of invoke_on(). Only the unique_ptr itself is read/written cross-shard —
// no concurrent access occurs since the caller is suspended during invoke_on().
co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state)), enforce_authorization = bool(_enforce_authorization),
warn_authorization = bool(_warn_authorization), client_state_other_shard = client_state.move_to_other_shard(), empty_request, &e = this->container(), &audit_info]
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
schema_ptr schema;
size_t retries = mm.get_concurrent_ddl_retries();
for (;;) {
auto group0_guard = co_await mm.start_group0_operation();
schema_ptr tab = get_table(p.local(), request);
e.local().maybe_audit(audit_info, audit::statement_category::DDL, tab->ks_name(), tab->cf_name(), "UpdateTable", request);
tracing::add_alternator_table_name(gt, tab->cf_name());
// the ugly but harmless conversion to string_view here is because
// Seastar's sstring is missing a find(std::string_view) :-()
if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
co_await coroutine::return_exception(api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
}
schema_builder builder(tab);
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
if (stream_specification && stream_specification->IsObject()) {
empty_request = false;
if (add_stream_options(*stream_specification, builder, p.local())) {
validate_cdc_log_name_length(builder.cf_name());
}
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
if (stream_enabled && stream_enabled->IsBool()) {
if (stream_enabled->GetBool()) {
if (p.local().local_db().find_keyspace(tab->ks_name()).get_replication_strategy().uses_tablets()) {
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
"If you want to enable streams, re-create this table with vnodes (with the tag 'system:initial_tablets' set to 'none').");
}
if (tab->cdc_options().enabled()) {
co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
}
}
else if (!tab->cdc_options().enabled()) {
co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
}
}
}
// Support VectorIndexUpdates to add or delete a vector index,
// similar to GlobalSecondaryIndexUpdates above. We handle this
// before builder.build() so we can use builder directly.
rjson::value* vector_index_updates = rjson::find(request, "VectorIndexUpdates");
if (vector_index_updates) {
if (!vector_index_updates->IsArray()) {
co_return api_error::validation("VectorIndexUpdates must be an array");
}
if (vector_index_updates->Size() > 1) {
// VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates.
// Since DynamoDB artifically limits the latter to just a
// single operation (one Create or one Delete), we also
// place the same artificial limit on VectorIndexUpdates,
// and throw the same LimitExceeded error if the client
// tries to pass more than one operation.
co_return api_error::limit_exceeded("VectorIndexUpdates only allows one index creation or deletion");
}
}
if (vector_index_updates && vector_index_updates->Size() == 1) {
empty_request = false;
if (!(*vector_index_updates)[0].IsObject() || (*vector_index_updates)[0].MemberCount() != 1) {
co_return api_error::validation("VectorIndexUpdates array must contain one object with a Create or Delete operation");
}
auto it = (*vector_index_updates)[0].MemberBegin();
const std::string_view op = rjson::to_string_view(it->name);
if (!it->value.IsObject()) {
co_return api_error::validation("VectorIndexUpdates entries must be objects");
}
const rjson::value* index_name_v = rjson::find(it->value, "IndexName");
if (!index_name_v || !index_name_v->IsString()) {
co_return api_error::validation("VectorIndexUpdates operation must have IndexName");
}
sstring index_name = rjson::to_sstring(*index_name_v);
if (op == "Create") {
if (!p.local().local_db().find_keyspace(tab->ks_name()).get_replication_strategy().uses_tablets()) {
co_return api_error::validation("Vector indexes are not supported on tables using vnodes.");
}
validate_table_name(index_name, "VectorIndexUpdates IndexName");
// Check for duplicate index name against existing vector indexes, GSIs and LSIs.
if (tab->has_index(index_name)) {
// Alternator only uses a secondary index for vector
// search (GSI and LSI are implemented as materialized
// views, not secondary indexes), so the error message
// can refer to a "Vector index".
co_return api_error::validation(fmt::format(
"Vector index {} already exists in table {}", index_name, tab->cf_name()));
}
if (p.local().data_dictionary().has_schema(tab->ks_name(), gsi_name(tab->cf_name(), index_name, false)) ||
p.local().data_dictionary().has_schema(tab->ks_name(), lsi_name(tab->cf_name(), index_name, false))) {
co_return api_error::validation(fmt::format(
"GSI or LSI {} already exists in table {}, cannot reuse the name for a vector index", index_name, tab->cf_name()));
}
const rjson::value* vector_attribute_v = rjson::find(it->value, "VectorAttribute");
if (!vector_attribute_v || !vector_attribute_v->IsObject()) {
co_return api_error::validation("VectorIndexUpdates Create VectorAttribute must be an object.");
}
const rjson::value* attribute_name_v = rjson::find(*vector_attribute_v, "AttributeName");
if (!attribute_name_v || !attribute_name_v->IsString()) {
co_return api_error::validation("VectorIndexUpdates Create AttributeName must be a string.");
}
std::string_view attribute_name = rjson::to_string_view(*attribute_name_v);
validate_attr_name_length("VectorIndexUpdates", attribute_name.size(), /*is_key=*/false, "AttributeName ");
// attribute_name must not be a key column of the base
// table or any of its GSIs or LSIs, because those have
// mandatory types (defined in AttributeDefinitions) which
// will never be a vector.
for (const column_definition& cdef : tab->primary_key_columns()) {
if (cdef.name_as_text() == attribute_name) {
co_return api_error::validation(fmt::format(
"VectorIndexUpdates AttributeName '{}' is a key column and cannot be used as a vector index target.", attribute_name));
}
}
for (const auto& view : p.local().data_dictionary().find_column_family(tab).views()) {
for (const column_definition& cdef : view->primary_key_columns()) {
if (cdef.name_as_text() == attribute_name) {
co_return api_error::validation(fmt::format(
"VectorIndexUpdates AttributeName '{}' is a key column of a GSI or LSI and cannot be used as a vector index target.", attribute_name));
}
}
}
// attribute_name must not already be the target of an
// existing vector index.
if (has_vector_index_on_attribute(*tab, attribute_name)) {
co_return api_error::validation(fmt::format(
"VectorIndexUpdates AttributeName '{}' is already the target of an existing vector index.", attribute_name));
}
int dimensions = get_dimensions(*vector_attribute_v, "VectorIndexUpdates");
// The optional Projection parameter is only supported with
// ProjectionType=KEYS_ONLY. Other values are not yet supported.
const rjson::value* projection_v = rjson::find(it->value, "Projection");
if (projection_v) {
if (!projection_v->IsObject()) {
co_return api_error::validation("VectorIndexUpdates Projection must be an object.");
}
const rjson::value* projection_type_v = rjson::find(*projection_v, "ProjectionType");
if (!projection_type_v || !projection_type_v->IsString() ||
rjson::to_string_view(*projection_type_v) != "KEYS_ONLY") {
co_return api_error::validation("VectorIndexUpdates Projection: only ProjectionType=KEYS_ONLY is currently supported.");
}
}
// A vector index will use CDC on this table, so the CDC
// log table name will need to fit our length limits
validate_cdc_log_name_length(builder.cf_name());
index_options_map index_options;
index_options[db::index::secondary_index::custom_class_option_name] = "vector_index";
index_options[cql3::statements::index_target::target_option_name] = sstring(attribute_name);
index_options["dimensions"] = std::to_string(dimensions);
builder.with_index(index_metadata{index_name, index_options,
index_metadata_kind::custom, index_metadata::is_local_index(false)});
} else if (op == "Delete") {
if (!tab->has_index(index_name)) {
co_return api_error::resource_not_found(fmt::format(
"No vector index {} in table {}", index_name, tab->cf_name()));
}
builder.without_index(index_name);
} else {
// Update operation not yet supported, as we don't yet
// have any updatable properties of vector indexes.
co_return api_error::validation(fmt::format(
"VectorIndexUpdates supports a Create or Delete operation, saw '{}'", op));
}
}
schema = builder.build();
std::vector<view_ptr> new_views;
std::vector<std::string> dropped_views;
rjson::value* gsi_updates = rjson::find(request, "GlobalSecondaryIndexUpdates");
if (gsi_updates) {
if (!gsi_updates->IsArray()) {
co_return api_error::validation("GlobalSecondaryIndexUpdates must be an array");
}
if (gsi_updates->Size() > 1) {
// Although UpdateTable takes an array of operations and could
// support multiple Create and/or Delete operations in one
// command, DynamoDB doesn't actually allows this, and throws
// a LimitExceededException if this is attempted.
co_return api_error::limit_exceeded("GlobalSecondaryIndexUpdates only allows one index creation or deletion");
}
if (vector_index_updates && vector_index_updates->IsArray() &&
vector_index_updates->Size() && gsi_updates->Size()) {
co_return api_error::limit_exceeded("UpdateTable cannot have both VectorIndexUpdates and GlobalSecondaryIndexUpdates in the same request");
}
if (gsi_updates->Size() == 1) {
empty_request = false;
if (!(*gsi_updates)[0].IsObject() || (*gsi_updates)[0].MemberCount() != 1) {
co_return api_error::validation("GlobalSecondaryIndexUpdates array must contain one object with a Create, Delete or Update operation");
}
auto it = (*gsi_updates)[0].MemberBegin();
const std::string_view op = rjson::to_string_view(it->name);
if (!it->value.IsObject()) {
co_return api_error::validation("GlobalSecondaryIndexUpdates entries must be objects");
}
const rjson::value* index_name_v = rjson::find(it->value, "IndexName");
if (!index_name_v || !index_name_v->IsString()) {
co_return api_error::validation("GlobalSecondaryIndexUpdates operation must have IndexName");
}
std::string_view index_name = rjson::to_string_view(*index_name_v);
std::string_view table_name = schema->cf_name();
std::string_view keyspace_name = schema->ks_name();
std::string vname(view_name(table_name, index_name));
if (op == "Create") {
const rjson::value* attribute_definitions = rjson::find(request, "AttributeDefinitions");
if (!attribute_definitions) {
co_return api_error::validation("GlobalSecondaryIndexUpdates Create needs AttributeDefinitions");
}
std::unordered_set<std::string> unused_attribute_definitions =
validate_attribute_definitions("GlobalSecondaryIndexUpdates", *attribute_definitions);
check_attribute_definitions_conflicts(*attribute_definitions, *schema);
for (auto& view : p.local().data_dictionary().find_column_family(tab).views()) {
check_attribute_definitions_conflicts(*attribute_definitions, *view);
}
if (p.local().data_dictionary().has_schema(keyspace_name, vname)) {
// Surprisingly, DynamoDB uses validation error here, not resource_in_use
co_return api_error::validation(fmt::format(
"GSI {} already exists in table {}", index_name, table_name));
}
if (p.local().data_dictionary().has_schema(keyspace_name, lsi_name(table_name, index_name, false))) {
co_return api_error::validation(fmt::format(
"LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
}
if (tab->has_index(sstring(index_name))) {
co_return api_error::validation(fmt::format(
"Vector index {} already exists in table {}, cannot reuse the name for a GSI", index_name, table_name));
}
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(),
p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy());
} catch (const std::invalid_argument& ex) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
}
elogger.trace("Adding GSI {}", index_name);
// FIXME: read and handle "Projection" parameter. This will
// require the MV code to copy just parts of the attrs map.
schema_builder view_builder(keyspace_name, vname);
auto [view_hash_key, view_range_key] = parse_key_schema(it->value, "GlobalSecondaryIndexUpdates");
// If an attribute is already a real column in the base
// table (i.e., a key attribute in the base table),
// we can use it directly as a view key. Otherwise, we
// need to add it as a "computed column", which extracts
// and deserializes the attribute from the ":attrs" map.
bool view_hash_key_real_column =
schema->get_column_definition(to_bytes(view_hash_key));
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key, !view_hash_key_real_column);
unused_attribute_definitions.erase(view_hash_key);
if (!view_range_key.empty()) {
bool view_range_key_real_column =
schema->get_column_definition(to_bytes(view_range_key));
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, !view_range_key_real_column);
if (!schema->get_column_definition(to_bytes(view_range_key)) &&
!schema->get_column_definition(to_bytes(view_hash_key))) {
// FIXME: This warning should go away. See issue #6714
elogger.warn("Only 1 regular column from the base table should be used in the GSI key in order to ensure correct liveness management without assumptions");
}
unused_attribute_definitions.erase(view_range_key);
}
// Surprisingly, although DynamoDB checks for unused
// AttributeDefinitions in CreateTable, it does not
// check it in UpdateTable. We decided to check anyway.
if (!unused_attribute_definitions.empty()) {
co_return api_error::validation(fmt::format(
"AttributeDefinitions defines spurious attributes not used by any KeySchema: {}",
unused_attribute_definitions));
}
// Base key columns which aren't part of the index's key need to
// be added to the view nonetheless, as (additional) clustering
// key(s).
for (auto& def : schema->primary_key_columns()) {
if (def.name_as_text() != view_hash_key && def.name_as_text() != view_range_key) {
view_builder.with_column(def.name(), def.type, column_kind::clustering_key);
}
}
// GSIs have no tags:
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>());
// Note below we don't need to add virtual columns, as all
// base columns were copied to view. TODO: reconsider the need
// for virtual columns when we support Projection.
for (const column_definition& regular_cdef : schema->regular_columns()) {
if (!view_builder.has_column(*cql3::to_identifier(regular_cdef))) {
view_builder.with_column(regular_cdef.name(), regular_cdef.type, column_kind::regular_column);
}
}
const bool include_all_columns = true;
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
new_views.emplace_back(view_builder.build());
} else if (op == "Delete") {
elogger.trace("Deleting GSI {}", index_name);
if (!p.local().data_dictionary().has_schema(keyspace_name, vname)) {
co_return api_error::resource_not_found(fmt::format("No GSI {} in table {}", index_name, table_name));
}
dropped_views.emplace_back(vname);
} else if (op == "Update") {
co_return api_error::validation("GlobalSecondaryIndexUpdates Update not yet supported");
} else {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexUpdates supports a Create, Delete or Update operation, saw '{}'", op));
}
}
}
if (empty_request) {
co_return api_error::validation("UpdateTable requires one of GlobalSecondaryIndexUpdates, VectorIndexUpdates, StreamSpecification or BillingMode to be specified");
}
co_await verify_permission(enforce_authorization, warn_authorization, client_state_other_shard.get(), schema, auth::permission::ALTER, e.local()._stats);
auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, std::vector<view_ptr>(), group0_guard.write_timestamp());
for (view_ptr view : new_views) {
auto m2 = co_await service::prepare_new_view_announcement(p.local(), view, group0_guard.write_timestamp());
std::move(m2.begin(), m2.end(), std::back_inserter(m));
}
for (const std::string& view_name : dropped_views) {
auto m2 = co_await service::prepare_view_drop_announcement(p.local(), schema->ks_name(), view_name, group0_guard.write_timestamp());
std::move(m2.begin(), m2.end(), std::back_inserter(m));
}
// If a role is allowed to create a GSI, we should give it permissions
// to read the GSI it just created. This is known as "auto-grant".
// Also, when we delete a GSI we should revoke any permissions set on
// it - so if it's ever created again the old permissions wouldn't be
// remembered for the new GSI. This is known as "auto-revoke"
if (client_state_other_shard.get().user() && (!new_views.empty() || !dropped_views.empty())) {
service::group0_batch mc(std::move(group0_guard));
mc.add_mutations(std::move(m));
for (view_ptr view : new_views) {
auto resource = auth::make_data_resource(view->ks_name(), view->cf_name());
co_await auth::grant_applicable_permissions(
*client_state_other_shard.get().get_auth_service(), *client_state_other_shard.get().user(), resource, mc);
}
for (const auto& view_name : dropped_views) {
auto resource = auth::make_data_resource(schema->ks_name(), view_name);
co_await auth::revoke_all(*client_state_other_shard.get().get_auth_service(), resource, mc);
}
std::tie(m, group0_guard) = co_await std::move(mc).extract();
}
try {
co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
break;
} catch (const service::group0_concurrent_modification& ex) {
elogger.info("Failed to execute UpdateTable {} due to concurrent schema modifications. {}.",
tab->cf_name(), retries ? "Retrying" : "Number of retries exceeded, giving up");
if (retries--) {
continue;
}
throw;
}
}
co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
rjson::value status = rjson::empty_object();
supplement_table_info(request, *schema, p.local());
rjson::add(status, "TableDescription", std::move(request));
co_return rjson::print(std::move(status));
});
}
// attribute_collector is a helper class used to accept several attribute
// puts or deletes, and collect them as single collection mutation.
// The implementation is somewhat complicated by the need of cells in a
// collection to be sorted by key order.
class attribute_collector {
std::map<bytes, atomic_cell, serialized_compare> collected;
void add(bytes&& name, atomic_cell&& cell) {
collected.emplace(std::move(name), std::move(cell));
}
void add(const bytes& name, atomic_cell&& cell) {
collected.emplace(name, std::move(cell));
}
public:
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void del(bytes&& name, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
}
void del(const bytes& name, api::timestamp_type ts) {
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
}
collection_mutation_description to_mut() {
collection_mutation_description ret;
for (auto&& e : collected) {
ret.cells.emplace_back(e.first, std::move(e.second));
}
return ret;
}
bool empty() const {
return collected.empty();
}
};
// After calling pk_from_json() and ck_from_json() to extract the pk and ck
// components of a key, and if that succeeded, call check_key() to further
// check that the key doesn't have any spurious components.
void check_key(const rjson::value& key, const schema_ptr& schema) {
if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
throw api_error::validation("Given key attribute not in schema");
}
}
// Verify that a value parsed from the user input is legal. In particular,
// we check that the value is not an empty set, string or bytes - which is
// (somewhat artificially) forbidden by DynamoDB.
void validate_value(const rjson::value& v, const char* caller) {
if (!v.IsObject() || v.MemberCount() != 1) {
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
}
auto it = v.MemberBegin();
const std::string_view type = rjson::to_string_view(it->name);
if (type == "SS" || type == "BS" || type == "NS") {
if (!it->value.IsArray()) {
throw api_error::validation(format("{}: improperly formatted set '{}'", caller, v));
}
if (it->value.Size() == 0) {
throw api_error::validation(format("{}: empty set not allowed", caller));
}
} else if (type == "S" || type == "B") {
if (!it->value.IsString()) {
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
}
} else if (type == "N") {
if (!it->value.IsString()) {
// DynamoDB uses a SerializationException in this case, not ValidationException.
throw api_error::serialization(format("{}: number value must be encoded as string '{}'", caller, v));
}
} else if (type != "L" && type != "M" && type != "BOOL" && type != "NULL") {
// TODO: can do more sanity checks on the content of the above types.
throw api_error::validation(fmt::format("{}: unknown type {} for value {}", caller, type, v));
}
}
// The put_or_delete_item class builds the mutations needed by the PutItem and
// DeleteItem operations - either as stand-alone commands or part of a list
// of commands in BatchWriteItem.
// put_or_delete_item splits each operation into two stages: Constructing the
// object parses and validates the user input (throwing exceptions if there
// are input errors). Later, build() generates the actual mutation, with a
// specified timestamp. This split is needed because of the peculiar needs of
// BatchWriteItem and LWT. BatchWriteItem needs all parsing to happen before
// any writing happens (if one of the commands has an error, none of the
// writes should be done). LWT makes it impossible for the parse step to
// generate "mutation" objects, because the timestamp still isn't known.
class put_or_delete_item {
private:
partition_key _pk;
clustering_key _ck;
struct cell {
bytes column_name;
bytes value;
};
// PutItem: engaged _cells, write these cells to item (_pk, _ck).
// DeleteItem: disengaged _cells, delete the entire item (_pk, _ck).
std::optional<std::vector<cell>> _cells;
// WCU calculation takes into account some length in bytes,
// that length can have different meaning depends on the operation but the
// the calculation of length in bytes to WCU is the same.
uint64_t _length_in_bytes = 0;
public:
struct delete_item {};
struct put_item {};
put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes);
// put_or_delete_item doesn't keep a reference to schema (so it can be
// moved between shards for LWT) so it needs to be given again to build():
mutation build(schema_ptr schema, api::timestamp_type ts) const;
const partition_key& pk() const { return _pk; }
const clustering_key& ck() const { return _ck; }
uint64_t length_in_bytes() const noexcept {
return _length_in_bytes;
}
void set_length_in_bytes(uint64_t length) noexcept {
_length_in_bytes = length;
}
bool is_put_item() noexcept {
return _cells.has_value();
}
};
put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
: _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
check_key(key, schema);
}
// find_attribute() checks whether the named attribute is stored in the
// schema as a real column (we do this for key attribute, and for a GSI key)
// and if so, returns that column. If not, the function returns nullptr,
// telling the caller that the attribute is stored serialized in the
// ATTRS_COLUMN_NAME map - not in a stand-alone column in the schema.
static inline const column_definition* find_attribute(const schema& schema, const bytes& attribute_name) {
const column_definition* cdef = schema.get_column_definition(attribute_name);
// Although ATTRS_COLUMN_NAME exists as an actual column, when used as an
// attribute name it should refer to an attribute inside ATTRS_COLUMN_NAME
// not to ATTRS_COLUMN_NAME itself. This if() is needed for #5009.
if (cdef && cdef->name() == executor::ATTRS_COLUMN_NAME) {
return nullptr;
}
return cdef;
}
// Get a list of all attributes that serve as a key attributes for any of the
// GSIs or LSIs of this table, and the declared type for each (can be only
// "S", "B", or "N"). The implementation below will also list the base table's
// key columns (they are the views' clustering keys).
std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table t) {
std::unordered_map<bytes, std::string> ret;
for (const view_ptr& v : t.views()) {
for (const column_definition& cdef : v->partition_key_columns()) {
ret[cdef.name()] = type_to_string(cdef.type);
}
for (const column_definition& cdef : v->clustering_key_columns()) {
ret[cdef.name()] = type_to_string(cdef.type);
}
}
return ret;
}
// Get a map from attribute name (bytes) to dimensions (int) for all vector
// indexes defined on this table's schema. Used to validate written values
// against the vector index constraints.
static std::unordered_map<bytes, int> vector_index_attributes(const schema& s) {
std::unordered_map<bytes, int> ret;
for (const index_metadata& im : s.indices()) {
const auto& opts = im.options();
auto class_it = opts.find(db::index::secondary_index::custom_class_option_name);
if (class_it == opts.end() || class_it->second != "vector_index") {
continue;
}
auto target_it = opts.find(cql3::statements::index_target::target_option_name);
auto dims_it = opts.find("dimensions");
if (target_it == opts.end() || dims_it == opts.end()) {
continue;
}
try {
ret[to_bytes(target_it->second)] = std::stoi(dims_it->second);
} catch (...) {}
}
return ret;
}
// When an attribute is a key (hash or sort) of one of the GSIs or LSIs on a
// table, DynamoDB refuses an update to that attribute with an unsuitable
// value. Unsuitable values are:
// 1. An empty string (those are normally allowed as values, but not allowed
// as keys, including GSI keys).
// 2. A value with a type different than that declared for the GSI key.
// Normally non-key attributes can take values of any type (DynamoDB is
// schema-less), but as soon as an attribute is used as a GSI key, it
// must be set only to the specific type declared for that key.
// (Note that a missing value for an GSI key attribute is fine - the update
// will happen on the base table, but won't reach the view table. In this
// case, this function simply won't be called for this attribute.)
//
// This function checks if the given attribute update is an update to some
// GSI's key, and if the value is unsuitable, an api_error::validation is
// thrown. The checking here is similar to the checking done in
// get_key_from_typed_value() for the base table's key columns.
//
// validate_value_if_index_key() should only be called after validate_value()
// already validated that the value itself has a valid form.
static void validate_value_if_index_key(
std::unordered_map<bytes, std::string> key_attributes,
const bytes& attribute,
const rjson::value& value) {
auto it = key_attributes.find(attribute);
if (it == key_attributes.end()) {
// Given attribute is not a key column with a fixed type, so no
// more validation to do.
return;
}
const std::string& expected_type = it->second;
// We assume that validate_value() was previously called on this value,
// so value is known to be of the proper format (an object with one
// member, whose key and value are strings)
std::string_view value_type = rjson::to_string_view(value.MemberBegin()->name);
if (expected_type != value_type) {
throw api_error::validation(fmt::format(
"Type mismatch: expected type {} for GSI or LSI key attribute {}, got type {}",
expected_type, to_string_view(attribute), value_type));
}
std::string_view value_content = rjson::to_string_view(value.MemberBegin()->value);
if (value_content.empty()) {
throw api_error::validation(fmt::format(
"GSI or LSI key attribute {} cannot be set to an empty string", to_string_view(attribute)));
}
}
// When an attribute is the target of a vector index on the table, a write
// to that attribute is rejected unless the value is a DynamoDB list (type
// "L") of exactly the declared number of numeric (type "N") elements, where
// each number can be represented as a 32-bit float.
//
// validate_value_if_vector_index_attribute() should only be called after
// validate_value() already confirmed the value has a valid DynamoDB form.
static void validate_value_if_vector_index_attribute(
const std::unordered_map<bytes, int>& vector_attrs,
const bytes& attribute,
const rjson::value& value) {
auto it = vector_attrs.find(attribute);
if (it == vector_attrs.end()) {
return;
}
int dimensions = it->second;
std::string_view attr_name = to_string_view(attribute);
// value is a DynamoDB typed value: an object with one member whose key
// is the type tag. validate_value() already checked the overall shape.
std::string_view value_type = rjson::to_string_view(value.MemberBegin()->name);
if (value_type != "L") {
throw api_error::validation(fmt::format(
"Vector index attribute '{}' must be a list of {} numbers, got type {}",
attr_name, dimensions, value_type));
}
const rjson::value& list = value.MemberBegin()->value;
if (!list.IsArray() || (int)list.Size() != dimensions) {
throw api_error::validation(fmt::format(
"Vector index attribute '{}' must be a list of exactly {} numbers, got {} elements",
attr_name, dimensions, list.IsArray() ? (int)list.Size() : -1));
}
for (const rjson::value& elem : list.GetArray()) {
if (!elem.IsObject() || elem.MemberCount() != 1 ||
rjson::to_string_view(elem.MemberBegin()->name) != "N") {
throw api_error::validation(fmt::format(
"Vector index attribute '{}' must contain only numbers", attr_name));
}
std::string_view num_str = rjson::to_string_view(elem.MemberBegin()->value);
float f;
auto [ptr, ec] = std::from_chars(num_str.data(), num_str.data() + num_str.size(), f);
if (ec != std::errc{} || ptr != num_str.data() + num_str.size() || !std::isfinite(f)) {
throw api_error::validation(fmt::format(
"Vector index attribute '{}' element '{}' cannot be represented as a 32-bit float",
attr_name, num_str));
}
}
}
put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes)
: _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
_cells = std::vector<cell>();
_cells->reserve(item.MemberCount());
auto vec_attrs = vector_index_attributes(*schema);
for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
bytes column_name = to_bytes(rjson::to_string_view(it->name));
validate_value(it->value, "PutItem");
const column_definition* cdef = find_attribute(*schema, column_name);
validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
_length_in_bytes += column_name.size();
if (!cdef) {
// This attribute may be a key column of one of the GSI or LSI,
// in which case there are some limitations on the value.
if (!key_attributes.empty()) {
validate_value_if_index_key(key_attributes, column_name, it->value);
}
// This attribute may also be a vector index target column,
// in which case it must be a list of the right number of floats.
if (!vec_attrs.empty()) {
validate_value_if_vector_index_attribute(vec_attrs, column_name, it->value);
}
bytes value = serialize_item(it->value);
if (value.size()) {
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
_length_in_bytes += value.size() - 1;
}
_cells->push_back({std::move(column_name), serialize_item(it->value)});
} else if (!cdef->is_primary_key()) {
// Fixed-type regular columns were used for LSIs and also (in the
// slightly more distant past) GSIs, before they were moved to
// :attrs. We keep this branch for backward compatibility.
bytes value = get_key_from_typed_value(it->value, *cdef);
_cells->push_back({std::move(column_name), value});
if (value.size()) {
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
_length_in_bytes += value.size() - 1;
}
}
}
if (_pk.representation().size() > 2) {
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
_length_in_bytes += _pk.representation().size() - 2;
}
if (_ck.representation().size() > 2) {
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
_length_in_bytes += _ck.representation().size() - 2;
}
}
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
mutation m(schema, _pk);
// If there's no clustering key, a tombstone should be created directly
// on a partition, not on a clustering row - otherwise it will look like
// an open-ended range tombstone, which will crash on KA/LA sstable format.
// Ref: #6035
const bool use_partition_tombstone = schema->clustering_key_size() == 0;
if (!_cells) {
if (use_partition_tombstone) {
m.partition().apply(tombstone(ts, gc_clock::now()));
} else {
// a DeleteItem operation:
m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
}
return m;
}
// else, a PutItem operation:
auto& row = m.partition().clustered_row(*schema, _ck);
attribute_collector attrs_collector;
for (auto& c : *_cells) {
const column_definition* cdef = find_attribute(*schema, c.column_name);
if (!cdef) {
attrs_collector.put(c.column_name, c.value, ts);
} else {
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
}
}
auto attrs = attrs_column(*schema);
if (!attrs_collector.empty()) {
auto serialized_map = attrs_collector.to_mut().serialize(*attrs_type());
row.cells().apply(attrs, std::move(serialized_map));
}
// To allow creation of an item with no attributes, we need a row marker.
row.apply(row_marker(ts));
// PutItem is supposed to completely replace the old item, so we need to
// also have a tombstone removing old cells. Important points:
// 1) Alternator's schema is dynamic, therefore we store data in a map
// in column :attrs. Since we're replacing a row, invalidating only
// :attrs is enough. Alternator base tables also had columns for LSI
// keys and GSI keys. New tables no longer have such columns, but old
// tables created in the past may still have them.
// 2) We use a collection tombstone for the :attrs column instead of a row
// tombstone. While a row tombstone would also replace the data, it has
// an undesirable side effect for CDC, which would report it as a
// separate deletion event. To model PutItem's "replace" semantic, we
// leverage a corner case: a collection tombstone at ts-1 paired with an
// upsert at ts is not reported by CDC as a separate REMOVE event. We
// can't use the timestamp ts, because when data and tombstone tie on
// timestamp, the tombstone wins. These tricks were introduced in
// Scylla to handle collection replacements in CQL (see #6084, PR #6491,
// e.g. cql3::maps::setter::execute()) and we utilize it to avoid
// emitting the REMOVE event (resolving #6930).
row.cells().apply(attrs, collection_mutation_description{tombstone{ts - 1, gc_clock::now()}}.serialize(*attrs.type));
// Note that for old tables created with regular LSI and GSI key columns,
// we must also delete the regular columns that are not part of the new
// schema consisting of pk, ck, and :attrs.
for (const auto& cdef : schema->regular_columns()) {
if (cdef.name_as_text() != executor::ATTRS_COLUMN_NAME) {
row.cells().apply(cdef, atomic_cell::make_dead(ts - 1, gc_clock::now()));
}
}
return m;
}
// The DynamoDB API doesn't let the client control the server's timeout, so
// we have a global default_timeout() for Alternator requests. The value of
// s_default_timeout_ms is overwritten in alternator::controller::start_server()
// based on the "alternator_timeout_in_ms" configuration parameter.
thread_local utils::updateable_value<uint32_t> executor::s_default_timeout_in_ms{10'000};
db::timeout_clock::time_point executor::default_timeout() {
return db::timeout_clock::now() + std::chrono::milliseconds(s_default_timeout_in_ms);
}
static lw_shared_ptr<query::read_command> previous_item_read_command(service::storage_proxy& proxy,
schema_ptr schema,
const clustering_key& ck,
shared_ptr<cql3::selection::selection> selection) {
std::vector<query::clustering_range> bounds;
if (schema->clustering_key_size() == 0) {
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
} else {
bounds.push_back(query::clustering_range::make_singular(ck));
}
// FIXME: We pretend to take a selection (all callers currently give us a
// wildcard selection...) but here we read the entire item anyway. We
// should take the column list from selection instead of building it here.
auto regular_columns =
schema->regular_columns() | std::views::transform(&column_definition::id)
| std::ranges::to<query::column_id_vector>();
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
query::tombstone_limit(proxy.get_tombstone_limit()));
}
static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
return dht::partition_range_vector{dht::partition_range(dht::decorate_key(schema, pk))};
}
static dht::partition_range_vector to_partition_ranges(const dht::decorated_key& pk) {
return dht::partition_range_vector{dht::partition_range(pk)};
}
// Parse the different options for the ReturnValues parameter. We parse all
// the known options, but only UpdateItem actually supports all of them. The
// other operations (DeleteItem and PutItem) will refuse some of them.
rmw_operation::returnvalues rmw_operation::parse_returnvalues(const rjson::value& request) {
const rjson::value* attribute_value = rjson::find(request, "ReturnValues");
if (!attribute_value) {
return rmw_operation::returnvalues::NONE;
}
if (!attribute_value->IsString()) {
throw api_error::validation(format("Expected string value for ReturnValues, got: {}", *attribute_value));
}
auto s = rjson::to_string_view(*attribute_value);
if (s == "NONE") {
return rmw_operation::returnvalues::NONE;
} else if (s == "ALL_OLD") {
return rmw_operation::returnvalues::ALL_OLD;
} else if (s == "UPDATED_OLD") {
return rmw_operation::returnvalues::UPDATED_OLD;
} else if (s == "ALL_NEW") {
return rmw_operation::returnvalues::ALL_NEW;
} else if (s == "UPDATED_NEW") {
return rmw_operation::returnvalues::UPDATED_NEW;
} else {
throw api_error::validation(fmt::format("Unrecognized value for ReturnValues: {}", s));
}
}
rmw_operation::returnvalues_on_condition_check_failure
rmw_operation::parse_returnvalues_on_condition_check_failure(const rjson::value& request) {
const rjson::value* attribute_value = rjson::find(request, "ReturnValuesOnConditionCheckFailure");
if (!attribute_value) {
return rmw_operation::returnvalues_on_condition_check_failure::NONE;
}
if (!attribute_value->IsString()) {
throw api_error::validation(format("Expected string value for ReturnValuesOnConditionCheckFailure, got: {}", *attribute_value));
}
auto s = rjson::to_string_view(*attribute_value);
if (s == "NONE") {
return rmw_operation::returnvalues_on_condition_check_failure::NONE;
} else if (s == "ALL_OLD") {
return rmw_operation::returnvalues_on_condition_check_failure::ALL_OLD;
} else {
throw api_error::validation(fmt::format("Unrecognized value for ReturnValuesOnConditionCheckFailure: {}", s));
}
}
rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& request)
: _request(std::move(request))
, _schema(get_table_for_write(proxy, _request))
, _write_isolation(get_write_isolation_for_schema(_schema))
, _consumed_capacity(_request)
, _returnvalues(parse_returnvalues(_request))
, _returnvalues_on_condition_check_failure(parse_returnvalues_on_condition_check_failure(_request))
{
// _pk and _ck will be assigned later, by the subclass's constructor
// (each operation puts the key in a slightly different location in
// the request).
}
std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) {
if (qr->row_count()) {
auto selection = cql3::selection::selection::wildcard(_schema);
uint64_t item_length = 0;
auto previous_item = executor::describe_single_item(_schema, slice, *selection, *qr, {}, &item_length);
if (_consumed_capacity._total_bytes < item_length) {
_consumed_capacity._total_bytes = item_length;
}
if (previous_item) {
if (should_fill_preimage()) {
cdc_opts.preimage = make_lw_shared<cql3::untyped_result_set>(*_schema, std::move(qr), *selection, slice);
}
return apply(std::make_unique<rjson::value>(std::move(*previous_item)), ts, cdc_opts);
}
}
return apply(std::unique_ptr<rjson::value>(), ts, cdc_opts);
}
rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(schema_ptr schema) {
const auto tags_ptr = db::get_tags_of_table(schema);
if (!tags_ptr) {
// Tags missing entirely from this table. This can't happen for a
// normal Alternator table, but can happen if get_table_for_write()
// allowed writing to a non-Alternator table (e.g., an internal table).
// If it is a system table, LWT will not work (and is also pointless
// for non-distributed tables), so use UNSAFE_RMW.
if(is_internal_keyspace(schema->ks_name())) {
return write_isolation::UNSAFE_RMW;
} else {
return default_write_isolation;
}
}
auto it = tags_ptr->find(WRITE_ISOLATION_TAG_KEY);
if (it == tags_ptr->end() || it->second.empty()) {
return default_write_isolation;
}
return parse_write_isolation(it->second);
}
// shard_for_execute() checks whether execute() must be called on a specific
// other shard. Running execute() on a specific shard is necessary only if it
// will use LWT (storage_proxy::cas()). This is because cas() can only be
// called on the specific shard owning (as per get_cas_shard()) _pk's token.
// Knowing if execute() will call cas() or not may depend on whether there is
// a read-before-write, but not just on it - depending on configuration,
// execute() may unconditionally use cas() for every write. Unfortunately,
// this requires duplicating here a bit of logic from execute().
// The returned cas_shard must be passed to execute() to ensure
// the tablet shard won't change. The caller must hold the returned object for
// the duration of execution, even if we were already on the right shard - so it doesn't move.
std::optional<service::cas_shard> rmw_operation::shard_for_execute(bool needs_read_before_write) {
if (_write_isolation == write_isolation::FORBID_RMW ||
(_write_isolation == write_isolation::LWT_RMW_ONLY && !needs_read_before_write) ||
_write_isolation == write_isolation::UNSAFE_RMW) {
return {};
}
// If we're still here, cas() *will* be called by execute(), so let's
// find the appropriate shard to run it on:
const auto token = dht::get_token(*_schema, _pk);
return service::cas_shard(*_schema, token);
}
// Build the return value from the different RMW operations (UpdateItem,
// PutItem, DeleteItem). All these return nothing by default, but can
// optionally return Attributes if requested via the ReturnValues option.
static executor::request_return_type rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
rjson::value ret = rjson::empty_object();
consumed_capacity.add_consumed_capacity_to_response_if_needed(ret);
metric += consumed_capacity.get_consumed_capacity_units();
if (!attributes.IsNull()) {
rjson::add(ret, "Attributes", std::move(attributes));
}
return rjson::print(std::move(ret));
}
static future<std::unique_ptr<rjson::value>> get_previous_item(
service::storage_proxy& proxy,
service::client_state& client_state,
schema_ptr schema,
const partition_key& pk,
const clustering_key& ck,
service_permit permit,
db::consistency_level cl,
uint64_t& item_length)
{
auto selection = cql3::selection::selection::wildcard(schema);
auto command = previous_item_read_command(proxy, schema, ck, selection);
command->allow_limit = db::allow_per_partition_rate_limit::yes;
return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
[schema, command, selection = std::move(selection), &item_length] (service::storage_proxy::coordinator_query_result qr) {
auto previous_item = executor::describe_single_item(schema, command->slice, *selection, *qr.query_result, {}, &item_length);
if (previous_item) {
return make_ready_future<std::unique_ptr<rjson::value>>(std::make_unique<rjson::value>(std::move(*previous_item)));
} else {
return make_ready_future<std::unique_ptr<rjson::value>>();
}
});
}
static future<std::unique_ptr<rjson::value>> get_previous_item(
service::storage_proxy& proxy,
service::client_state& client_state,
schema_ptr schema,
const partition_key& pk,
const clustering_key& ck,
service_permit permit,
alternator::stats& global_stats,
alternator::stats& per_table_stats,
uint64_t& item_length)
{
global_stats.reads_before_write++;
per_table_stats.reads_before_write++;
return get_previous_item(proxy, client_state, schema, pk, ck, permit, db::consistency_level::LOCAL_QUORUM, item_length);
}
static future<uint64_t> get_previous_item_size(
service::storage_proxy& proxy,
service::client_state& client_state,
schema_ptr schema,
const partition_key& pk,
const clustering_key& ck,
service_permit permit) {
uint64_t item_length = 0;
// The use of get_previous_item here is for DynamoDB calculation compatibility mode,
// and the actual value is ignored. For performance reasons, we use CL_LOCAL_ONE.
co_await get_previous_item(proxy, client_state, schema, pk, ck, permit, db::consistency_level::LOCAL_ONE, item_length);
co_return item_length;
}
future<executor::request_return_type> rmw_operation::execute(service::storage_proxy& proxy,
std::optional<service::cas_shard> cas_shard,
service::client_state& client_state,
tracing::trace_state_ptr trace_state,
service_permit permit,
bool needs_read_before_write,
stats& global_stats,
stats& per_table_stats,
uint64_t& wcu_total) {
auto cdc_opts = cdc::per_request_options{
.alternator = true,
.alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
};
if (needs_read_before_write) {
if (_write_isolation == write_isolation::FORBID_RMW) {
throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
}
global_stats.reads_before_write++;
per_table_stats.reads_before_write++;
if (_write_isolation == write_isolation::UNSAFE_RMW) {
// This is the old, unsafe, read before write which does first
// a read, then a write. TODO: remove this mode entirely.
return get_previous_item(proxy, client_state, schema(), _pk, _ck, permit, global_stats, per_table_stats, _consumed_capacity._total_bytes).then(
[this, &proxy, &wcu_total, trace_state, permit = std::move(permit), cdc_opts = std::move(cdc_opts)] (std::unique_ptr<rjson::value> previous_item) mutable {
std::optional<mutation> m = apply(std::move(previous_item), api::new_timestamp(), cdc_opts);
if (!m) {
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
}
return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this,&wcu_total] () mutable {
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
});
});
}
} else if (_write_isolation != write_isolation::LWT_ALWAYS) {
std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
throwing_assert(m); // !needs_read_before_write, so apply() did not check a condition
return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
});
}
throwing_assert(cas_shard);
// If we're still here, we need to do this write using LWT:
global_stats.write_using_lwt++;
per_table_stats.write_using_lwt++;
auto timeout = executor::default_timeout();
auto selection = cql3::selection::selection::wildcard(schema());
auto read_command = needs_read_before_write ?
previous_item_read_command(proxy, schema(), _ck, selection) :
nullptr;
return proxy.cas(schema(), std::move(*cas_shard), *this, read_command, to_partition_ranges(*schema(), _pk),
{timeout, std::move(permit), client_state, trace_state},
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout, true, std::move(cdc_opts)).then([this, read_command, &wcu_total] (bool is_applied) mutable {
if (!is_applied) {
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
}
return make_ready_future<executor::request_return_type>(rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total));
});
}
static parsed::condition_expression get_parsed_condition_expression(parsed::expression_cache& parsed_expression_cache, rjson::value& request) {
rjson::value* condition_expression = rjson::find(request, "ConditionExpression");
if (!condition_expression) {
// Returning an empty() condition_expression means no condition.
return parsed::condition_expression{};
}
if (!condition_expression->IsString()) {
throw api_error::validation("ConditionExpression must be a string");
}
if (condition_expression->GetStringLength() == 0) {
throw api_error::validation("ConditionExpression must not be empty");
}
try {
return parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*condition_expression), "ConditionExpression");
} catch(expressions_syntax_error& e) {
throw api_error::validation(e.what());
}
}
static bool check_needs_read_before_write(const parsed::condition_expression& condition_expression) {
// Theoretically, a condition expression may exist but not refer to the
// item at all. But this is not a useful case and there is no point in
// optimizing for it.
return !condition_expression.empty();
}
// Fail the expression if it has unused attribute names or values. This is
// how DynamoDB behaves, so we do too.
void verify_all_are_used(const rjson::value* field,
const std::unordered_set<std::string>& used, const char* field_name, const char* operation) {
if (!field) {
return;
}
for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
if (!used.contains(rjson::to_string(it->name))) {
throw api_error::validation(
format("{} has spurious '{}', not used in {}",
field_name, rjson::to_string_view(it->name), operation));
}
}
}
class put_item_operation : public rmw_operation {
private:
put_or_delete_item _mutation_builder;
public:
parsed::condition_expression _condition_expression;
put_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
: rmw_operation(proxy, std::move(request))
, _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{},
si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name()))) {
_pk = _mutation_builder.pk();
_ck = _mutation_builder.ck();
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
throw api_error::validation(format("PutItem supports only NONE or ALL_OLD for ReturnValues"));
}
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
if (!_condition_expression.empty()) {
std::unordered_set<std::string> used_attribute_names;
std::unordered_set<std::string> used_attribute_values;
resolve_condition_expression(_condition_expression,
expression_attribute_names, expression_attribute_values,
used_attribute_names, used_attribute_values);
verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "PutItem");
verify_all_are_used(expression_attribute_values, used_attribute_values,"ExpressionAttributeValues", "PutItem");
} else {
if (expression_attribute_names) {
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
}
if (expression_attribute_values) {
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
}
}
_consumed_capacity += _mutation_builder.length_in_bytes();
}
bool needs_read_before_write() const {
return _request.HasMember("Expected") ||
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
if (!verify_expected(_request, previous_item.get()) ||
!verify_condition_expression(_condition_expression, previous_item.get())) {
if (previous_item && _returnvalues_on_condition_check_failure ==
returnvalues_on_condition_check_failure::ALL_OLD) {
_return_attributes = std::move(*previous_item);
}
// If the update is to be cancelled because of an unfulfilled Expected
// condition, return an empty optional mutation, which is more
// efficient than throwing an exception.
return {};
}
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
_return_attributes = std::move(*previous_item);
} else {
_return_attributes = {};
}
return _mutation_builder.build(_schema, ts);
}
virtual ~put_item_operation() = default;
};
future<executor::request_return_type> executor::put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.put_item++;
auto start_time = std::chrono::steady_clock::now();
elogger.trace("put_item {}", request);
auto op = make_shared<put_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
if (!audit_info) {
// On LWT shard bounce, audit_info is already set on the originating shard.
// The reference captured in the bounce lambda points back to the original
// coroutine frame, which remains alive for the entire cross-shard call.
// Only reads of this pointer occur on the target shard — no writes or frees.
maybe_audit(audit_info, audit::statement_category::DML, op->schema()->ks_name(),
op->schema()->cf_name(), "PutItem", op->request(), db::consistency_level::LOCAL_QUORUM);
}
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
const bool needs_read_before_write = op->needs_read_before_write();
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
auto cas_shard = op->shard_for_execute(needs_read_before_write);
if (cas_shard && !cas_shard->this_shard()) {
_stats.api_operations.put_item--; // uncount on this shard, will be counted in other shard
_stats.shard_bounce_for_lwt++;
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit), &audit_info]
(executor& e) mutable {
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt), &audit_info]
(service::client_state& client_state) mutable {
//FIXME: Instead of passing empty_service_permit() to the background operation,
// the current permit's lifetime should be prolonged, so that it's destructed
// only after all background operations are finished as well.
return e.put_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request), audit_info);
});
});
}
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
per_table_stats->api_operations.put_item++;
uint64_t wcu_total = 0;
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
per_table_stats->operation_sizes.put_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
per_table_stats->wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
_stats.wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
per_table_stats->api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
_stats.api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return res;
}
class delete_item_operation : public rmw_operation {
private:
put_or_delete_item _mutation_builder;
public:
parsed::condition_expression _condition_expression;
delete_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
: rmw_operation(proxy, std::move(request))
, _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
_pk = _mutation_builder.pk();
_ck = _mutation_builder.ck();
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
throw api_error::validation(format("DeleteItem supports only NONE or ALL_OLD for ReturnValues"));
}
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
if (!_condition_expression.empty()) {
std::unordered_set<std::string> used_attribute_names;
std::unordered_set<std::string> used_attribute_values;
resolve_condition_expression(_condition_expression,
expression_attribute_names, expression_attribute_values,
used_attribute_names, used_attribute_values);
verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "DeleteItem");
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "DeleteItem");
} else {
if (expression_attribute_names) {
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
}
if (expression_attribute_values) {
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
}
}
}
bool needs_read_before_write() const {
return _request.HasMember("Expected") ||
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
if (!verify_expected(_request, previous_item.get()) ||
!verify_condition_expression(_condition_expression, previous_item.get())) {
if (previous_item && _returnvalues_on_condition_check_failure ==
returnvalues_on_condition_check_failure::ALL_OLD) {
_return_attributes = std::move(*previous_item);
}
// If the update is to be cancelled because of an unfulfilled Expected
// condition, return an empty optional mutation, which is more
// efficient than throwing an exception.
return {};
}
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
_return_attributes = std::move(*previous_item);
} else {
_return_attributes = {};
}
if (_consumed_capacity._total_bytes == 0) {
_consumed_capacity._total_bytes = 1;
}
return _mutation_builder.build(_schema, ts);
}
virtual ~delete_item_operation() = default;
};
future<executor::request_return_type> executor::delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.delete_item++;
auto start_time = std::chrono::steady_clock::now();
elogger.trace("delete_item {}", request);
auto op = make_shared<delete_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
if (!audit_info) {
// On LWT shard bounce, audit_info is already set on the originating shard.
// The reference captured in the bounce lambda points back to the original
// coroutine frame, which remains alive for the entire cross-shard call.
// Only reads of this pointer occur on the target shard — no writes or frees.
maybe_audit(audit_info, audit::statement_category::DML, op->schema()->ks_name(),
op->schema()->cf_name(), "DeleteItem", op->request(), db::consistency_level::LOCAL_QUORUM);
}
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
auto cas_shard = op->shard_for_execute(needs_read_before_write);
if (cas_shard && !cas_shard->this_shard()) {
_stats.api_operations.delete_item--; // uncount on this shard, will be counted in other shard
_stats.shard_bounce_for_lwt++;
per_table_stats->shard_bounce_for_lwt++;
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit), &audit_info]
(executor& e) mutable {
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt), &audit_info]
(service::client_state& client_state) mutable {
//FIXME: Instead of passing empty_service_permit() to the background operation,
// the current permit's lifetime should be prolonged, so that it's destructed
// only after all background operations are finished as well.
return e.delete_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request), audit_info);
});
});
}
per_table_stats->api_operations.delete_item++;
uint64_t wcu_total = 0;
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
if (op->consumed_capacity()._total_bytes > 1) {
per_table_stats->operation_sizes.delete_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
}
per_table_stats->wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
_stats.wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
per_table_stats->api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
_stats.api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return res;
}
schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings
try {
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
} catch(data_dictionary::no_such_column_family&) {
// DynamoDB returns validation error even when table does not exist
// and the table name is invalid.
validate_table_name(table_name);
throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
}
}
using primary_key = std::pair<partition_key, clustering_key>;
struct primary_key_hash {
schema_ptr _s;
size_t operator()(const primary_key& key) const {
return utils::hash_combine(partition_key::hashing(*_s)(key.first), clustering_key::hashing(*_s)(key.second));
}
};
struct primary_key_equal {
schema_ptr _s;
bool operator()(const primary_key& k1, const primary_key& k2) const {
return partition_key::equality(*_s)(k1.first, k2.first) && clustering_key::equality(*_s)(k1.second, k2.second);
}
};
// This is a cas_request subclass for applying given put_or_delete_items to
// one partition using LWT as part as BatchWriteItem. This is a write-only
// operation, not needing the previous value of the item (the mutation to be
// done is known prior to starting the operation). Nevertheless, we want to
// do this mutation via LWT to ensure that it is serialized with other LWT
// mutations to the same partition.
//
// The std::vector<put_or_delete_item> must remain alive until the
// storage_proxy::cas() future is resolved.
class put_or_delete_item_cas_request : public service::cas_request {
schema_ptr schema;
const std::vector<put_or_delete_item>& _mutation_builders;
public:
put_or_delete_item_cas_request(schema_ptr s, const std::vector<put_or_delete_item>& b) :
schema(std::move(s)), _mutation_builders(b) { }
virtual ~put_or_delete_item_cas_request() = default;
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override {
std::optional<mutation> ret;
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
// We assume all these builders have the same partition.
if (ret) {
ret->apply(mutation_builder.build(schema, ts));
} else {
ret = mutation_builder.build(schema, ts);
}
}
return ret;
}
};
future<> executor::cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
tracing::trace_state_ptr trace_state, service_permit permit)
{
if (!cas_shard.this_shard()) {
_stats.shard_bounce_for_lwt++;
return container().invoke_on(cas_shard.shard(), _ssg,
[cs = client_state.move_to_other_shard(),
&mb = mutation_builders,
&dk,
ks = schema->ks_name(),
cf = schema->cf_name(),
gt = tracing::global_trace_state_ptr(trace_state),
permit = std::move(permit)]
(executor& self) mutable {
return do_with(cs.get(), [&mb, &dk, ks = std::move(ks), cf = std::move(cf),
trace_state = tracing::trace_state_ptr(gt), &self]
(service::client_state& client_state) mutable {
auto schema = self._proxy.data_dictionary().find_schema(ks, cf);
service::cas_shard cas_shard(*schema, dk.token());
//FIXME: Instead of passing empty_service_permit() to the background operation,
// the current permit's lifetime should be prolonged, so that it's destructed
// only after all background operations are finished as well.
return self.cas_write(schema, std::move(cas_shard), dk, mb, client_state, std::move(trace_state), empty_service_permit());
});
});
}
auto timeout = executor::default_timeout();
auto op = std::make_unique<put_or_delete_item_cas_request>(schema, mutation_builders);
auto* op_ptr = op.get();
auto cdc_opts = cdc::per_request_options{
.alternator = true,
.alternator_streams_increased_compatibility =
schema->cdc_options().enabled() && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
};
return _proxy.cas(schema, std::move(cas_shard), *op_ptr, nullptr, to_partition_ranges(dk),
{timeout, std::move(permit), client_state, trace_state},
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
timeout, timeout, true, std::move(cdc_opts)).finally([op = std::move(op)]{}).discard_result();
// We discarded cas()'s future value ("is_applied") because BatchWriteItem
// does not need to support conditional updates.
}
struct schema_decorated_key {
schema_ptr schema;
dht::decorated_key dk;
};
struct schema_decorated_key_hash {
size_t operator()(const schema_decorated_key& k) const {
return std::hash<dht::token>()(k.dk.token());
}
};
struct schema_decorated_key_equal {
bool operator()(const schema_decorated_key& k1, const schema_decorated_key& k2) const {
return k1.schema == k2.schema && k1.dk.equal(*k1.schema, k2.dk);
}
};
// FIXME: if we failed writing some of the mutations, need to return a list
// of these failed mutations rather than fail the whole write (issue #5650).
future<> executor::do_batch_write(
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
service::client_state& client_state,
tracing::trace_state_ptr trace_state,
service_permit permit) {
if (mutation_builders.empty()) {
return make_ready_future<>();
}
// NOTE: technically, do_batch_write could be reworked to use LWT only for part
// of the batched requests and not use it for others, but it's not considered
// likely that a batch will contain both tables which always demand LWT and ones
// that don't - it's fragile to split a batch into multiple storage proxy requests though.
// Hence, the decision is conservative - if any table enforces LWT,the whole batch will use it.
const bool needs_lwt = std::ranges::any_of(mutation_builders | std::views::keys, [] (const schema_ptr& schema) {
return rmw_operation::get_write_isolation_for_schema(schema) == rmw_operation::write_isolation::LWT_ALWAYS;
});
if (!needs_lwt) {
// Do a normal write, without LWT:
utils::chunked_vector<mutation> mutations;
mutations.reserve(mutation_builders.size());
api::timestamp_type now = api::new_timestamp();
bool any_cdc_enabled = false;
for (auto& b : mutation_builders) {
mutations.push_back(b.second.build(b.first, now));
any_cdc_enabled |= b.first->cdc_options().enabled();
}
return _proxy.mutate(std::move(mutations),
db::consistency_level::LOCAL_QUORUM,
executor::default_timeout(),
trace_state,
std::move(permit),
db::allow_per_partition_rate_limit::yes,
false,
cdc::per_request_options{
.alternator = true,
.alternator_streams_increased_compatibility = any_cdc_enabled && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
});
} else {
// Do the write via LWT:
// Multiple mutations may be destined for the same partition, adding
// or deleting different items of one partition. Join them together
// because we can do them in one cas() call.
using map_type = std::unordered_map<schema_decorated_key,
std::vector<put_or_delete_item>,
schema_decorated_key_hash,
schema_decorated_key_equal>;
auto key_builders = std::make_unique<map_type>(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
for (auto&& b : std::move(mutation_builders)) {
auto [it, added] = key_builders->try_emplace(schema_decorated_key {
.schema = b.first,
.dk = dht::decorate_key(*b.first, b.second.pk())
});
it->second.push_back(std::move(b.second));
}
auto* key_builders_ptr = key_builders.get();
return parallel_for_each(*key_builders_ptr, [this, &client_state, trace_state, permit = std::move(permit)] (const auto& e) {
_stats.write_using_lwt++;
auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
auto s = e.first.schema;
static const auto* injection_name = "alternator_executor_batch_write_wait";
return utils::get_local_injector().inject(injection_name, [s = std::move(s)] (auto& handler) -> future<> {
const auto ks = handler.get("keyspace");
const auto cf = handler.get("table");
const auto shard = std::atoll(handler.get("shard")->data());
if (ks == s->ks_name() && cf == s->cf_name() && shard == this_shard_id()) {
elogger.info("{}: hit", injection_name);
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
elogger.info("{}: continue", injection_name);
}
}).then([&e, desired_shard = std::move(desired_shard),
&client_state, trace_state = std::move(trace_state), permit = std::move(permit), this]() mutable
{
return cas_write(e.first.schema, std::move(desired_shard), e.first.dk,
std::move(e.second), client_state, std::move(trace_state), std::move(permit));
});
}).finally([key_builders = std::move(key_builders)]{});
}
}
sstring print_names_for_audit(const std::set<sstring>& names) {
sstring res;
// Might have been useful to loop twice, with the 1st loop learning the total size of the names for the res to then reserve()
for(const auto& name : names) {
if (!res.empty()) {
res += "|";
}
res += name;
}
return res;
}
future<executor::request_return_type> executor::batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.batch_write_item++;
auto start_time = std::chrono::steady_clock::now();
const rjson::value& request_items = get_member(request, "RequestItems", "BatchWriteItem content");
validate_is_object(request_items, "RequestItems");
if (request_items.ObjectEmpty()) {
co_return api_error::validation("RequestItems can't be empty");
}
const auto maximum_batch_write_size = _proxy.data_dictionary().get_config().alternator_max_items_in_batch_write();
size_t total_items = 0;
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
if (!it->value.IsArray() || it->value.Empty()) {
co_return api_error::validation("Member of RequestItems must be a non-empty array of WriteRequest objects");
}
total_items += it->value.Size();
}
if (total_items > maximum_batch_write_size) {
co_return api_error::validation(fmt::format("Invalid length of BatchWriteItem command, got {} items, "
"maximum is {} (from configuration variable alternator_max_items_in_batch_write)", total_items, maximum_batch_write_size));
}
bool should_add_wcu = wcu_consumed_capacity_counter::should_add_capacity(request);
rjson::value consumed_capacity = rjson::empty_array();
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders;
// WCU calculation is performed at the end of execution.
// We need to keep track of changes per table, both for internal metrics
// and to be able to return the values if should_add_wcu is true.
// For each table, we need its stats and schema.
std::vector<std::pair<lw_shared_ptr<stats>, schema_ptr>> per_table_wcu;
std::set<sstring> table_names; // for auditing
// FIXME: will_log() here doesn't pass keyspace/table, so keyspace-level audit
// filtering is bypassed — a batch spanning multiple tables is audited as a whole.
bool should_audit = _audit.local_is_initialized() && _audit.local().will_log(audit::statement_category::DML);
mutation_builders.reserve(request_items.MemberCount());
per_table_wcu.reserve(request_items.MemberCount());
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
schema_ptr schema = get_table_from_batch_request(_proxy, it);
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(schema));
per_table_stats->api_operations.batch_write_item++;
per_table_stats->api_operations.batch_write_item_batch_total += it->value.Size();
per_table_stats->api_operations.batch_write_item_histogram.add(it->value.Size());
tracing::add_alternator_table_name(trace_state, schema->cf_name());
if (should_audit) {
table_names.insert(schema->cf_name());
}
std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
1, primary_key_hash{schema}, primary_key_equal{schema});
for (auto& request : it->value.GetArray()) {
auto& r = get_single_member(request, "RequestItems element");
const auto r_name = rjson::to_string_view(r.name);
if (r_name == "PutRequest") {
const rjson::value& item = get_member(r.value, "Item", "PutRequest");
validate_is_object(item, "Item in PutRequest");
auto&& put_item = put_or_delete_item(
item, schema, put_or_delete_item::put_item{},
si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())));
mutation_builders.emplace_back(schema, std::move(put_item));
auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
if (used_keys.contains(mut_key)) {
co_return api_error::validation("Provided list of item keys contains duplicates");
}
used_keys.insert(std::move(mut_key));
} else if (r_name == "DeleteRequest") {
const rjson::value& key = get_member(r.value, "Key", "DeleteRequest");
validate_is_object(key, "Key in DeleteRequest");
mutation_builders.emplace_back(schema, put_or_delete_item(
key, schema, put_or_delete_item::delete_item{}));
auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
mutation_builders.back().second.ck());
if (used_keys.contains(mut_key)) {
co_return api_error::validation("Provided list of item keys contains duplicates");
}
used_keys.insert(std::move(mut_key));
} else {
co_return api_error::validation(fmt::format("Unknown BatchWriteItem request type: {}", r_name));
}
}
per_table_wcu.emplace_back(std::make_pair(per_table_stats, schema));
}
for (const auto& b : mutation_builders) {
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, b.first, auth::permission::MODIFY, _stats);
}
// If alternator_force_read_before_write is true we will first get the previous item size
// and only then do send the mutation.
if (_proxy.data_dictionary().get_config().alternator_force_read_before_write()) {
std::vector<future<uint64_t>> previous_items_sizes;
previous_items_sizes.reserve(mutation_builders.size());
// Parallel get all previous item sizes
for (const auto& b : mutation_builders) {
previous_items_sizes.emplace_back(get_previous_item_size(
_proxy,
client_state,
b.first,
b.second.pk(),
b.second.ck(),
permit));
}
size_t pos = 0;
// We are going to wait for all the requests
for (auto&& pi : previous_items_sizes) {
auto res = co_await std::move(pi);
if (mutation_builders[pos].second.length_in_bytes() < res) {
mutation_builders[pos].second.set_length_in_bytes(res);
}
pos++;
}
}
size_t wcu_put_units = 0;
size_t wcu_delete_units = 0;
size_t pos = 0;
size_t total_wcu;
// Here we calculate the per-table WCU.
// The size in the mutation is based either on the operation size,
// or, if we performed a read-before-write, on the larger of the operation size
// and the previous item's size.
for (const auto& w : per_table_wcu) {
total_wcu = 0;
// The following loop goes over all items from the same table
while(pos < mutation_builders.size() && w.second->id() == mutation_builders[pos].first->id()) {
uint64_t item_size = mutation_builders[pos].second.length_in_bytes();
size_t wcu = wcu_consumed_capacity_counter::get_units(item_size ? item_size : 1);
total_wcu += wcu;
if (mutation_builders[pos].second.is_put_item()) {
w.first->wcu_total[stats::PUT_ITEM] += wcu;
wcu_put_units += wcu;
} else {
w.first->wcu_total[stats::DELETE_ITEM] += wcu;
wcu_delete_units += wcu;
}
w.first->operation_sizes.batch_write_item_op_size_kb.add(bytes_to_kb_ceil(item_size));
pos++;
}
if (should_add_wcu) {
rjson::value entry = rjson::empty_object();
rjson::add(entry, "TableName", rjson::from_string(w.second->cf_name()));
rjson::add(entry, "CapacityUnits", total_wcu);
rjson::push_back(consumed_capacity, std::move(entry));
}
}
_stats.wcu_total[stats::PUT_ITEM] += wcu_put_units;
_stats.wcu_total[stats::DELETE_ITEM] += wcu_delete_units;
_stats.api_operations.batch_write_item_batch_total += total_items;
_stats.api_operations.batch_write_item_histogram.add(total_items);
co_await do_batch_write(std::move(mutation_builders), client_state, trace_state, std::move(permit));
// FIXME: Issue #5650: If we failed writing some of the updates,
// need to return a list of these failed updates in UnprocessedItems
// rather than fail the whole write (issue #5650).
rjson::value ret = rjson::empty_object();
rjson::add(ret, "UnprocessedItems", rjson::empty_object());
if (should_add_wcu) {
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
}
auto duration = std::chrono::steady_clock::now() - start_time;
_stats.api_operations.batch_write_item_latency.mark(duration);
for (const auto& w : per_table_wcu) {
w.first->api_operations.batch_write_item_latency.mark(duration);
}
maybe_audit(audit_info, audit::statement_category::DML, "",
print_names_for_audit(table_names), "BatchWriteItem", request, db::consistency_level::LOCAL_QUORUM);
co_return rjson::print(std::move(ret));
}
static const std::string_view get_item_type_string(const rjson::value& v) {
const rjson::value::Member& mem = get_single_member(v, "Item");
return rjson::to_string_view(mem.name);
}
/**
* Helper routine to extract data when we already have
* row, etc etc.
*
* Note: include_all_embedded_attributes means we should
* include all values in the `ATTRS_COLUMN_NAME` map column.
*
* We could change the behaviour to simply include all values
* from this column if the `ATTRS_COLUMN_NAME` is explicit in
* `attrs_to_get`, but I am scared to do that now in case
* there is some corner case in existing code.
*
* Explicit bool means we can be sure all previous calls are
* as before.
*/
void executor::describe_single_item(const cql3::selection::selection& selection,
const std::vector<managed_bytes_opt>& result_row,
const std::optional<attrs_to_get>& attrs_to_get,
rjson::value& item,
uint64_t* item_length_in_bytes,
bool include_all_embedded_attributes)
{
const auto& columns = selection.get_columns();
auto column_it = columns.begin();
for (const managed_bytes_opt& cell : result_row) {
if (!cell) {
++column_it;
continue;
}
std::string column_name = (*column_it)->name_as_text();
if (column_name != executor::ATTRS_COLUMN_NAME) {
if (item_length_in_bytes) {
(*item_length_in_bytes) += column_name.length() + cell->size();
}
if (!attrs_to_get || attrs_to_get->contains(column_name)) {
// item is expected to start empty, and column_name are unique
// so add() makes sense
rjson::add_with_string_name(item, column_name, rjson::empty_object());
rjson::value& field = item[column_name.c_str()];
cell->with_linearized([&] (bytes_view linearized_cell) {
rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it));
});
}
} else {
auto deserialized = attrs_type()->deserialize(*cell);
auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
for (auto entry : keys_and_values) {
std::string attr_name = value_cast<sstring>(entry.first);
if (item_length_in_bytes) {
(*item_length_in_bytes) += attr_name.length();
}
if (include_all_embedded_attributes || !attrs_to_get || attrs_to_get->contains(attr_name)) {
bytes value = value_cast<bytes>(entry.second);
if (item_length_in_bytes && value.length()) {
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
(*item_length_in_bytes) += value.length() - 1;
}
rjson::value v = deserialize_item(value);
if (attrs_to_get) {
auto it = attrs_to_get->find(attr_name);
if (it != attrs_to_get->end()) {
// attrs_to_get may have asked for only part of
// this attribute. hierarchy_filter() modifies v,
// and returns false when nothing is to be kept.
if (!hierarchy_filter(v, it->second)) {
continue;
}
}
}
// item is expected to start empty, and attribute
// names are unique so add() makes sense
rjson::add_with_string_name(item, attr_name, std::move(v));
} else if (item_length_in_bytes) {
(*item_length_in_bytes) += value_cast<bytes>(entry.second).length() - 1;
}
}
}
++column_it;
}
}
std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
const query::partition_slice& slice,
const cql3::selection::selection& selection,
const query::result& query_result,
const std::optional<attrs_to_get>& attrs_to_get,
uint64_t* item_length_in_bytes) {
rjson::value item = rjson::empty_object();
cql3::selection::result_set_builder builder(selection, gc_clock::now());
query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
auto result_set = builder.build();
if (result_set->empty()) {
if (item_length_in_bytes) {
// empty results is counted as having a minimal length (e.g. 1 byte).
(*item_length_in_bytes) += 1;
}
// If there is no matching item, we're supposed to return an empty
// object without an Item member - not one with an empty Item member
return {};
}
if (result_set->size() > 1) {
// If the result set contains multiple rows, the code should have
// called describe_multi_item(), not this function.
throw std::logic_error("describe_single_item() asked to describe multiple items");
}
describe_single_item(selection, *result_set->rows().begin(), attrs_to_get, item, item_length_in_bytes);
return item;
}
static bool check_needs_read_before_write(const parsed::value& v) {
return std::visit(overloaded_functor {
[&] (const parsed::constant& c) -> bool {
return false;
},
[&] (const parsed::value::function_call& f) -> bool {
return std::ranges::any_of(f._parameters, [&] (const parsed::value& param) {
return check_needs_read_before_write(param);
});
},
[&] (const parsed::path& p) -> bool {
return true;
}
}, v._value);
}
static bool check_needs_read_before_write(const attribute_path_map<parsed::update_expression::action>& update_expression) {
return std::ranges::any_of(update_expression, [](const auto& p) {
if (!p.second.has_value()) {
// If the action is not on the top-level attribute, we need to
// read the old item: we change only a part of the top-level
// attribute, and write the full top-level attribute back.
return true;
}
// Otherwise, the action p.second.get_value() is just on top-level
// attribute. Check if it needs read-before-write:
return std::visit(overloaded_functor {
[&] (const parsed::update_expression::action::set& a) -> bool {
return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2));
},
[&] (const parsed::update_expression::action::remove& a) -> bool {
return false;
},
[&] (const parsed::update_expression::action::add& a) -> bool {
return true;
},
[&] (const parsed::update_expression::action::del& a) -> bool {
return true;
}
}, p.second.get_value()._action);
});
}
/*!
* \brief estimate_value_size provides a rough size estimation
* for an rjson value object.
*
* When calculating RCU and WCU, we need to determine the length of the JSON representation
* (specifically, the length of each key and each value).
*
* When possible, this is calculated as a side effect of other operations.
* estimate_value_size is used when this calculation cannot be performed directly,
* but we still need an estimated value.
*
* It achieves this without streaming any values and uses a fixed size for numbers.
* The aim is not to provide a perfect 1-to-1 size calculation, as WCU is calculated
* in 1KB units. A ballpark estimate is sufficient.
*/
static size_t estimate_value_size(const rjson::value& value) {
size_t size = 0;
if (value.IsString()) {
size += value.GetStringLength();
}
else if (value.IsNumber()) {
size += 8;
}
else if (value.IsBool()) {
size += 5;
}
else if (value.IsArray()) {
for (auto& v : value.GetArray()) {
size += estimate_value_size(v); // Recursively calculate array element sizes
}
}
else if (value.IsObject()) {
for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) {
size += it->name.GetStringLength(); // Size of the key
size += estimate_value_size(it->value); // Size of the value
}
}
return size;
}
class update_item_operation : public rmw_operation {
public:
// Some information parsed during the constructor to check for input
// errors, and cached to be used again during apply().
rjson::value* _attribute_updates;
// Instead of keeping a parsed::update_expression with an unsorted list
// list of actions, we keep them in an attribute_path_map which groups
// them by top-level attribute, and detects forbidden overlaps/conflicts.
attribute_path_map<parsed::update_expression::action> _update_expression;
// Saved list of GSI keys in the table being updated, used for
// validate_value_if_index_key()
std::unordered_map<bytes, std::string> _key_attributes;
// Saved map of vector index target attributes to their dimensions, used
// for validate_value_if_vector_index_attribute()
std::unordered_map<bytes, int> _vector_index_attributes;
parsed::condition_expression _condition_expression;
update_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request);
virtual ~update_item_operation() = default;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override;
bool needs_read_before_write() const;
private:
void delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
attribute_collector& modified_attrs) const;
void update_attribute(bytes&& column_name, const rjson::value& json_value, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts,
deletable_row& row, attribute_collector& modified_attrs, const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) const;
void apply_attribute_updates(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const;
void apply_update_expression(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const;
};
update_item_operation::update_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& update_info)
: rmw_operation(proxy, std::move(update_info))
{
const rjson::value* key = rjson::find(_request, "Key");
if (!key) {
throw api_error::validation("UpdateItem requires a Key parameter");
}
_pk = pk_from_json(*key, _schema);
_ck = ck_from_json(*key, _schema);
check_key(*key, _schema);
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
std::unordered_set<std::string> used_attribute_names;
std::unordered_set<std::string> used_attribute_values;
const rjson::value* update_expression = rjson::find(_request, "UpdateExpression");
if (update_expression) {
if (!update_expression->IsString()) {
throw api_error::validation("UpdateExpression must be a string");
}
try {
parsed::update_expression expr = parsed_expression_cache.parse_update_expression(rjson::to_string_view(*update_expression));
resolve_update_expression(expr,
expression_attribute_names, expression_attribute_values,
used_attribute_names, used_attribute_values);
for (auto& action : expr.actions()) {
// Unfortunately we need to copy the action's path, because
// we std::move the action object.
auto p = action._path;
attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action));
}
} catch(expressions_syntax_error& e) {
throw api_error::validation(e.what());
}
}
_attribute_updates = rjson::find(_request, "AttributeUpdates");
if (_attribute_updates) {
if (!_attribute_updates->IsObject()) {
throw api_error::validation("AttributeUpdates must be an object");
}
for (auto it = std::as_const(*_attribute_updates).MemberBegin(); it != std::as_const(*_attribute_updates).MemberEnd(); ++it) {
validate_attr_name_length("AttributeUpdates", it->name.GetStringLength(), false);
}
}
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
resolve_condition_expression(_condition_expression,
expression_attribute_names, expression_attribute_values,
used_attribute_names, used_attribute_values);
verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "UpdateItem");
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "UpdateItem");
// DynamoDB forbids having both old-style AttributeUpdates or Expected
// and new-style UpdateExpression or ConditionExpression in the same request
const rjson::value* expected = rjson::find(_request, "Expected");
if (update_expression && _attribute_updates) {
throw api_error::validation(
format("UpdateItem does not allow both AttributeUpdates and UpdateExpression to be given together"));
}
if (update_expression && expected) {
throw api_error::validation(
format("UpdateItem does not allow both old-style Expected and new-style UpdateExpression to be given together"));
}
if (_attribute_updates && !_condition_expression.empty()) {
throw api_error::validation(
format("UpdateItem does not allow both old-style AttributeUpdates and new-style ConditionExpression to be given together"));
}
if (_pk.representation().size() > 2) {
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
_consumed_capacity._total_bytes += _pk.representation().size() - 2;
}
if (_ck.representation().size() > 2) {
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
_consumed_capacity._total_bytes += _ck.representation().size() - 2;
}
if (expression_attribute_names) {
_consumed_capacity._total_bytes += estimate_value_size(*expression_attribute_names);
}
if (expression_attribute_values) {
_consumed_capacity._total_bytes += estimate_value_size(*expression_attribute_values);
}
_key_attributes = si_key_attributes(proxy.data_dictionary().find_table(
_schema->ks_name(), _schema->cf_name()));
_vector_index_attributes = vector_index_attributes(*_schema);
}
// These are the cases where update_item_operation::apply() needs to use
// "previous_item" for certain AttributeUpdates operations (ADD or DELETE)
static bool check_needs_read_before_write_attribute_updates(rjson::value *attribute_updates) {
if (!attribute_updates) {
return false;
}
// We already confirmed in update_item_operation::update_item_operation()
// that _attribute_updates, when it exists, is a map
for (auto it = attribute_updates->MemberBegin(); it != attribute_updates->MemberEnd(); ++it) {
rjson::value* action = rjson::find(it->value, "Action");
if (action) {
std::string_view action_s = rjson::to_string_view(*action);
if (action_s == "ADD") {
return true;
}
// For DELETE operation, it only needs a read before write if the
// "Value" option is used. Without it, it's just a delete.
if (action_s == "DELETE" && it->value.HasMember("Value")) {
return true;
}
}
}
return false;
}
bool
update_item_operation::needs_read_before_write() const {
return check_needs_read_before_write(_update_expression) ||
check_needs_read_before_write(_condition_expression) ||
check_needs_read_before_write_attribute_updates(_attribute_updates) ||
_request.HasMember("Expected") ||
(_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
}
// action_result() returns the result of applying an UpdateItem action -
// this result is either a JSON object or an unset optional which indicates
// the action was a deletion. The caller (update_item_operation::apply()
// below) will either write this JSON as the content of a column, or
// use it as a piece in a bigger top-level attribute.
static std::optional<rjson::value> action_result(
const parsed::update_expression::action& action,
const rjson::value* previous_item) {
return std::visit(overloaded_functor {
[&] (const parsed::update_expression::action::set& a) -> std::optional<rjson::value> {
return calculate_value(a._rhs, previous_item);
},
[&] (const parsed::update_expression::action::remove& a) -> std::optional<rjson::value> {
return std::nullopt;
},
[&] (const parsed::update_expression::action::add& a) -> std::optional<rjson::value> {
parsed::value base;
parsed::value addition;
base.set_path(action._path);
addition.set_constant(a._valref);
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item);
rjson::value result;
// An ADD can be used to create a new attribute (when
// v1.IsNull()) or to add to a pre-existing attribute:
if (v1.IsNull()) {
const auto v2_type = get_item_type_string(v2);
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
result = v2;
} else {
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
}
} else {
const auto v1_type = get_item_type_string(v1);
if (v1_type == "N") {
if (get_item_type_string(v2) != "N") {
throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
}
result = number_add(v1, v2);
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
if (get_item_type_string(v2) != v1_type) {
throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
}
result = set_sum(v1, v2);
} else {
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
}
}
return result;
},
[&] (const parsed::update_expression::action::del& a) -> std::optional<rjson::value> {
parsed::value base;
parsed::value subset;
base.set_path(action._path);
subset.set_constant(a._valref);
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item);
if (!v1.IsNull()) {
return set_diff(v1, v2);
}
// When we return nullopt here, we ask to *delete* this attribute,
// which is unnecessary because we know the attribute does not
// exist anyway. This is a waste, but a small one. Note that also
// for the "remove" action above we don't bother to check if the
// previous_item add anything to remove.
return std::nullopt;
}
}, action._action);
}
}
// Print an attribute_path_map_node<action> as the list of paths it contains:
template <> struct fmt::formatter<alternator::attribute_path_map_node<alternator::parsed::update_expression::action>> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
// this function recursively call into itself, so we have to forward declare it.
auto format(const alternator::attribute_path_map_node<alternator::parsed::update_expression::action>& h, fmt::format_context& ctx) const
-> decltype(ctx.out());
};
auto fmt::formatter<alternator::attribute_path_map_node<alternator::parsed::update_expression::action>>::format(const alternator::attribute_path_map_node<alternator::parsed::update_expression::action>& h, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
auto out = ctx.out();
if (h.has_value()) {
out = fmt::format_to(out, " {}", h.get_value()._path);
} else if (h.has_members()) {
for (auto& member : h.get_members()) {
out = fmt::format_to(out, "{}", *member.second);
}
} else if (h.has_indexes()) {
for (auto& index : h.get_indexes()) {
out = fmt::format_to(out, "{}", *index.second);
}
}
return out;
}
namespace alternator {
// Apply the hierarchy of actions in an attribute_path_map_node<action> to a
// JSON object which uses DynamoDB's serialization conventions. The complete,
// unmodified, previous_item is also necessary for the right-hand sides of the
// actions. Modifies obj in-place or returns false if it is to be removed.
static bool hierarchy_actions(
rjson::value& obj,
const attribute_path_map_node<parsed::update_expression::action>& h,
const rjson::value* previous_item)
{
if (!obj.IsObject() || obj.MemberCount() != 1) {
// This shouldn't happen. We shouldn't have stored malformed objects.
// But today Alternator does not validate the structure of nested
// documents before storing them, so this can happen on read.
throw api_error::validation(format("Malformed value object read: {}", obj));
}
const char* type = obj.MemberBegin()->name.GetString();
rjson::value& v = obj.MemberBegin()->value;
if (h.has_value()) {
// Action replacing everything in this position in the hierarchy
std::optional<rjson::value> newv = action_result(h.get_value(), previous_item);
if (newv) {
obj = std::move(*newv);
} else {
return false;
}
} else if (h.has_members()) {
if (type[0] != 'M' || !v.IsObject()) {
// A .something on a non-map doesn't work.
throw api_error::validation(fmt::format("UpdateExpression: document paths not valid for this item:{}", h));
}
for (const auto& member : h.get_members()) {
std::string attr = member.first;
const attribute_path_map_node<parsed::update_expression::action>& subh = *member.second;
rjson::value *subobj = rjson::find(v, attr);
if (subobj) {
if (!hierarchy_actions(*subobj, subh, previous_item)) {
rjson::remove_member(v, attr);
}
} else {
// When a.b does not exist, setting a.b itself (i.e.
// subh.has_value()) is fine, but setting a.b.c is not.
if (subh.has_value()) {
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
if (newv) {
// This is the !subobj case, so v doesn't have an
// attr member so we can use add()
rjson::add_with_string_name(v, attr, std::move(*newv));
} else {
// Removing a.b when a is a map but a.b doesn't exist
// is silently ignored. It's not considered an error.
}
} else {
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
}
}
}
} else if (h.has_indexes()) {
if (type[0] != 'L' || !v.IsArray()) {
// A [i] on a non-list doesn't work.
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
}
unsigned nremoved = 0;
for (const auto& index : h.get_indexes()) {
unsigned i = index.first - nremoved;
const attribute_path_map_node<parsed::update_expression::action>& subh = *index.second;
if (i < v.Size()) {
if (!hierarchy_actions(v[i], subh, previous_item)) {
v.Erase(v.Begin() + i);
// If we have the actions "REMOVE a[1] SET a[3] = :val",
// the index 3 refers to the original indexes, before any
// items were removed. So we offset the next indexes
// (which are guaranteed to be higher than i - indexes is
// a sorted map) by an increased "nremoved".
nremoved++;
}
} else {
// If a[7] does not exist, setting a[7] itself (i.e.
// subh.has_value()) is fine - and appends an item, though
// not necessarily with index 7. But setting a[7].b will
// not work.
if (subh.has_value()) {
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
if (newv) {
rjson::push_back(v, std::move(*newv));
} else {
// Removing a[7] when the list has fewer elements is
// silently ignored. It's not considered an error.
}
} else {
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
}
}
}
}
return true;
}
void update_item_operation::delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts,
deletable_row& row, attribute_collector& modified_attrs) const {
if (_returnvalues == returnvalues::ALL_NEW) {
rjson::remove_member(_return_attributes, to_string_view(column_name));
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
std::string_view cn = to_string_view(column_name);
const rjson::value* col = rjson::find(*previous_item, cn);
if (col) {
// In the UPDATED_OLD case the item starts empty and column
// names are unique, so we can use add()
rjson::add_with_string_name(_return_attributes, cn, rjson::copy(*col));
}
}
const column_definition* cdef = find_attribute(*_schema, column_name);
if (cdef) {
row.cells().apply(*cdef, atomic_cell::make_dead(ts, gc_clock::now()));
} else {
modified_attrs.del(std::move(column_name), ts);
}
}
void update_item_operation::update_attribute(bytes&& column_name, const rjson::value& json_value, const std::unique_ptr<rjson::value>& previous_item,
const api::timestamp_type ts, deletable_row& row, attribute_collector& modified_attrs,
const attribute_path_map_node<parsed::update_expression::action>* h) const {
if (_returnvalues == returnvalues::ALL_NEW) {
rjson::replace_with_string_name(_return_attributes, to_string_view(column_name), rjson::copy(json_value));
} else if (_returnvalues == returnvalues::UPDATED_NEW) {
rjson::value&& v = rjson::copy(json_value);
if (h) {
// If the operation was only on specific attribute paths,
// leave only them in _return_attributes.
if (hierarchy_filter(v, *h)) {
// In the UPDATED_NEW case, _return_attributes starts
// empty and the attribute names are unique, so we can
// use add().
rjson::add_with_string_name(_return_attributes, to_string_view(column_name), std::move(v));
}
} else {
rjson::add_with_string_name(_return_attributes, to_string_view(column_name), std::move(v));
}
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
std::string_view cn = to_string_view(column_name);
const rjson::value* col = rjson::find(*previous_item, cn);
if (col) {
rjson::value&& v = rjson::copy(*col);
if (h) {
if (hierarchy_filter(v, *h)) {
// In the UPDATED_OLD case, _return_attributes starts
// empty and the attribute names are unique, so we can
// use add().
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
}
} else {
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
}
}
}
const column_definition* cdef = find_attribute(*_schema, column_name);
if (cdef) {
bytes column_value = get_key_from_typed_value(json_value, *cdef);
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, column_value));
} else {
// This attribute may be a key column of one of the GSIs or LSIs,
// in which case there are some limitations on the value.
if (!_key_attributes.empty()) {
validate_value_if_index_key(_key_attributes, column_name, json_value);
}
// This attribute may also be a vector index target column,
// in which case it must be a list of the right number of floats.
if (!_vector_index_attributes.empty()) {
validate_value_if_vector_index_attribute(_vector_index_attributes, column_name, json_value);
}
modified_attrs.put(std::move(column_name), serialize_item(json_value), ts);
}
}
inline void update_item_operation::apply_attribute_updates(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
// Note that it.key() is the name of the column, *it is the operation
bytes column_name = to_bytes(rjson::to_string_view(it->name));
const column_definition* cdef = _schema->get_column_definition(column_name);
if (cdef && cdef->is_primary_key()) {
throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
}
std::string action = rjson::to_string((it->value)["Action"]);
if (action == "DELETE") {
// The DELETE operation can do two unrelated tasks. Without a
// "Value" option, it is used to delete an attribute. With a
// "Value" option, it is used to delete a set of elements from
// a set attribute of the same type.
if (it->value.HasMember("Value")) {
// Subtracting sets needs a read of previous_item, so
// check_needs_read_before_write_attribute_updates()
// returns true in this case, and previous_item is
// available to us when the item exists.
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_string_view(column_name)) : nullptr;
const rjson::value& v2 = (it->value)["Value"];
validate_value(v2, "AttributeUpdates");
const auto v2_type = get_item_type_string(v2);
if (v2_type != "SS" && v2_type != "NS" && v2_type != "BS") {
throw api_error::validation(fmt::format("AttributeUpdates DELETE operation with Value only valid for sets, got type {}", v2_type));
}
if (v1) {
std::optional<rjson::value> result = set_diff(*v1, v2);
if (result) {
any_updates = true;
update_attribute(std::move(column_name), *result, previous_item, ts, row, modified_attrs);
} else {
// DynamoDB does not allow empty sets - if the
// result is empty, delete the attribute.
any_deletes = true;
delete_attribute(std::move(column_name), previous_item, ts, row, modified_attrs);
}
} else {
// if the attribute or item don't exist, the DELETE
// operation should silently do nothing - and not
// create an empty item. It's a waste to call
// do_delete() on an attribute we already know is
// deleted, so we can just mark any_deletes = true.
any_deletes = true;
}
} else {
any_deletes = true;
delete_attribute(std::move(column_name), previous_item, ts, row, modified_attrs);
}
} else if (action == "PUT") {
const rjson::value& value = (it->value)["Value"];
validate_value(value, "AttributeUpdates");
any_updates = true;
update_attribute(std::move(column_name), value, previous_item, ts, row, modified_attrs);
} else if (action == "ADD") {
// Note that check_needs_read_before_write_attribute_updates()
// made sure we retrieved previous_item (if exists) when there
// is an ADD action.
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_string_view(column_name)) : nullptr;
const rjson::value& v2 = (it->value)["Value"];
validate_value(v2, "AttributeUpdates");
// An ADD can be used to create a new attribute (when
// !v1) or to add to a pre-existing attribute:
if (!v1) {
const auto v2_type = get_item_type_string(v2);
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS" || v2_type == "L") {
any_updates = true;
update_attribute(std::move(column_name), v2, previous_item, ts, row, modified_attrs);
} else {
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", v2));
}
} else {
const auto v1_type = get_item_type_string(*v1);
const auto v2_type = get_item_type_string(v2);
if (v2_type != v1_type) {
throw api_error::validation(fmt::format("Operand type mismatch in AttributeUpdates ADD. Expected {}, got {}", v1_type, v2_type));
}
if (v1_type == "N") {
any_updates = true;
update_attribute(std::move(column_name), number_add(*v1, v2), previous_item, ts, row, modified_attrs);
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
any_updates = true;
update_attribute(std::move(column_name), set_sum(*v1, v2), previous_item, ts, row, modified_attrs);
} else if (v1_type == "L") {
// The DynamoDB documentation doesn't say it supports
// lists in ADD operations, but it turns out that it
// does. Interestingly, this is only true for
// AttributeUpdates (this code) - the similar ADD
// in UpdateExpression doesn't support lists.
any_updates = true;
update_attribute(std::move(column_name), list_concatenate(*v1, v2), previous_item, ts, row, modified_attrs);
} else {
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", *v1));
}
}
} else {
throw api_error::validation(fmt::format("Unknown Action value '{}' in AttributeUpdates", action));
}
}
}
inline void update_item_operation::apply_update_expression(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
for (auto& actions : _update_expression) {
// The actions of _update_expression are grouped by top-level
// attributes. Here, all actions in actions.second share the same
// top-level attribute actions.first.
std::string column_name = actions.first;
const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
if (cdef && cdef->is_primary_key()) {
throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
}
if (actions.second.has_value()) {
// An action on a top-level attribute column_name. The single
// action is actions.second.get_value(). We can simply invoke
// the action and replace the attribute with its result:
std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
if (result) {
any_updates = true;
update_attribute(to_bytes(column_name), *result, previous_item, ts, row, modified_attrs, &actions.second);
} else {
any_deletes = true;
delete_attribute(to_bytes(column_name), previous_item, ts, row, modified_attrs);
}
} else {
// We have actions on a path or more than one path in the same
// top-level attribute column_name - but not on the top-level
// attribute as a whole. We already read the full top-level
// attribute (see check_needs_read_before_write()), and now we
// need to modify pieces of it and write back the entire
// top-level attribute.
if (!previous_item) {
throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item"));
}
const rjson::value* toplevel = rjson::find(*previous_item, column_name);
if (!toplevel) {
throw api_error::validation(fmt::format("UpdateItem cannot update document path: missing attribute {}", column_name));
}
rjson::value result = rjson::copy(*toplevel);
any_updates = true;
hierarchy_actions(result, actions.second, previous_item.get());
update_attribute(to_bytes(column_name), std::move(result), previous_item, ts, row, modified_attrs, &actions.second);
}
}
}
std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const {
if (_consumed_capacity._total_bytes == 0) {
_consumed_capacity._total_bytes = 1;
}
if (!verify_expected(_request, previous_item.get()) || !verify_condition_expression(_condition_expression, previous_item.get())) {
if (previous_item && _returnvalues_on_condition_check_failure == returnvalues_on_condition_check_failure::ALL_OLD) {
_return_attributes = std::move(*previous_item);
}
// If the update is to be cancelled because of an unfulfilled
// condition, return an empty optional mutation, which is more
// efficient than throwing an exception.
return {};
}
// In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
// _return_attributes and parts of it will be overwritten by the new
// updates (in do_update() and do_delete()). We need to make a copy and
// cannot overwrite previous_item directly because we still need its
// original content for update expressions. For example, the expression
// "REMOVE a SET b=a" is valid, and needs the original value of a to
// stick around.
// Note that for ReturnValues=ALL_OLD, we don't need to copy here, and
// can just move previous_item later, when we don't need it any more.
if (_returnvalues == returnvalues::ALL_NEW) {
if (previous_item) {
_return_attributes = rjson::copy(*previous_item);
} else {
// If there is no previous item, usually a new item is created
// and contains the given key. This may be cancelled at the end
// of this function if the update is just deletes.
_return_attributes = rjson::copy(rjson::get(_request, "Key"));
}
} else if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
_return_attributes = rjson::empty_object();
}
mutation m(_schema, _pk);
bool any_updates = false;
bool any_deletes = false;
auto& row = m.partition().clustered_row(*_schema, _ck);
auto modified_attrs = attribute_collector();
if (!_update_expression.empty()) {
apply_update_expression(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
}
if (_attribute_updates) {
apply_attribute_updates(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
}
if (!modified_attrs.empty()) {
auto serialized_map = modified_attrs.to_mut().serialize(*attrs_type());
row.cells().apply(attrs_column(*_schema), std::move(serialized_map));
}
// To allow creation of an item with no attributes, we need a row marker.
// Note that unlike Scylla, even an "update" operation needs to add a row
// marker. An update with only DELETE operations must not add a row marker
// (this was issue #5862) but any other update, even an empty one, should.
if (any_updates || !any_deletes) {
row.apply(row_marker(ts));
} else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
// There was no pre-existing item, and we're not creating one, so
// don't report the new item in the returned Attributes.
_return_attributes = rjson::null_value();
}
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
_return_attributes = std::move(*previous_item);
}
// ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
// even if a new item was created. Instead it should be missing entirely.
if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
if (_return_attributes.MemberCount() == 0) {
_return_attributes = rjson::null_value();
}
}
return m;
}
future<executor::request_return_type> executor::update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.update_item++;
auto start_time = std::chrono::steady_clock::now();
elogger.trace("update_item {}", request);
auto op = make_shared<update_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
if (!audit_info) {
// On LWT shard bounce, audit_info is already set on the originating shard.
// The reference captured in the bounce lambda points back to the original
// coroutine frame, which remains alive for the entire cross-shard call.
// Only reads of this pointer occur on the target shard — no writes or frees.
maybe_audit(audit_info, audit::statement_category::DML, op->schema()->ks_name(),
op->schema()->cf_name(), "UpdateItem", op->request(), db::consistency_level::LOCAL_QUORUM);
}
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
auto cas_shard = op->shard_for_execute(needs_read_before_write);
if (cas_shard && !cas_shard->this_shard()) {
_stats.api_operations.update_item--; // uncount on this shard, will be counted in other shard
_stats.shard_bounce_for_lwt++;
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit), &audit_info]
(executor& e) mutable {
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt), &audit_info]
(service::client_state& client_state) mutable {
//FIXME: Instead of passing empty_service_permit() to the background operation,
// the current permit's lifetime should be prolonged, so that it's destructed
// only after all background operations are finished as well.
return e.update_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request), audit_info);
});
});
}
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
per_table_stats->api_operations.update_item++;
uint64_t wcu_total = 0;
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
per_table_stats->operation_sizes.update_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
per_table_stats->wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
_stats.wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
per_table_stats->api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
_stats.api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return res;
}
static void check_big_object(const rjson::value& val, int& size_left);
static void check_big_array(const rjson::value& val, int& size_left);
bool is_big(const rjson::value& val, int big_size) {
if (val.IsString()) {
return ssize_t(val.GetStringLength()) > big_size;
} else if (val.IsObject()) {
check_big_object(val, big_size);
return big_size < 0;
} else if (val.IsArray()) {
check_big_array(val, big_size);
return big_size < 0;
}
return false;
}
static void check_big_array(const rjson::value& val, int& size_left) {
// Assume a fixed size of 10 bytes for each number, boolean, etc., or
// beginning of a sub-object. This doesn't have to be accurate.
size_left -= 10 * val.Size();
for (const auto& v : val.GetArray()) {
if (size_left < 0) {
return;
}
// Note that we avoid recursive calls for the leaves (anything except
// array or object) because usually those greatly outnumber the trunk.
if (v.IsString()) {
size_left -= v.GetStringLength();
} else if (v.IsObject()) {
check_big_object(v, size_left);
} else if (v.IsArray()) {
check_big_array(v, size_left);
}
}
}
static void check_big_object(const rjson::value& val, int& size_left) {
size_left -= 10 * val.MemberCount();
for (const auto& m : val.GetObject()) {
if (size_left < 0) {
return;
}
size_left -= m.name.GetStringLength();
if (m.value.IsString()) {
size_left -= m.value.GetStringLength();
} else if (m.value.IsObject()) {
check_big_object(m.value, size_left);
} else if (m.value.IsArray()) {
check_big_array(m.value, size_left);
}
}
}
future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.list_tables++;
elogger.trace("Listing tables {}", request);
maybe_audit(audit_info, audit::statement_category::QUERY, "", "", "ListTables", request);
co_await utils::get_local_injector().inject("alternator_list_tables", [] (auto& handler) -> future<> {
handler.set("waiting", true);
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
});
rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
rjson::value* limit_json = rjson::find(request, "Limit");
std::string exclusive_start = exclusive_start_json ? rjson::to_string(*exclusive_start_json) : "";
int limit = limit_json ? limit_json->GetInt() : 100;
if (limit < 1 || limit > 100) {
co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
}
auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
auto table_names = tables
| std::views::filter([this] (data_dictionary::table t) {
return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 &&
!t.schema()->is_view() &&
!cdc::is_log_for_some_table(_proxy.local_db(), t.schema()->ks_name(), t.schema()->cf_name());
})
| std::views::transform([] (data_dictionary::table t) {
return t.schema()->cf_name();
});
rjson::value response = rjson::empty_object();
rjson::add(response, "TableNames", rjson::empty_array());
rjson::value& all_tables = response["TableNames"];
//TODO(sarna): Dynamo doesn't declare any ordering when listing tables,
// but our implementation is vulnerable to changes, because the tables
// are stored in an unordered map. We may consider (partially) sorting
// the results before returning them to the client, especially if there
// is an implicit order of elements that Dynamo imposes.
auto table_names_it = [&table_names, &exclusive_start] {
if (!exclusive_start.empty()) {
auto it = std::ranges::find_if(table_names, [&exclusive_start] (const sstring& table_name) { return table_name == exclusive_start; });
return std::next(it, it != table_names.end());
} else {
return table_names.begin();
}
}();
while (limit > 0 && table_names_it != table_names.end()) {
rjson::push_back(all_tables, rjson::from_string(*table_names_it));
--limit;
++table_names_it;
}
if (table_names_it != table_names.end()) {
auto& last_table_name = *std::prev(all_tables.End());
rjson::add(response, "LastEvaluatedTableName", rjson::copy(last_table_name));
}
co_return rjson::print(std::move(response));
}
future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.describe_endpoints++;
maybe_audit(audit_info, audit::statement_category::QUERY, "", "", "DescribeEndpoints", request);
// The alternator_describe_endpoints configuration can be used to disable
// the DescribeEndpoints operation, or set it to return a fixed string
std::string override = _proxy.data_dictionary().get_config().alternator_describe_endpoints();
if (!override.empty()) {
if (override == "disabled") {
_stats.unsupported_operations++;
co_return api_error::unknown_operation(
"DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)");
}
host_header = std::move(override);
}
rjson::value response = rjson::empty_object();
// Without having any configuration parameter to say otherwise, we tell
// the user to return to the same endpoint they used to reach us. The only
// way we can know this is through the "Host:" header in the request,
// which typically exists (and in fact is mandatory in HTTP 1.1).
// A "Host:" header includes both host name and port, exactly what we need
// to return.
if (host_header.empty()) {
co_return api_error::validation("DescribeEndpoints needs a 'Host:' header in request");
}
rjson::add(response, "Endpoints", rjson::empty_array());
rjson::push_back(response["Endpoints"], rjson::empty_object());
rjson::add(response["Endpoints"][0], "Address", rjson::from_string(host_header));
rjson::add(response["Endpoints"][0], "CachePeriodInMinutes", rjson::value(1440));
co_return rjson::print(std::move(response));
}
static locator::replication_strategy_config_options get_network_topology_options(service::storage_proxy& sp, gms::gossiper& gossiper, int rf) {
locator::replication_strategy_config_options options;
for (const auto& dc : sp.get_token_metadata_ptr()->get_datacenter_racks_token_owners() | std::views::keys) {
options.emplace(dc, std::to_string(rf));
}
return options;
}
future<executor::request_return_type> executor::describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
_stats.api_operations.describe_continuous_backups++;
// Unlike most operations which return ResourceNotFound when the given
// table doesn't exists, this operation returns a TableNoteFoundException.
// So we can't use the usual get_table() wrapper and need a bit more code:
std::string table_name = get_table_name(request);
sstring ks_name = sstring(executor::KEYSPACE_NAME_PREFIX) + table_name;
maybe_audit(audit_info, audit::statement_category::QUERY, ks_name, table_name, "DescribeContinuousBackups", request);
schema_ptr schema;
try {
schema = _proxy.data_dictionary().find_schema(ks_name, table_name);
} catch(data_dictionary::no_such_column_family&) {
// DynamoDB returns validation error even when table does not exist
// and the table name is invalid.
validate_table_name(table_name);
throw api_error::table_not_found(
fmt::format("Table {} not found", table_name));
}
rjson::value desc = rjson::empty_object();
rjson::add(desc, "ContinuousBackupsStatus", "DISABLED");
rjson::value pitr = rjson::empty_object();
rjson::add(pitr, "PointInTimeRecoveryStatus", "DISABLED");
rjson::add(desc, "PointInTimeRecoveryDescription", std::move(pitr));
rjson::value response = rjson::empty_object();
rjson::add(response, "ContinuousBackupsDescription", std::move(desc));
co_return rjson::print(std::move(response));
}
// Create the metadata for the keyspace in which we put the alternator
// table if it doesn't already exist.
// Currently, we automatically configure the keyspace based on the number
// of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
// A smaller cluster (presumably, a test only), gets RF=1. The user may
// manually create the keyspace to override this predefined behavior.
static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts,
const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode) {
// Whether to use tablets for the table (actually for the keyspace of the
// table) is determined by tablets_mode (taken from the configuration
// option "tablets_mode_for_new_keyspaces"), as well as the presence and
// the value of a per-table tag system:initial_tablets
// (INITIAL_TABLETS_TAG_KEY).
// Setting the tag with a numeric value will enable a specific initial number
// of tablets (setting the value to 0 means enabling tablets with
// an automatic selection of the best number of tablets).
// Setting this tag to any non-numeric value (e.g., an empty string or the
// word "none") will ask to disable tablets.
// When vnodes are asked for by the tag value, but tablets are enforced by config,
// throw an exception to the client.
std::optional<unsigned> initial_tablets;
if (feat.tablets) {
auto it = tags_map.find(INITIAL_TABLETS_TAG_KEY);
if (it != tags_map.end()) {
// Tag set. If it's a valid number, use it. If not - e.g., it's
// empty or a word like "none", disable tablets by setting
// initial_tablets to a disengaged optional.
try {
initial_tablets = std::stol(tags_map.at(INITIAL_TABLETS_TAG_KEY));
} catch (...) {
if (tablets_mode == db::tablets_mode_t::mode::enforced) {
throw api_error::validation(format("Tag {} containing non-numerical value requests vnodes, but vnodes are forbidden by configuration option `tablets_mode_for_new_keyspaces: enforced`", INITIAL_TABLETS_TAG_KEY));
}
initial_tablets = std::nullopt;
elogger.trace("Following {} tag containing non-numerical value, Alternator will attempt to create a keyspace {} with vnodes.", INITIAL_TABLETS_TAG_KEY, keyspace_name);
}
} else {
// No per-table tag present, use the value from config
if (tablets_mode == db::tablets_mode_t::mode::enabled || tablets_mode == db::tablets_mode_t::mode::enforced) {
initial_tablets = 0;
elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with tablets.", keyspace_name);
} else {
initial_tablets = std::nullopt;
elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with vnodes.", keyspace_name);
}
}
}
int endpoint_count = gossiper.num_endpoints();
int rf = 3;
if (endpoint_count < rf) {
rf = 1;
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
keyspace_name, rf, endpoint_count);
}
auto opts = get_network_topology_options(sp, gossiper, rf);
cql3::statements::ks_prop_defs props;
opts["class"] = sstring("NetworkTopologyStrategy");
props.add_property(cql3::statements::ks_prop_defs::KW_REPLICATION, opts);
std::map<sstring, sstring> tablet_opts;
if (initial_tablets) {
tablet_opts["initial"] = std::to_string(*initial_tablets);
}
tablet_opts["enabled"] = initial_tablets ? "true" : "false";
props.add_property(cql3::statements::ks_prop_defs::KW_TABLETS, std::move(tablet_opts));
props.validate();
return props.as_ks_metadata(sstring(keyspace_name), *sp.get_token_metadata_ptr(), feat, sp.local_db().get_config());
}
future<> executor::start() {
// Currently, nothing to do on initialization. We delay the keyspace
// creation (create_keyspace()) until a table is actually created.
return make_ready_future<>();
}
future<> executor::stop() {
co_await _describe_table_info_manager->stop();
// disconnect from the value source, but keep the value unchanged.
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
co_await _parsed_expression_cache->stop();
}
} // namespace alternator