mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 19:32:02 +00:00
Compare commits
1 Commits
SCYLLADB-1
...
fix-invali
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45d5f9b827 |
@@ -4,8 +4,6 @@ on:
|
||||
milestone:
|
||||
types: [created, closed]
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
sync-milestone-to-jira:
|
||||
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -36,6 +36,4 @@ compile_commands.json
|
||||
clang_build
|
||||
.idea/
|
||||
nuke
|
||||
rust/**/target
|
||||
rust/**/Cargo.lock
|
||||
test/resource/wasm/rust/target
|
||||
rust/target
|
||||
|
||||
@@ -299,7 +299,6 @@ target_sources(scylla-main
|
||||
serializer.cc
|
||||
service/direct_failure_detector/failure_detector.cc
|
||||
sstables_loader.cc
|
||||
sstables_loader_helpers.cc
|
||||
table_helper.cc
|
||||
tasks/task_handler.cc
|
||||
tasks/task_manager.cc
|
||||
|
||||
@@ -247,18 +247,6 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
|
||||
if (!v1) {
|
||||
return false;
|
||||
}
|
||||
if (!v1->IsObject() || v1->MemberCount() != 1) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!v2.IsObject() || v2.MemberCount() != 1) {
|
||||
if (v2_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const auto& kv1 = *v1->MemberBegin();
|
||||
const auto& kv2 = *v2.MemberBegin();
|
||||
if (kv1.name == "S" && kv2.name == "S") {
|
||||
@@ -277,17 +265,9 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
|
||||
}
|
||||
}
|
||||
} else if (kv1.name == "L") {
|
||||
if (!kv1.value.IsArray()) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator received a malformed list");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
|
||||
if (!i->IsObject() || i->MemberCount() != 1) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator received a list whose element is malformed");
|
||||
}
|
||||
clogger.error("check_CONTAINS received a list whose element is malformed");
|
||||
return false;
|
||||
}
|
||||
const auto& el = *i->MemberBegin();
|
||||
@@ -701,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
|
||||
case parsed::primitive_condition::type::VALUE:
|
||||
if (calculated_values.size() != 1) {
|
||||
// Shouldn't happen unless we have a bug in the parser
|
||||
throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
|
||||
throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
|
||||
}
|
||||
// Unwrap the boolean wrapped as the value (if it is a boolean)
|
||||
if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
|
||||
|
||||
@@ -38,7 +38,6 @@ controller::controller(
|
||||
sharded<auth::service>& auth_service,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
sharded<vector_search::vector_store_client>& vsc,
|
||||
sharded<updateable_timeout_config>& timeout_config,
|
||||
const db::config& config,
|
||||
seastar::scheduling_group sg)
|
||||
: protocol_server(sg)
|
||||
@@ -53,7 +52,6 @@ controller::controller(
|
||||
, _auth_service(auth_service)
|
||||
, _sl_controller(sl_controller)
|
||||
, _vsc(vsc)
|
||||
, _timeout_config(timeout_config)
|
||||
, _config(config)
|
||||
{
|
||||
}
|
||||
@@ -101,7 +99,7 @@ future<> controller::start_server() {
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks), std::ref(_sys_ks),
|
||||
sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), std::ref(_vsc), _ssg.value(),
|
||||
sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller), std::ref(_timeout_config)).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
|
||||
// Note: from this point on, if start_server() throws for any reason,
|
||||
// it must first call stop_server() to stop the executor and server
|
||||
// services we just started - or Scylla will cause an assertion
|
||||
|
||||
@@ -48,8 +48,6 @@ namespace vector_search {
|
||||
class vector_store_client;
|
||||
}
|
||||
|
||||
class updateable_timeout_config;
|
||||
|
||||
namespace alternator {
|
||||
|
||||
// This is the official DynamoDB API version.
|
||||
@@ -74,7 +72,6 @@ class controller : public protocol_server {
|
||||
sharded<auth::service>& _auth_service;
|
||||
sharded<qos::service_level_controller>& _sl_controller;
|
||||
sharded<vector_search::vector_store_client>& _vsc;
|
||||
sharded<updateable_timeout_config>& _timeout_config;
|
||||
const db::config& _config;
|
||||
|
||||
std::vector<socket_address> _listen_addresses;
|
||||
@@ -95,7 +92,6 @@ public:
|
||||
sharded<auth::service>& auth_service,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
sharded<vector_search::vector_store_client>& vsc,
|
||||
sharded<updateable_timeout_config>& timeout_config,
|
||||
const db::config& config,
|
||||
seastar::scheduling_group sg);
|
||||
|
||||
|
||||
@@ -1362,33 +1362,6 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
|
||||
return dimensions_v->GetInt();
|
||||
}
|
||||
|
||||
// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
|
||||
// currently synchronous - they return only after the operation is complete.
|
||||
// After announce() of the new schema finished, the schema change is committed
|
||||
// and a majority of nodes know it - but it's possible that some live nodes
|
||||
// have not yet applied the new schema. If we return to the user now, and the
|
||||
// user sends a node request that relies on the new schema, it might fail.
|
||||
// So before returning, we must verify that *all* nodes have applied the new
|
||||
// schema. This is what wait_for_schema_agreement_after_ddl() does.
|
||||
//
|
||||
// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
|
||||
// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
|
||||
// is returned. The user, who doesn't know if the CreateTable succeeded or not,
|
||||
// can retry the request and will get a ResourceInUseException and know the
|
||||
// table already exists. So a CreateTable that returns a ResourceInUseException
|
||||
// should also call wait_for_schema_agreement_after_ddl().
|
||||
//
|
||||
// When issue #5052 is resolved, this function can be removed - we will need
|
||||
// to check if we reached schema agreement, but not to *wait* for it.
|
||||
static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
|
||||
static constexpr auto schema_agreement_seconds = 30;
|
||||
try {
|
||||
co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
|
||||
} catch (const service::migration_manager::schema_agreement_timeout&) {
|
||||
throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
|
||||
}
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
|
||||
const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
||||
throwing_assert(this_shard_id() == 0);
|
||||
@@ -1722,26 +1695,13 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
}
|
||||
bool table_already_exists = false;
|
||||
try {
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
|
||||
} catch (exceptions::already_exists_exception&) {
|
||||
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
table_already_exists = true;
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
}
|
||||
if (table_already_exists) {
|
||||
// The user may have retried a CreateTable operation after it timed
|
||||
// out in wait_for_schema_agreement_after_ddl(). So before we may
|
||||
// return ResourceInUseException (which can lead the user to start
|
||||
// using the table which it now knows exists), we need to wait for
|
||||
// schema agreement, just like the original CreateTable did. Again
|
||||
// we fail with InternalServerError if schema agreement still cannot
|
||||
// be reached. We can release group0_guard before waiting.
|
||||
release_guard(std::move(group0_guard));
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
if (_proxy.data_dictionary().try_find_table(schema->id())) {
|
||||
// This should never happen, the ID is supposed to be unique
|
||||
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
||||
@@ -1790,7 +1750,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
rjson::value status = rjson::empty_object();
|
||||
executor::supplement_table_info(request, *schema, _proxy);
|
||||
rjson::add(status, "TableDescription", std::move(request));
|
||||
@@ -1900,7 +1860,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
empty_request = false;
|
||||
if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
|
||||
if (add_stream_options(*stream_specification, builder, p.local())) {
|
||||
validate_cdc_log_name_length(builder.cf_name());
|
||||
// On tablet tables, defer stream enablement and block
|
||||
// tablet merges (see defer_enabling_streams_block_tablet_merges).
|
||||
@@ -1915,23 +1875,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
|
||||
}
|
||||
// When re-enabling streams on an Alternator table, drop the old
|
||||
// CDC log table first as a separate schema change, so the
|
||||
// subsequent UpdateTable creates a fresh one with a new UUID
|
||||
// (= new StreamArn). See #7239.
|
||||
auto logname = cdc::log_name(tab->cf_name());
|
||||
auto& local_db = p.local().local_db();
|
||||
if (local_db.has_schema(tab->ks_name(), logname)
|
||||
&& cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
|
||||
auto drop_m = co_await service::prepare_column_family_drop_announcement(
|
||||
p.local(), tab->ks_name(), logname,
|
||||
group0_guard.write_timestamp());
|
||||
co_await mm.announce(std::move(drop_m), std::move(group0_guard),
|
||||
format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
|
||||
co_await mm.wait_for_schema_agreement(
|
||||
p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
|
||||
@@ -2246,7 +2189,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
throw;
|
||||
}
|
||||
}
|
||||
co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());
|
||||
co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
|
||||
rjson::value status = rjson::empty_object();
|
||||
supplement_table_info(request, *schema, p.local());
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "cdc/cdc_options.hh"
|
||||
|
||||
|
||||
namespace db {
|
||||
@@ -200,7 +199,7 @@ private:
|
||||
tracing::trace_state_ptr trace_state, service_permit permit);
|
||||
|
||||
public:
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
|
||||
static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
|
||||
static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
|
||||
};
|
||||
|
||||
@@ -485,7 +485,7 @@ std::optional<bytes> unwrap_bytes(const rjson::value& value, bool from_query) {
|
||||
return rjson::base64_decode(value);
|
||||
} catch (...) {
|
||||
if (from_query) {
|
||||
throw api_error::serialization("Invalid base64 data");
|
||||
throw api_error::serialization(format("Invalid base64 data"));
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
@@ -835,7 +835,7 @@ void server::set_routes(routes& r) {
|
||||
//FIXME: A way to immediately invalidate the cache should be considered,
|
||||
// e.g. when the system table which stores the keys is changed.
|
||||
// For now, this propagation may take up to 1 minute.
|
||||
server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config)
|
||||
server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller)
|
||||
: _http_server("http-alternator")
|
||||
, _https_server("https-alternator")
|
||||
, _executor(exec)
|
||||
@@ -847,7 +847,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
|
||||
, _max_users_query_size_in_trace_output(1024)
|
||||
, _enabled_servers{}
|
||||
, _pending_requests("alternator::server::pending_requests")
|
||||
, _timeout_config(timeout_config)
|
||||
, _timeout_config(_proxy.data_dictionary().get_config())
|
||||
, _callbacks{
|
||||
{"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
||||
return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request), audit_info);
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include <seastar/net/tls.hh>
|
||||
#include <optional>
|
||||
#include "alternator/auth.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
@@ -54,8 +53,8 @@ class server : public peering_sharded_service<server> {
|
||||
named_gate _pending_requests;
|
||||
// In some places we will need a CQL updateable_timeout_config object even
|
||||
// though it isn't really relevant for Alternator which defines its own
|
||||
// timeouts separately.
|
||||
updateable_timeout_config& _timeout_config;
|
||||
// timeouts separately. We can create this object only once.
|
||||
updateable_timeout_config _timeout_config;
|
||||
client_options_cache_type _connection_options_keys_and_values;
|
||||
|
||||
alternator_callbacks_map _callbacks;
|
||||
@@ -99,7 +98,7 @@ class server : public peering_sharded_service<server> {
|
||||
utils::scoped_item_list<ongoing_request> _ongoing_requests;
|
||||
|
||||
public:
|
||||
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config);
|
||||
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
|
||||
|
||||
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port,
|
||||
std::optional<uint16_t> port_proxy_protocol, std::optional<uint16_t> https_port_proxy_protocol,
|
||||
|
||||
@@ -243,10 +243,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
if (!is_alternator_keyspace(ks_name)) {
|
||||
continue;
|
||||
}
|
||||
// Use get_base_table instead of is_log_for_some_table because the
|
||||
// latter requires CDC to be enabled, but we want to list streams
|
||||
// that have been disabled but whose log table still exists (#7239).
|
||||
if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
|
||||
if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
|
||||
rjson::value new_entry = rjson::empty_object();
|
||||
|
||||
auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
|
||||
@@ -395,7 +392,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
|
||||
return is;
|
||||
}
|
||||
|
||||
static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
|
||||
static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
|
||||
stream_view_type type = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
type = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
@@ -841,7 +838,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto& opts = bs->cdc_options();
|
||||
|
||||
auto status = "DISABLED";
|
||||
bool stream_disabled = !opts.enabled();
|
||||
|
||||
if (opts.enabled()) {
|
||||
if (!_cdc_metadata.streams_available()) {
|
||||
@@ -857,7 +853,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(opts);
|
||||
stream_view_type type = cdc_options_to_steam_view_type(opts);
|
||||
|
||||
rjson::add(stream_desc, "StreamArn", stream_arn);
|
||||
rjson::add(stream_desc, "StreamViewType", type);
|
||||
@@ -865,9 +861,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
describe_key_schema(stream_desc, *bs);
|
||||
|
||||
// For disabled streams, we still fall through to enumerate shards
|
||||
// below. All shards will have EndingSequenceNumber set, indicating
|
||||
// they are closed. See issue #7239.
|
||||
if (!opts.enabled()) {
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
// TODO: label
|
||||
// TODO: creation time
|
||||
@@ -950,12 +947,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto expired = [&]() -> std::optional<db_clock::time_point> {
|
||||
auto j = std::next(i);
|
||||
if (j == e) {
|
||||
// For a disabled stream, all shards are closed (#7239).
|
||||
// Use "now" as the ending sequence number for the last
|
||||
// generation's shards.
|
||||
if (stream_disabled) {
|
||||
return db_clock::now();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
// add this so we sort of match potential
|
||||
@@ -1306,7 +1297,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
| std::ranges::to<query::column_id_vector>()
|
||||
;
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());
|
||||
stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
|
||||
|
||||
auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
|
||||
auto partition_slice = query::partition_slice(
|
||||
@@ -1490,17 +1481,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
|
||||
auto& shard = iter.shard;
|
||||
|
||||
if (!base->cdc_options().enabled()) {
|
||||
// Stream is disabled -- all shards are closed (#7239).
|
||||
// Don't return NextShardIterator.
|
||||
} else if (shard.time < ts && ts < high_ts) {
|
||||
if (shard.time < ts && ts < high_ts) {
|
||||
// The DynamoDB documentation states that when a shard is
|
||||
// closed, reading it until the end has NextShardIterator
|
||||
// "set to null". Our test test_streams_closed_read
|
||||
// confirms that by "null" they meant not set at all.
|
||||
} else {
|
||||
// Shard is still open with no records in the scanned window.
|
||||
// Return the original iterator so the client can poll again.
|
||||
// We could have return the same iterator again, but we did
|
||||
// a search from it until high_ts and found nothing, so we
|
||||
// can also start the next search from high_ts.
|
||||
// TODO: but why? It's simpler just to leave the iterator be.
|
||||
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
|
||||
rjson::add(ret, "NextShardIterator", iter);
|
||||
}
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
@@ -1510,13 +1501,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
|
||||
auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
|
||||
if (!stream_enabled || !stream_enabled->IsBool()) {
|
||||
throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
|
||||
}
|
||||
|
||||
if (stream_enabled->GetBool()) {
|
||||
if (!sp.features().alternator_streams) {
|
||||
throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
|
||||
}
|
||||
|
||||
cdc::options opts;
|
||||
opts.enabled(true);
|
||||
opts.tablet_merge_blocked(true);
|
||||
@@ -1542,13 +1537,8 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
builder.with_cdc_options(opts);
|
||||
return true;
|
||||
} else {
|
||||
// When disabling, preserve the existing CDC options (preimage,
|
||||
// postimage, ttl, etc.) so that DescribeStream can still report
|
||||
// the correct StreamViewType on a disabled stream.
|
||||
cdc::options opts = existing_cdc_opts;
|
||||
cdc::options opts;
|
||||
opts.enabled(false);
|
||||
opts.enable_requested(false);
|
||||
opts.tablet_merge_blocked(false);
|
||||
builder.with_cdc_options(opts);
|
||||
return false;
|
||||
}
|
||||
@@ -1556,36 +1546,33 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
|
||||
void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
|
||||
auto& opts = schema.cdc_options();
|
||||
// Report stream info when:
|
||||
// 1. Log table exists (covers both enabled and disabled-but-readable).
|
||||
// 2. enable_requested (ENABLING state, log not yet created).
|
||||
auto db = sp.data_dictionary();
|
||||
auto log_name = cdc::log_name(schema.cf_name());
|
||||
auto log_cf = db.try_find_table(schema.ks_name(), log_name);
|
||||
if (log_cf) {
|
||||
auto log_schema = log_cf->schema();
|
||||
stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
|
||||
if (opts.enabled()) {
|
||||
auto db = sp.data_dictionary();
|
||||
auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
|
||||
stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
|
||||
rjson::add(descr, "LatestStreamArn", arn);
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));
|
||||
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", opts.enabled());
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
} else if (opts.enable_requested()) {
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
|
||||
} else if (!opts.enable_requested()) {
|
||||
return;
|
||||
}
|
||||
// For both enabled() and enable_requested():
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
auto mode = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
mode = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
} else if (opts.preimage()) {
|
||||
mode = stream_view_type::OLD_IMAGE;
|
||||
} else if (opts.postimage()) {
|
||||
mode = stream_view_type::NEW_IMAGE;
|
||||
}
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
}
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
@@ -974,54 +974,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/tablets/restore",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Starts copying SSTables from a designated bucket in object storage to a specified keyspace",
|
||||
"type":"string",
|
||||
"nickname":"tablet_aware_restore",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"keyspace",
|
||||
"description":"Name of a keyspace to copy SSTables to",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"table",
|
||||
"description":"Name of a table to copy SSTables to",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"snapshot",
|
||||
"description":"Name of the snapshot to restore from",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"backup_location",
|
||||
"description":"JSON array of backup location objects. Each object must contain: 'datacenter' (string), 'endpoint' (string), 'bucket' (string), and 'manifests' (array of strings). Currently, the array must contain exactly one entry.",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"array",
|
||||
"paramType":"body"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/keyspace_compaction/{keyspace}",
|
||||
"operations":[
|
||||
|
||||
@@ -527,56 +527,11 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
|
||||
ss::tablet_aware_restore.set(r, [&ctx, &sst_loader](std::unique_ptr<http::request> req) -> future<json_return_type> {
|
||||
std::string keyspace = req->get_query_param("keyspace");
|
||||
std::string table = req->get_query_param("table");
|
||||
std::string snapshot = req->get_query_param("snapshot");
|
||||
|
||||
rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
|
||||
rjson::value parsed = rjson::parse(std::move(content));
|
||||
if (!parsed.IsArray()) {
|
||||
throw httpd::bad_param_exception("backup locations (in body) must be a JSON array");
|
||||
}
|
||||
|
||||
const auto& locations = parsed.GetArray();
|
||||
if (locations.Size() != 1) {
|
||||
throw httpd::bad_param_exception("backup locations array (in body) must contain exactly one entry");
|
||||
}
|
||||
|
||||
const auto& location = locations[0];
|
||||
if (!location.IsObject()) {
|
||||
throw httpd::bad_param_exception("backup location (in body) must be a JSON object");
|
||||
}
|
||||
|
||||
auto endpoint = rjson::to_string_view(location["endpoint"]);
|
||||
auto bucket = rjson::to_string_view(location["bucket"]);
|
||||
auto dc = rjson::to_string_view(location["datacenter"]);
|
||||
|
||||
if (!location.HasMember("manifests") || !location["manifests"].IsArray()) {
|
||||
throw httpd::bad_param_exception("backup location entry must have 'manifests' array");
|
||||
}
|
||||
|
||||
auto manifests = location["manifests"].GetArray() |
|
||||
std::views::transform([] (const auto& m) { return sstring(rjson::to_string_view(m)); }) |
|
||||
std::ranges::to<utils::chunked_vector<sstring>>();
|
||||
|
||||
if (manifests.empty()) {
|
||||
throw httpd::bad_param_exception("backup location 'manifests' array must not be empty");
|
||||
}
|
||||
|
||||
apilog.info("Tablet restore for {}:{} called. Parameters: snapshot={} datacenter={} endpoint={} bucket={} manifests_count={}",
|
||||
keyspace, table, snapshot, dc, endpoint, bucket, manifests.size());
|
||||
|
||||
auto table_id = validate_table(ctx.db.local(), keyspace, table);
|
||||
auto task_id = co_await sst_loader.local().restore_tablets(table_id, keyspace, table, snapshot, sstring(endpoint), sstring(bucket), std::move(manifests));
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
}
|
||||
|
||||
void unset_sstables_loader(http_context& ctx, routes& r) {
|
||||
ss::load_new_ss_tables.unset(r);
|
||||
ss::start_restore.unset(r);
|
||||
ss::tablet_aware_restore.unset(r);
|
||||
}
|
||||
|
||||
void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
|
||||
|
||||
@@ -194,36 +194,22 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
std::move(audited_keyspaces),
|
||||
std::move(audited_tables),
|
||||
std::move(audited_categories),
|
||||
std::cref(cfg));
|
||||
}
|
||||
|
||||
future<> audit::start_storage(const db::config& cfg) {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
|
||||
local_audit._storage_running = true;
|
||||
std::cref(cfg))
|
||||
.then([&cfg] {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit.start(cfg);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_storage() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([] (audit& local_audit) {
|
||||
local_audit._storage_running = false;
|
||||
return local_audit._storage_helper_ptr->stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_audit() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
|
||||
SCYLLA_ASSERT(!local_audit._storage_running);
|
||||
return local_audit.shutdown();
|
||||
}).then([] {
|
||||
return audit::audit::audit_instance().stop();
|
||||
@@ -237,6 +223,14 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
|
||||
return std::make_unique<audit_info>(cat, keyspace, table, batch);
|
||||
}
|
||||
|
||||
future<> audit::start(const db::config& cfg) {
|
||||
return _storage_helper_ptr->start(cfg);
|
||||
}
|
||||
|
||||
future<> audit::stop() {
|
||||
return _storage_helper_ptr->stop();
|
||||
}
|
||||
|
||||
future<> audit::shutdown() {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -247,12 +241,6 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
|
||||
const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
|
||||
socket_address client_ip = client_state.get_client_address().addr();
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
audit_info.query(), client_ip, audit_info.table(), username));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
@@ -298,11 +286,6 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c
|
||||
|
||||
future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false"));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false");
|
||||
|
||||
@@ -141,7 +141,6 @@ private:
|
||||
category_set _audited_categories;
|
||||
|
||||
std::unique_ptr<storage_helper> _storage_helper_ptr;
|
||||
bool _storage_running = false;
|
||||
|
||||
const db::config& _cfg;
|
||||
utils::observer<sstring> _cfg_keyspaces_observer;
|
||||
@@ -164,8 +163,6 @@ public:
|
||||
return audit_instance().local();
|
||||
}
|
||||
static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
|
||||
static future<> start_storage(const db::config& cfg);
|
||||
static future<> stop_storage();
|
||||
static future<> stop_audit();
|
||||
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
|
||||
audit(locator::shared_token_metadata& stm,
|
||||
@@ -177,6 +174,8 @@ public:
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg);
|
||||
~audit();
|
||||
future<> start(const db::config& cfg);
|
||||
future<> stop();
|
||||
future<> shutdown();
|
||||
bool should_log(const audit_info& audit_info) const;
|
||||
bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
|
||||
|
||||
@@ -185,14 +185,24 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
|
||||
static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
if (!r.has("value")) {
|
||||
continue;
|
||||
}
|
||||
rec->attributes[r.get_as<sstring>("name")] =
|
||||
r.get_as<sstring>("value");
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
// permissions
|
||||
{
|
||||
static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
auto resource = r.get_as<sstring>("resource");
|
||||
auto perms_strings = r.get_set<sstring>("permissions");
|
||||
std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
|
||||
auto pset = permissions::from_strings(perms_set);
|
||||
rec->permissions[std::move(resource)] = std::move(pset);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
co_return rec;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ public:
|
||||
std::unordered_set<role_name_t> members;
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
|
||||
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
|
||||
@@ -76,11 +76,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
|
||||
if (results->empty()) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
const auto& row = results->one();
|
||||
if (!row.has(PERMISSIONS_NAME)) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
|
||||
co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
|
||||
}
|
||||
|
||||
future<>
|
||||
|
||||
@@ -258,11 +258,13 @@ future<> ldap_role_manager::start() {
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
try {
|
||||
co_await _cache.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
return _std_mgr.start();
|
||||
|
||||
@@ -157,20 +157,6 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
return create_legacy_keyspace_if_missing(mm);
|
||||
});
|
||||
}
|
||||
// Authorizer must be started before the permission loader is set,
|
||||
// because the loader calls _authorizer->authorize().
|
||||
// The loader must be set before starting the role manager, because
|
||||
// LDAP role manager starts a pruner fiber that calls
|
||||
// reload_all_permissions() which asserts _permission_loader is set.
|
||||
co_await _authorizer->start();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
co_await _role_manager->start();
|
||||
if (this_shard_id() == 0) {
|
||||
// Role manager and password authenticator have this odd startup
|
||||
@@ -179,19 +165,21 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
// creation therefore we need to wait here.
|
||||
co_await _role_manager->ensure_superuser_is_created();
|
||||
}
|
||||
// Authenticator must be started after ensure_superuser_is_created()
|
||||
// because password_authenticator queries system.roles for the
|
||||
// superuser entry created by the role manager.
|
||||
co_await _authenticator->start();
|
||||
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
}
|
||||
|
||||
future<> service::stop() {
|
||||
_as.request_abort();
|
||||
// Reverse of start() order.
|
||||
co_await _authenticator->stop();
|
||||
co_await _role_manager->stop();
|
||||
_cache.set_permission_loader(nullptr);
|
||||
co_await _authorizer->stop();
|
||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
|
||||
}
|
||||
|
||||
future<> service::ensure_superuser_is_created() {
|
||||
|
||||
@@ -267,7 +267,7 @@ struct extract_row_visitor {
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
@@ -137,24 +137,6 @@ endfunction()
|
||||
|
||||
option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)
|
||||
|
||||
# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
|
||||
# Each .o produces a companion .json file in the build directory that can be
|
||||
# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
|
||||
#
|
||||
# Usage:
|
||||
# cmake -DScylla_TIME_TRACE=ON ...
|
||||
# ninja
|
||||
# # Analyze results (requires ClangBuildAnalyzer):
|
||||
# ClangBuildAnalyzer --all <build-dir> capture.bin
|
||||
# ClangBuildAnalyzer --analyze capture.bin
|
||||
option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
|
||||
if(Scylla_TIME_TRACE)
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
|
||||
endif()
|
||||
add_compile_options(-ftime-trace)
|
||||
endif()
|
||||
|
||||
macro(update_build_flags config)
|
||||
cmake_parse_arguments (
|
||||
parsed_args
|
||||
|
||||
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
|
||||
sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
|
||||
sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
|
||||
sm::make_counter("validation_errors", [this] { return _validation_errors; },
|
||||
sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
|
||||
sm::description("Holds the number of encountered validation errors.")),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
55
configure.py
55
configure.py
@@ -285,12 +285,8 @@ def generate_compdb(compdb, ninja, buildfile, modes):
|
||||
os.symlink(compdb_target, compdb)
|
||||
except FileExistsError:
|
||||
# if there is already a valid compile_commands.json link in the
|
||||
# source root, we are done. if it's a stale link, update it.
|
||||
if os.path.islink(compdb):
|
||||
current_target = os.readlink(compdb)
|
||||
if not os.path.exists(current_target):
|
||||
os.unlink(compdb)
|
||||
os.symlink(compdb_target, compdb)
|
||||
# source root, we are done.
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
@@ -564,7 +560,6 @@ scylla_tests = set([
|
||||
'test/boost/crc_test',
|
||||
'test/boost/dict_trainer_test',
|
||||
'test/boost/dirty_memory_manager_test',
|
||||
'test/boost/tablet_aware_restore_test',
|
||||
'test/boost/double_decker_test',
|
||||
'test/boost/duration_test',
|
||||
'test/boost/dynamic_bitset_test',
|
||||
@@ -598,7 +593,6 @@ scylla_tests = set([
|
||||
'test/boost/linearizing_input_stream_test',
|
||||
'test/boost/lister_test',
|
||||
'test/boost/locator_topology_test',
|
||||
'test/boost/lock_tables_metadata_test',
|
||||
'test/boost/log_heap_test',
|
||||
'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
|
||||
'test/boost/logalloc_test',
|
||||
@@ -859,10 +853,6 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
|
||||
arg_parser.add_argument('--build-dir', action='store', default='build',
|
||||
help='Build directory path')
|
||||
arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
|
||||
arg_parser.add_argument('--time-trace', action='store_true', default=False,
|
||||
help='Enable Clang -ftime-trace for build profiling. '
|
||||
'Each .o produces a .json file analyzable with '
|
||||
'ClangBuildAnalyzer or chrome://tracing')
|
||||
arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
|
||||
args = arg_parser.parse_args()
|
||||
if args.help:
|
||||
@@ -1173,8 +1163,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
'index/vector_index.cc',
|
||||
'index/fulltext_index.cc',
|
||||
'index/index_option_utils.cc',
|
||||
'utils/UUID_gen.cc',
|
||||
'utils/i_filter.cc',
|
||||
'utils/bloom_filter.cc',
|
||||
@@ -1337,7 +1325,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'ent/ldap/ldap_connection.cc',
|
||||
'reader_concurrency_semaphore.cc',
|
||||
'sstables_loader.cc',
|
||||
'sstables_loader_helpers.cc',
|
||||
'utils/utf8.cc',
|
||||
'utils/ascii.cc',
|
||||
'utils/like_matcher.cc',
|
||||
@@ -1477,7 +1464,6 @@ idls = ['idl/gossip_digest.idl.hh',
|
||||
'idl/frozen_mutation.idl.hh',
|
||||
'idl/reconcilable_result.idl.hh',
|
||||
'idl/streaming.idl.hh',
|
||||
'idl/sstables_loader.idl.hh',
|
||||
'idl/paging_state.idl.hh',
|
||||
'idl/frozen_schema.idl.hh',
|
||||
'idl/repair.idl.hh',
|
||||
@@ -1673,7 +1659,6 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/auth_cache_test.cc',
|
||||
'test/boost/auth_test.cc',
|
||||
'test/boost/batchlog_manager_test.cc',
|
||||
'test/boost/table_helper_test.cc',
|
||||
'test/boost/cache_algorithm_test.cc',
|
||||
'test/boost/castas_fcts_test.cc',
|
||||
'test/boost/cdc_test.cc',
|
||||
@@ -1725,7 +1710,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/sstable_compression_config_test.cc',
|
||||
'test/boost/sstable_directory_test.cc',
|
||||
'test/boost/sstable_set_test.cc',
|
||||
'test/boost/sstable_tablet_streaming_test.cc',
|
||||
'test/boost/sstable_tablet_streaming.cc',
|
||||
'test/boost/statement_restrictions_test.cc',
|
||||
'test/boost/storage_proxy_test.cc',
|
||||
'test/boost/tablets_test.cc',
|
||||
@@ -1980,9 +1965,6 @@ user_cflags += ' -fextend-variable-liveness=none'
|
||||
if args.target != '':
|
||||
user_cflags += ' -march=' + args.target
|
||||
|
||||
if args.time_trace:
|
||||
user_cflags += ' -ftime-trace'
|
||||
|
||||
for mode in modes:
|
||||
# Those flags are passed not only to Scylla objects, but also to libraries
|
||||
# that we compile ourselves.
|
||||
@@ -2475,9 +2457,6 @@ def write_build_file(f,
|
||||
command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
|
||||
rule unified
|
||||
command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
|
||||
rule collect_pkgs
|
||||
command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
|
||||
description = COLLECT $out
|
||||
rule rust_header
|
||||
command = cxxbridge --include rust/cxx.h --header $in > $out
|
||||
description = RUST_HEADER $out
|
||||
@@ -2963,8 +2942,6 @@ def write_build_file(f,
|
||||
build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar
|
||||
|
||||
build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
|
||||
|
||||
build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
|
||||
'''))
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
@@ -2972,28 +2949,7 @@ def write_build_file(f,
|
||||
rule dist-check
|
||||
command = ./tools/testing/dist-check/dist-check.sh --mode $mode
|
||||
'''))
|
||||
deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
|
||||
deb_ver = f'{scylla_version}-{scylla_release}-1'
|
||||
rpm_ver = f'{scylla_version}-{scylla_release}'
|
||||
for mode in build_modes:
|
||||
server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
|
||||
server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
|
||||
for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
|
||||
python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
|
||||
all_rpms = server_rpms + cqlsh_rpms + python3_rpms
|
||||
|
||||
server_deb_dir = f'$builddir/dist/{mode}/debian'
|
||||
server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
|
||||
for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
|
||||
for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
|
||||
python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
|
||||
all_debs = server_debs + cqlsh_debs + python3_debs
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
@@ -3001,11 +2957,6 @@ def write_build_file(f,
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
|
||||
build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
|
||||
pkgs = {' '.join(all_rpms)}
|
||||
build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
|
||||
pkgs = {' '.join(all_debs)}
|
||||
build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
|
||||
build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
|
||||
build dist-{mode}: phony {mode}-dist
|
||||
build dist-check-{mode}: dist-check
|
||||
|
||||
@@ -136,9 +136,9 @@ public:
|
||||
{}
|
||||
|
||||
future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
|
||||
return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return make_ready_future<value_type>(std::move(v));
|
||||
});
|
||||
}).discard_result();
|
||||
}
|
||||
|
||||
value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
|
||||
|
||||
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
|
||||
.args = {},
|
||||
};
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "cql3/prepare_context.hh"
|
||||
#include "cql3/expr/expr-utils.hh"
|
||||
#include "types/list.hh"
|
||||
#include "types/tuple.hh"
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
|
||||
@@ -117,34 +116,6 @@ void validate_token_relation(const std::vector<const column_definition*> column_
|
||||
}
|
||||
}
|
||||
|
||||
void validate_tuples_size(const expression& rhs, size_t valid_size) {
|
||||
auto coll = as_if<collection_constructor>(&rhs);
|
||||
if (!coll) {
|
||||
// Pre-prepare, the IN list arrives as a collection_constructor.
|
||||
// After prepare it would be a constant of list type whose elements
|
||||
// are serialized; arity validation has already happened earlier in
|
||||
// that case, so nothing to do here.
|
||||
return;
|
||||
}
|
||||
for (const auto& expr : coll->elements) {
|
||||
size_t expr_size = 0;
|
||||
if (auto tuple = as_if<tuple_constructor>(&expr)) {
|
||||
expr_size = tuple->elements.size();
|
||||
} else {
|
||||
auto the_const = as_if<constant>(&expr);
|
||||
if (the_const && the_const->type->without_reversed().is_tuple()) {
|
||||
const tuple_type_impl* const_tuple = dynamic_cast<const tuple_type_impl*>(&the_const->type->without_reversed());
|
||||
expr_size = const_tuple->size();
|
||||
} else {
|
||||
continue; // not a tuple; perhaps we need to set expr_size to 1 here when #12554 is fixed
|
||||
}
|
||||
}
|
||||
if (expr_size != valid_size) {
|
||||
throw exceptions::invalid_request_exception(format("Expected {} elements in value tuple, but got {}: {}", valid_size, expr_size, expr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void preliminary_binop_vaidation_checks(const binary_operator& binop) {
|
||||
if (binop.op == oper_t::NEQ) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported \"!=\" relation: {:user}", binop));
|
||||
@@ -171,10 +142,6 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
|
||||
throw exceptions::invalid_request_exception("LIKE cannot be used for Multi-column relations");
|
||||
}
|
||||
|
||||
if (binop.op == oper_t::IN) {
|
||||
validate_tuples_size(binop.rhs, lhs_tup->elements.size());
|
||||
}
|
||||
|
||||
if (auto rhs_tup = as_if<tuple_constructor>(&binop.rhs)) {
|
||||
if (lhs_tup->elements.size() != rhs_tup->elements.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
|
||||
@@ -343,102 +343,102 @@ to_predicates(
|
||||
auto cdef = col.col;
|
||||
auto type = &cdef->type->without_reversed();
|
||||
if (oper.op == oper_t::IS_NOT) {
|
||||
return to_vector(predicate{
|
||||
.solve_for = nullptr,
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_not_null_single_column = is_null_constant(oper.rhs),
|
||||
.op = oper.op,
|
||||
});
|
||||
return to_vector(predicate{
|
||||
.solve_for = nullptr,
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_not_null_single_column = is_null_constant(oper.rhs),
|
||||
.op = oper.op,
|
||||
});
|
||||
}
|
||||
if (is_compare(oper.op)) {
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return oper.op == oper_t::EQ ? value_set(value_list{*val})
|
||||
: to_range(oper.op, std::move(*val));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return oper.op == oper_t::EQ ? value_set(value_list{*val})
|
||||
: to_range(oper.op, std::move(*val));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
} else if (oper.op == oper_t::IN) {
|
||||
auto solve = [oper, type, cdef] (const query_options& options) {
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.is_in = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper, type, cdef] (const query_options& options) {
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.is_in = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
} else if (oper.op == oper_t::CONTAINS || oper.op == oper_t::CONTAINS_KEY) {
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return value_set(value_list{*val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return value_set(value_list{*val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
},
|
||||
[&] (const subscript& s) -> std::vector<predicate> {
|
||||
const column_value& col = get_subscripted_column(s);
|
||||
|
||||
if (oper.op == oper_t::EQ) {
|
||||
auto solve = [s, oper] (const query_options& options) {
|
||||
managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
|
||||
if (!sval) {
|
||||
return empty_value_set; // NULL can't be a map key
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
auto solve = [s, oper] (const query_options& options) {
|
||||
managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
|
||||
if (!sval) {
|
||||
return empty_value_set; // NULL can't be a map key
|
||||
}
|
||||
|
||||
managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!rval) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
managed_bytes_opt elements[] = {sval, rval};
|
||||
managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
|
||||
return value_set(value_list{val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = true,
|
||||
.equality = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
.is_subscript = true,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!rval) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
managed_bytes_opt elements[] = {sval, rval};
|
||||
managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
|
||||
return value_set(value_list{val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = true,
|
||||
.equality = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
.is_subscript = true,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
},
|
||||
[&] (const tuple_constructor& tuple) -> std::vector<predicate> {
|
||||
auto columns = tuple.elements
|
||||
| std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
|
||||
| std::ranges::to<std::vector>();
|
||||
| std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
|
||||
| std::ranges::to<std::vector>();
|
||||
for (unsigned i = 0; i < columns.size(); ++i) {
|
||||
if (!columns[i]->is_clustering_key() || columns[i]->position() != i) {
|
||||
on_internal_error(rlogger, "to_predicates: multi-column relation not on a clustering key prefix");
|
||||
@@ -481,42 +481,42 @@ to_predicates(
|
||||
if (!(oper.op == oper_t::EQ || is_slice(oper.op))) {
|
||||
return cannot_solve(oper);
|
||||
}
|
||||
auto solve = [oper] (const query_options& options) -> value_set {
|
||||
auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no token values match.
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
return value_list{*val};
|
||||
} else if (oper.op == oper_t::GT) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
|
||||
} else if (oper.op == oper_t::GTE) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
|
||||
}
|
||||
static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
|
||||
MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
|
||||
// Undocumented feature: when the user types `token(...) < MININT`, we interpret
|
||||
// that as MAXINT for some reason.
|
||||
const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
|
||||
if (oper.op == oper_t::LT) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
|
||||
} else if (oper.op == oper_t::LTE) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
|
||||
}
|
||||
throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_partition_key_token{table_schema_opt},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) -> value_set {
|
||||
auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no token values match.
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
return value_list{*val};
|
||||
} else if (oper.op == oper_t::GT) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
|
||||
} else if (oper.op == oper_t::GTE) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
|
||||
}
|
||||
static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
|
||||
MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
|
||||
// Undocumented feature: when the user types `token(...) < MININT`, we interpret
|
||||
// that as MAXINT for some reason.
|
||||
const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
|
||||
if (oper.op == oper_t::LT) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
|
||||
} else if (oper.op == oper_t::LTE) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
|
||||
}
|
||||
throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_partition_key_token{table_schema_opt},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
},
|
||||
[&] (const binary_operator&) -> std::vector<predicate> {
|
||||
return cannot_solve(oper);
|
||||
@@ -555,7 +555,7 @@ to_predicates(
|
||||
return cannot_solve(oper);
|
||||
},
|
||||
}, oper.lhs);
|
||||
},
|
||||
},
|
||||
[] (const column_value& cv) -> std::vector<predicate> {
|
||||
return cannot_solve(cv);
|
||||
},
|
||||
@@ -806,26 +806,26 @@ bool is_empty_restriction(const expression& e) {
|
||||
static
|
||||
std::function<bytes_opt (const query_options&)>
|
||||
build_value_for_fn(const column_definition& cdef, const expression& e, const schema& s) {
|
||||
auto ac = to_predicate_on_column(e, &cdef, &s);
|
||||
return [ac] (const query_options& options) -> bytes_opt {
|
||||
value_set possible_vals = solve(ac, options);
|
||||
return std::visit(overloaded_functor {
|
||||
[&](const value_list& val_list) -> bytes_opt {
|
||||
if (val_list.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (val_list.size() != 1) {
|
||||
on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
|
||||
}
|
||||
|
||||
return to_bytes(val_list.front());
|
||||
},
|
||||
[&](const interval<managed_bytes>&) -> bytes_opt {
|
||||
on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
|
||||
auto ac = to_predicate_on_column(e, &cdef, &s);
|
||||
return [ac] (const query_options& options) -> bytes_opt {
|
||||
value_set possible_vals = solve(ac, options);
|
||||
return std::visit(overloaded_functor {
|
||||
[&](const value_list& val_list) -> bytes_opt {
|
||||
if (val_list.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
}, possible_vals);
|
||||
};
|
||||
|
||||
if (val_list.size() != 1) {
|
||||
on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
|
||||
}
|
||||
|
||||
return to_bytes(val_list.front());
|
||||
},
|
||||
[&](const interval<managed_bytes>&) -> bytes_opt {
|
||||
on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
|
||||
}
|
||||
}, possible_vals);
|
||||
};
|
||||
}
|
||||
|
||||
bool contains_multi_column_restriction(const expression& e) {
|
||||
@@ -1337,11 +1337,11 @@ statement_restrictions::ck_restrictions_need_filtering() const {
|
||||
}
|
||||
|
||||
return has_partition_key_unrestricted_components()
|
||||
|| clustering_key_restrictions_need_filtering()
|
||||
// If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
|
||||
// A single token restriction can have multiple matching partition key values.
|
||||
// Because of this we can't create a clustering prefix with more than token restriction.
|
||||
|| (_uses_secondary_indexing && has_token_restrictions());
|
||||
|| clustering_key_restrictions_need_filtering()
|
||||
// If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
|
||||
// A single token restriction can have multiple matching partition key values.
|
||||
// Because of this we can't create a clustering prefix with more than token restriction.
|
||||
|| (_uses_secondary_indexing && has_token_restrictions());
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1705,28 +1705,28 @@ dht::partition_range_vector statement_restrictions::get_partition_key_ranges(con
|
||||
get_partition_key_ranges_fn_t
|
||||
statement_restrictions::build_partition_key_ranges_fn() const {
|
||||
return std::visit(overloaded_functor{
|
||||
[&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
|
||||
return [] (const query_options& options) -> dht::partition_range_vector{
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
[&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
|
||||
return [] (const query_options& options) -> dht::partition_range_vector{
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
};
|
||||
},
|
||||
[&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
return [&] (const query_options& options) -> dht::partition_range_vector {
|
||||
return partition_ranges_from_token(r.token_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
if (_partition_range_is_simple) {
|
||||
return [&] (const query_options& options) {
|
||||
// Special case to avoid extra allocations required for a Cartesian product.
|
||||
return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
return [&] (const query_options& options) -> dht::partition_range_vector {
|
||||
return partition_ranges_from_token(r.token_restrictions, options, *_schema);
|
||||
} else {
|
||||
return [&] (const query_options& options) {
|
||||
return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
if (_partition_range_is_simple) {
|
||||
return [&] (const query_options& options) {
|
||||
// Special case to avoid extra allocations required for a Cartesian product.
|
||||
return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
} else {
|
||||
return [&] (const query_options& options) {
|
||||
return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
}
|
||||
}}, _partition_range_restrictions);
|
||||
}
|
||||
}}, _partition_range_restrictions);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@@ -1970,28 +1970,28 @@ build_get_multi_column_clustering_bounds_fn(
|
||||
}
|
||||
});
|
||||
}
|
||||
return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
multi_column_range_accumulator acc;
|
||||
for (auto& builder : range_builders) {
|
||||
builder(acc, options);
|
||||
}
|
||||
auto bounds = std::move(acc.ranges);
|
||||
return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
multi_column_range_accumulator acc;
|
||||
for (auto& builder : range_builders) {
|
||||
builder(acc, options);
|
||||
}
|
||||
auto bounds = std::move(acc.ranges);
|
||||
|
||||
if (!all_natural && !all_reverse) {
|
||||
std::vector<query::clustering_range> bounds_in_clustering_order;
|
||||
for (const auto& b : bounds) {
|
||||
const auto eqv = get_equivalent_ranges(b, *schema);
|
||||
bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
|
||||
}
|
||||
return bounds_in_clustering_order;
|
||||
if (!all_natural && !all_reverse) {
|
||||
std::vector<query::clustering_range> bounds_in_clustering_order;
|
||||
for (const auto& b : bounds) {
|
||||
const auto eqv = get_equivalent_ranges(b, *schema);
|
||||
bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
|
||||
}
|
||||
if (all_reverse) {
|
||||
for (auto& crange : bounds) {
|
||||
crange = query::clustering_range(crange.end(), crange.start());
|
||||
}
|
||||
return bounds_in_clustering_order;
|
||||
}
|
||||
if (all_reverse) {
|
||||
for (auto& crange : bounds) {
|
||||
crange = query::clustering_range(crange.end(), crange.start());
|
||||
}
|
||||
return bounds;
|
||||
};
|
||||
}
|
||||
return bounds;
|
||||
};
|
||||
}
|
||||
|
||||
/// Reverses the range if the type is reversed. Why don't we have interval::reverse()??
|
||||
@@ -2288,17 +2288,17 @@ build_range_from_raw_bounds_fn(
|
||||
std::vector<std::function<query::clustering_range (const query_options&)>> range_builders;
|
||||
for (const auto& e : exprs | std::views::transform(&predicate::filter)) {
|
||||
if (auto b = find_clustering_order(e)) {
|
||||
range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
|
||||
auto* b = &bb;
|
||||
cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
|
||||
if (tup_val.is_null()) {
|
||||
on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
|
||||
}
|
||||
range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
|
||||
auto* b = &bb;
|
||||
cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
|
||||
if (tup_val.is_null()) {
|
||||
on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
|
||||
}
|
||||
|
||||
const auto r = to_range(
|
||||
const auto r = to_range(
|
||||
b->op, clustering_key_prefix::from_optional_exploded(schema, expr::get_tuple_elements(tup_val, *type_of(b->rhs))));
|
||||
return r;
|
||||
});
|
||||
return r;
|
||||
});
|
||||
}
|
||||
}
|
||||
return [range_builders] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
@@ -2322,9 +2322,9 @@ build_range_from_raw_bounds_fn(
|
||||
get_clustering_bounds_fn_t
|
||||
statement_restrictions::build_get_clustering_bounds_fn() const {
|
||||
if (_clustering_prefix_restrictions.empty()) {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
};
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
};
|
||||
}
|
||||
if (_clustering_prefix_restrictions[0].is_multi_column) {
|
||||
bool all_natural = true, all_reverse = true; ///< Whether column types are reversed or natural.
|
||||
@@ -2342,14 +2342,14 @@ statement_restrictions::build_get_clustering_bounds_fn() const {
|
||||
}
|
||||
}
|
||||
}
|
||||
return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
|
||||
all_natural, all_reverse);
|
||||
} else {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
|
||||
};
|
||||
}
|
||||
return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
|
||||
all_natural, all_reverse);
|
||||
} else {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_clustering_bounds(const query_options& options) const {
|
||||
return _get_clustering_bounds_fn(options);
|
||||
@@ -2475,11 +2475,11 @@ void statement_restrictions::prepare_indexed_global(const schema& idx_tbl_schema
|
||||
_idx_tbl_ck_prefix->reserve(_idx_tbl_ck_prefix->size() + idx_tbl_schema.clustering_key_size());
|
||||
auto *single_column_partition_key_restrictions = std::get_if<single_column_partition_range_restrictions>(&_partition_range_restrictions);
|
||||
if (single_column_partition_key_restrictions) {
|
||||
for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
|
||||
const auto col = require_on_single_column(e);
|
||||
const auto pos = _schema->position(*col) + 1;
|
||||
(*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
|
||||
}
|
||||
for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
|
||||
const auto col = require_on_single_column(e);
|
||||
const auto pos = _schema->position(*col) + 1;
|
||||
(*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
|
||||
}
|
||||
}
|
||||
|
||||
if (std::ranges::any_of(*_idx_tbl_ck_prefix | std::views::drop(1) | std::views::transform(&predicate::filter), is_empty_restriction)) {
|
||||
@@ -2621,10 +2621,10 @@ statement_restrictions::build_get_global_index_clustering_ranges_fn() const {
|
||||
return {};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_global_index_clustering_ranges(
|
||||
@@ -2643,14 +2643,14 @@ statement_restrictions::build_get_global_index_token_clustering_ranges_fn() cons
|
||||
// In old indexes the token column was of type blob.
|
||||
// This causes problems with sorting and must be handled separately.
|
||||
if (token_column.type != long_type) {
|
||||
return [&] (const query_options& options) {
|
||||
return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
|
||||
};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_global_index_token_clustering_ranges(
|
||||
@@ -2664,10 +2664,10 @@ statement_restrictions::build_get_local_index_clustering_ranges_fn() const {
|
||||
return {};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_local_index_clustering_ranges(
|
||||
|
||||
@@ -351,9 +351,6 @@ public:
|
||||
if (agg.state_to_result_function) {
|
||||
ret.push_back(agg.state_to_result_function);
|
||||
}
|
||||
if (agg.state_reduction_function) {
|
||||
ret.push_back(agg.state_reduction_function);
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
@@ -71,7 +71,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
|
||||
using namespace service::strong_consistency;
|
||||
if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
|
||||
bool is_write = true;
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
|
||||
}
|
||||
utils::get_local_injector().inject("sc_modification_statement_timeout", [&] {
|
||||
throw exceptions::mutation_write_timeout_exception{"", "", options.get_consistency(), 0, 0, db::write_type::SIMPLE};
|
||||
|
||||
@@ -47,7 +47,7 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
|
||||
using namespace service::strong_consistency;
|
||||
if (const auto* redirect = get_if<need_redirect>(&query_result)) {
|
||||
bool is_write = false;
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
|
||||
}
|
||||
|
||||
co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
|
||||
|
||||
@@ -12,23 +12,19 @@
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "locator/tablet_replication_strategy.hh"
|
||||
#include "service/strong_consistency/coordinator.hh"
|
||||
|
||||
namespace cql3::statements::strong_consistency {
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
|
||||
const query_options& options,
|
||||
const locator::tablet_replica& target,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool is_write,
|
||||
service::strong_consistency::stats& stats)
|
||||
bool is_write)
|
||||
{
|
||||
auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
|
||||
const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
|
||||
if (target.host != my_host_id) {
|
||||
++(is_write ? stats.write_node_bounces : stats.read_node_bounces);
|
||||
co_return qp.bounce_to_node(target, std::move(func_values_cache), timeout, is_write);
|
||||
}
|
||||
++(is_write ? stats.write_shard_bounces : stats.read_shard_bounces);
|
||||
co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
|
||||
}
|
||||
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
namespace service::strong_consistency { struct stats; }
|
||||
|
||||
namespace cql3::statements::strong_consistency {
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
|
||||
@@ -20,8 +18,7 @@ future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement
|
||||
const query_options& options,
|
||||
const locator::tablet_replica& target,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool is_write,
|
||||
service::strong_consistency::stats& stats);
|
||||
bool is_write);
|
||||
|
||||
bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);
|
||||
|
||||
|
||||
@@ -339,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
|
||||
}
|
||||
if (values.size() > allowed_options.size()) {
|
||||
throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
|
||||
type, fmt::join(values | std::views::keys, ","),
|
||||
fmt::join(values | std::views::keys, ","), type,
|
||||
fmt::join(allowed_options | std::views::keys, ",")));
|
||||
}
|
||||
options.type = std::string(type);
|
||||
|
||||
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
friend std::ostream& operator<<(std::ostream&, const segment&);
|
||||
friend class segment_manager;
|
||||
|
||||
constexpr size_t sector_overhead(size_t size) const {
|
||||
size_t sector_overhead(size_t size) const {
|
||||
return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
|
||||
}
|
||||
|
||||
@@ -1028,21 +1028,18 @@ public:
|
||||
co_return me;
|
||||
}
|
||||
|
||||
std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t s) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
|
||||
auto overhead = segment_overhead_size;
|
||||
if (_file_pos == 0) {
|
||||
overhead += descriptor_header_size;
|
||||
}
|
||||
|
||||
return {s + overhead, overhead};
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t size_in) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
auto [s, overhead] = buffer_usage_size(size_in);
|
||||
s += overhead;
|
||||
// add bookkeep data reqs.
|
||||
auto a = align_up(s + sector_overhead(s), _alignment);
|
||||
auto k = std::max(a, default_size);
|
||||
@@ -1430,9 +1427,6 @@ public:
|
||||
|
||||
position_type next_position(size_t size) const {
|
||||
auto used = _buffer_ostream_size - _buffer_ostream.size();
|
||||
if (used == 0) { // new chunk/segment
|
||||
std::tie(size, std::ignore) = buffer_usage_size(size);
|
||||
}
|
||||
used += size;
|
||||
return _file_pos + used + sector_overhead(used);
|
||||
}
|
||||
@@ -1576,6 +1570,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);
|
||||
|
||||
auto size = writer.size();
|
||||
auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;
|
||||
|
||||
// check if this cannot be written at all...
|
||||
if (!cfg.allow_going_over_size_limit) {
|
||||
@@ -1584,11 +1579,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
// more worst case
|
||||
auto size_with_meta_overhead = size_with_sector_overhead
|
||||
+ (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
|
||||
* (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
|
||||
* (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
|
||||
;
|
||||
// this is not really true. We could have some space in current segment,
|
||||
// but again, lets be conservative.
|
||||
auto max_file_size_avail = max_disk_size - max_size;
|
||||
auto max_file_size_avail = max_disk_size - max_file_size;
|
||||
|
||||
if (size_with_meta_overhead > max_file_size_avail) {
|
||||
throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
|
||||
@@ -1775,13 +1770,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
co_await s->close();
|
||||
s = co_await get_segment();
|
||||
}
|
||||
// bytes not counting overhead
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());
|
||||
// bytes not counting overhead
|
||||
auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
|
||||
|
||||
size_t avail;
|
||||
if (buf_rem >= align) {
|
||||
if (buf_rem > align) {
|
||||
auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
|
||||
avail = std::min(rem2, max_mutation_size)
|
||||
- segment::entry_overhead_size
|
||||
@@ -1791,7 +1784,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
} else {
|
||||
co_await s->cycle();
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto max = std::max<size_t>(pos, max_file_size);
|
||||
auto file_rem = max - pos;
|
||||
|
||||
if (file_rem < align) {
|
||||
|
||||
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
|
||||
if (cm_it == local_cm.end()) {
|
||||
if (!cer.get_column_mapping()) {
|
||||
rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
|
||||
throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
|
||||
throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
|
||||
}
|
||||
rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
|
||||
cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
|
||||
|
||||
@@ -1429,13 +1429,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance.")
|
||||
, enable_ipv6_dns_lookup(this, "enable_ipv6_dns_lookup", value_status::Used, false, "Use IPv6 address resolution")
|
||||
, abort_on_internal_error(this, "abort_on_internal_error", liveness::LiveUpdate, value_status::Used, false, "Abort the server instead of throwing exception when internal invariants are violated.")
|
||||
, abort_on_malformed_sstable_error(this, "abort_on_malformed_sstable_error", liveness::LiveUpdate, value_status::Used,
|
||||
#if defined(DEBUG) || defined(DEVEL)
|
||||
true,
|
||||
#else
|
||||
false,
|
||||
#endif
|
||||
"Abort the server and generate a coredump instead of throwing an exception when any sstable parse error is detected (malformed_sstable_exception, bufsize_mismatch_exception, parse_assert() failures, or BTI parse errors). Intended for debugging memory corruption that may manifest as sstable corruption. Defaults to true in debug and dev builds.")
|
||||
, max_partition_key_restrictions_per_query(this, "max_partition_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100,
|
||||
"Maximum number of distinct partition keys restrictions per query. This limit places a bound on the size of IN tuples, "
|
||||
"especially when multiple partition key columns have IN restrictions. Increasing this value can result in server instability.")
|
||||
@@ -1928,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
|
||||
{"lwt", feature::UNUSED},
|
||||
{"udf", feature::UDF},
|
||||
{"cdc", feature::UNUSED},
|
||||
{"alternator-streams", feature::UNUSED},
|
||||
{"alternator-streams", feature::ALTERNATOR_STREAMS},
|
||||
{"alternator-ttl", feature::UNUSED },
|
||||
{"consistent-topology-changes", feature::UNUSED},
|
||||
{"broadcast-tables", feature::BROADCAST_TABLES},
|
||||
|
||||
@@ -115,6 +115,7 @@ struct experimental_features_t {
|
||||
enum class feature {
|
||||
UNUSED,
|
||||
UDF,
|
||||
ALTERNATOR_STREAMS,
|
||||
BROADCAST_TABLES,
|
||||
KEYSPACE_STORAGE_OPTIONS,
|
||||
STRONGLY_CONSISTENT_TABLES,
|
||||
@@ -456,7 +457,6 @@ public:
|
||||
named_value<bool> enable_shard_aware_drivers;
|
||||
named_value<bool> enable_ipv6_dns_lookup;
|
||||
named_value<bool> abort_on_internal_error;
|
||||
named_value<bool> abort_on_malformed_sstable_error;
|
||||
named_value<uint32_t> max_partition_key_restrictions_per_query;
|
||||
named_value<uint32_t> max_clustering_key_restrictions_per_query;
|
||||
named_value<uint64_t> max_memory_for_unlimited_query_soft_limit;
|
||||
|
||||
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
|
||||
}
|
||||
}
|
||||
|
||||
hr_logger.trace(" pp after1={}", pp);
|
||||
hr_logger.trace(" pp after1=", pp);
|
||||
if (d.first == me) {
|
||||
// We only care what "me" sends, and only the elements in
|
||||
// the sorted list earlier than me could have forced it to
|
||||
|
||||
@@ -29,9 +29,6 @@ class large_data_handler {
|
||||
public:
|
||||
struct stats {
|
||||
int64_t partitions_bigger_than_threshold = 0; // number of large partition updates exceeding threshold_bytes
|
||||
int64_t rows_bigger_than_threshold = 0; // number of large row updates exceeding row_threshold_bytes
|
||||
int64_t cells_bigger_than_threshold = 0; // number of large cell updates exceeding cell_threshold_bytes
|
||||
int64_t collections_bigger_than_threshold = 0; // number of large collection updates exceeding collection_elements_count_threshold
|
||||
};
|
||||
|
||||
private:
|
||||
@@ -85,7 +82,6 @@ public:
|
||||
const clustering_key_prefix* clustering_key, uint64_t row_size) {
|
||||
SCYLLA_ASSERT(running());
|
||||
if (row_size > _row_threshold_bytes) [[unlikely]] {
|
||||
++_stats.rows_bigger_than_threshold;
|
||||
return with_sem([&sst, &partition_key, clustering_key, row_size, this] {
|
||||
return record_large_rows(sst, partition_key, clustering_key, row_size);
|
||||
}).then([] {
|
||||
@@ -106,8 +102,6 @@ public:
|
||||
const clustering_key_prefix* clustering_key, const column_definition& cdef, uint64_t cell_size, uint64_t collection_elements) {
|
||||
SCYLLA_ASSERT(running());
|
||||
above_threshold_result above_threshold{.size = cell_size > _cell_threshold_bytes, .elements = collection_elements > _collection_elements_count_threshold};
|
||||
_stats.cells_bigger_than_threshold += above_threshold.size;
|
||||
_stats.collections_bigger_than_threshold += above_threshold.elements;
|
||||
if (above_threshold.size || above_threshold.elements) [[unlikely]] {
|
||||
return with_sem([&sst, &partition_key, clustering_key, &cdef, cell_size, collection_elements, this] {
|
||||
return record_large_cells(sst, partition_key, clustering_key, cdef, cell_size, collection_elements);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "db/snapshot-ctl.hh"
|
||||
#include "db/snapshot/backup_task.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "sstables/exceptions.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "sstables/sstable_directory.hh"
|
||||
#include "sstables/sstables_manager.hh"
|
||||
@@ -163,23 +164,22 @@ future<> backup_task_impl::process_snapshot_dir() {
|
||||
auto file_path = _snapshot_dir / name;
|
||||
auto st = co_await file_stat(directory, name);
|
||||
total += st.size;
|
||||
auto result = sstables::parse_path(file_path, "", "");
|
||||
if (!result) {
|
||||
_files.emplace_back(name);
|
||||
continue;
|
||||
}
|
||||
auto desc = std::move(*result);
|
||||
const auto& gen = desc.generation;
|
||||
_sstable_comps[gen].emplace_back(name);
|
||||
_sstables_in_snapshot.insert(desc.generation);
|
||||
++num_sstable_comps;
|
||||
try {
|
||||
auto desc = sstables::parse_path(file_path, "", "");
|
||||
const auto& gen = desc.generation;
|
||||
_sstable_comps[gen].emplace_back(name);
|
||||
_sstables_in_snapshot.insert(desc.generation);
|
||||
++num_sstable_comps;
|
||||
|
||||
// When the SSTable is only linked-to by the snapshot directory,
|
||||
// it is already deleted from the table's base directory, and
|
||||
// therefore it better be uploaded earlier to free-up its capacity.
|
||||
if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
|
||||
snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
|
||||
_deleted_sstables.push_back(gen);
|
||||
// When the SSTable is only linked-to by the snapshot directory,
|
||||
// it is already deleted from the table's base directory, and
|
||||
// therefore it better be uploaded earlier to free-up its capacity.
|
||||
if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
|
||||
snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
|
||||
_deleted_sstables.push_back(gen);
|
||||
}
|
||||
} catch (const sstables::malformed_sstable_exception&) {
|
||||
_files.emplace_back(name);
|
||||
}
|
||||
}
|
||||
_total_progress.total = total;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "replica/database.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/config.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "types/types.hh"
|
||||
@@ -21,6 +22,8 @@
|
||||
#include "cdc/generation.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
|
||||
#include "service/migration_manager.hh"
|
||||
#include "locator/host_id.hh"
|
||||
|
||||
@@ -38,10 +41,27 @@ static logging::logger dlogger("system_distributed_keyspace");
|
||||
extern logging::logger cdc_log;
|
||||
|
||||
namespace db {
|
||||
namespace {
|
||||
const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
|
||||
(builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
|
||||
{
|
||||
builder.set_wait_for_sync_to_commitlog(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
extern thread_local data_type cdc_streams_set_type;
|
||||
thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);
|
||||
|
||||
/* See `token_range_description` struct */
|
||||
thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
|
||||
thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
|
||||
{ long_type // dht::token token_range_end;
|
||||
, cdc_streams_list_type // std::vector<stream_id> streams;
|
||||
, byte_type // uint8_t sharding_ignore_msb;
|
||||
});
|
||||
thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);
|
||||
|
||||
schema_ptr view_build_status() {
|
||||
static thread_local auto schema = [] {
|
||||
@@ -57,6 +77,42 @@ schema_ptr view_build_status() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* An internal table used by nodes to exchange CDC generation data. */
|
||||
schema_ptr cdc_generations_v2() {
|
||||
thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
|
||||
/* The unique identifier of this generation. */
|
||||
.with_column("id", uuid_type, column_kind::partition_key)
|
||||
/* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
|
||||
* This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
|
||||
* of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
|
||||
* Each such range-local mapping is represented by a row of this table.
|
||||
* The clustering key of the row is the end of the range being described by this row.
|
||||
* The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
|
||||
* or of the last row of this partition if this is the first the first row. */
|
||||
.with_column("range_end", long_type, column_kind::clustering_key)
|
||||
/* The set of streams mapped to in this range.
|
||||
* The number of streams mapped to a single range in a CDC generation is bounded from above by the number
|
||||
* of shards on the owner of that range in the token ring.
|
||||
* In other words, the number of elements of this set is bounded by the maximum of the number of shards
|
||||
* over all nodes. The serialized size is obtained by counting about 20B for each stream.
|
||||
* For example, if all nodes in the cluster have at most 128 shards,
|
||||
* the serialized size of this set will be bounded by ~2.5 KB. */
|
||||
.with_column("streams", cdc_streams_set_type)
|
||||
/* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
|
||||
* when the generation was first created. Together with the set of streams above it fully describes
|
||||
* the mapping for this particular range. */
|
||||
.with_column("ignore_msb", byte_type)
|
||||
/* Column used for sanity checking.
|
||||
* For a given generation it's equal to the number of ranges in this generation;
|
||||
* thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
|
||||
.with_column("num_ranges", int32_type, column_kind::static_column)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* A user-facing table providing identifiers of the streams used in CDC generations. */
|
||||
schema_ptr cdc_desc() {
|
||||
@@ -99,43 +155,14 @@ static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
schema_ptr service_levels() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
|
||||
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
|
||||
auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
|
||||
.with_column("service_level", utf8_type, column_kind::partition_key)
|
||||
.with_column("timeout", duration_type)
|
||||
.with_column("workload_type", utf8_type)
|
||||
.with_column("shares", int32_type)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
.with_column("shares", int32_type);
|
||||
if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
|
||||
builder.remove_column("shares");
|
||||
}
|
||||
|
||||
schema_ptr snapshot_sstables() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES);
|
||||
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES, std::make_optional(id))
|
||||
// Name of the snapshot
|
||||
.with_column("snapshot_name", utf8_type, column_kind::partition_key)
|
||||
// Keyspace where the snapshot was taken
|
||||
.with_column("keyspace", utf8_type, column_kind::partition_key)
|
||||
// Table within the keyspace
|
||||
.with_column("table", utf8_type, column_kind::partition_key)
|
||||
// Datacenter where this SSTable is located
|
||||
.with_column("datacenter", utf8_type, column_kind::partition_key)
|
||||
// Rack where this SSTable is located
|
||||
.with_column("rack", utf8_type, column_kind::partition_key)
|
||||
// First token in the token range covered by this SSTable
|
||||
.with_column("first_token", long_type, column_kind::clustering_key)
|
||||
// Unique identifier for the SSTable (UUID)
|
||||
.with_column("sstable_id", uuid_type, column_kind::clustering_key)
|
||||
// Last token in the token range covered by this SSTable
|
||||
.with_column("last_token", long_type)
|
||||
// TOC filename of the SSTable
|
||||
.with_column("toc_name", utf8_type)
|
||||
// Prefix path in object storage where the SSTable was backed up
|
||||
.with_column("prefix", utf8_type)
|
||||
// Flag if the SSTable was downloaded already
|
||||
.with_column("downloaded", boolean_type)
|
||||
return builder
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
@@ -155,15 +182,19 @@ schema_ptr snapshot_sstables() {
|
||||
static std::vector<schema_ptr> ensured_tables() {
|
||||
return {
|
||||
view_build_status(),
|
||||
cdc_generations_v2(),
|
||||
cdc_desc(),
|
||||
cdc_timestamps(),
|
||||
service_levels(),
|
||||
snapshot_sstables(),
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels(), snapshot_sstables()};
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
|
||||
return {cdc_generations_v2()};
|
||||
}
|
||||
|
||||
system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
|
||||
@@ -172,6 +203,36 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
|
||||
, _sp(sp) {
|
||||
}
|
||||
|
||||
static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
|
||||
std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
|
||||
if (workload_prioritization_enabled) {
|
||||
new_columns.push_back({"shares", int32_type});
|
||||
}
|
||||
return new_columns;
|
||||
};
|
||||
|
||||
static schema_ptr get_current_service_levels(data_dictionary::database db) {
|
||||
return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
: service_levels();
|
||||
}
|
||||
|
||||
static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto schema = get_current_service_levels(db);
|
||||
schema_builder b(schema);
|
||||
for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
|
||||
auto& [col_name, col_type] = col;
|
||||
bytes options_name = to_bytes(col_name.data());
|
||||
if (schema->get_column_definition(options_name)) {
|
||||
continue;
|
||||
}
|
||||
b.with_column(options_name, col_type, column_kind::regular_column);
|
||||
}
|
||||
b.with_hash_version();
|
||||
return b.build();
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -182,9 +243,11 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
|
||||
while (true) {
|
||||
// Check if there is any work to do before taking the group 0 guard.
|
||||
bool keyspaces_setup = db.has_keyspace(NAME);
|
||||
bool workload_prioritization_enabled = _sp.features().workload_prioritization;
|
||||
bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
|
||||
bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
|
||||
if (keyspaces_setup && tables_setup) {
|
||||
bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
|
||||
if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
|
||||
dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
|
||||
_started = true;
|
||||
co_return;
|
||||
@@ -195,25 +258,51 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
sstring description;
|
||||
|
||||
auto ksm = keyspace_metadata::new_keyspace(
|
||||
auto sd_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME,
|
||||
"org.apache.cassandra.locator.SimpleStrategy",
|
||||
{{"replication_factor", "3"}},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME)) {
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
|
||||
description += format(" create {} keyspace;", NAME);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME);
|
||||
}
|
||||
|
||||
// Get mutations for creating tables.
|
||||
auto sde_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME_EVERYWHERE,
|
||||
"org.apache.cassandra.locator.EverywhereStrategy",
|
||||
{},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME_EVERYWHERE)) {
|
||||
auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
|
||||
std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
|
||||
description += format(" create {} keyspace;", NAME_EVERYWHERE);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
|
||||
}
|
||||
|
||||
// Get mutations for creating and updating tables.
|
||||
auto num_keyspace_mutations = mutations.size();
|
||||
co_await coroutine::parallel_for_each(ensured_tables(),
|
||||
[this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
|
||||
[this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
|
||||
auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
|
||||
|
||||
// Ensure that the service_levels table contains new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS) {
|
||||
table = get_updated_service_levels(db, workload_prioritization_enabled);
|
||||
}
|
||||
|
||||
if (!db.has_schema(table->ks_name(), table->cf_name())) {
|
||||
co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
|
||||
}
|
||||
|
||||
// The service_levels table exists. Update it if it lacks new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
|
||||
auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
|
||||
std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
|
||||
}
|
||||
});
|
||||
if (mutations.size() > num_keyspace_mutations) {
|
||||
description += " create and update system_distributed(_everywhere) tables";
|
||||
@@ -235,6 +324,15 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start_workload_prioritization() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_return;
|
||||
}
|
||||
if (_qp.db().features().workload_prioritization) {
|
||||
co_await create_tables({get_updated_service_levels(_qp.db(), true)});
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start() {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -277,6 +375,90 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
|
||||
return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
|
||||
}
|
||||
|
||||
future<>
|
||||
system_distributed_keyspace::insert_cdc_generation(
|
||||
utils::UUID id,
|
||||
const cdc::topology_description& desc,
|
||||
context ctx) {
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
const size_t concurrency = 10;
|
||||
const size_t num_replicas = ctx.num_token_owners;
|
||||
|
||||
// To insert the data quickly and efficiently we send it in batches of multiple rows
|
||||
// (each batch represented by a single mutation). We also send multiple such batches concurrently.
|
||||
// However, we need to limit the memory consumption of the operation.
|
||||
// I assume that the memory consumption grows linearly with the number of replicas
|
||||
// (we send to all replicas ``at the same time''), with the batch size (the data must
|
||||
// be copied for each replica?) and with concurrency. These assumptions may be too conservative
|
||||
// but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
|
||||
// Thus, if we want to limit the memory consumption to L, it should be true that
|
||||
// mutation_size * num_replicas * concurrency <= L, hence
|
||||
// mutation_size <= L / (num_replicas * concurrency).
|
||||
// For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
|
||||
// mutation_size <= 10MB / 1000 = 10KB.
|
||||
// On the other hand we must have mutation_size >= size of a single row,
|
||||
// so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
|
||||
|
||||
// It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
|
||||
// which would correspond to L ~= 60MB. Hence that's the limit we use here.
|
||||
const size_t L = 60'000'000;
|
||||
const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
|
||||
|
||||
auto s = _qp.db().real_database().find_schema(
|
||||
system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
|
||||
co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
|
||||
co_await _sp.mutate(
|
||||
{ std::move(m) },
|
||||
db::consistency_level::ALL,
|
||||
db::timeout_clock::now() + 60s,
|
||||
nullptr, // trace_state
|
||||
empty_service_permit(),
|
||||
db::allow_per_partition_rate_limit::no,
|
||||
false // raw_counters
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<cdc::topology_description>>
|
||||
system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
|
||||
utils::chunked_vector<cdc::token_range_description> entries;
|
||||
size_t num_ranges = 0;
|
||||
co_await _qp.query_internal(
|
||||
// This should be a local read so 20s should be more than enough
|
||||
format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
|
||||
db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
|
||||
{ id },
|
||||
1000, // for ~1KB rows, ~1MB page size
|
||||
[&] (const cql3::untyped_result_set_row& row) {
|
||||
|
||||
std::vector<cdc::stream_id> streams;
|
||||
row.get_list_data<bytes>("streams", std::back_inserter(streams));
|
||||
entries.push_back(cdc::token_range_description{
|
||||
dht::token::from_int64(row.get_as<int64_t>("range_end")),
|
||||
std::move(streams),
|
||||
uint8_t(row.get_as<int8_t>("ignore_msb"))});
|
||||
num_ranges = row.get_as<int32_t>("num_ranges");
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
|
||||
if (entries.empty()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
// Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
|
||||
// were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
|
||||
// tested, so just in case...
|
||||
if (entries.size() != num_ranges) {
|
||||
throw std::runtime_error(format(
|
||||
"read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
|
||||
" but reading the partition returned {}.", num_ranges, entries.size()));
|
||||
}
|
||||
|
||||
co_return std::optional{cdc::topology_description(std::move(entries))};
|
||||
}
|
||||
|
||||
static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
|
||||
const replica::database& db,
|
||||
db_clock::time_point time,
|
||||
@@ -448,83 +630,65 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
|
||||
co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl) {
|
||||
// Not inserting the downloaded column so that re-populating on restore
|
||||
// retry doesn't overwrite downloaded=true set by a previous attempt
|
||||
static const sstring query = format("INSERT INTO {}.{} (snapshot_name, \"keyspace\", \"table\", datacenter, rack, first_token, sstable_id, last_token, toc_name, prefix) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) USING TTL {}", NAME, SNAPSHOT_SSTABLES, SNAPSHOT_SSTABLES_TTL_SECONDS);
|
||||
|
||||
return _qp.execute_internal(
|
||||
query,
|
||||
cl,
|
||||
internal_distributed_query_state(),
|
||||
{ std::move(snapshot_name), std::move(ks), std::move(table), std::move(dc), std::move(rack),
|
||||
dht::token::to_int64(first_token), sstable_id.uuid(), dht::token::to_int64(last_token), std::move(toc_name), std::move(prefix) },
|
||||
cql3::query_processor::cache_internal::yes).discard_result();
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
|
||||
return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<snapshot_sstable_entry>>
|
||||
system_distributed_keyspace::get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl, std::optional<dht::token> start_token, std::optional<dht::token> end_token) const {
|
||||
utils::chunked_vector<snapshot_sstable_entry> sstables;
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
|
||||
return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
|
||||
}
|
||||
|
||||
static const sstring base_query = format("SELECT toc_name, prefix, sstable_id, first_token, last_token, downloaded FROM {}.{}"
|
||||
" WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND datacenter = ? AND rack = ?", NAME, SNAPSHOT_SSTABLES);
|
||||
|
||||
auto read_row = [&] (const cql3::untyped_result_set_row& row) {
|
||||
sstables.emplace_back(sstables::sstable_id(row.get_as<utils::UUID>("sstable_id")), dht::token::from_int64(row.get_as<int64_t>("first_token")), dht::token::from_int64(row.get_as<int64_t>("last_token")), row.get_as<sstring>("toc_name"), row.get_as<sstring>("prefix"), is_downloaded(row.get_or<bool>("downloaded", false)));
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
|
||||
static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
|
||||
co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
|
||||
auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const lowres_clock::duration& d) {
|
||||
return data_value(cql_duration(months_counter{0},
|
||||
days_counter{0},
|
||||
nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
|
||||
},
|
||||
}, tv);
|
||||
};
|
||||
|
||||
if (start_token && end_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token >= ? AND first_token <= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token), dht::token::to_int64(*end_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else if (start_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token >= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else if (end_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token <= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*end_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else {
|
||||
co_await _qp.query_internal(
|
||||
base_query,
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack },
|
||||
1000,
|
||||
read_row);
|
||||
}
|
||||
|
||||
co_return sstables;
|
||||
auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const T& v) {
|
||||
return data_value(v);
|
||||
},
|
||||
}, v);
|
||||
};
|
||||
data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
|
||||
? data_value::make_null(utf8_type)
|
||||
: data_value(qos::service_level_options::to_string(slo.workload));
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value(slo.timeout),
|
||||
workload,
|
||||
service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value_g(slo.shares), service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::update_sstable_download_status(sstring snapshot_name,
|
||||
sstring ks,
|
||||
sstring table,
|
||||
sstring dc,
|
||||
sstring rack,
|
||||
sstables::sstable_id sstable_id,
|
||||
dht::token start_token,
|
||||
is_downloaded downloaded) const {
|
||||
static const sstring update_query = format("UPDATE {}.{} USING TTL {} SET downloaded = ? WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND "
|
||||
"datacenter = ? AND rack = ? AND first_token = ? AND sstable_id = ?",
|
||||
NAME,
|
||||
SNAPSHOT_SSTABLES,
|
||||
SNAPSHOT_SSTABLES_TTL_SECONDS);
|
||||
co_await _qp.execute_internal(update_query,
|
||||
consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{downloaded == is_downloaded::yes ? true : false, snapshot_name, ks, table, dc, rack, dht::token::to_int64(start_token), sstable_id.uuid()},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
|
||||
static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
|
||||
return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
}
|
||||
|
||||
@@ -9,17 +9,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "cdc/generation_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "dht/token.hh"
|
||||
#include "sstables/types.hh"
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/util/bool_class.hh>
|
||||
|
||||
#include <optional>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace cql3 {
|
||||
@@ -27,6 +24,7 @@ class query_processor;
|
||||
}
|
||||
|
||||
namespace cdc {
|
||||
class stream_id;
|
||||
class topology_description;
|
||||
class streams_version;
|
||||
} // namespace cdc
|
||||
@@ -36,27 +34,23 @@ namespace service {
|
||||
class migration_manager;
|
||||
}
|
||||
|
||||
|
||||
namespace db {
|
||||
|
||||
using is_downloaded = bool_class<class is_downloaded_tag>;
|
||||
|
||||
struct snapshot_sstable_entry {
|
||||
sstables::sstable_id sstable_id;
|
||||
dht::token first_token;
|
||||
dht::token last_token;
|
||||
sstring toc_name;
|
||||
sstring prefix;
|
||||
is_downloaded downloaded{is_downloaded::no};
|
||||
};
|
||||
|
||||
class system_distributed_keyspace {
|
||||
public:
|
||||
static constexpr auto NAME = "system_distributed";
|
||||
static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";
|
||||
|
||||
static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
|
||||
static constexpr auto SERVICE_LEVELS = "service_levels";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes. */
|
||||
static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes.
|
||||
* Resides in system_distributed_everywhere. */
|
||||
static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";
|
||||
|
||||
/* This table is used by CDC clients to learn about available CDC streams. */
|
||||
static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
|
||||
|
||||
@@ -68,12 +62,6 @@ public:
|
||||
* in the old table also appear in the new table, if necessary. */
|
||||
static constexpr auto CDC_DESC_V1 = "cdc_streams_descriptions";
|
||||
|
||||
/* This table is used by the backup and restore code to store per-sstable metadata.
|
||||
* The data the coordinator node puts in this table comes from the snapshot manifests. */
|
||||
static constexpr auto SNAPSHOT_SSTABLES = "snapshot_sstables";
|
||||
|
||||
static constexpr uint64_t SNAPSHOT_SSTABLES_TTL_SECONDS = std::chrono::seconds(std::chrono::days(3)).count();
|
||||
|
||||
/* Information required to modify/query some system_distributed tables, passed from the caller. */
|
||||
struct context {
|
||||
/* How many different token owners (endpoints) are there in the token ring? */
|
||||
@@ -89,14 +77,19 @@ private:
|
||||
|
||||
public:
|
||||
static std::vector<schema_ptr> all_distributed_tables();
|
||||
static std::vector<schema_ptr> all_everywhere_tables();
|
||||
|
||||
system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);
|
||||
|
||||
future<> start();
|
||||
future<> start_workload_prioritization();
|
||||
future<> stop();
|
||||
|
||||
bool started() const { return _started; }
|
||||
|
||||
future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
|
||||
future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
|
||||
|
||||
future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
|
||||
future<bool> cdc_desc_exists(db_clock::time_point, context);
|
||||
|
||||
@@ -112,25 +105,10 @@ public:
|
||||
// NOTE: currently used only by alternator
|
||||
future<db_clock::time_point> cdc_current_generation_timestamp(context);
|
||||
|
||||
/* Inserts a single SSTable entry for a given snapshot, keyspace, table, datacenter,
|
||||
* and rack. The row is written with the specified TTL (in seconds). Uses consistency
|
||||
* level `EACH_QUORUM` by default.*/
|
||||
future<> insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl = db::consistency_level::EACH_QUORUM);
|
||||
|
||||
/* Retrieves all SSTable entries for a given snapshot, keyspace, table, datacenter, and rack.
|
||||
* If `start_token` and `end_token` are provided, only entries whose `first_token` is in the range [`start_token`, `end_token`] will be returned.
|
||||
* Returns a vector of `snapshot_sstable_entry` structs containing `sstable_id`, `first_token`, `last_token`,
|
||||
* `toc_name`, and `prefix`. Uses consistency level `LOCAL_QUORUM` by default. */
|
||||
future<utils::chunked_vector<snapshot_sstable_entry>> get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl = db::consistency_level::LOCAL_QUORUM, std::optional<dht::token> start_token = std::nullopt, std::optional<dht::token> end_token = std::nullopt) const;
|
||||
|
||||
future<> update_sstable_download_status(sstring snapshot_name,
|
||||
sstring ks,
|
||||
sstring table,
|
||||
sstring dc,
|
||||
sstring rack,
|
||||
sstables::sstable_id sstable_id,
|
||||
dht::token start_token,
|
||||
is_downloaded downloaded) const;
|
||||
future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
|
||||
future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
|
||||
future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
|
||||
future<> drop_service_level(sstring service_level_name) const;
|
||||
|
||||
private:
|
||||
future<> create_tables(std::vector<schema_ptr> tables);
|
||||
|
||||
@@ -1146,8 +1146,7 @@ schema_ptr system_keyspace::sstables_registry() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, SSTABLES_REGISTRY);
|
||||
return schema_builder(NAME, SSTABLES_REGISTRY, id)
|
||||
.with_column("table_id", uuid_type, column_kind::partition_key)
|
||||
.with_column("node_owner", uuid_type, column_kind::partition_key)
|
||||
.with_column("owner", uuid_type, column_kind::partition_key)
|
||||
.with_column("generation", timeuuid_type, column_kind::clustering_key)
|
||||
.with_column("status", utf8_type)
|
||||
.with_column("state", utf8_type)
|
||||
@@ -1310,7 +1309,6 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
return schema_builder(NAME, VIEW_BUILDING_TASKS, std::make_optional(id))
|
||||
.with_column("key", utf8_type, column_kind::partition_key)
|
||||
.with_column("id", timeuuid_type, column_kind::clustering_key)
|
||||
.with_column("min_task_id", timeuuid_type, column_kind::static_column)
|
||||
.with_column("type", utf8_type)
|
||||
.with_column("aborted", boolean_type)
|
||||
.with_column("base_id", uuid_type)
|
||||
@@ -2751,36 +2749,12 @@ future<mutation> system_keyspace::make_remove_view_build_status_on_host_mutation
|
||||
|
||||
static constexpr auto VIEW_BUILDING_KEY = "view_building";
|
||||
|
||||
future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> system_keyspace::get_view_building_tasks() {
|
||||
future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
|
||||
static const sstring query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
using namespace db::view;
|
||||
|
||||
// When the VIEW_BUILDING_TASKS_MIN_TASK_ID feature is active, read the static
|
||||
// column min_task_id first and use it as a lower bound for the clustering row
|
||||
// scan. This skips tombstoned rows below the boundary, avoiding dead-cell
|
||||
// warnings from the tombstone_warn_threshold check.
|
||||
std::optional<utils::UUID> min_task_id;
|
||||
if (_db.features().view_building_tasks_min_task_id) {
|
||||
auto schema = view_building_tasks();
|
||||
auto pk = partition_key::from_single_value(*schema, data_value(VIEW_BUILDING_KEY).serialize_nonnull());
|
||||
auto dk = dht::decorate_key(*schema, pk);
|
||||
auto col_id = schema->get_column_definition("min_task_id")->id;
|
||||
query::partition_slice slice(
|
||||
query::clustering_row_ranges{},
|
||||
{col_id},
|
||||
{},
|
||||
query::partition_slice::option_set::of<query::partition_slice::option::always_return_static_content>());
|
||||
auto cmd = query::read_command(schema->id(), schema->version(), slice,
|
||||
_db.get_query_max_result_size(), query::tombstone_limit::max);
|
||||
auto [qr, _cache_temp] = co_await _db.query(schema, cmd, query::result_options::only_result(),
|
||||
{dht::partition_range::make_singular(dk)}, nullptr, db::no_timeout);
|
||||
auto rs = query::result_set::from_raw_result(schema, slice, *qr);
|
||||
if (!rs.empty()) {
|
||||
min_task_id = rs.row(0).get<utils::UUID>("min_task_id");
|
||||
}
|
||||
}
|
||||
|
||||
building_tasks tasks;
|
||||
auto process_row = [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
||||
co_await _qp.query_internal(query, [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
auto type = task_type_from_string(row.get_as<sstring>("type"));
|
||||
auto aborted = row.get_as<bool>("aborted");
|
||||
@@ -2805,18 +2779,8 @@ future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> system_k
|
||||
break;
|
||||
}
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
if (min_task_id) {
|
||||
static const sstring bounded_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}' AND id >= ?",
|
||||
NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
co_await _qp.query_internal(bounded_query, db::consistency_level::LOCAL_ONE, {*min_task_id}, 1000, std::move(process_row));
|
||||
} else {
|
||||
static const sstring full_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'",
|
||||
NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
co_await _qp.query_internal(full_query, std::move(process_row));
|
||||
}
|
||||
co_return std::pair{std::move(tasks), std::move(min_task_id)};
|
||||
});
|
||||
co_return tasks;
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task) {
|
||||
@@ -3509,37 +3473,37 @@ system_keyspace::read_cdc_generation_opt(utils::UUID id) {
|
||||
co_return cdc::topology_description{std::move(entries)};
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
|
||||
static const auto req = format("INSERT INTO system.{} (table_id, node_owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
|
||||
slogger.trace("Inserting {}.{}.{} into {}", tid, node_owner, desc.generation, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, tid.id, node_owner.uuid(), desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
|
||||
future<> system_keyspace::sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
|
||||
static const auto req = format("INSERT INTO system.{} (owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
|
||||
slogger.trace("Inserting {}.{} into {}", owner, desc.generation, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, owner.id, desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) {
|
||||
static const auto req = format("UPDATE system.{} SET status = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Updating {}.{}.{} -> status={} in {}", tid, node_owner, gen, status, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, status, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
future<> system_keyspace::sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status) {
|
||||
static const auto req = format("UPDATE system.{} SET status = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Updating {}.{} -> status={} in {}", owner, gen, status, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, status, owner.id, gen).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) {
|
||||
static const auto req = format("UPDATE system.{} SET state = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
future<> system_keyspace::sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) {
|
||||
static const auto req = format("UPDATE system.{} SET state = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
auto new_state = sstables::state_to_dir(state);
|
||||
slogger.trace("Updating {}.{}.{} -> state={} in {}", tid, node_owner, gen, new_state, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, new_state, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
slogger.trace("Updating {}.{} -> state={} in {}", owner, gen, new_state, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, new_state, owner.id, gen).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) {
|
||||
static const auto req = format("DELETE FROM system.{} WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Removing {}.{}.{} from {}", tid, node_owner, gen, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
future<> system_keyspace::sstables_registry_delete_entry(table_id owner, sstables::generation_type gen) {
|
||||
static const auto req = format("DELETE FROM system.{} WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Removing {}.{} from {}", owner, gen, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, owner.id, gen).discard_result();
|
||||
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer) {
|
||||
static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE table_id = ? AND node_owner = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Listing {}.{} entries from {}", tid, node_owner, SSTABLES_REGISTRY);
|
||||
future<> system_keyspace::sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer) {
|
||||
static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE owner = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Listing {} entries from {}", owner, SSTABLES_REGISTRY);
|
||||
|
||||
co_await _qp.query_internal(req, db::consistency_level::ONE, { tid.id, node_owner.uuid() }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
co_await _qp.query_internal(req, db::consistency_level::ONE, { owner.id }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
auto status = row.get_as<sstring>("status");
|
||||
auto state = sstables::state_from_dir(row.get_as<sstring>("state"));
|
||||
auto gen = sstables::generation_type(row.get_as<utils::UUID>("generation"));
|
||||
|
||||
@@ -572,7 +572,7 @@ public:
|
||||
future<mutation> make_remove_view_build_status_on_host_mutation(api::timestamp_type ts, system_keyspace_view_name view_name, locator::host_id host_id);
|
||||
|
||||
// system.view_building_tasks
|
||||
future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> get_view_building_tasks();
|
||||
future<db::view::building_tasks> get_view_building_tasks();
|
||||
future<mutation> make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task);
|
||||
future<mutation> make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id);
|
||||
|
||||
@@ -671,12 +671,12 @@ public:
|
||||
future<mutation> make_view_builder_version_mutation(api::timestamp_type ts, view_builder_version_t version);
|
||||
future<view_builder_version_t> get_view_builder_version();
|
||||
|
||||
future<> sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
|
||||
future<> sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status);
|
||||
future<> sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state);
|
||||
future<> sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen);
|
||||
future<> sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
|
||||
future<> sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status);
|
||||
future<> sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state);
|
||||
future<> sstables_registry_delete_entry(table_id owner, sstables::generation_type gen);
|
||||
using sstable_registry_entry_consumer = sstables::sstables_registry::entry_consumer;
|
||||
future<> sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer);
|
||||
future<> sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer);
|
||||
|
||||
future<std::optional<sstring>> load_group0_upgrade_state();
|
||||
future<> save_group0_upgrade_state(sstring);
|
||||
|
||||
@@ -15,24 +15,24 @@ class system_keyspace_sstables_registry : public sstables::sstables_registry {
|
||||
public:
|
||||
system_keyspace_sstables_registry(system_keyspace& keyspace) : _keyspace(keyspace.shared_from_this()) {}
|
||||
|
||||
virtual seastar::future<> create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
|
||||
return _keyspace->sstables_registry_create_entry(tid, node_owner, status, state, desc);
|
||||
virtual seastar::future<> create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
|
||||
return _keyspace->sstables_registry_create_entry(owner, status, state, desc);
|
||||
}
|
||||
|
||||
virtual seastar::future<> update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) override {
|
||||
return _keyspace->sstables_registry_update_entry_status(tid, node_owner, gen, status);
|
||||
virtual seastar::future<> update_entry_status(table_id owner, sstables::generation_type gen, sstring status) override {
|
||||
return _keyspace->sstables_registry_update_entry_status(owner, gen, status);
|
||||
}
|
||||
|
||||
virtual seastar::future<> update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) override {
|
||||
return _keyspace->sstables_registry_update_entry_state(tid, node_owner, gen, state);
|
||||
virtual seastar::future<> update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) override {
|
||||
return _keyspace->sstables_registry_update_entry_state(owner, gen, state);
|
||||
}
|
||||
|
||||
virtual seastar::future<> delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) override {
|
||||
return _keyspace->sstables_registry_delete_entry(tid, node_owner, gen);
|
||||
virtual seastar::future<> delete_entry(table_id owner, sstables::generation_type gen) override {
|
||||
return _keyspace->sstables_registry_delete_entry(owner, gen);
|
||||
}
|
||||
|
||||
virtual seastar::future<> sstables_registry_list(table_id tid, locator::host_id node_owner, entry_consumer consumer) override {
|
||||
return _keyspace->sstables_registry_list(tid, node_owner, std::move(consumer));
|
||||
virtual seastar::future<> sstables_registry_list(table_id owner, entry_consumer consumer) override {
|
||||
return _keyspace->sstables_registry_list(owner, std::move(consumer));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include <seastar/core/cacheline.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -42,16 +41,13 @@ class node_update_backlog {
|
||||
std::chrono::milliseconds _interval;
|
||||
std::atomic<clock::time_point> _last_update;
|
||||
std::atomic<update_backlog> _max;
|
||||
utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;
|
||||
|
||||
public:
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
|
||||
utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
|
||||
: _backlogs(shards)
|
||||
, _interval(interval)
|
||||
, _last_update(clock::now() - _interval)
|
||||
, _max(update_backlog::no_backlog())
|
||||
, _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
|
||||
, _max(update_backlog::no_backlog()) {
|
||||
if (utils::get_local_injector().enter("update_backlog_immediately")) {
|
||||
_interval = std::chrono::milliseconds(0);
|
||||
_last_update = clock::now();
|
||||
@@ -63,9 +59,6 @@ public:
|
||||
update_backlog fetch_shard(unsigned shard);
|
||||
seastar::future<std::optional<update_backlog>> fetch_if_changed();
|
||||
|
||||
std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const;
|
||||
|
||||
// Exposed for testing only.
|
||||
update_backlog load() const {
|
||||
return _max.load(std::memory_order_relaxed);
|
||||
|
||||
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
|
||||
auto pli = _two_level_locks.find(*pk);
|
||||
if (pli == _two_level_locks.end()) {
|
||||
// This shouldn't happen... We can't unlock this lock if we can't find it...
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&pli->first == pk);
|
||||
if (cpk) {
|
||||
auto rli = pli->second._row_locks.find(*cpk);
|
||||
if (rli == pli->second._row_locks.end()) {
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&rli->first == cpk);
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
#include "db/view/view_builder.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
#include "db/view/view_update_generator.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include "db/view/regular_column_transformation.hh"
|
||||
#include "db/system_keyspace_view_types.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
@@ -3493,27 +3492,18 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
|
||||
}
|
||||
}
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const {
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms) {
|
||||
auto adjust = [] (float x) { return x * x * x; };
|
||||
auto budget = std::max(db::timeout_clock::duration(0),
|
||||
timeout - db::timeout_clock::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
|
||||
auto budget = std::max(service::storage_proxy::clock_type::duration(0),
|
||||
timeout - service::storage_proxy::clock_type::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
|
||||
// "budget" has millisecond resolution and can potentially be long
|
||||
// in the future so converting it to microseconds may overflow.
|
||||
// So to compare buget and ret we need to convert both to the lower
|
||||
// resolution.
|
||||
if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
|
||||
if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
|
||||
return ret;
|
||||
} else {
|
||||
// budget is small (< ret) so can be converted to microseconds
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <exception>
|
||||
#include <ranges>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
@@ -180,10 +179,7 @@ future<> view_building_coordinator::clean_finished_tasks() {
|
||||
co_return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
|
||||
// Collect tasks eligible for deletion: must still be in state and not aborted.
|
||||
std::vector<utils::UUID> tasks_to_delete;
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
for (auto& [replica, tasks]: _finished_tasks) {
|
||||
for (auto& task_id: tasks) {
|
||||
// The task might be aborted in the meantime. In this case we cannot remove it because we need it to create a new task.
|
||||
@@ -193,65 +189,15 @@ future<> view_building_coordinator::clean_finished_tasks() {
|
||||
// If yes, we can just remove it instead of aborting it.
|
||||
auto task_opt = _vb_sm.building_state.get_task(*_vb_sm.building_state.currently_processed_base_table, replica, task_id);
|
||||
if (task_opt && !task_opt->get().aborted) {
|
||||
tasks_to_delete.push_back(task_id);
|
||||
builder.del_task(task_id);
|
||||
vbc_logger.debug("Removing finished task with ID: {}", task_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!tasks_to_delete.empty()) {
|
||||
// Find the minimum UUID (by timeuuid ordering) among tasks that are NOT being
|
||||
// deleted — i.e., alive tasks that must remain in the table.
|
||||
// Everything strictly below this boundary is safe to cover with one range tombstone.
|
||||
const std::unordered_set<utils::UUID> to_delete_set(tasks_to_delete.begin(), tasks_to_delete.end());
|
||||
std::optional<utils::UUID> min_alive_uuid;
|
||||
for (auto& [base_id, base_tasks] : _vb_sm.building_state.tasks_state) {
|
||||
for (auto& [replica, rep_tasks] : base_tasks) {
|
||||
auto check = [&](const utils::UUID& id) {
|
||||
if (!to_delete_set.contains(id)
|
||||
&& (!min_alive_uuid || timeuuid_tri_compare(id, *min_alive_uuid) < 0)) {
|
||||
min_alive_uuid = id;
|
||||
}
|
||||
};
|
||||
for (auto& [id, task] : rep_tasks.staging_tasks) {
|
||||
check(id);
|
||||
}
|
||||
for (auto& [view_id, task_m] : rep_tasks.view_tasks) {
|
||||
for (auto& [id, task] : task_m) {
|
||||
check(id);
|
||||
}
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
|
||||
if (min_alive_uuid) {
|
||||
vbc_logger.debug("Removing finished tasks before ID: {} using range tombstone", *min_alive_uuid);
|
||||
builder.del_tasks_before(*min_alive_uuid);
|
||||
for (auto& task_id : tasks_to_delete) {
|
||||
// Tasks below min_alive_uuid are already covered by the range tombstone.
|
||||
if (timeuuid_tri_compare(task_id, *min_alive_uuid) < 0) {
|
||||
continue;
|
||||
}
|
||||
vbc_logger.debug("Removing finished task with ID: {}", task_id);
|
||||
builder.del_task(task_id);
|
||||
}
|
||||
} else {
|
||||
// No alive tasks remain — one range tombstone covers everything.
|
||||
vbc_logger.debug("No alive tasks remain, removing all finished tasks using range tombstone");
|
||||
builder.del_all_tasks();
|
||||
}
|
||||
|
||||
if (_db.features().view_building_tasks_min_task_id) {
|
||||
// If min_alive_uuid == std::nullopt, set min_task_id to a fresh UUID,
|
||||
// so future scans start past all the just-deleted rows (new tasks created
|
||||
// later will have larger UUIDs).
|
||||
builder.set_min_task_id(min_alive_uuid ? *min_alive_uuid : utils::UUID_gen::get_time_UUID());
|
||||
}
|
||||
|
||||
co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
|
||||
for (auto& [_, tasks_set]: _finished_tasks) {
|
||||
tasks_set.clear();
|
||||
}
|
||||
co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
|
||||
for (auto& [_, tasks_set]: _finished_tasks) {
|
||||
tasks_set.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -587,7 +533,7 @@ void view_building_coordinator::generate_tablet_migration_updates(utils::chunked
|
||||
}
|
||||
|
||||
auto last_token = tmap.get_last_token(gid.tablet);
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
|
||||
auto create_task_copy_on_pending_replica = [&] (const view_building_task& task) {
|
||||
auto new_id = builder.new_id();
|
||||
@@ -655,7 +601,7 @@ void view_building_coordinator::generate_tablet_resize_updates(utils::chunked_ve
|
||||
return;
|
||||
}
|
||||
bool is_split = old_tmap.tablet_count() < new_tmap.tablet_count();
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
|
||||
auto create_task_copy = [&] (const view_building_task& task, dht::token last_token) -> utils::UUID {
|
||||
auto new_id = builder.new_id();
|
||||
@@ -725,7 +671,7 @@ void view_building_coordinator::abort_tasks(utils::chunked_vector<canonical_muta
|
||||
}
|
||||
vbc_logger.debug("Generating abort mutations for tasks for table {}", table_id);
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto abort_task_map = [&] (const task_map& task_map) {
|
||||
for (auto& [id, _]: task_map) {
|
||||
vbc_logger.debug("Aborting task {}", id);
|
||||
@@ -754,7 +700,7 @@ void abort_view_building_tasks(const view_building_state_machine& vb_sm,
|
||||
}
|
||||
vbc_logger.debug("Generating abort mutations for tasks for table {} on replica {} and last token {}", table_id, replica, last_token);
|
||||
|
||||
view_building_task_mutation_builder builder(write_timestamp, vb_sm.building_state.make_task_uuid_generator(write_timestamp));
|
||||
view_building_task_mutation_builder builder(write_timestamp);
|
||||
auto abort_task_map = [&] (const task_map& task_map) {
|
||||
for (auto& [id, task]: task_map) {
|
||||
if (task.last_token == last_token) {
|
||||
@@ -796,7 +742,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
|
||||
return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto& base_tasks = _vb_sm.building_state.tasks_state.at(table_id);
|
||||
for (auto& [_, replica_tasks]: base_tasks) {
|
||||
for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
|
||||
@@ -813,7 +759,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
|
||||
return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto& replica_tasks = _vb_sm.building_state.tasks_state.at(table_id).at(replica);
|
||||
for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
|
||||
rollback_task_map(builder, building_task_map);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
*/
|
||||
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
|
||||
namespace db {
|
||||
|
||||
@@ -23,10 +22,9 @@ view_building_task::view_building_task(utils::UUID id, task_type type, bool abor
|
||||
, replica(replica)
|
||||
, last_token(last_token) {}
|
||||
|
||||
view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid)
|
||||
view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table)
|
||||
: tasks_state(std::move(tasks_state))
|
||||
, currently_processed_base_table(std::move(processed_base_table))
|
||||
, min_alive_uuid(std::move(min_alive_uuid)) {}
|
||||
, currently_processed_base_table(std::move(processed_base_table)) {}
|
||||
|
||||
views_state::views_state(std::map<table_id, std::vector<table_id>> views_per_base, view_build_status_map status_map)
|
||||
: views_per_base(std::move(views_per_base))
|
||||
@@ -129,24 +127,6 @@ std::map<dht::token, std::vector<view_building_task>> view_building_state::colle
|
||||
return tasks;
|
||||
}
|
||||
|
||||
task_uuid_generator::task_uuid_generator(api::timestamp_type base_ts)
|
||||
: _next_ts(base_ts) {}
|
||||
|
||||
utils::UUID task_uuid_generator::operator()() {
|
||||
return utils::UUID_gen::get_random_time_UUID_from_micros(
|
||||
std::chrono::microseconds{_next_ts++});
|
||||
}
|
||||
|
||||
task_uuid_generator view_building_state::make_task_uuid_generator(api::timestamp_type ts) const {
|
||||
if (min_alive_uuid) {
|
||||
auto lower_bound = utils::UUID_gen::micros_timestamp(*min_alive_uuid);
|
||||
if (ts <= lower_bound) {
|
||||
ts = lower_bound + 1;
|
||||
}
|
||||
}
|
||||
return task_uuid_generator{ts};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include <fmt/base.h>
|
||||
#include "schema/schema_fwd.hh"
|
||||
@@ -65,16 +64,6 @@ struct replica_tasks {
|
||||
using base_table_tasks = std::map<locator::tablet_replica, replica_tasks>;
|
||||
using building_tasks = std::map<table_id, base_table_tasks>;
|
||||
|
||||
// Generates unique timeuuids with strictly increasing microsecond timestamps.
|
||||
// Each call to operator() returns a new timeuuid whose timestamp is one
|
||||
// microsecond greater than the previous one.
|
||||
class task_uuid_generator {
|
||||
api::timestamp_type _next_ts;
|
||||
public:
|
||||
explicit task_uuid_generator(api::timestamp_type base_ts);
|
||||
utils::UUID operator()();
|
||||
};
|
||||
|
||||
// Represents cluster-wide view building state (only for tablet-based views).
|
||||
// The state stores all unfinished view building tasks for all tablet-based views
|
||||
// and table_id of currently processed base table by view building coordinator.
|
||||
@@ -84,22 +73,14 @@ public:
|
||||
struct view_building_state {
|
||||
building_tasks tasks_state;
|
||||
std::optional<table_id> currently_processed_base_table;
|
||||
std::optional<utils::UUID> min_alive_uuid;
|
||||
|
||||
view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid);
|
||||
view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table);
|
||||
view_building_state() = default;
|
||||
|
||||
std::optional<std::reference_wrapper<const view_building_task>> get_task(table_id base_id, locator::tablet_replica replica, utils::UUID id) const;
|
||||
std::vector<std::reference_wrapper<const view_building_task>> get_tasks_for_host(table_id base_id, locator::host_id host) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id, const locator::tablet_replica& replica) const;
|
||||
|
||||
// Creates a generator that produces unique timeuuids suitable for view
|
||||
// building task IDs. The generated uuids have strictly increasing
|
||||
// microsecond timestamps starting from write_timestamp. If min_alive_uuid
|
||||
// is set, all generated uuids are guaranteed to be greater than
|
||||
// *min_alive_uuid in timeuuid order.
|
||||
task_uuid_generator make_task_uuid_generator(api::timestamp_type write_timestamp) const;
|
||||
};
|
||||
|
||||
// Represents global state of tablet-based views.
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace db {
|
||||
namespace view {
|
||||
|
||||
utils::UUID view_building_task_mutation_builder::new_id() {
|
||||
return _uuid_gen();
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
clustering_key view_building_task_mutation_builder::get_ck(utils::UUID id) {
|
||||
@@ -52,30 +52,6 @@ view_building_task_mutation_builder& view_building_task_mutation_builder::del_ta
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::del_tasks_before(utils::UUID id) {
|
||||
auto ck = get_ck(id);
|
||||
range_tombstone rt(
|
||||
position_in_partition::before_all_clustered_rows(),
|
||||
position_in_partition_view(ck, bound_weight::before_all_prefixed),
|
||||
tombstone{_ts, gc_clock::now()});
|
||||
_m.partition().apply_row_tombstone(*_s, std::move(rt));
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::del_all_tasks() {
|
||||
range_tombstone rt(
|
||||
position_in_partition::before_all_clustered_rows(),
|
||||
position_in_partition::after_all_clustered_rows(),
|
||||
tombstone{_ts, gc_clock::now()});
|
||||
_m.partition().apply_row_tombstone(*_s, std::move(rt));
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::set_min_task_id(utils::UUID id) {
|
||||
_m.set_static_cell("min_task_id", data_value(id), _ts);
|
||||
return *this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
@@ -20,19 +19,17 @@ namespace view {
|
||||
// Factory for mutations to `system.view_building_tasks` table.
|
||||
class view_building_task_mutation_builder {
|
||||
api::timestamp_type _ts;
|
||||
task_uuid_generator _uuid_gen;
|
||||
schema_ptr _s;
|
||||
mutation _m;
|
||||
|
||||
public:
|
||||
view_building_task_mutation_builder(api::timestamp_type ts, task_uuid_generator uuid_gen)
|
||||
view_building_task_mutation_builder(api::timestamp_type ts)
|
||||
: _ts(ts)
|
||||
, _uuid_gen(std::move(uuid_gen))
|
||||
, _s(db::system_keyspace::view_building_tasks())
|
||||
, _m(_s, partition_key::from_single_value(*_s, data_value("view_building").serialize_nonnull()))
|
||||
{ }
|
||||
|
||||
utils::UUID new_id();
|
||||
static utils::UUID new_id();
|
||||
|
||||
view_building_task_mutation_builder& set_type(utils::UUID id, db::view::view_building_task::task_type type);
|
||||
view_building_task_mutation_builder& set_aborted(utils::UUID id, bool aborted);
|
||||
@@ -41,12 +38,6 @@ public:
|
||||
view_building_task_mutation_builder& set_last_token(utils::UUID id, dht::token last_token);
|
||||
view_building_task_mutation_builder& set_replica(utils::UUID id, const locator::tablet_replica& replica);
|
||||
view_building_task_mutation_builder& del_task(utils::UUID id);
|
||||
// Deletes all tasks with clustering key < id using a range tombstone.
|
||||
view_building_task_mutation_builder& del_tasks_before(utils::UUID id);
|
||||
// Deletes all tasks using a range tombstone covering the entire clustering range.
|
||||
view_building_task_mutation_builder& del_all_tasks();
|
||||
// Sets the static column min_task_id to `id`.
|
||||
view_building_task_mutation_builder& set_min_task_id(utils::UUID id);
|
||||
|
||||
mutation build() {
|
||||
return std::move(_m);
|
||||
|
||||
@@ -275,12 +275,11 @@ future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto guard = co_await _group0.client().start_operation(_as);
|
||||
auto uuid_gen = _vb_state_machine.building_state.make_task_uuid_generator(guard.write_timestamp());
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
for (auto& [table_id, sst_infos]: _sstables_to_register) {
|
||||
for (auto& sst_info: sst_infos) {
|
||||
view_building_task task {
|
||||
uuid_gen(), view_building_task::task_type::process_staging, false,
|
||||
utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
|
||||
table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
|
||||
};
|
||||
auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
@@ -716,7 +715,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
|
||||
} catch (...) {
|
||||
eptr = std::current_exception();
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
|
||||
}
|
||||
reader.close().get();
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ public:
|
||||
// Returns the number of bytes in the backlog divided by the maximum number of bytes
|
||||
// that the backlog can hold before employing admission control. While the backlog
|
||||
// is below the threshold, the coordinator will slow down the view updates up to
|
||||
// node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// the coordinator will reject the writes that would increase the backlog. On the
|
||||
// replica, the writes will start failing only after reaching the hard limit '_max'.
|
||||
float relative_size() const {
|
||||
@@ -70,4 +70,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(
|
||||
update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
*/
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include <seastar/core/timed_out_error.hh>
|
||||
#include "gms/inet_address.hh"
|
||||
#include <seastar/util/defer.hh>
|
||||
@@ -96,10 +95,9 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
|
||||
: _db(db)
|
||||
, _proxy(proxy)
|
||||
, _node_update_backlog(node_backlog)
|
||||
, _progress_tracker(std::make_unique<progress_tracker>())
|
||||
, _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
|
||||
{
|
||||
@@ -114,7 +112,7 @@ future<> view_update_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
auto drop_sstable_references = defer([&] () noexcept {
|
||||
// Clear sstable references so sstables_manager::stop() doesn't hang.
|
||||
vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
|
||||
vug_logger.info("leaving {} unstaged sstables unprocessed",
|
||||
_sstables_to_move.size(), _sstables_with_tables.size());
|
||||
_sstables_to_move.clear();
|
||||
_sstables_with_tables.clear();
|
||||
@@ -500,7 +498,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
|
||||
// the one which limits the number of incoming client requests by delaying the response to the client.
|
||||
if (batch_num > 0) {
|
||||
update_backlog local_backlog = _db.get_view_update_backlog();
|
||||
std::chrono::microseconds throttle_delay = _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);
|
||||
std::chrono::microseconds throttle_delay = calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
|
||||
|
||||
co_await seastar::sleep(throttle_delay);
|
||||
|
||||
|
||||
@@ -52,7 +52,6 @@ using allow_hints = bool_class<allow_hints_tag>;
|
||||
|
||||
namespace db::view {
|
||||
|
||||
class node_update_backlog;
|
||||
class stats;
|
||||
struct wait_for_all_updates_tag {};
|
||||
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
|
||||
@@ -64,7 +63,6 @@ public:
|
||||
private:
|
||||
replica::database& _db;
|
||||
sharded<service::storage_proxy>& _proxy;
|
||||
node_update_backlog& _node_update_backlog;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::condition_variable _pending_sstables;
|
||||
@@ -77,7 +75,7 @@ private:
|
||||
optimized_optional<abort_source::subscription> _early_abort_subscription;
|
||||
void do_abort() noexcept;
|
||||
public:
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
|
||||
~view_update_generator();
|
||||
|
||||
future<> start();
|
||||
|
||||
68
dist/CMakeLists.txt
vendored
68
dist/CMakeLists.txt
vendored
@@ -141,72 +141,4 @@ add_dependencies(dist
|
||||
dist-python3
|
||||
dist-server)
|
||||
|
||||
set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
|
||||
set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
|
||||
|
||||
# Map system processor to Debian architecture names
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(deb_arch "amd64")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
set(deb_arch "arm64")
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
|
||||
set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
|
||||
set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
|
||||
set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
|
||||
set(server_rpms
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(cqlsh_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(python3_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
|
||||
|
||||
set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
|
||||
set(server_debs
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
|
||||
set(cqlsh_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
|
||||
set(python3_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
|
||||
|
||||
add_custom_target(collect-dist-rpm
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting RPMs into ${dist_rpm_dir}")
|
||||
|
||||
add_custom_target(collect-dist-deb
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting DEBs into ${dist_deb_dir}")
|
||||
|
||||
add_custom_target(collect-dist
|
||||
DEPENDS collect-dist-rpm collect-dist-deb)
|
||||
|
||||
add_subdirectory(debuginfo)
|
||||
|
||||
16
dist/common/supervisor/scylla-server.sh
vendored
16
dist/common/supervisor/scylla-server.sh
vendored
@@ -9,22 +9,6 @@ for f in "$etcdir"/scylla.d/*.conf; do
|
||||
done
|
||||
|
||||
if is_privileged; then
|
||||
# Override pipe-based core_pattern that may not work inside a container
|
||||
# (e.g. Ubuntu host's apport). File-based patterns resolve inside the
|
||||
# container's mount namespace, so coredumps land in the right place.
|
||||
# Derive workdir from scylla.yaml, matching the Python entrypoint logic.
|
||||
_workdir=$(python3 -c "import yaml; cfg=yaml.safe_load(open('/etc/scylla/scylla.yaml')); print(cfg.get('workdir') or '/var/lib/scylla')" 2>/dev/null || echo "/var/lib/scylla")
|
||||
_coredump_dir="${_workdir}/coredump"
|
||||
core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || true)
|
||||
if [[ "$core_pattern" == "|"* ]]; then
|
||||
if ! mkdir -p "$_coredump_dir" 2>/dev/null; then
|
||||
echo "WARNING: could not create coredump directory $_coredump_dir" >&2
|
||||
elif echo "${_coredump_dir}/core.%e.%p.%t" > /proc/sys/kernel/core_pattern 2>/dev/null; then
|
||||
echo "kernel.core_pattern overridden to file-based pattern: ${_coredump_dir}/core.%e.%p.%t" >&2
|
||||
else
|
||||
echo "WARNING: pipe-based core_pattern detected but could not override. Coredumps may be lost." >&2
|
||||
fi
|
||||
fi
|
||||
"$scriptsdir"/scylla_prepare
|
||||
fi
|
||||
execsudo /usr/bin/env SCYLLA_HOME=$SCYLLA_HOME SCYLLA_CONF=$SCYLLA_CONF "$bindir"/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
|
||||
|
||||
1
dist/docker/docker-entrypoint.py
vendored
1
dist/docker/docker-entrypoint.py
vendored
@@ -24,7 +24,6 @@ try:
|
||||
setup.developerMode()
|
||||
setup.cpuSet()
|
||||
setup.io()
|
||||
setup.coredumpSetup()
|
||||
setup.cqlshrc()
|
||||
setup.write_rackdc_properties()
|
||||
setup.arguments()
|
||||
|
||||
66
dist/docker/scyllasetup.py
vendored
66
dist/docker/scyllasetup.py
vendored
@@ -3,7 +3,6 @@ import logging
|
||||
import yaml
|
||||
import os
|
||||
import socket
|
||||
import errno
|
||||
|
||||
def is_bind_mount(path):
|
||||
# Check if the file or its parent is a mount point (bind mount or otherwise)
|
||||
@@ -48,7 +47,6 @@ class ScyllaSetup:
|
||||
self._dc = arguments.dc
|
||||
self._rack = arguments.rack
|
||||
self._blocked_reactor_notify_ms = arguments.blocked_reactor_notify_ms
|
||||
self._coredump_dir = None
|
||||
|
||||
def _run(self, *args, **kwargs):
|
||||
logging.info('running: {}'.format(args))
|
||||
@@ -134,70 +132,6 @@ class ScyllaSetup:
|
||||
f.write(f"dc={dc}\n")
|
||||
f.write(f"rack={rack}\n")
|
||||
|
||||
CORE_PATTERN_PATH = '/proc/sys/kernel/core_pattern'
|
||||
|
||||
def _get_coredump_dir(self):
|
||||
"""Return the coredump directory, deriving it from scylla.yaml workdir if needed."""
|
||||
if self._coredump_dir is not None:
|
||||
return self._coredump_dir
|
||||
conf_dir = "/etc/scylla"
|
||||
try:
|
||||
with open(os.path.join(conf_dir, "scylla.yaml")) as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
cfg = {}
|
||||
workdir = cfg.get('workdir') or '/var/lib/scylla'
|
||||
self._coredump_dir = os.path.join(workdir, 'coredump')
|
||||
return self._coredump_dir
|
||||
|
||||
def coredumpSetup(self):
|
||||
"""Configure coredump handling for containers.
|
||||
|
||||
The host's kernel.core_pattern may pipe core dumps to a handler
|
||||
(e.g. Ubuntu's apport) that does not exist or work correctly
|
||||
inside the container. This method tries to switch to a file-based
|
||||
core_pattern so that coredumps are written directly to disk.
|
||||
|
||||
Writing to /proc/sys/kernel/core_pattern requires privileges
|
||||
(root with CAP_SYS_ADMIN). When the container lacks permission
|
||||
a warning is logged with guidance for the operator.
|
||||
"""
|
||||
coredump_dir = self._get_coredump_dir()
|
||||
|
||||
try:
|
||||
os.makedirs(coredump_dir, exist_ok=True)
|
||||
except OSError as e:
|
||||
logging.warning('Could not create coredump directory %s: %s',
|
||||
coredump_dir, e)
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.CORE_PATTERN_PATH) as f:
|
||||
current = f.read().strip()
|
||||
except Exception as e:
|
||||
logging.debug('Could not read %s: %s', self.CORE_PATTERN_PATH, e)
|
||||
return
|
||||
|
||||
if not current.startswith('|'):
|
||||
return
|
||||
|
||||
desired = f'{coredump_dir}/core.%e.%p.%t'
|
||||
try:
|
||||
with open(self.CORE_PATTERN_PATH, 'w') as f:
|
||||
f.write(desired + '\n')
|
||||
logging.info('kernel.core_pattern set to %s', desired)
|
||||
except OSError as e:
|
||||
if e.errno in (errno.EACCES, errno.EPERM, errno.EROFS):
|
||||
logging.warning(
|
||||
'kernel.core_pattern pipes to a program that may not work '
|
||||
'inside the container, and we lack permission to override it. '
|
||||
'To fix this, either run with --privileged or set on the host: '
|
||||
'sysctl -w kernel.core_pattern="%s"', desired)
|
||||
else:
|
||||
logging.debug('Unexpected OSError setting core_pattern: %s', e)
|
||||
except Exception as e:
|
||||
logging.debug('Unexpected error in coredumpSetup: %s', e)
|
||||
|
||||
def arguments(self):
|
||||
args = []
|
||||
if self._memory is not None:
|
||||
|
||||
@@ -324,13 +324,6 @@ experimental:
|
||||
stream events. Without this option, such no-op operations may still
|
||||
generate spurious stream events.
|
||||
<https://github.com/scylladb/scylladb/issues/28368>
|
||||
* When a stream is disabled, no new records are written but the existing
|
||||
stream data is preserved and remains readable through its original
|
||||
StreamArn. The data expires via TTL after 24 hours. Re-enabling the
|
||||
stream purges the old data immediately and produces a new StreamArn.
|
||||
In contrast, DynamoDB keeps the old stream and its data readable for
|
||||
24 hours through the old StreamArn even after re-enabling.
|
||||
<https://scylladb.atlassian.net/browse/SCYLLADB-1873>
|
||||
|
||||
## Unimplemented API features
|
||||
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
# Alternator Vector Search
|
||||
|
||||
```{admonition} Availability
|
||||
:class: important
|
||||
|
||||
The Vector Search feature is only available in [ScyllaDB Cloud](https://cloud.docs.scylladb.com/) - a fully managed DBaaS running ScyllaDB.
|
||||
```
|
||||
|
||||
## Introduction
|
||||
|
||||
Alternator vector search is a ScyllaDB extension to the DynamoDB-compatible
|
||||
|
||||
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
|
||||
|
||||
|
||||
Altering from a rack list to a numeric replication factor is not supported.
|
||||
@@ -1017,11 +1017,11 @@ For example:
|
||||
|
||||
CREATE TABLE customer_data (
|
||||
cust_id uuid,
|
||||
"cust_first-name" text,
|
||||
"cust_last-name" text,
|
||||
cust_first-name text,
|
||||
cust_last-name text,
|
||||
cust_phone text,
|
||||
"cust_get-sms" text,
|
||||
PRIMARY KEY (cust_id)
|
||||
cust_get-sms text,
|
||||
PRIMARY KEY (customer_id)
|
||||
) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };
|
||||
|
||||
.. _cql-caching-options:
|
||||
|
||||
@@ -24,8 +24,7 @@ For example:
|
||||
|
||||
INSERT INTO NerdMovies (movie, director, main_actor, year)
|
||||
VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
|
||||
IF NOT EXISTS
|
||||
USING TTL 86400;
|
||||
USING TTL 86400 IF NOT EXISTS;
|
||||
|
||||
The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
|
||||
its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
|
||||
|
||||
@@ -71,7 +71,7 @@ used. If it is used, the statement will be a no-op if the materialized view alre
|
||||
MV Select Statement
|
||||
...................
|
||||
|
||||
The select statement of a materialized view creation defines which of the base table columns are included in the view. That
|
||||
The select statement of a materialized view creation defines which of the base table is included in the view. That
|
||||
statement is limited in a number of ways:
|
||||
|
||||
- The :ref:`selection <selection-clause>` is limited to those that only select columns of the base table. In other
|
||||
|
||||
@@ -507,7 +507,7 @@ For example::
|
||||
|
||||
CREATE TABLE superheroes (
|
||||
name frozen<full_name> PRIMARY KEY,
|
||||
home frozen<address>
|
||||
home address
|
||||
);
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -167,11 +167,6 @@ All tables in a keyspace are uploaded, the destination object names will look li
|
||||
or
|
||||
`gs://bucket/some/prefix/to/store/data/.../sstable`
|
||||
|
||||
# System tables
|
||||
There are a few system tables that object storage related code needs to touch in order to operate.
|
||||
* [system_distributed.snapshot_sstables](docs/dev/snapshot_sstables.md) - Used during restore by worker nodes to get the list of SSTables that need to be downloaded from object storage and restored locally.
|
||||
* [system.sstables](docs/dev/system_keyspace.md#systemsstables) - Used to keep track of SSTables on object storage when a keyspace is created with object storage storage_options.
|
||||
|
||||
# Manipulating S3 data
|
||||
|
||||
This section intends to give an overview of where, when and how we store data in S3 and provide a quick set of commands
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
# system\_distributed.snapshot\_sstables
|
||||
|
||||
## Purpose
|
||||
|
||||
This table is used during tablet-aware restore to exchange per-SSTable metadata between
|
||||
the coordinator and worker nodes. When the restore process starts, the coordinator node
|
||||
populates this table with information about each SSTable extracted from the snapshot
|
||||
manifests. Worker nodes then read from this table to determine which SSTables need to
|
||||
be downloaded from object storage and restored locally.
|
||||
|
||||
Rows are inserted with a TTL so that stale restore metadata is automatically cleaned up.
|
||||
|
||||
## Schema
|
||||
|
||||
~~~
|
||||
CREATE TABLE system_distributed.snapshot_sstables (
|
||||
snapshot_name text,
|
||||
"keyspace" text,
|
||||
"table" text,
|
||||
datacenter text,
|
||||
rack text,
|
||||
first_token bigint,
|
||||
sstable_id uuid,
|
||||
last_token bigint,
|
||||
toc_name text,
|
||||
prefix text,
|
||||
PRIMARY KEY ((snapshot_name, "keyspace", "table", datacenter, rack), first_token, sstable_id)
|
||||
)
|
||||
~~~
|
||||
|
||||
Column descriptions:
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `snapshot_name` | text (partition key) | Name of the snapshot |
|
||||
| `keyspace` | text (partition key) | Keyspace the snapshot was taken from |
|
||||
| `table` | text (partition key) | Table within the keyspace |
|
||||
| `datacenter` | text (partition key) | Datacenter where the SSTable is located |
|
||||
| `rack` | text (partition key) | Rack where the SSTable is located |
|
||||
| `first_token` | bigint (clustering key) | First token in the token range covered by this SSTable |
|
||||
| `sstable_id` | uuid (clustering key) | Unique identifier for the SSTable |
|
||||
| `last_token` | bigint | Last token in the token range covered by this SSTable |
|
||||
| `toc_name` | text | TOC filename of the SSTable (e.g. `me-3gdq_0bki_2cvk01yl83nj0tp5gh-big-TOC.txt`) |
|
||||
| `prefix` | text | Prefix path in object storage where the SSTable was backed up |
|
||||
|
||||
## APIs
|
||||
|
||||
The following C++ APIs are provided in `db::system_distributed_keyspace`:
|
||||
|
||||
- insert\_snapshot\_sstable
|
||||
|
||||
- get\_snapshot\_sstables
|
||||
@@ -274,8 +274,6 @@ globally driven by the topology change coordinator and serialized per-tablet. Tr
|
||||
|
||||
- repair - tablet replicas are repaired
|
||||
|
||||
- restore - tablet replicas download SSTables from object storage during cluster-wide backup restore
|
||||
|
||||
Each tablet has its own state machine for keeping state of transition stored in group0 which is part of the tablet state. It involves
|
||||
these properties of a tablet:
|
||||
|
||||
@@ -392,9 +390,6 @@ stateDiagram-v2
|
||||
|
||||
The repair tablet transition kind is different. It transits only to the repair and end_repair stage because no token ownership is changed.
|
||||
|
||||
The restore tablet transition kind is also simple. It uses a single `restore` stage and does not change token
|
||||
ownership. See the [Tablet-aware restore](#tablet-aware-restore) section below for details.
|
||||
|
||||
The behavioral difference between "migration" and "intranode_migration" transitions is in the way "streaming" stage
|
||||
is performed. In case of intra-node migration, streaming is done by fast duplication of data by creating hard links to
|
||||
sstable files on the destination shard. Original sstable files on the source shard will be removed by the standard "cleanup" stage.
|
||||
@@ -989,18 +984,3 @@ Losing a committed entry can be observed by external systems. For example, the l
|
||||
schema version in the cluster can go back in time from the driver's perspective. This
|
||||
is outside the scope of the recovery procedure, though, and it shouldn't cause
|
||||
problems in practice.
|
||||
|
||||
# Tablet restore transition
|
||||
|
||||
The `restore` tablet transition kind is used by the tablet-aware restore to download SSTables
|
||||
from object storage. The transition contains `restore_config` with snapshot name, endpoint and
|
||||
bucket.
|
||||
|
||||
Like `repair`, the `restore` transition does not change token ownership — replicas remain intact.
|
||||
The topology coordinator processes a tablet in this stage by calling the `RESTORE_TABLET` RPC on
|
||||
all tablet replicas. Each replica then downloads and attaches the SSTables that are contained in
|
||||
the tablet's token range. If the operation succeeds or fails, the transition is cleared and the
|
||||
failure to download SSTables is propagated back to user by the API handler itself.
|
||||
|
||||
Restore transitions are serialized per-tablet like any other transition (invariant [INV-TABL-2]),
|
||||
so they do not run concurrently with migrations or repairs on the same tablet.
|
||||
|
||||
@@ -106,7 +106,6 @@ The most important table is `system.view_building_tasks`, which stores all unfin
|
||||
CREATE TABLE system.view_building_tasks (
|
||||
key text,
|
||||
id timeuuid,
|
||||
min_task_id timeuuid STATIC, -- lower bound for task scans; see "Tombstone avoidance" below
|
||||
type text,
|
||||
aborted boolean,
|
||||
base_id uuid,
|
||||
@@ -118,26 +117,6 @@ CREATE TABLE system.view_building_tasks (
|
||||
)
|
||||
```
|
||||
|
||||
### Tombstone avoidance
|
||||
|
||||
`system.view_building_tasks` is a single partition. When `finished_task_gc_fiber()` removes
|
||||
finished tasks in batches, the deleted rows remain as tombstones in SSTables until compaction,
|
||||
causing `tombstone_warn_threshold` warnings on subsequent reloads in large clusters.
|
||||
|
||||
Two mechanisms address this:
|
||||
|
||||
**Range tombstone on GC.** Instead of one row tombstone per deleted task, the coordinator emits
|
||||
a single range tombstone `[before_all, min_alive_uuid)` where `min_alive_uuid` is the smallest
|
||||
timeuuid among surviving tasks. Tasks above the boundary (rare) still get individual row tombstones.
|
||||
When all tasks are deleted, a single full-partition range tombstone is used.
|
||||
|
||||
**Bounded scan on reload.** Physical rows remain until compaction and are still counted as dead cells.
|
||||
After each GC batch, `min_task_id = min_alive_uuid` is written atomically as a static cell (same Raft
|
||||
batch as the range tombstone). On reload, `min_task_id` is read using a **static-only partition slice**
|
||||
(empty `_row_ranges` + `always_return_static_content`) — this makes the SSTable reader stop immediately
|
||||
after the static row, before any clustering tombstones, so zero dead cells are counted. The value is
|
||||
then used as `AND id >= min_task_id` to skip all tombstoned rows in the main scan.
|
||||
|
||||
The view building coordinator stores currently processing base table in `system.scylla_local`
|
||||
under `view_building_processing_base` key.
|
||||
The entry is managed by group0.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
Migrate a Keyspace from Vnodes to Tablets :label-caution:`Experimental`
|
||||
=========================================================================
|
||||
Migrate a Keyspace from Vnodes to Tablets
|
||||
==========================================
|
||||
|
||||
This procedure describes how to migrate an existing keyspace from vnodes
|
||||
to tablets. Tablets are designed to be the long-term replacement for vnodes,
|
||||
@@ -8,9 +8,6 @@ balancing, automatic cleanups, and improved streaming performance. Migrating to
|
||||
tablets is strongly recommended. See :doc:`Data Distribution with Tablets </architecture/tablets/>`
|
||||
for details.
|
||||
|
||||
ℹ️ This feature is experimental and will change in future releases, including
|
||||
the removal of current limitations.
|
||||
|
||||
.. note::
|
||||
|
||||
The migration is an online operation. This means that the keyspace remains
|
||||
|
||||
@@ -16,7 +16,7 @@ Cluster and Node Limits
|
||||
* - Nodes per cluster
|
||||
- Low hundreds
|
||||
* - Node size
|
||||
- 4096 CPUs
|
||||
- 256 vcpu
|
||||
|
||||
See :ref:`Hardware Requirements <system-requirements-hardware>` for storage
|
||||
and memory requirements and limits.
|
||||
|
||||
@@ -4,7 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
|
||||
Metrics Update <metric-update-2025.x-to-2026.1>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
|
||||
@@ -0,0 +1,82 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
.. |PRECEDING_VERSION| replace:: 2025.4
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_operation_size_kb
|
||||
- Histogram of item sizes involved in a request.
|
||||
* - scylla_column_family_total_disk_space_before_compression
|
||||
- Hypothetical total disk space used if data files weren't compressed
|
||||
* - scylla_group_name_auto_repair_enabled_nr
|
||||
- Number of tablets with auto repair enabled.
|
||||
* - scylla_group_name_auto_repair_needs_repair_nr
|
||||
- Number of tablets with auto repair enabled that currently need repair.
|
||||
* - scylla_lsa_compact_time_ms
|
||||
- Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
|
||||
* - scylla_lsa_evict_time_ms
|
||||
- Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
|
||||
* - scylla_lsa_reclaim_time_ms
|
||||
- Total time spent in reclaiming LSA memory back to std allocator.
|
||||
* - scylla_object_storage_memory_usage
|
||||
- Total number of bytes consumed by the object storage client.
|
||||
* - scylla_tablet_ops_failed
|
||||
- Number of failed tablet auto repair attempts.
|
||||
* - scylla_tablet_ops_succeeded
|
||||
- Number of successful tablet auto repair attempts.
|
||||
|
||||
Renamed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric Name in |PRECEDING_VERSION|
|
||||
- Metric Name in |NEW_VERSION|
|
||||
* - scylla_s3_memory_usage
|
||||
- scylla_object_storage_memory_usage
|
||||
|
||||
Removed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are removed in ScyllaDB |NEW_VERSION|.
|
||||
|
||||
* scylla_redis_current_connections
|
||||
* scylla_redis_op_latency
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_requests_latency
|
||||
* scylla_redis_requests_served
|
||||
* scylla_redis_requests_serving
|
||||
|
||||
New and Updated Metrics in Previous Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
|
||||
.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
|
||||
Metrics Update <metric-update-2026.1-to-2026.2>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
|
||||
* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
|
||||
@@ -1,126 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |PRECEDING_VERSION| replace:: 2026.1
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_auth_cache_permissions
|
||||
- Total number of permission sets currently cached across all roles.
|
||||
* - scylla_auth_cache_roles
|
||||
- Number of roles currently cached.
|
||||
* - scylla_cql_forwarded_requests
|
||||
- Counts the total number of attempts to forward CQL requests to other nodes.
|
||||
One request may be forwarded multiple times, particularly when a write is
|
||||
handled by a non-replica node.
|
||||
* - scylla_cql_write_consistency_levels_disallowed_violations
|
||||
- Counts the number of write_consistency_levels_disallowed guardrail violations,
|
||||
i.e. attempts to write with a forbidden consistency level.
|
||||
* - scylla_cql_write_consistency_levels_warned_violations
|
||||
- Counts the number of write_consistency_levels_warned guardrail violations,
|
||||
i.e. attempts to write with a discouraged consistency level.
|
||||
* - scylla_cql_writes_per_consistency_level
|
||||
- Counts the number of writes for each consistency level.
|
||||
* - scylla_io_queue_integrated_disk_queue_length
|
||||
- Length of the integrated disk queue.
|
||||
* - scylla_io_queue_integrated_queue_length
|
||||
- Length of the integrated queue.
|
||||
* - scylla_logstor_sm_bytes_freed
|
||||
- Counts the number of data bytes freed.
|
||||
* - scylla_logstor_sm_bytes_read
|
||||
- Counts the number of bytes read from the disk.
|
||||
* - scylla_logstor_sm_bytes_written
|
||||
- Counts the number of bytes written to the disk.
|
||||
* - scylla_logstor_sm_compaction_bytes_written
|
||||
- Counts the number of bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_data_bytes_written
|
||||
- Counts the number of data bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_records_rewritten
|
||||
- Counts the number of records rewritten during compaction.
|
||||
* - scylla_logstor_sm_compaction_records_skipped
|
||||
- Counts the number of records skipped during compaction.
|
||||
* - scylla_logstor_sm_compaction_segments_freed
|
||||
- Counts the number of data bytes written to the disk.
|
||||
* - scylla_logstor_sm_disk_usage
|
||||
- Total disk usage.
|
||||
* - scylla_logstor_sm_free_segments
|
||||
- Counts the number of free segments currently available.
|
||||
* - scylla_logstor_sm_segment_pool_compaction_segments_get
|
||||
- Counts the number of segments taken from the segment pool for compaction.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_get
|
||||
- Counts the number of segments taken from the segment pool for normal writes.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_wait
|
||||
- Counts the number of times normal writes had to wait for a segment to become
|
||||
available in the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_segments_put
|
||||
- Counts the number of segments returned to the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_separator_segments_get
|
||||
- Counts the number of segments taken from the segment pool for separator writes.
|
||||
* - scylla_logstor_sm_segment_pool_size
|
||||
- Counts the number of segments in the segment pool.
|
||||
* - scylla_logstor_sm_segments_allocated
|
||||
- Counts the number of segments allocated.
|
||||
* - scylla_logstor_sm_segments_compacted
|
||||
- Counts the number of segments compacted.
|
||||
* - scylla_logstor_sm_segments_freed
|
||||
- Counts the number of segments freed.
|
||||
* - scylla_logstor_sm_segments_in_use
|
||||
- Counts the number of segments currently in use.
|
||||
* - scylla_logstor_sm_separator_buffer_flushed
|
||||
- Counts the number of times the separator buffer has been flushed.
|
||||
* - scylla_logstor_sm_separator_bytes_written
|
||||
- Counts the number of bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_data_bytes_written
|
||||
- Counts the number of data bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_flow_control_delay
|
||||
- Current delay applied to writes to control separator debt in microseconds.
|
||||
* - scylla_logstor_sm_separator_segments_freed
|
||||
- Counts the number of segments freed by the separator.
|
||||
* - scylla_transport_cql_pending_response_memory
|
||||
- Holds the total memory in bytes consumed by responses waiting to be sent.
|
||||
* - scylla_transport_cql_request_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_cql_requests_serving
|
||||
- Holds the number of requests that are being processed right now.
|
||||
* - scylla_transport_cql_response_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_requests_forwarded_failed
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed to execute there.
|
||||
* - scylla_transport_requests_forwarded_prepared_not_found
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed there because the statement was not prepared on the target.
|
||||
When this happens, the coordinator performs an additional remote call
|
||||
to prepare the statement on the replica and retries the EXECUTE request
|
||||
afterwards.
|
||||
* - scylla_transport_requests_forwarded_redirected
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but that replica responded with a redirect to another node. This can
|
||||
happen when replica has stale information about the cluster topology or
|
||||
when the request is handled by a node that is not a replica for the data
|
||||
being accessed by the request.
|
||||
* - scylla_transport_requests_forwarded_successfully
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
and executed successfully there.
|
||||
|
||||
@@ -289,8 +289,8 @@ private:
|
||||
|
||||
sstring _host;
|
||||
host_options& _options;
|
||||
std::optional<output_stream<char>> _output;
|
||||
std::optional<input_stream<char>> _input;
|
||||
output_stream<char> _output;
|
||||
input_stream<char> _input;
|
||||
seastar::connected_socket _socket;
|
||||
std::optional<temporary_buffer<char>> _in_buffer;
|
||||
std::optional<future<>> _pending;
|
||||
@@ -347,8 +347,8 @@ future<> kmip_host::impl::connection::connect() {
|
||||
// #998 Set keepalive to try avoiding connection going stale in between commands.
|
||||
s.set_keepalive_parameters(net::tcp_keepalive_params{60s, 60s, 10});
|
||||
s.set_keepalive(true);
|
||||
_input.emplace(s.input());
|
||||
_output.emplace(s.output());
|
||||
_input = s.input();
|
||||
_output = s.output();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -367,9 +367,9 @@ int kmip_host::impl::connection::send(void* data, unsigned int len, unsigned int
|
||||
}
|
||||
kmip_log.trace("{}: Sending {} bytes", *this, len);
|
||||
|
||||
auto f = _output->write(reinterpret_cast<char *>(data), len).then([this] {
|
||||
auto f = _output.write(reinterpret_cast<char *>(data), len).then([this] {
|
||||
kmip_log.trace("{}: send done. flushing...", *this);
|
||||
return _output->flush();
|
||||
return _output.flush();
|
||||
});
|
||||
// if the call failed already, we still want to
|
||||
// drop back to "wait_for_io()", because we cannot throw
|
||||
@@ -405,7 +405,7 @@ int kmip_host::impl::connection::recv(void* data, unsigned int len, unsigned int
|
||||
}
|
||||
|
||||
kmip_log.trace("{}: issue read", *this);
|
||||
auto f = _input->read().then([this](temporary_buffer<char> buf) {
|
||||
auto f = _input.read().then([this](temporary_buffer<char> buf) {
|
||||
kmip_log.trace("{}: got {} bytes", *this, buf.size());
|
||||
_in_buffer = std::move(buf);
|
||||
});
|
||||
@@ -462,8 +462,8 @@ void kmip_host::impl::connection::attach(KMIP_CMD* cmd) {
|
||||
}
|
||||
|
||||
future<> kmip_host::impl::connection::close() {
|
||||
return _output->close().finally([this] {
|
||||
return _input->close();
|
||||
return _output.close().finally([this] {
|
||||
return _input.close();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret
|
||||
|
||||
template<typename Func>
|
||||
future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
|
||||
kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
|
||||
kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
|
||||
KMIP_CMD* cmd = cmd_in;
|
||||
|
||||
// #998 Need to do retry loop, because we can have either timed out connection,
|
||||
|
||||
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
|
||||
static auto get_xml_node = [](node_type* node, const char* what) {
|
||||
auto res = node->first_node(what);
|
||||
if (!res) {
|
||||
throw malformed_response_error(fmt::format("XML parse error: {}", what));
|
||||
throw malformed_response_error(fmt::format("XML parse error", what));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
@@ -109,7 +109,6 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
|
||||
"UUID_SSTABLE_IDENTIFIERS"sv,
|
||||
"GROUP0_SCHEMA_VERSIONING"sv,
|
||||
"VIEW_BUILD_STATUS_ON_GROUP0"sv,
|
||||
"CDC_GENERATIONS_V2"sv,
|
||||
};
|
||||
|
||||
if (is_test_only_feature_deprecated()) {
|
||||
|
||||
@@ -83,6 +83,7 @@ public:
|
||||
gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
|
||||
gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
|
||||
gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
|
||||
gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
|
||||
gms::feature user_defined_aggregates { *this, "UDA"sv };
|
||||
// Historically max_result_size contained only two fields: soft_limit and
|
||||
// hard_limit. It was somehow obscure because for normal paged queries both
|
||||
@@ -182,7 +183,6 @@ public:
|
||||
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
|
||||
gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
|
||||
gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
|
||||
gms::feature view_building_tasks_min_task_id { *this, "VIEW_BUILDING_TASKS_MIN_TASK_ID"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -399,10 +399,9 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
|
||||
}
|
||||
}
|
||||
gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
|
||||
auto ack2_msg_str = fmt::format("{}", ack2_msg);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
}
|
||||
|
||||
// Depends on
|
||||
@@ -965,7 +964,8 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
|
||||
diff = now - last;
|
||||
if (!failed) {
|
||||
last = now;
|
||||
} else if (diff > max_duration) {
|
||||
}
|
||||
if (diff > max_duration) {
|
||||
logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
|
||||
co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
|
||||
return g.convict(host_id);
|
||||
|
||||
@@ -53,7 +53,6 @@ set(idl_headers
|
||||
group0.idl.hh
|
||||
hinted_handoff.idl.hh
|
||||
sstables.idl.hh
|
||||
sstables_loader.idl.hh
|
||||
storage_proxy.idl.hh
|
||||
storage_service.idl.hh
|
||||
strong_consistency/state_machine.idl.hh
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
class restore_result {
|
||||
};
|
||||
|
||||
verb [[]] restore_tablet (raft::server_id dst_id, locator::global_tablet_id gid) -> restore_result;
|
||||
@@ -72,7 +72,6 @@ struct raft_topology_cmd_result {
|
||||
success
|
||||
};
|
||||
service::raft_topology_cmd_result::command_status status;
|
||||
sstring error_message [[version 2026.2]];
|
||||
};
|
||||
|
||||
struct raft_snapshot {
|
||||
|
||||
@@ -5,8 +5,6 @@ target_sources(index
|
||||
PRIVATE
|
||||
secondary_index.cc
|
||||
secondary_index_manager.cc
|
||||
fulltext_index.cc
|
||||
index_option_utils.cc
|
||||
vector_index.cc)
|
||||
target_include_directories(index
|
||||
PUBLIC
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
// Supported text analyzers for fulltext indexing.
|
||||
// This list corresponds to analyzers expected to be provided
|
||||
// by the backend search engine (Tantivy).
|
||||
static const std::vector<sstring> analyzer_values = {
|
||||
"standard", "english", "german", "french", "spanish", "italian", "portuguese", "russian", "chinese", "japanese", "korean", "simple", "whitespace"};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(std::string_view, const sstring&, const sstring&)>> fulltext_index_options = {
|
||||
// 'analyzer' specifies the built-in text analyzer to use for tokenization.
|
||||
{"analyzer", std::bind_front(util::validate_enumerated_option, analyzer_values)},
|
||||
// 'positions' controls whether token positions are stored in the index.
|
||||
// Required for phrase queries. Set to false to save space.
|
||||
{"positions", std::bind_front(util::validate_enumerated_option, util::boolean_values)},
|
||||
};
|
||||
|
||||
bool fulltext_index::view_should_exist() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::optional<cql3::description> fulltext_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
auto target = im.options().at(cql3::statements::index_target::target_option_name);
|
||||
auto target_column = cql3::statements::index_target::column_name_from_target_string(target);
|
||||
return describe_with_target(im, base_schema, cql3::util::maybe_quote(target_column));
|
||||
}
|
||||
|
||||
void fulltext_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index must have exactly one target column");
|
||||
}
|
||||
|
||||
auto& target = targets[0];
|
||||
if (!std::holds_alternative<index_target::single_column>(target->value)) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index target must be a single column");
|
||||
}
|
||||
|
||||
auto& column = std::get<index_target::single_column>(target->value);
|
||||
auto c_name = column->to_string();
|
||||
auto const* c_def = schema.get_column_definition(column->name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto kind = c_def->type->get_kind();
|
||||
if (kind != abstract_type::kind::utf8 && kind != abstract_type::kind::ascii) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Fulltext index is only supported on text, varchar, or ascii columns, but column {} has an incompatible type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::check_index_options(const cql3::statements::index_specific_prop_defs& properties) const {
|
||||
for (auto option : properties.get_raw_options()) {
|
||||
auto it = fulltext_index_options.find(option.first);
|
||||
if (it == fulltext_index_options.end()) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported option {} for fulltext index", option.first));
|
||||
}
|
||||
it->second(index_type_name(), option.first, option.second);
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service&, const data_dictionary::database&) const {
|
||||
check_target(schema, targets);
|
||||
check_index_options(properties);
|
||||
}
|
||||
|
||||
utils::UUID fulltext_index::index_version(const schema& schema) {
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory() {
|
||||
return std::make_unique<fulltext_index>();
|
||||
}
|
||||
|
||||
} // namespace secondary_index
|
||||
@@ -1,43 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema.hh"
|
||||
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
class fulltext_index : public custom_index {
|
||||
public:
|
||||
std::string_view index_type_name() const override {
|
||||
return "fulltext";
|
||||
}
|
||||
|
||||
fulltext_index() = default;
|
||||
~fulltext_index() override = default;
|
||||
std::optional<cql3::description> describe(const index_metadata& im, const schema& base_schema) const override;
|
||||
bool view_should_exist() const override;
|
||||
void validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const override;
|
||||
utils::UUID index_version(const schema& schema) override;
|
||||
|
||||
private:
|
||||
void check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const;
|
||||
void check_index_options(const cql3::statements::index_specific_prop_defs& properties) const;
|
||||
};
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory();
|
||||
|
||||
} // namespace secondary_index
|
||||
@@ -1,70 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/core/format.hh>
|
||||
|
||||
namespace secondary_index::util {
|
||||
|
||||
void validate_enumerated_option(
|
||||
const std::vector<sstring>& supported_values, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
bool is_valid = std::any_of(supported_values.begin(), supported_values.end(), [&](const std::string& v) {
|
||||
return boost::iequals(value, v);
|
||||
});
|
||||
|
||||
if (!is_valid) {
|
||||
throw exceptions::invalid_request_exception(seastar::format("Invalid value in option '{}' for {} index: '{}'."
|
||||
" Supported are case-insensitive: {}",
|
||||
value_name, index_type_name, value, fmt::join(supported_values, ", ")));
|
||||
}
|
||||
}
|
||||
|
||||
void validate_positive_option(int max, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
int num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stoi(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not an integer", value_name, index_type_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not an integer", value_name, index_type_name, value));
|
||||
}
|
||||
|
||||
if (num_value <= 0 || num_value > max) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is out of valid range [1 - {}]", value_name, index_type_name, value, max));
|
||||
}
|
||||
}
|
||||
|
||||
void validate_factor_option(float min, float max, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
float num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stof(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not a float", value_name, index_type_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not a float", value_name, index_type_name, value));
|
||||
}
|
||||
|
||||
if (!(num_value >= min && num_value <= max)) {
|
||||
throw exceptions::invalid_request_exception(seastar::format(
|
||||
"Invalid value in option '{}' for {} index: '{}' is out of valid range [{} - {}]", value_name, index_type_name, value, min, max));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace secondary_index::util
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <seastar/core/sstring.hh>
|
||||
|
||||
namespace secondary_index::util {
|
||||
|
||||
inline const std::vector<seastar::sstring> boolean_values = {"false", "true"};
|
||||
|
||||
void validate_enumerated_option(const std::vector<seastar::sstring>& supported_values, std::string_view index_type_name, const seastar::sstring& value_name,
|
||||
const seastar::sstring& value);
|
||||
|
||||
void validate_positive_option(int max, std::string_view index_type_name, const seastar::sstring& value_name, const seastar::sstring& value);
|
||||
|
||||
void validate_factor_option(float min, float max, std::string_view index_type_name, const seastar::sstring& value_name, const seastar::sstring& value);
|
||||
|
||||
} // namespace secondary_index::util
|
||||
@@ -9,21 +9,17 @@
|
||||
*/
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <ranges>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/vector_index.hh"
|
||||
|
||||
#include "cql3/expr/expression.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "utils/histogram_metrics_helper.hh"
|
||||
@@ -215,7 +211,6 @@ std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_ma
|
||||
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
|
||||
|
||||
const static std::unordered_map<std::string_view, std::function<std::unique_ptr<custom_index>()>> classes = {
|
||||
{"fulltext_index", fulltext_index_factory},
|
||||
{"vector_index", vector_index_factory},
|
||||
};
|
||||
|
||||
@@ -238,49 +233,6 @@ std::optional<std::unique_ptr<custom_index>> secondary_index_manager::get_custom
|
||||
return (*custom_class_factory)();
|
||||
}
|
||||
|
||||
std::optional<cql3::description> custom_index::describe_with_target(
|
||||
const index_metadata& im,
|
||||
const schema& base_schema,
|
||||
const sstring& target_cql) const {
|
||||
static const std::unordered_set<sstring> system_options = {
|
||||
cql3::statements::index_target::target_option_name,
|
||||
db::index::secondary_index::custom_class_option_name,
|
||||
db::index::secondary_index::index_version_option_name,
|
||||
};
|
||||
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
|
||||
<< cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << target_cql << ")"
|
||||
<< " USING '" << index_type_name() << "_index'";
|
||||
|
||||
std::map<sstring, sstring> user_options;
|
||||
for (const auto& [key, value] : im.options()) {
|
||||
if (!system_options.contains(key)) {
|
||||
user_options.emplace(key, value);
|
||||
}
|
||||
}
|
||||
if (!user_options.empty()) {
|
||||
os << " WITH OPTIONS = {";
|
||||
bool first = true;
|
||||
for (const auto& [key, value] : user_options) {
|
||||
if (!first) {
|
||||
os << ", ";
|
||||
}
|
||||
os << "'" << key << "': '" << value << "'";
|
||||
first = false;
|
||||
}
|
||||
os << "}";
|
||||
}
|
||||
|
||||
return cql3::description{
|
||||
.keyspace = base_schema.ks_name(),
|
||||
.type = "index",
|
||||
.name = im.name(),
|
||||
.create_statement = std::move(os).to_managed_string(),
|
||||
};
|
||||
}
|
||||
|
||||
stats::stats(const sstring& ks_name, const sstring& index_name) {
|
||||
metrics.add_group("index",
|
||||
{seastar::metrics::make_histogram("query_latencies", seastar::metrics::description("Index query latencies"), {idx(index_name), ks(ks_name)},
|
||||
|
||||
@@ -100,7 +100,6 @@ public:
|
||||
class custom_index {
|
||||
public:
|
||||
virtual ~custom_index() = default;
|
||||
virtual std::string_view index_type_name() const = 0;
|
||||
/// Returns a custom description of the index, or std::nullopt if the default index description logic should be used instead.
|
||||
virtual std::optional<cql3::description> describe(const index_metadata& im, const schema& base_schema) const = 0;
|
||||
virtual bool view_should_exist() const = 0;
|
||||
@@ -108,12 +107,6 @@ public:
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>> &targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const = 0;
|
||||
virtual utils::UUID index_version(const schema& schema) = 0;
|
||||
|
||||
protected:
|
||||
std::optional<cql3::description> describe_with_target(
|
||||
const index_metadata& im,
|
||||
const schema& base_schema,
|
||||
const sstring& target_cql) const;
|
||||
};
|
||||
|
||||
struct stats {
|
||||
|
||||
@@ -14,19 +14,66 @@
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "index/vector_index.hh"
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
|
||||
int num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stoi(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not an integer", value_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not an integer", value_name, value));
|
||||
}
|
||||
|
||||
if (num_value <= 0 || num_value > max) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is out of valid range [1 - {}]", value_name, value, max));
|
||||
}
|
||||
}
|
||||
|
||||
static void validate_factor_option(float min, float max, const sstring& value_name, const sstring& value) {
|
||||
float num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stof(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not a float", value_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not a float", value_name, value));
|
||||
}
|
||||
|
||||
if (!(num_value >= min && num_value <= max)) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is out of valid range [{} - {}]", value_name, value, min, max));
|
||||
}
|
||||
}
|
||||
|
||||
static void validate_enumerated_option(const std::vector<sstring>& supported_values, const sstring& value_name, const sstring& value) {
|
||||
bool is_valid = std::any_of(supported_values.begin(), supported_values.end(),
|
||||
[&](const std::string& func) { return boost::iequals(value, func); });
|
||||
|
||||
if (!is_valid) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for vector index: '{}'. Supported are case-insensitive: {}",
|
||||
value_name,
|
||||
value,
|
||||
fmt::join(supported_values, ", ")));
|
||||
}
|
||||
}
|
||||
|
||||
static const std::vector<sstring> similarity_function_values = {
|
||||
"cosine", "euclidean", "dot_product"
|
||||
};
|
||||
@@ -35,29 +82,33 @@ static const std::vector<sstring> quantization_values = {
|
||||
"f32", "f16", "bf16", "i8", "b1"
|
||||
};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(std::string_view, const sstring&, const sstring&)>> vector_index_options = {
|
||||
static const std::vector<sstring> boolean_values = {
|
||||
"false", "true"
|
||||
};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(const sstring&, const sstring&)>> vector_index_options = {
|
||||
// `similarity_function` defines method of calculating similarity between vectors
|
||||
// Used internally by vector store during both indexing and querying
|
||||
// CQL implements corresponding functions in cql3/functions/similarity_functions.hh
|
||||
{"similarity_function", std::bind_front(util::validate_enumerated_option, similarity_function_values)},
|
||||
{"similarity_function", std::bind_front(validate_enumerated_option, similarity_function_values)},
|
||||
// 'maximum_node_connections', 'construction_beam_width', 'search_beam_width' define HNSW index parameters
|
||||
// Used internally by vector store.
|
||||
{"maximum_node_connections", std::bind_front(util::validate_positive_option, 512)},
|
||||
{"construction_beam_width", std::bind_front(util::validate_positive_option, 4096)},
|
||||
{"search_beam_width", std::bind_front(util::validate_positive_option, 4096)},
|
||||
{"maximum_node_connections", std::bind_front(validate_positive_option, 512)},
|
||||
{"construction_beam_width", std::bind_front(validate_positive_option, 4096)},
|
||||
{"search_beam_width", std::bind_front(validate_positive_option, 4096)},
|
||||
// 'quantization' enables compression of vectors in vector store (not in base table!)
|
||||
// Used internally by vector store. Scylla only checks it to enable rescoring.
|
||||
{"quantization", std::bind_front(util::validate_enumerated_option, quantization_values)},
|
||||
{"quantization", std::bind_front(validate_enumerated_option, quantization_values)},
|
||||
// 'oversampling' defines factor by which number of candidates retrieved from vector store is multiplied.
|
||||
// It can improve accuracy of ANN queries, especially for quantized vectors when combined with rescoring.
|
||||
// Used by Scylla during query processing to increase query limit sent to vector store.
|
||||
{"oversampling", std::bind_front(util::validate_factor_option, 1.0f, 100.0f)},
|
||||
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
|
||||
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
|
||||
{"rescoring", std::bind_front(util::validate_enumerated_option, util::boolean_values)},
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
// 'source_model' is a Cassandra SAI option specifying the embedding model name.
|
||||
// Used by Cassandra libraries (e.g., CassIO) to tag indexes with the model that produced the vectors.
|
||||
// Accepted for compatibility but not used by ScyllaDB.
|
||||
{"source_model", [](std::string_view, const sstring&, const sstring&) { /* accepted for Cassandra compatibility */ }},
|
||||
{"source_model", [](const sstring&, const sstring&) { /* accepted for Cassandra compatibility */ }},
|
||||
};
|
||||
|
||||
static constexpr auto TC_TARGET_KEY = "tc";
|
||||
@@ -204,8 +255,43 @@ bool vector_index::view_should_exist() const {
|
||||
}
|
||||
|
||||
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
return describe_with_target(im, base_schema,
|
||||
targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)));
|
||||
static const std::unordered_set<sstring> system_options = {
|
||||
cql3::statements::index_target::target_option_name,
|
||||
db::index::secondary_index::custom_class_option_name,
|
||||
db::index::secondary_index::index_version_option_name,
|
||||
};
|
||||
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
<< " USING 'vector_index'";
|
||||
|
||||
// Collect user-provided options (excluding system keys like target, class_name, index_version).
|
||||
std::map<sstring, sstring> user_options;
|
||||
for (const auto& [key, value] : im.options()) {
|
||||
if (!system_options.contains(key)) {
|
||||
user_options.emplace(key, value);
|
||||
}
|
||||
}
|
||||
if (!user_options.empty()) {
|
||||
os << " WITH OPTIONS = {";
|
||||
bool first = true;
|
||||
for (const auto& [key, value] : user_options) {
|
||||
if (!first) {
|
||||
os << ", ";
|
||||
}
|
||||
os << "'" << key << "': '" << value << "'";
|
||||
first = false;
|
||||
}
|
||||
os << "}";
|
||||
}
|
||||
|
||||
return cql3::description{
|
||||
.keyspace = base_schema.ks_name(),
|
||||
.type = "index",
|
||||
.name = im.name(),
|
||||
.create_statement = std::move(os).to_managed_string(),
|
||||
};
|
||||
}
|
||||
|
||||
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
@@ -343,7 +429,7 @@ void vector_index::check_index_options(const cql3::statements::index_specific_pr
|
||||
if (it == vector_index_options.end()) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported option {} for vector index", option.first));
|
||||
}
|
||||
it->second(index_type_name(), option.first, option.second);
|
||||
it->second(option.first, option.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,8 +20,6 @@ namespace secondary_index {
|
||||
|
||||
class vector_index: public custom_index {
|
||||
public:
|
||||
std::string_view index_type_name() const override { return "vector"; }
|
||||
|
||||
// The minimal TTL for the CDC used by Vector Search.
|
||||
// Required to ensure that the data is not deleted until the vector index is fully built.
|
||||
static constexpr int VS_TTL_SECONDS = 86400; // 24 hours
|
||||
|
||||
3
init.cc
3
init.cc
@@ -87,6 +87,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
}
|
||||
}
|
||||
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
|
||||
disabled.insert("ALTERNATOR_STREAMS"s);
|
||||
}
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
|
||||
disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include <ranges>
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/serialization.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <seastar/util/backtrace.hh>
|
||||
|
||||
enum class allow_prefixes { no, yes };
|
||||
@@ -104,12 +103,7 @@ public:
|
||||
static managed_bytes serialize_value(RangeOfSerializedComponents&& values) {
|
||||
auto size = serialized_size(values);
|
||||
if (size > std::numeric_limits<size_type>::max()) {
|
||||
// Matches Cassandra's wording so CQL-level compatibility tests
|
||||
// (and client-visible error messages) line up.
|
||||
// Issues #10366 (SELECT) and #12247 (INSERT) both require a
|
||||
// clean InvalidRequest here rather than a generic server error.
|
||||
throw exceptions::invalid_request_exception(format("Key length of {:d} is longer than maximum of {:d}",
|
||||
size, std::numeric_limits<size_type>::max()));
|
||||
throw std::runtime_error(format("Key size too large: {:d} > {:d}", size, std::numeric_limits<size_type>::max()));
|
||||
}
|
||||
managed_bytes b(managed_bytes::initialized_later(), size);
|
||||
serialize_value(values, managed_bytes_mutable_view(b));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user