mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 11:22:01 +00:00
Compare commits
2 Commits
scylladb_1
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
93fbc0a683 | ||
|
|
520466b407 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -36,6 +36,4 @@ compile_commands.json
|
||||
clang_build
|
||||
.idea/
|
||||
nuke
|
||||
rust/**/target
|
||||
rust/**/Cargo.lock
|
||||
test/resource/wasm/rust/target
|
||||
rust/target
|
||||
|
||||
@@ -681,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
|
||||
case parsed::primitive_condition::type::VALUE:
|
||||
if (calculated_values.size() != 1) {
|
||||
// Shouldn't happen unless we have a bug in the parser
|
||||
throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
|
||||
throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
|
||||
}
|
||||
// Unwrap the boolean wrapped as the value (if it is a boolean)
|
||||
if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
|
||||
|
||||
@@ -1362,33 +1362,6 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
|
||||
return dimensions_v->GetInt();
|
||||
}
|
||||
|
||||
// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
|
||||
// currently synchronous - they return only after the operation is complete.
|
||||
// After announce() of the new schema finished, the schema change is committed
|
||||
// and a majority of nodes know it - but it's possible that some live nodes
|
||||
// have not yet applied the new schema. If we return to the user now, and the
|
||||
// user sends a node request that relies on the new schema, it might fail.
|
||||
// So before returning, we must verify that *all* nodes have applied the new
|
||||
// schema. This is what wait_for_schema_agreement_after_ddl() does.
|
||||
//
|
||||
// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
|
||||
// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
|
||||
// is returned. The user, who doesn't know if the CreateTable succeeded or not,
|
||||
// can retry the request and will get a ResourceInUseException and know the
|
||||
// table already exists. So a CreateTable that returns a ResourceInUseException
|
||||
// should also call wait_for_schema_agreement_after_ddl().
|
||||
//
|
||||
// When issue #5052 is resolved, this function can be removed - we will need
|
||||
// to check if we reached schema agreement, but not to *wait* for it.
|
||||
static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
|
||||
static constexpr auto schema_agreement_seconds = 30;
|
||||
try {
|
||||
co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
|
||||
} catch (const service::migration_manager::schema_agreement_timeout&) {
|
||||
throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
|
||||
}
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
|
||||
const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
||||
throwing_assert(this_shard_id() == 0);
|
||||
@@ -1722,26 +1695,13 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
}
|
||||
bool table_already_exists = false;
|
||||
try {
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
|
||||
} catch (exceptions::already_exists_exception&) {
|
||||
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
table_already_exists = true;
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
}
|
||||
if (table_already_exists) {
|
||||
// The user may have retried a CreateTable operation after it timed
|
||||
// out in wait_for_schema_agreement_after_ddl(). So before we may
|
||||
// return ResourceInUseException (which can lead the user to start
|
||||
// using the table which it now knows exists), we need to wait for
|
||||
// schema agreement, just like the original CreateTable did. Again
|
||||
// we fail with InternalServerError if schema agreement still cannot
|
||||
// be reached. We can release group0_guard before waiting.
|
||||
release_guard(std::move(group0_guard));
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
if (_proxy.data_dictionary().try_find_table(schema->id())) {
|
||||
// This should never happen, the ID is supposed to be unique
|
||||
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
||||
@@ -1790,7 +1750,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
rjson::value status = rjson::empty_object();
|
||||
executor::supplement_table_info(request, *schema, _proxy);
|
||||
rjson::add(status, "TableDescription", std::move(request));
|
||||
@@ -1900,7 +1860,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
empty_request = false;
|
||||
if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
|
||||
if (add_stream_options(*stream_specification, builder, p.local())) {
|
||||
validate_cdc_log_name_length(builder.cf_name());
|
||||
// On tablet tables, defer stream enablement and block
|
||||
// tablet merges (see defer_enabling_streams_block_tablet_merges).
|
||||
@@ -1915,23 +1875,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
|
||||
}
|
||||
// When re-enabling streams on an Alternator table, drop the old
|
||||
// CDC log table first as a separate schema change, so the
|
||||
// subsequent UpdateTable creates a fresh one with a new UUID
|
||||
// (= new StreamArn). See #7239.
|
||||
auto logname = cdc::log_name(tab->cf_name());
|
||||
auto& local_db = p.local().local_db();
|
||||
if (local_db.has_schema(tab->ks_name(), logname)
|
||||
&& cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
|
||||
auto drop_m = co_await service::prepare_column_family_drop_announcement(
|
||||
p.local(), tab->ks_name(), logname,
|
||||
group0_guard.write_timestamp());
|
||||
co_await mm.announce(std::move(drop_m), std::move(group0_guard),
|
||||
format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
|
||||
co_await mm.wait_for_schema_agreement(
|
||||
p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
|
||||
@@ -2246,7 +2189,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
throw;
|
||||
}
|
||||
}
|
||||
co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());
|
||||
co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
|
||||
rjson::value status = rjson::empty_object();
|
||||
supplement_table_info(request, *schema, p.local());
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "cdc/cdc_options.hh"
|
||||
|
||||
|
||||
namespace db {
|
||||
@@ -200,7 +199,7 @@ private:
|
||||
tracing::trace_state_ptr trace_state, service_permit permit);
|
||||
|
||||
public:
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
|
||||
static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
|
||||
static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
|
||||
};
|
||||
|
||||
@@ -243,10 +243,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
if (!is_alternator_keyspace(ks_name)) {
|
||||
continue;
|
||||
}
|
||||
// Use get_base_table instead of is_log_for_some_table because the
|
||||
// latter requires CDC to be enabled, but we want to list streams
|
||||
// that have been disabled but whose log table still exists (#7239).
|
||||
if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
|
||||
if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
|
||||
rjson::value new_entry = rjson::empty_object();
|
||||
|
||||
auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
|
||||
@@ -395,7 +392,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
|
||||
return is;
|
||||
}
|
||||
|
||||
static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
|
||||
static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
|
||||
stream_view_type type = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
type = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
@@ -841,7 +838,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto& opts = bs->cdc_options();
|
||||
|
||||
auto status = "DISABLED";
|
||||
bool stream_disabled = !opts.enabled();
|
||||
|
||||
if (opts.enabled()) {
|
||||
if (!_cdc_metadata.streams_available()) {
|
||||
@@ -857,7 +853,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(opts);
|
||||
stream_view_type type = cdc_options_to_steam_view_type(opts);
|
||||
|
||||
rjson::add(stream_desc, "StreamArn", stream_arn);
|
||||
rjson::add(stream_desc, "StreamViewType", type);
|
||||
@@ -865,9 +861,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
describe_key_schema(stream_desc, *bs);
|
||||
|
||||
// For disabled streams, we still fall through to enumerate shards
|
||||
// below. All shards will have EndingSequenceNumber set, indicating
|
||||
// they are closed. See issue #7239.
|
||||
if (!opts.enabled()) {
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
// TODO: label
|
||||
// TODO: creation time
|
||||
@@ -950,12 +947,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto expired = [&]() -> std::optional<db_clock::time_point> {
|
||||
auto j = std::next(i);
|
||||
if (j == e) {
|
||||
// For a disabled stream, all shards are closed (#7239).
|
||||
// Use "now" as the ending sequence number for the last
|
||||
// generation's shards.
|
||||
if (stream_disabled) {
|
||||
return db_clock::now();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
// add this so we sort of match potential
|
||||
@@ -1306,7 +1297,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
| std::ranges::to<query::column_id_vector>()
|
||||
;
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());
|
||||
stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
|
||||
|
||||
auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
|
||||
auto partition_slice = query::partition_slice(
|
||||
@@ -1490,17 +1481,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
|
||||
auto& shard = iter.shard;
|
||||
|
||||
if (!base->cdc_options().enabled()) {
|
||||
// Stream is disabled -- all shards are closed (#7239).
|
||||
// Don't return NextShardIterator.
|
||||
} else if (shard.time < ts && ts < high_ts) {
|
||||
if (shard.time < ts && ts < high_ts) {
|
||||
// The DynamoDB documentation states that when a shard is
|
||||
// closed, reading it until the end has NextShardIterator
|
||||
// "set to null". Our test test_streams_closed_read
|
||||
// confirms that by "null" they meant not set at all.
|
||||
} else {
|
||||
// Shard is still open with no records in the scanned window.
|
||||
// Return the original iterator so the client can poll again.
|
||||
// We could have return the same iterator again, but we did
|
||||
// a search from it until high_ts and found nothing, so we
|
||||
// can also start the next search from high_ts.
|
||||
// TODO: but why? It's simpler just to leave the iterator be.
|
||||
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
|
||||
rjson::add(ret, "NextShardIterator", iter);
|
||||
}
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
@@ -1510,13 +1501,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
|
||||
auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
|
||||
if (!stream_enabled || !stream_enabled->IsBool()) {
|
||||
throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
|
||||
}
|
||||
|
||||
if (stream_enabled->GetBool()) {
|
||||
if (!sp.features().alternator_streams) {
|
||||
throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
|
||||
}
|
||||
|
||||
cdc::options opts;
|
||||
opts.enabled(true);
|
||||
opts.tablet_merge_blocked(true);
|
||||
@@ -1542,13 +1537,8 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
builder.with_cdc_options(opts);
|
||||
return true;
|
||||
} else {
|
||||
// When disabling, preserve the existing CDC options (preimage,
|
||||
// postimage, ttl, etc.) so that DescribeStream can still report
|
||||
// the correct StreamViewType on a disabled stream.
|
||||
cdc::options opts = existing_cdc_opts;
|
||||
cdc::options opts;
|
||||
opts.enabled(false);
|
||||
opts.enable_requested(false);
|
||||
opts.tablet_merge_blocked(false);
|
||||
builder.with_cdc_options(opts);
|
||||
return false;
|
||||
}
|
||||
@@ -1556,36 +1546,33 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
|
||||
void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
|
||||
auto& opts = schema.cdc_options();
|
||||
// Report stream info when:
|
||||
// 1. Log table exists (covers both enabled and disabled-but-readable).
|
||||
// 2. enable_requested (ENABLING state, log not yet created).
|
||||
auto db = sp.data_dictionary();
|
||||
auto log_name = cdc::log_name(schema.cf_name());
|
||||
auto log_cf = db.try_find_table(schema.ks_name(), log_name);
|
||||
if (log_cf) {
|
||||
auto log_schema = log_cf->schema();
|
||||
stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
|
||||
if (opts.enabled()) {
|
||||
auto db = sp.data_dictionary();
|
||||
auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
|
||||
stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
|
||||
rjson::add(descr, "LatestStreamArn", arn);
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));
|
||||
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", opts.enabled());
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
} else if (opts.enable_requested()) {
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
|
||||
} else if (!opts.enable_requested()) {
|
||||
return;
|
||||
}
|
||||
// For both enabled() and enable_requested():
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
auto mode = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
mode = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
} else if (opts.preimage()) {
|
||||
mode = stream_view_type::OLD_IMAGE;
|
||||
} else if (opts.postimage()) {
|
||||
mode = stream_view_type::NEW_IMAGE;
|
||||
}
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
}
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
@@ -194,36 +194,22 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
std::move(audited_keyspaces),
|
||||
std::move(audited_tables),
|
||||
std::move(audited_categories),
|
||||
std::cref(cfg));
|
||||
}
|
||||
|
||||
future<> audit::start_storage(const db::config& cfg) {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
|
||||
local_audit._storage_running = true;
|
||||
std::cref(cfg))
|
||||
.then([&cfg] {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit.start(cfg);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_storage() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([] (audit& local_audit) {
|
||||
local_audit._storage_running = false;
|
||||
return local_audit._storage_helper_ptr->stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_audit() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
|
||||
SCYLLA_ASSERT(!local_audit._storage_running);
|
||||
return local_audit.shutdown();
|
||||
}).then([] {
|
||||
return audit::audit::audit_instance().stop();
|
||||
@@ -237,6 +223,14 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
|
||||
return std::make_unique<audit_info>(cat, keyspace, table, batch);
|
||||
}
|
||||
|
||||
future<> audit::start(const db::config& cfg) {
|
||||
return _storage_helper_ptr->start(cfg);
|
||||
}
|
||||
|
||||
future<> audit::stop() {
|
||||
return _storage_helper_ptr->stop();
|
||||
}
|
||||
|
||||
future<> audit::shutdown() {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -247,12 +241,6 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
|
||||
const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
|
||||
socket_address client_ip = client_state.get_client_address().addr();
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
audit_info.query(), client_ip, audit_info.table(), username));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
@@ -298,11 +286,6 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c
|
||||
|
||||
future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false"));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false");
|
||||
|
||||
@@ -141,7 +141,6 @@ private:
|
||||
category_set _audited_categories;
|
||||
|
||||
std::unique_ptr<storage_helper> _storage_helper_ptr;
|
||||
bool _storage_running = false;
|
||||
|
||||
const db::config& _cfg;
|
||||
utils::observer<sstring> _cfg_keyspaces_observer;
|
||||
@@ -164,8 +163,6 @@ public:
|
||||
return audit_instance().local();
|
||||
}
|
||||
static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
|
||||
static future<> start_storage(const db::config& cfg);
|
||||
static future<> stop_storage();
|
||||
static future<> stop_audit();
|
||||
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
|
||||
audit(locator::shared_token_metadata& stm,
|
||||
@@ -177,6 +174,8 @@ public:
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg);
|
||||
~audit();
|
||||
future<> start(const db::config& cfg);
|
||||
future<> stop();
|
||||
future<> shutdown();
|
||||
bool should_log(const audit_info& audit_info) const;
|
||||
bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
|
||||
|
||||
@@ -185,14 +185,24 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
|
||||
static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
if (!r.has("value")) {
|
||||
continue;
|
||||
}
|
||||
rec->attributes[r.get_as<sstring>("name")] =
|
||||
r.get_as<sstring>("value");
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
// permissions
|
||||
{
|
||||
static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
auto resource = r.get_as<sstring>("resource");
|
||||
auto perms_strings = r.get_set<sstring>("permissions");
|
||||
std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
|
||||
auto pset = permissions::from_strings(perms_set);
|
||||
rec->permissions[std::move(resource)] = std::move(pset);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
co_return rec;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ public:
|
||||
std::unordered_set<role_name_t> members;
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
|
||||
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
|
||||
@@ -76,11 +76,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
|
||||
if (results->empty()) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
const auto& row = results->one();
|
||||
if (!row.has(PERMISSIONS_NAME)) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
|
||||
co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
|
||||
}
|
||||
|
||||
future<>
|
||||
|
||||
@@ -258,11 +258,13 @@ future<> ldap_role_manager::start() {
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
try {
|
||||
co_await _cache.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
return _std_mgr.start();
|
||||
|
||||
@@ -157,20 +157,6 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
return create_legacy_keyspace_if_missing(mm);
|
||||
});
|
||||
}
|
||||
// Authorizer must be started before the permission loader is set,
|
||||
// because the loader calls _authorizer->authorize().
|
||||
// The loader must be set before starting the role manager, because
|
||||
// LDAP role manager starts a pruner fiber that calls
|
||||
// reload_all_permissions() which asserts _permission_loader is set.
|
||||
co_await _authorizer->start();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
co_await _role_manager->start();
|
||||
if (this_shard_id() == 0) {
|
||||
// Role manager and password authenticator have this odd startup
|
||||
@@ -179,19 +165,21 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
// creation therefore we need to wait here.
|
||||
co_await _role_manager->ensure_superuser_is_created();
|
||||
}
|
||||
// Authenticator must be started after ensure_superuser_is_created()
|
||||
// because password_authenticator queries system.roles for the
|
||||
// superuser entry created by the role manager.
|
||||
co_await _authenticator->start();
|
||||
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
}
|
||||
|
||||
future<> service::stop() {
|
||||
_as.request_abort();
|
||||
// Reverse of start() order.
|
||||
co_await _authenticator->stop();
|
||||
co_await _role_manager->stop();
|
||||
_cache.set_permission_loader(nullptr);
|
||||
co_await _authorizer->stop();
|
||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
|
||||
}
|
||||
|
||||
future<> service::ensure_superuser_is_created() {
|
||||
|
||||
@@ -267,7 +267,7 @@ struct extract_row_visitor {
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
@@ -137,24 +137,6 @@ endfunction()
|
||||
|
||||
option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)
|
||||
|
||||
# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
|
||||
# Each .o produces a companion .json file in the build directory that can be
|
||||
# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
|
||||
#
|
||||
# Usage:
|
||||
# cmake -DScylla_TIME_TRACE=ON ...
|
||||
# ninja
|
||||
# # Analyze results (requires ClangBuildAnalyzer):
|
||||
# ClangBuildAnalyzer --all <build-dir> capture.bin
|
||||
# ClangBuildAnalyzer --analyze capture.bin
|
||||
option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
|
||||
if(Scylla_TIME_TRACE)
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
|
||||
endif()
|
||||
add_compile_options(-ftime-trace)
|
||||
endif()
|
||||
|
||||
macro(update_build_flags config)
|
||||
cmake_parse_arguments (
|
||||
parsed_args
|
||||
|
||||
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
|
||||
sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
|
||||
sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
|
||||
sm::make_counter("validation_errors", [this] { return _validation_errors; },
|
||||
sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
|
||||
sm::description("Holds the number of encountered validation errors.")),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
50
configure.py
50
configure.py
@@ -285,12 +285,8 @@ def generate_compdb(compdb, ninja, buildfile, modes):
|
||||
os.symlink(compdb_target, compdb)
|
||||
except FileExistsError:
|
||||
# if there is already a valid compile_commands.json link in the
|
||||
# source root, we are done. if it's a stale link, update it.
|
||||
if os.path.islink(compdb):
|
||||
current_target = os.readlink(compdb)
|
||||
if not os.path.exists(current_target):
|
||||
os.unlink(compdb)
|
||||
os.symlink(compdb_target, compdb)
|
||||
# source root, we are done.
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
@@ -597,7 +593,6 @@ scylla_tests = set([
|
||||
'test/boost/linearizing_input_stream_test',
|
||||
'test/boost/lister_test',
|
||||
'test/boost/locator_topology_test',
|
||||
'test/boost/lock_tables_metadata_test',
|
||||
'test/boost/log_heap_test',
|
||||
'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
|
||||
'test/boost/logalloc_test',
|
||||
@@ -858,10 +853,6 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
|
||||
arg_parser.add_argument('--build-dir', action='store', default='build',
|
||||
help='Build directory path')
|
||||
arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
|
||||
arg_parser.add_argument('--time-trace', action='store_true', default=False,
|
||||
help='Enable Clang -ftime-trace for build profiling. '
|
||||
'Each .o produces a .json file analyzable with '
|
||||
'ClangBuildAnalyzer or chrome://tracing')
|
||||
arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
|
||||
args = arg_parser.parse_args()
|
||||
if args.help:
|
||||
@@ -1668,7 +1659,6 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/auth_cache_test.cc',
|
||||
'test/boost/auth_test.cc',
|
||||
'test/boost/batchlog_manager_test.cc',
|
||||
'test/boost/table_helper_test.cc',
|
||||
'test/boost/cache_algorithm_test.cc',
|
||||
'test/boost/castas_fcts_test.cc',
|
||||
'test/boost/cdc_test.cc',
|
||||
@@ -1720,7 +1710,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/sstable_compression_config_test.cc',
|
||||
'test/boost/sstable_directory_test.cc',
|
||||
'test/boost/sstable_set_test.cc',
|
||||
'test/boost/sstable_tablet_streaming_test.cc',
|
||||
'test/boost/sstable_tablet_streaming.cc',
|
||||
'test/boost/statement_restrictions_test.cc',
|
||||
'test/boost/storage_proxy_test.cc',
|
||||
'test/boost/tablets_test.cc',
|
||||
@@ -1975,9 +1965,6 @@ user_cflags += ' -fextend-variable-liveness=none'
|
||||
if args.target != '':
|
||||
user_cflags += ' -march=' + args.target
|
||||
|
||||
if args.time_trace:
|
||||
user_cflags += ' -ftime-trace'
|
||||
|
||||
for mode in modes:
|
||||
# Those flags are passed not only to Scylla objects, but also to libraries
|
||||
# that we compile ourselves.
|
||||
@@ -2470,9 +2457,6 @@ def write_build_file(f,
|
||||
command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
|
||||
rule unified
|
||||
command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
|
||||
rule collect_pkgs
|
||||
command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
|
||||
description = COLLECT $out
|
||||
rule rust_header
|
||||
command = cxxbridge --include rust/cxx.h --header $in > $out
|
||||
description = RUST_HEADER $out
|
||||
@@ -2958,8 +2942,6 @@ def write_build_file(f,
|
||||
build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar
|
||||
|
||||
build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
|
||||
|
||||
build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
|
||||
'''))
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
@@ -2967,28 +2949,7 @@ def write_build_file(f,
|
||||
rule dist-check
|
||||
command = ./tools/testing/dist-check/dist-check.sh --mode $mode
|
||||
'''))
|
||||
deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
|
||||
deb_ver = f'{scylla_version}-{scylla_release}-1'
|
||||
rpm_ver = f'{scylla_version}-{scylla_release}'
|
||||
for mode in build_modes:
|
||||
server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
|
||||
server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
|
||||
for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
|
||||
python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
|
||||
all_rpms = server_rpms + cqlsh_rpms + python3_rpms
|
||||
|
||||
server_deb_dir = f'$builddir/dist/{mode}/debian'
|
||||
server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
|
||||
for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
|
||||
for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
|
||||
python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
|
||||
all_debs = server_debs + cqlsh_debs + python3_debs
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
@@ -2996,11 +2957,6 @@ def write_build_file(f,
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
|
||||
build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
|
||||
pkgs = {' '.join(all_rpms)}
|
||||
build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
|
||||
pkgs = {' '.join(all_debs)}
|
||||
build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
|
||||
build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
|
||||
build dist-{mode}: phony {mode}-dist
|
||||
build dist-check-{mode}: dist-check
|
||||
|
||||
@@ -136,9 +136,9 @@ public:
|
||||
{}
|
||||
|
||||
future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
|
||||
return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return make_ready_future<value_type>(std::move(v));
|
||||
});
|
||||
}).discard_result();
|
||||
}
|
||||
|
||||
value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
|
||||
|
||||
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
|
||||
.args = {},
|
||||
};
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
|
||||
}
|
||||
if (values.size() > allowed_options.size()) {
|
||||
throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
|
||||
type, fmt::join(values | std::views::keys, ","),
|
||||
fmt::join(values | std::views::keys, ","), type,
|
||||
fmt::join(allowed_options | std::views::keys, ",")));
|
||||
}
|
||||
options.type = std::string(type);
|
||||
|
||||
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
friend std::ostream& operator<<(std::ostream&, const segment&);
|
||||
friend class segment_manager;
|
||||
|
||||
constexpr size_t sector_overhead(size_t size) const {
|
||||
size_t sector_overhead(size_t size) const {
|
||||
return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
|
||||
}
|
||||
|
||||
@@ -1028,21 +1028,18 @@ public:
|
||||
co_return me;
|
||||
}
|
||||
|
||||
std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t s) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
|
||||
auto overhead = segment_overhead_size;
|
||||
if (_file_pos == 0) {
|
||||
overhead += descriptor_header_size;
|
||||
}
|
||||
|
||||
return {s + overhead, overhead};
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t size_in) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
auto [s, overhead] = buffer_usage_size(size_in);
|
||||
s += overhead;
|
||||
// add bookkeep data reqs.
|
||||
auto a = align_up(s + sector_overhead(s), _alignment);
|
||||
auto k = std::max(a, default_size);
|
||||
@@ -1430,9 +1427,6 @@ public:
|
||||
|
||||
position_type next_position(size_t size) const {
|
||||
auto used = _buffer_ostream_size - _buffer_ostream.size();
|
||||
if (used == 0) { // new chunk/segment
|
||||
std::tie(size, std::ignore) = buffer_usage_size(size);
|
||||
}
|
||||
used += size;
|
||||
return _file_pos + used + sector_overhead(used);
|
||||
}
|
||||
@@ -1576,6 +1570,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);
|
||||
|
||||
auto size = writer.size();
|
||||
auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;
|
||||
|
||||
// check if this cannot be written at all...
|
||||
if (!cfg.allow_going_over_size_limit) {
|
||||
@@ -1584,11 +1579,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
// more worst case
|
||||
auto size_with_meta_overhead = size_with_sector_overhead
|
||||
+ (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
|
||||
* (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
|
||||
* (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
|
||||
;
|
||||
// this is not really true. We could have some space in current segment,
|
||||
// but again, lets be conservative.
|
||||
auto max_file_size_avail = max_disk_size - max_size;
|
||||
auto max_file_size_avail = max_disk_size - max_file_size;
|
||||
|
||||
if (size_with_meta_overhead > max_file_size_avail) {
|
||||
throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
|
||||
@@ -1775,13 +1770,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
co_await s->close();
|
||||
s = co_await get_segment();
|
||||
}
|
||||
// bytes not counting overhead
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());
|
||||
// bytes not counting overhead
|
||||
auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
|
||||
|
||||
size_t avail;
|
||||
if (buf_rem >= align) {
|
||||
if (buf_rem > align) {
|
||||
auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
|
||||
avail = std::min(rem2, max_mutation_size)
|
||||
- segment::entry_overhead_size
|
||||
@@ -1791,7 +1784,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
} else {
|
||||
co_await s->cycle();
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto max = std::max<size_t>(pos, max_file_size);
|
||||
auto file_rem = max - pos;
|
||||
|
||||
if (file_rem < align) {
|
||||
|
||||
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
|
||||
if (cm_it == local_cm.end()) {
|
||||
if (!cer.get_column_mapping()) {
|
||||
rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
|
||||
throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
|
||||
throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
|
||||
}
|
||||
rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
|
||||
cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
|
||||
|
||||
@@ -1921,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
|
||||
{"lwt", feature::UNUSED},
|
||||
{"udf", feature::UDF},
|
||||
{"cdc", feature::UNUSED},
|
||||
{"alternator-streams", feature::UNUSED},
|
||||
{"alternator-streams", feature::ALTERNATOR_STREAMS},
|
||||
{"alternator-ttl", feature::UNUSED },
|
||||
{"consistent-topology-changes", feature::UNUSED},
|
||||
{"broadcast-tables", feature::BROADCAST_TABLES},
|
||||
|
||||
@@ -115,6 +115,7 @@ struct experimental_features_t {
|
||||
enum class feature {
|
||||
UNUSED,
|
||||
UDF,
|
||||
ALTERNATOR_STREAMS,
|
||||
BROADCAST_TABLES,
|
||||
KEYSPACE_STORAGE_OPTIONS,
|
||||
STRONGLY_CONSISTENT_TABLES,
|
||||
|
||||
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
|
||||
}
|
||||
}
|
||||
|
||||
hr_logger.trace(" pp after1={}", pp);
|
||||
hr_logger.trace(" pp after1=", pp);
|
||||
if (d.first == me) {
|
||||
// We only care what "me" sends, and only the elements in
|
||||
// the sorted list earlier than me could have forced it to
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "replica/database.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/config.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "types/types.hh"
|
||||
@@ -21,6 +22,8 @@
|
||||
#include "cdc/generation.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
|
||||
#include "service/migration_manager.hh"
|
||||
#include "locator/host_id.hh"
|
||||
|
||||
@@ -38,10 +41,27 @@ static logging::logger dlogger("system_distributed_keyspace");
|
||||
extern logging::logger cdc_log;
|
||||
|
||||
namespace db {
|
||||
namespace {
|
||||
const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
|
||||
(builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
|
||||
{
|
||||
builder.set_wait_for_sync_to_commitlog(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
extern thread_local data_type cdc_streams_set_type;
|
||||
thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);
|
||||
|
||||
/* See `token_range_description` struct */
|
||||
thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
|
||||
thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
|
||||
{ long_type // dht::token token_range_end;
|
||||
, cdc_streams_list_type // std::vector<stream_id> streams;
|
||||
, byte_type // uint8_t sharding_ignore_msb;
|
||||
});
|
||||
thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);
|
||||
|
||||
schema_ptr view_build_status() {
|
||||
static thread_local auto schema = [] {
|
||||
@@ -57,6 +77,42 @@ schema_ptr view_build_status() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* An internal table used by nodes to exchange CDC generation data. */
|
||||
schema_ptr cdc_generations_v2() {
|
||||
thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
|
||||
/* The unique identifier of this generation. */
|
||||
.with_column("id", uuid_type, column_kind::partition_key)
|
||||
/* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
|
||||
* This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
|
||||
* of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
|
||||
* Each such range-local mapping is represented by a row of this table.
|
||||
* The clustering key of the row is the end of the range being described by this row.
|
||||
* The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
|
||||
* or of the last row of this partition if this is the first the first row. */
|
||||
.with_column("range_end", long_type, column_kind::clustering_key)
|
||||
/* The set of streams mapped to in this range.
|
||||
* The number of streams mapped to a single range in a CDC generation is bounded from above by the number
|
||||
* of shards on the owner of that range in the token ring.
|
||||
* In other words, the number of elements of this set is bounded by the maximum of the number of shards
|
||||
* over all nodes. The serialized size is obtained by counting about 20B for each stream.
|
||||
* For example, if all nodes in the cluster have at most 128 shards,
|
||||
* the serialized size of this set will be bounded by ~2.5 KB. */
|
||||
.with_column("streams", cdc_streams_set_type)
|
||||
/* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
|
||||
* when the generation was first created. Together with the set of streams above it fully describes
|
||||
* the mapping for this particular range. */
|
||||
.with_column("ignore_msb", byte_type)
|
||||
/* Column used for sanity checking.
|
||||
* For a given generation it's equal to the number of ranges in this generation;
|
||||
* thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
|
||||
.with_column("num_ranges", int32_type, column_kind::static_column)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* A user-facing table providing identifiers of the streams used in CDC generations. */
|
||||
schema_ptr cdc_desc() {
|
||||
@@ -96,6 +152,23 @@ schema_ptr cdc_timestamps() {
|
||||
|
||||
static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
|
||||
schema_ptr service_levels() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
|
||||
auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
|
||||
.with_column("service_level", utf8_type, column_kind::partition_key)
|
||||
.with_column("shares", int32_type);
|
||||
if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
|
||||
builder.remove_column("shares");
|
||||
}
|
||||
|
||||
return builder
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
// This is the set of tables which this node ensures to exist in the cluster.
|
||||
// It does that by announcing the creation of these schemas on initialization
|
||||
// of the `system_distributed_keyspace` service (see `start()`), unless it first
|
||||
@@ -109,13 +182,19 @@ static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
static std::vector<schema_ptr> ensured_tables() {
|
||||
return {
|
||||
view_build_status(),
|
||||
cdc_generations_v2(),
|
||||
cdc_desc(),
|
||||
cdc_timestamps(),
|
||||
service_levels(),
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps()};
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
|
||||
return {cdc_generations_v2()};
|
||||
}
|
||||
|
||||
system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
|
||||
@@ -124,6 +203,36 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
|
||||
, _sp(sp) {
|
||||
}
|
||||
|
||||
static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
|
||||
std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
|
||||
if (workload_prioritization_enabled) {
|
||||
new_columns.push_back({"shares", int32_type});
|
||||
}
|
||||
return new_columns;
|
||||
};
|
||||
|
||||
static schema_ptr get_current_service_levels(data_dictionary::database db) {
|
||||
return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
: service_levels();
|
||||
}
|
||||
|
||||
static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto schema = get_current_service_levels(db);
|
||||
schema_builder b(schema);
|
||||
for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
|
||||
auto& [col_name, col_type] = col;
|
||||
bytes options_name = to_bytes(col_name.data());
|
||||
if (schema->get_column_definition(options_name)) {
|
||||
continue;
|
||||
}
|
||||
b.with_column(options_name, col_type, column_kind::regular_column);
|
||||
}
|
||||
b.with_hash_version();
|
||||
return b.build();
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -134,9 +243,11 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
|
||||
while (true) {
|
||||
// Check if there is any work to do before taking the group 0 guard.
|
||||
bool keyspaces_setup = db.has_keyspace(NAME);
|
||||
bool workload_prioritization_enabled = _sp.features().workload_prioritization;
|
||||
bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
|
||||
bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
|
||||
if (keyspaces_setup && tables_setup) {
|
||||
bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
|
||||
if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
|
||||
dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
|
||||
_started = true;
|
||||
co_return;
|
||||
@@ -147,25 +258,51 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
sstring description;
|
||||
|
||||
auto ksm = keyspace_metadata::new_keyspace(
|
||||
auto sd_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME,
|
||||
"org.apache.cassandra.locator.SimpleStrategy",
|
||||
{{"replication_factor", "3"}},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME)) {
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
|
||||
description += format(" create {} keyspace;", NAME);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME);
|
||||
}
|
||||
|
||||
// Get mutations for creating tables.
|
||||
auto sde_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME_EVERYWHERE,
|
||||
"org.apache.cassandra.locator.EverywhereStrategy",
|
||||
{},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME_EVERYWHERE)) {
|
||||
auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
|
||||
std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
|
||||
description += format(" create {} keyspace;", NAME_EVERYWHERE);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
|
||||
}
|
||||
|
||||
// Get mutations for creating and updating tables.
|
||||
auto num_keyspace_mutations = mutations.size();
|
||||
co_await coroutine::parallel_for_each(ensured_tables(),
|
||||
[this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
|
||||
[this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
|
||||
auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
|
||||
|
||||
// Ensure that the service_levels table contains new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS) {
|
||||
table = get_updated_service_levels(db, workload_prioritization_enabled);
|
||||
}
|
||||
|
||||
if (!db.has_schema(table->ks_name(), table->cf_name())) {
|
||||
co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
|
||||
}
|
||||
|
||||
// The service_levels table exists. Update it if it lacks new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
|
||||
auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
|
||||
std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
|
||||
}
|
||||
});
|
||||
if (mutations.size() > num_keyspace_mutations) {
|
||||
description += " create and update system_distributed(_everywhere) tables";
|
||||
@@ -187,6 +324,15 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start_workload_prioritization() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_return;
|
||||
}
|
||||
if (_qp.db().features().workload_prioritization) {
|
||||
co_await create_tables({get_updated_service_levels(_qp.db(), true)});
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start() {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -229,6 +375,90 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
|
||||
return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
|
||||
}
|
||||
|
||||
future<>
|
||||
system_distributed_keyspace::insert_cdc_generation(
|
||||
utils::UUID id,
|
||||
const cdc::topology_description& desc,
|
||||
context ctx) {
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
const size_t concurrency = 10;
|
||||
const size_t num_replicas = ctx.num_token_owners;
|
||||
|
||||
// To insert the data quickly and efficiently we send it in batches of multiple rows
|
||||
// (each batch represented by a single mutation). We also send multiple such batches concurrently.
|
||||
// However, we need to limit the memory consumption of the operation.
|
||||
// I assume that the memory consumption grows linearly with the number of replicas
|
||||
// (we send to all replicas ``at the same time''), with the batch size (the data must
|
||||
// be copied for each replica?) and with concurrency. These assumptions may be too conservative
|
||||
// but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
|
||||
// Thus, if we want to limit the memory consumption to L, it should be true that
|
||||
// mutation_size * num_replicas * concurrency <= L, hence
|
||||
// mutation_size <= L / (num_replicas * concurrency).
|
||||
// For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
|
||||
// mutation_size <= 10MB / 1000 = 10KB.
|
||||
// On the other hand we must have mutation_size >= size of a single row,
|
||||
// so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
|
||||
|
||||
// It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
|
||||
// which would correspond to L ~= 60MB. Hence that's the limit we use here.
|
||||
const size_t L = 60'000'000;
|
||||
const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
|
||||
|
||||
auto s = _qp.db().real_database().find_schema(
|
||||
system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
|
||||
co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
|
||||
co_await _sp.mutate(
|
||||
{ std::move(m) },
|
||||
db::consistency_level::ALL,
|
||||
db::timeout_clock::now() + 60s,
|
||||
nullptr, // trace_state
|
||||
empty_service_permit(),
|
||||
db::allow_per_partition_rate_limit::no,
|
||||
false // raw_counters
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<cdc::topology_description>>
|
||||
system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
|
||||
utils::chunked_vector<cdc::token_range_description> entries;
|
||||
size_t num_ranges = 0;
|
||||
co_await _qp.query_internal(
|
||||
// This should be a local read so 20s should be more than enough
|
||||
format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
|
||||
db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
|
||||
{ id },
|
||||
1000, // for ~1KB rows, ~1MB page size
|
||||
[&] (const cql3::untyped_result_set_row& row) {
|
||||
|
||||
std::vector<cdc::stream_id> streams;
|
||||
row.get_list_data<bytes>("streams", std::back_inserter(streams));
|
||||
entries.push_back(cdc::token_range_description{
|
||||
dht::token::from_int64(row.get_as<int64_t>("range_end")),
|
||||
std::move(streams),
|
||||
uint8_t(row.get_as<int8_t>("ignore_msb"))});
|
||||
num_ranges = row.get_as<int32_t>("num_ranges");
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
|
||||
if (entries.empty()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
// Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
|
||||
// were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
|
||||
// tested, so just in case...
|
||||
if (entries.size() != num_ranges) {
|
||||
throw std::runtime_error(format(
|
||||
"read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
|
||||
" but reading the partition returned {}.", num_ranges, entries.size()));
|
||||
}
|
||||
|
||||
co_return std::optional{cdc::topology_description(std::move(entries))};
|
||||
}
|
||||
|
||||
static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
|
||||
const replica::database& db,
|
||||
db_clock::time_point time,
|
||||
@@ -400,4 +630,65 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
|
||||
co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
|
||||
}
|
||||
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
|
||||
return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
|
||||
}
|
||||
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
|
||||
return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
|
||||
static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
|
||||
co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
|
||||
auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const lowres_clock::duration& d) {
|
||||
return data_value(cql_duration(months_counter{0},
|
||||
days_counter{0},
|
||||
nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
|
||||
},
|
||||
}, tv);
|
||||
};
|
||||
auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const T& v) {
|
||||
return data_value(v);
|
||||
},
|
||||
}, v);
|
||||
};
|
||||
data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
|
||||
? data_value::make_null(utf8_type)
|
||||
: data_value(qos::service_level_options::to_string(slo.workload));
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value(slo.timeout),
|
||||
workload,
|
||||
service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value_g(slo.shares), service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
|
||||
static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
|
||||
return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "cdc/generation_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -21,6 +24,7 @@ class query_processor;
|
||||
}
|
||||
|
||||
namespace cdc {
|
||||
class stream_id;
|
||||
class topology_description;
|
||||
class streams_version;
|
||||
} // namespace cdc
|
||||
@@ -35,8 +39,17 @@ namespace db {
|
||||
class system_distributed_keyspace {
|
||||
public:
|
||||
static constexpr auto NAME = "system_distributed";
|
||||
static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";
|
||||
|
||||
static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
|
||||
static constexpr auto SERVICE_LEVELS = "service_levels";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes. */
|
||||
static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes.
|
||||
* Resides in system_distributed_everywhere. */
|
||||
static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";
|
||||
|
||||
/* This table is used by CDC clients to learn about available CDC streams. */
|
||||
static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
|
||||
@@ -64,14 +77,19 @@ private:
|
||||
|
||||
public:
|
||||
static std::vector<schema_ptr> all_distributed_tables();
|
||||
static std::vector<schema_ptr> all_everywhere_tables();
|
||||
|
||||
system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);
|
||||
|
||||
future<> start();
|
||||
future<> start_workload_prioritization();
|
||||
future<> stop();
|
||||
|
||||
bool started() const { return _started; }
|
||||
|
||||
future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
|
||||
future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
|
||||
|
||||
future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
|
||||
future<bool> cdc_desc_exists(db_clock::time_point, context);
|
||||
|
||||
@@ -87,6 +105,11 @@ public:
|
||||
// NOTE: currently used only by alternator
|
||||
future<db_clock::time_point> cdc_current_generation_timestamp(context);
|
||||
|
||||
future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
|
||||
future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
|
||||
future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
|
||||
future<> drop_service_level(sstring service_level_name) const;
|
||||
|
||||
private:
|
||||
future<> create_tables(std::vector<schema_ptr> tables);
|
||||
};
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include <seastar/core/cacheline.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -42,16 +41,13 @@ class node_update_backlog {
|
||||
std::chrono::milliseconds _interval;
|
||||
std::atomic<clock::time_point> _last_update;
|
||||
std::atomic<update_backlog> _max;
|
||||
utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;
|
||||
|
||||
public:
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
|
||||
utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
|
||||
: _backlogs(shards)
|
||||
, _interval(interval)
|
||||
, _last_update(clock::now() - _interval)
|
||||
, _max(update_backlog::no_backlog())
|
||||
, _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
|
||||
, _max(update_backlog::no_backlog()) {
|
||||
if (utils::get_local_injector().enter("update_backlog_immediately")) {
|
||||
_interval = std::chrono::milliseconds(0);
|
||||
_last_update = clock::now();
|
||||
@@ -63,9 +59,6 @@ public:
|
||||
update_backlog fetch_shard(unsigned shard);
|
||||
seastar::future<std::optional<update_backlog>> fetch_if_changed();
|
||||
|
||||
std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const;
|
||||
|
||||
// Exposed for testing only.
|
||||
update_backlog load() const {
|
||||
return _max.load(std::memory_order_relaxed);
|
||||
|
||||
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
|
||||
auto pli = _two_level_locks.find(*pk);
|
||||
if (pli == _two_level_locks.end()) {
|
||||
// This shouldn't happen... We can't unlock this lock if we can't find it...
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&pli->first == pk);
|
||||
if (cpk) {
|
||||
auto rli = pli->second._row_locks.find(*cpk);
|
||||
if (rli == pli->second._row_locks.end()) {
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&rli->first == cpk);
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
#include "db/view/view_builder.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
#include "db/view/view_update_generator.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include "db/view/regular_column_transformation.hh"
|
||||
#include "db/system_keyspace_view_types.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
@@ -3493,27 +3492,18 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
|
||||
}
|
||||
}
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const {
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms) {
|
||||
auto adjust = [] (float x) { return x * x * x; };
|
||||
auto budget = std::max(db::timeout_clock::duration(0),
|
||||
timeout - db::timeout_clock::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
|
||||
auto budget = std::max(service::storage_proxy::clock_type::duration(0),
|
||||
timeout - service::storage_proxy::clock_type::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
|
||||
// "budget" has millisecond resolution and can potentially be long
|
||||
// in the future so converting it to microseconds may overflow.
|
||||
// So to compare buget and ret we need to convert both to the lower
|
||||
// resolution.
|
||||
if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
|
||||
if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
|
||||
return ret;
|
||||
} else {
|
||||
// budget is small (< ret) so can be converted to microseconds
|
||||
|
||||
@@ -715,7 +715,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
|
||||
} catch (...) {
|
||||
eptr = std::current_exception();
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
|
||||
}
|
||||
reader.close().get();
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ public:
|
||||
// Returns the number of bytes in the backlog divided by the maximum number of bytes
|
||||
// that the backlog can hold before employing admission control. While the backlog
|
||||
// is below the threshold, the coordinator will slow down the view updates up to
|
||||
// node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// the coordinator will reject the writes that would increase the backlog. On the
|
||||
// replica, the writes will start failing only after reaching the hard limit '_max'.
|
||||
float relative_size() const {
|
||||
@@ -70,4 +70,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(
|
||||
update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
*/
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include <seastar/core/timed_out_error.hh>
|
||||
#include "gms/inet_address.hh"
|
||||
#include <seastar/util/defer.hh>
|
||||
@@ -96,10 +95,9 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
|
||||
: _db(db)
|
||||
, _proxy(proxy)
|
||||
, _node_update_backlog(node_backlog)
|
||||
, _progress_tracker(std::make_unique<progress_tracker>())
|
||||
, _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
|
||||
{
|
||||
@@ -114,7 +112,7 @@ future<> view_update_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
auto drop_sstable_references = defer([&] () noexcept {
|
||||
// Clear sstable references so sstables_manager::stop() doesn't hang.
|
||||
vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
|
||||
vug_logger.info("leaving {} unstaged sstables unprocessed",
|
||||
_sstables_to_move.size(), _sstables_with_tables.size());
|
||||
_sstables_to_move.clear();
|
||||
_sstables_with_tables.clear();
|
||||
@@ -500,7 +498,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
|
||||
// the one which limits the number of incoming client requests by delaying the response to the client.
|
||||
if (batch_num > 0) {
|
||||
update_backlog local_backlog = _db.get_view_update_backlog();
|
||||
std::chrono::microseconds throttle_delay = _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);
|
||||
std::chrono::microseconds throttle_delay = calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
|
||||
|
||||
co_await seastar::sleep(throttle_delay);
|
||||
|
||||
|
||||
@@ -52,7 +52,6 @@ using allow_hints = bool_class<allow_hints_tag>;
|
||||
|
||||
namespace db::view {
|
||||
|
||||
class node_update_backlog;
|
||||
class stats;
|
||||
struct wait_for_all_updates_tag {};
|
||||
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
|
||||
@@ -64,7 +63,6 @@ public:
|
||||
private:
|
||||
replica::database& _db;
|
||||
sharded<service::storage_proxy>& _proxy;
|
||||
node_update_backlog& _node_update_backlog;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::condition_variable _pending_sstables;
|
||||
@@ -77,7 +75,7 @@ private:
|
||||
optimized_optional<abort_source::subscription> _early_abort_subscription;
|
||||
void do_abort() noexcept;
|
||||
public:
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
|
||||
~view_update_generator();
|
||||
|
||||
future<> start();
|
||||
|
||||
68
dist/CMakeLists.txt
vendored
68
dist/CMakeLists.txt
vendored
@@ -141,72 +141,4 @@ add_dependencies(dist
|
||||
dist-python3
|
||||
dist-server)
|
||||
|
||||
set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
|
||||
set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
|
||||
|
||||
# Map system processor to Debian architecture names
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(deb_arch "amd64")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
set(deb_arch "arm64")
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
|
||||
set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
|
||||
set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
|
||||
set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
|
||||
set(server_rpms
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(cqlsh_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(python3_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
|
||||
|
||||
set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
|
||||
set(server_debs
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
|
||||
set(cqlsh_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
|
||||
set(python3_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
|
||||
|
||||
add_custom_target(collect-dist-rpm
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting RPMs into ${dist_rpm_dir}")
|
||||
|
||||
add_custom_target(collect-dist-deb
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting DEBs into ${dist_deb_dir}")
|
||||
|
||||
add_custom_target(collect-dist
|
||||
DEPENDS collect-dist-rpm collect-dist-deb)
|
||||
|
||||
add_subdirectory(debuginfo)
|
||||
|
||||
@@ -324,13 +324,6 @@ experimental:
|
||||
stream events. Without this option, such no-op operations may still
|
||||
generate spurious stream events.
|
||||
<https://github.com/scylladb/scylladb/issues/28368>
|
||||
* When a stream is disabled, no new records are written but the existing
|
||||
stream data is preserved and remains readable through its original
|
||||
StreamArn. The data expires via TTL after 24 hours. Re-enabling the
|
||||
stream purges the old data immediately and produces a new StreamArn.
|
||||
In contrast, DynamoDB keeps the old stream and its data readable for
|
||||
24 hours through the old StreamArn even after re-enabling.
|
||||
<https://scylladb.atlassian.net/browse/SCYLLADB-1873>
|
||||
|
||||
## Unimplemented API features
|
||||
|
||||
|
||||
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
|
||||
|
||||
|
||||
Altering from a rack list to a numeric replication factor is not supported.
|
||||
@@ -1017,11 +1017,11 @@ For example:
|
||||
|
||||
CREATE TABLE customer_data (
|
||||
cust_id uuid,
|
||||
"cust_first-name" text,
|
||||
"cust_last-name" text,
|
||||
cust_first-name text,
|
||||
cust_last-name text,
|
||||
cust_phone text,
|
||||
"cust_get-sms" text,
|
||||
PRIMARY KEY (cust_id)
|
||||
cust_get-sms text,
|
||||
PRIMARY KEY (customer_id)
|
||||
) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };
|
||||
|
||||
.. _cql-caching-options:
|
||||
|
||||
@@ -24,8 +24,7 @@ For example:
|
||||
|
||||
INSERT INTO NerdMovies (movie, director, main_actor, year)
|
||||
VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
|
||||
IF NOT EXISTS
|
||||
USING TTL 86400;
|
||||
USING TTL 86400 IF NOT EXISTS;
|
||||
|
||||
The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
|
||||
its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
|
||||
|
||||
@@ -71,7 +71,7 @@ used. If it is used, the statement will be a no-op if the materialized view alre
|
||||
MV Select Statement
|
||||
...................
|
||||
|
||||
The select statement of a materialized view creation defines which of the base table is included in the view. That
|
||||
The select statement of a materialized view creation defines which of the base table columns are included in the view. That
|
||||
statement is limited in a number of ways:
|
||||
|
||||
- The :ref:`selection <selection-clause>` is limited to those that only select columns of the base table. In other
|
||||
|
||||
@@ -507,7 +507,7 @@ For example::
|
||||
|
||||
CREATE TABLE superheroes (
|
||||
name frozen<full_name> PRIMARY KEY,
|
||||
home frozen<address>
|
||||
home address
|
||||
);
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -4,7 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
|
||||
Metrics Update <metric-update-2025.x-to-2026.1>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
|
||||
@@ -0,0 +1,82 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
.. |PRECEDING_VERSION| replace:: 2025.4
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_operation_size_kb
|
||||
- Histogram of item sizes involved in a request.
|
||||
* - scylla_column_family_total_disk_space_before_compression
|
||||
- Hypothetical total disk space used if data files weren't compressed
|
||||
* - scylla_group_name_auto_repair_enabled_nr
|
||||
- Number of tablets with auto repair enabled.
|
||||
* - scylla_group_name_auto_repair_needs_repair_nr
|
||||
- Number of tablets with auto repair enabled that currently need repair.
|
||||
* - scylla_lsa_compact_time_ms
|
||||
- Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
|
||||
* - scylla_lsa_evict_time_ms
|
||||
- Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
|
||||
* - scylla_lsa_reclaim_time_ms
|
||||
- Total time spent in reclaiming LSA memory back to std allocator.
|
||||
* - scylla_object_storage_memory_usage
|
||||
- Total number of bytes consumed by the object storage client.
|
||||
* - scylla_tablet_ops_failed
|
||||
- Number of failed tablet auto repair attempts.
|
||||
* - scylla_tablet_ops_succeeded
|
||||
- Number of successful tablet auto repair attempts.
|
||||
|
||||
Renamed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric Name in |PRECEDING_VERSION|
|
||||
- Metric Name in |NEW_VERSION|
|
||||
* - scylla_s3_memory_usage
|
||||
- scylla_object_storage_memory_usage
|
||||
|
||||
Removed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are removed in ScyllaDB |NEW_VERSION|.
|
||||
|
||||
* scylla_redis_current_connections
|
||||
* scylla_redis_op_latency
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_requests_latency
|
||||
* scylla_redis_requests_served
|
||||
* scylla_redis_requests_serving
|
||||
|
||||
New and Updated Metrics in Previous Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
|
||||
.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
|
||||
Metrics Update <metric-update-2026.1-to-2026.2>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
|
||||
* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
|
||||
@@ -1,126 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |PRECEDING_VERSION| replace:: 2026.1
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_auth_cache_permissions
|
||||
- Total number of permission sets currently cached across all roles.
|
||||
* - scylla_auth_cache_roles
|
||||
- Number of roles currently cached.
|
||||
* - scylla_cql_forwarded_requests
|
||||
- Counts the total number of attempts to forward CQL requests to other nodes.
|
||||
One request may be forwarded multiple times, particularly when a write is
|
||||
handled by a non-replica node.
|
||||
* - scylla_cql_write_consistency_levels_disallowed_violations
|
||||
- Counts the number of write_consistency_levels_disallowed guardrail violations,
|
||||
i.e. attempts to write with a forbidden consistency level.
|
||||
* - scylla_cql_write_consistency_levels_warned_violations
|
||||
- Counts the number of write_consistency_levels_warned guardrail violations,
|
||||
i.e. attempts to write with a discouraged consistency level.
|
||||
* - scylla_cql_writes_per_consistency_level
|
||||
- Counts the number of writes for each consistency level.
|
||||
* - scylla_io_queue_integrated_disk_queue_length
|
||||
- Length of the integrated disk queue.
|
||||
* - scylla_io_queue_integrated_queue_length
|
||||
- Length of the integrated queue.
|
||||
* - scylla_logstor_sm_bytes_freed
|
||||
- Counts the number of data bytes freed.
|
||||
* - scylla_logstor_sm_bytes_read
|
||||
- Counts the number of bytes read from the disk.
|
||||
* - scylla_logstor_sm_bytes_written
|
||||
- Counts the number of bytes written to the disk.
|
||||
* - scylla_logstor_sm_compaction_bytes_written
|
||||
- Counts the number of bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_data_bytes_written
|
||||
- Counts the number of data bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_records_rewritten
|
||||
- Counts the number of records rewritten during compaction.
|
||||
* - scylla_logstor_sm_compaction_records_skipped
|
||||
- Counts the number of records skipped during compaction.
|
||||
* - scylla_logstor_sm_compaction_segments_freed
|
||||
- Counts the number of data bytes written to the disk.
|
||||
* - scylla_logstor_sm_disk_usage
|
||||
- Total disk usage.
|
||||
* - scylla_logstor_sm_free_segments
|
||||
- Counts the number of free segments currently available.
|
||||
* - scylla_logstor_sm_segment_pool_compaction_segments_get
|
||||
- Counts the number of segments taken from the segment pool for compaction.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_get
|
||||
- Counts the number of segments taken from the segment pool for normal writes.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_wait
|
||||
- Counts the number of times normal writes had to wait for a segment to become
|
||||
available in the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_segments_put
|
||||
- Counts the number of segments returned to the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_separator_segments_get
|
||||
- Counts the number of segments taken from the segment pool for separator writes.
|
||||
* - scylla_logstor_sm_segment_pool_size
|
||||
- Counts the number of segments in the segment pool.
|
||||
* - scylla_logstor_sm_segments_allocated
|
||||
- Counts the number of segments allocated.
|
||||
* - scylla_logstor_sm_segments_compacted
|
||||
- Counts the number of segments compacted.
|
||||
* - scylla_logstor_sm_segments_freed
|
||||
- Counts the number of segments freed.
|
||||
* - scylla_logstor_sm_segments_in_use
|
||||
- Counts the number of segments currently in use.
|
||||
* - scylla_logstor_sm_separator_buffer_flushed
|
||||
- Counts the number of times the separator buffer has been flushed.
|
||||
* - scylla_logstor_sm_separator_bytes_written
|
||||
- Counts the number of bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_data_bytes_written
|
||||
- Counts the number of data bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_flow_control_delay
|
||||
- Current delay applied to writes to control separator debt in microseconds.
|
||||
* - scylla_logstor_sm_separator_segments_freed
|
||||
- Counts the number of segments freed by the separator.
|
||||
* - scylla_transport_cql_pending_response_memory
|
||||
- Holds the total memory in bytes consumed by responses waiting to be sent.
|
||||
* - scylla_transport_cql_request_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_cql_requests_serving
|
||||
- Holds the number of requests that are being processed right now.
|
||||
* - scylla_transport_cql_response_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_requests_forwarded_failed
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed to execute there.
|
||||
* - scylla_transport_requests_forwarded_prepared_not_found
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed there because the statement was not prepared on the target.
|
||||
When this happens, the coordinator performs an additional remote call
|
||||
to prepare the statement on the replica and retries the EXECUTE request
|
||||
afterwards.
|
||||
* - scylla_transport_requests_forwarded_redirected
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but that replica responded with a redirect to another node. This can
|
||||
happen when replica has stale information about the cluster topology or
|
||||
when the request is handled by a node that is not a replica for the data
|
||||
being accessed by the request.
|
||||
* - scylla_transport_requests_forwarded_successfully
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
and executed successfully there.
|
||||
|
||||
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret
|
||||
|
||||
template<typename Func>
|
||||
future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
|
||||
kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
|
||||
kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
|
||||
KMIP_CMD* cmd = cmd_in;
|
||||
|
||||
// #998 Need to do retry loop, because we can have either timed out connection,
|
||||
|
||||
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
|
||||
static auto get_xml_node = [](node_type* node, const char* what) {
|
||||
auto res = node->first_node(what);
|
||||
if (!res) {
|
||||
throw malformed_response_error(fmt::format("XML parse error: {}", what));
|
||||
throw malformed_response_error(fmt::format("XML parse error", what));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
@@ -109,7 +109,6 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
|
||||
"UUID_SSTABLE_IDENTIFIERS"sv,
|
||||
"GROUP0_SCHEMA_VERSIONING"sv,
|
||||
"VIEW_BUILD_STATUS_ON_GROUP0"sv,
|
||||
"CDC_GENERATIONS_V2"sv,
|
||||
};
|
||||
|
||||
if (is_test_only_feature_deprecated()) {
|
||||
|
||||
@@ -83,6 +83,7 @@ public:
|
||||
gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
|
||||
gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
|
||||
gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
|
||||
gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
|
||||
gms::feature user_defined_aggregates { *this, "UDA"sv };
|
||||
// Historically max_result_size contained only two fields: soft_limit and
|
||||
// hard_limit. It was somehow obscure because for normal paged queries both
|
||||
|
||||
@@ -399,10 +399,9 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
|
||||
}
|
||||
}
|
||||
gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
|
||||
auto ack2_msg_str = fmt::format("{}", ack2_msg);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
}
|
||||
|
||||
// Depends on
|
||||
@@ -965,7 +964,8 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
|
||||
diff = now - last;
|
||||
if (!failed) {
|
||||
last = now;
|
||||
} else if (diff > max_duration) {
|
||||
}
|
||||
if (diff > max_duration) {
|
||||
logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
|
||||
co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
|
||||
return g.convict(host_id);
|
||||
|
||||
3
init.cc
3
init.cc
@@ -87,6 +87,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
}
|
||||
}
|
||||
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
|
||||
disabled.insert("ALTERNATOR_STREAMS"s);
|
||||
}
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
|
||||
disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
|
||||
}
|
||||
|
||||
61
main.cc
61
main.cc
@@ -1358,7 +1358,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
};
|
||||
spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
|
||||
spcfg.available_memory = memory::stats().total_memory();
|
||||
spcfg.maintenance_mode = maintenance_mode_enabled{cfg->maintenance_mode()};
|
||||
smp_service_group_config storage_proxy_smp_service_group_config;
|
||||
// Assuming less than 1kB per queued request, this limits storage_proxy submit_to() queues to 5MB or less
|
||||
storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
|
||||
@@ -1367,7 +1366,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
spcfg.write_mv_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
|
||||
spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
|
||||
spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
|
||||
static db::view::node_update_backlog node_backlog(smp::count, 10ms, cfg->view_flow_control_delay_limit_in_ms);
|
||||
static db::view::node_update_backlog node_backlog(smp::count, 10ms);
|
||||
scheduling_group_key_config storage_proxy_stats_cfg =
|
||||
make_scheduling_group_key_config<service::storage_proxy_stats::stats>();
|
||||
storage_proxy_stats_cfg.constructor = [plain_constructor = storage_proxy_stats_cfg.constructor] (void* ptr) {
|
||||
@@ -1811,18 +1810,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
utils::get_local_injector().inject("stop_after_starting_migration_manager",
|
||||
[] { std::raise(SIGSTOP); });
|
||||
|
||||
// Audit must be constructed before the maintenance socket so
|
||||
// that on shutdown (reverse destruction order) the audit service
|
||||
// outlives the maintenance socket and in-flight queries can
|
||||
// still reach audit::inspect() safely.
|
||||
checkpoint(stop_signal, "starting audit service");
|
||||
audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
|
||||
startlog.error("audit start failed: {}", e);
|
||||
}).get();
|
||||
auto audit_stop = defer([] {
|
||||
audit::audit::stop_audit().get();
|
||||
});
|
||||
|
||||
// XXX: stop_raft has to happen before query_processor and migration_manager
|
||||
// is stopped, since some groups keep using the query
|
||||
// processor until are stopped inside stop_raft.
|
||||
@@ -1854,7 +1841,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "starting view update generator");
|
||||
view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(node_backlog), std::ref(stop_signal.as_sharded_abort_source())).get();
|
||||
view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(stop_signal.as_sharded_abort_source())).get();
|
||||
auto stop_view_update_generator = defer_verbose_shutdown("view update generator", [] {
|
||||
view_update_generator.stop().get();
|
||||
});
|
||||
@@ -2300,12 +2287,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
ss.local().wait_for_group0_stop().get();
|
||||
});
|
||||
|
||||
if (!group0_service.maintenance_mode() && sys_ks.local().bootstrap_complete()) {
|
||||
// Setup group0 early in case the node is bootstrapped already and the group exists.
|
||||
// Need to do it before allowing incoming messaging service connections since
|
||||
// storage proxy's and migration manager's verbs may access group0.
|
||||
group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
|
||||
}
|
||||
// Setup group0 early in case the node is bootstrapped already and the group exists.
|
||||
// Need to do it before allowing incoming messaging service connections since
|
||||
// storage proxy's and migration manager's verbs may access group0.
|
||||
group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
|
||||
|
||||
// The call to setup_group0_if_exists() above guarantees that, if group0 is
|
||||
// created and started, the locally persisted group0 state has been applied
|
||||
@@ -2355,22 +2340,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
}).get();
|
||||
stop_signal.ready(false);
|
||||
|
||||
// At this point, `locator::topology` should be stable, i.e. we should have complete information
|
||||
// about the layout of the cluster (= list of nodes along with the racks/DCs).
|
||||
startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
|
||||
db.local().check_rf_rack_validity(token_metadata.local().get());
|
||||
|
||||
startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
|
||||
db.local().check_rack_list_everywhere(cfg->enforce_rack_list());
|
||||
|
||||
// The table-based audit backend needs Raft (via join_cluster)
|
||||
// to create its keyspace and table.
|
||||
checkpoint(stop_signal, "starting audit storage");
|
||||
audit::audit::start_storage(*cfg).get();
|
||||
auto audit_storage_stop = defer([] {
|
||||
audit::audit::stop_storage().get();
|
||||
});
|
||||
|
||||
if (cfg->maintenance_socket() != "ignore") {
|
||||
// Enable role operations now that node joined the cluster
|
||||
maintenance_auth_service.invoke_on_all([](auth::service& svc) {
|
||||
@@ -2380,6 +2349,24 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
|
||||
}
|
||||
|
||||
// At this point, `locator::topology` should be stable, i.e. we should have complete information
|
||||
// about the layout of the cluster (= list of nodes along with the racks/DCs).
|
||||
startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
|
||||
db.local().check_rf_rack_validity(token_metadata.local().get());
|
||||
|
||||
startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
|
||||
db.local().check_rack_list_everywhere(cfg->enforce_rack_list());
|
||||
|
||||
// Start audit service after join_cluster so that the table-based audit backend
|
||||
// can properly create its keyspace and table.
|
||||
checkpoint(stop_signal, "starting audit service");
|
||||
audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
|
||||
startlog.error("audit start failed: {}", e);
|
||||
}).get();
|
||||
auto audit_stop = defer([] {
|
||||
audit::audit::stop_audit().get();
|
||||
});
|
||||
|
||||
// Semantic validation of sstable compression parameters from config.
|
||||
// Adding here (i.e., after `join_cluster`) to ensure that the
|
||||
// required SSTABLE_COMPRESSION_DICTS cluster feature has been negotiated.
|
||||
|
||||
@@ -48,8 +48,8 @@ static void set_field(atomic_cell_value& out, unsigned offset, T val) {
|
||||
}
|
||||
|
||||
template <FragmentRange Buffer>
|
||||
static void set_value(atomic_cell_value_mutable_view b, unsigned value_offset, const Buffer& value) {
|
||||
auto v = b.substr(value_offset, value.size_bytes());
|
||||
static void set_value(managed_bytes& b, unsigned value_offset, const Buffer& value) {
|
||||
auto v = managed_bytes_mutable_view(b).substr(value_offset, value.size_bytes());
|
||||
for (auto frag : value) {
|
||||
write_fragmented(v, single_fragmented_view(frag));
|
||||
}
|
||||
@@ -141,36 +141,20 @@ public:
|
||||
SCYLLA_ASSERT(is_live_and_has_ttl(cell));
|
||||
return gc_clock::duration(get_field<int32_t>(cell, ttl_offset));
|
||||
}
|
||||
static size_t dead_serialized_size() {
|
||||
return flags_size + timestamp_size + deletion_time_size;
|
||||
}
|
||||
static size_t live_serialized_size(size_t value_size) {
|
||||
return flags_size + timestamp_size + value_size;
|
||||
}
|
||||
static size_t live_expiring_serialized_size(size_t value_size) {
|
||||
return flags_size + timestamp_size + expiry_size + ttl_size + value_size;
|
||||
}
|
||||
static void write_dead(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
|
||||
static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
|
||||
managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
|
||||
b[0] = 0;
|
||||
set_field(b, timestamp_offset, timestamp);
|
||||
set_field(b, deletion_time_offset, static_cast<int64_t>(deletion_time.time_since_epoch().count()));
|
||||
}
|
||||
static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
|
||||
managed_bytes b(managed_bytes::initialized_later(), dead_serialized_size());
|
||||
write_dead(b, timestamp, deletion_time);
|
||||
return b;
|
||||
}
|
||||
template <FragmentRange Buffer>
|
||||
static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value) {
|
||||
static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
|
||||
auto value_offset = flags_size + timestamp_size;
|
||||
managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
|
||||
b[0] = LIVE_FLAG;
|
||||
set_field(b, timestamp_offset, timestamp);
|
||||
set_value(b, value_offset, value);
|
||||
}
|
||||
template <FragmentRange Buffer>
|
||||
static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
|
||||
managed_bytes b(managed_bytes::initialized_later(), live_serialized_size(value.size_bytes()));
|
||||
write_live(b, timestamp, value);
|
||||
return b;
|
||||
}
|
||||
static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
|
||||
@@ -182,18 +166,14 @@ public:
|
||||
return b;
|
||||
}
|
||||
template <FragmentRange Buffer>
|
||||
static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
|
||||
static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
|
||||
auto value_offset = flags_size + timestamp_size + expiry_size + ttl_size;
|
||||
managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
|
||||
b[0] = EXPIRY_FLAG | LIVE_FLAG;
|
||||
set_field(b, timestamp_offset, timestamp);
|
||||
set_field(b, expiry_offset, static_cast<int64_t>(expiry.time_since_epoch().count()));
|
||||
set_field(b, ttl_offset, static_cast<int32_t>(ttl.count()));
|
||||
set_value(b, value_offset, value);
|
||||
}
|
||||
template <FragmentRange Buffer>
|
||||
static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
|
||||
managed_bytes b(managed_bytes::initialized_later(), live_expiring_serialized_size(value.size_bytes()));
|
||||
write_live(b, timestamp, value, expiry, ttl);
|
||||
return b;
|
||||
}
|
||||
static managed_bytes make_live_uninitialized(api::timestamp_type timestamp, size_t size) {
|
||||
|
||||
@@ -113,10 +113,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
|
||||
auto&& entry = _cm.static_column_at(id);
|
||||
_os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
|
||||
}
|
||||
virtual void accept_static_cell(column_id id, collection_mutation cm) override {
|
||||
virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
|
||||
print_separator();
|
||||
auto&& entry = _cm.static_column_at(id);
|
||||
_os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
|
||||
_os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
|
||||
}
|
||||
virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
|
||||
print_separator();
|
||||
@@ -137,10 +137,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
|
||||
auto&& entry = _cm.regular_column_at(id);
|
||||
_os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
|
||||
}
|
||||
virtual void accept_row_cell(column_id id, collection_mutation cm) override {
|
||||
virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
|
||||
print_separator();
|
||||
auto&& entry = _cm.regular_column_at(id);
|
||||
_os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
|
||||
_os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
|
||||
}
|
||||
out_t finalize() {
|
||||
if (_in_row) {
|
||||
|
||||
@@ -7,14 +7,12 @@
|
||||
*/
|
||||
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/on_internal_error.hh"
|
||||
#include "types/collection.hh"
|
||||
#include "types/user.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "mutation/mutation_partition.hh"
|
||||
#include "compaction/compaction_garbage_collector.hh"
|
||||
#include "combine.hh"
|
||||
#include "idl/mutation.dist.impl.hh"
|
||||
|
||||
#include "collection_mutation.hh"
|
||||
|
||||
@@ -226,26 +224,13 @@ compact_and_expire_result collection_mutation_description::compact_and_expire(co
|
||||
return res;
|
||||
}
|
||||
|
||||
/// A CollectionMutationAdaptor is a static interface that adapts a collection
|
||||
/// element (an iterator value type) to the serialization requirements of
|
||||
/// serialize_collection_mutation(). It provides static methods to measure the
|
||||
/// serialized sizes and to write the key and value of each element into a buffer.
|
||||
template <typename Adaptor, typename Element>
|
||||
concept CollectionMutationAdaptor = requires(const Element& e, managed_bytes_mutable_view& out) {
|
||||
{ Adaptor::key_size(e) } -> std::convertible_to<size_t>;
|
||||
{ Adaptor::value_size(e) } -> std::convertible_to<size_t>;
|
||||
{ Adaptor::write_key(e, out) };
|
||||
{ Adaptor::write_value(e, out) };
|
||||
};
|
||||
|
||||
template <typename Adaptor, typename Iterator>
|
||||
requires CollectionMutationAdaptor<Adaptor, std::iter_value_t<Iterator>>
|
||||
template <typename Iterator>
|
||||
static collection_mutation serialize_collection_mutation(
|
||||
const abstract_type& type,
|
||||
const tombstone& tomb,
|
||||
std::ranges::subrange<Iterator> cells) {
|
||||
auto element_size = [] (size_t c, auto&& e) -> size_t {
|
||||
return c + 8 + Adaptor::key_size(e) + Adaptor::value_size(e);
|
||||
return c + 8 + e.first.size() + e.second.serialize().size();
|
||||
};
|
||||
auto size = std::ranges::fold_left(cells, (size_t)4, element_size);
|
||||
size += 1;
|
||||
@@ -259,112 +244,32 @@ static collection_mutation serialize_collection_mutation(
|
||||
write<int64_t>(out, tomb.timestamp);
|
||||
write<int64_t>(out, tomb.deletion_time.time_since_epoch().count());
|
||||
}
|
||||
auto writek = [&out] (auto& kv) {
|
||||
write<int32_t>(out, Adaptor::key_size(kv));
|
||||
Adaptor::write_key(kv, out);
|
||||
auto writek = [&out] (bytes_view v) {
|
||||
write<int32_t>(out, v.size());
|
||||
write_fragmented(out, single_fragmented_view(v));
|
||||
};
|
||||
auto writev = [&out] (auto& kv) {
|
||||
write<int32_t>(out, Adaptor::value_size(kv));
|
||||
Adaptor::write_value(kv, out);
|
||||
auto writev = [&out] (managed_bytes_view v) {
|
||||
write<int32_t>(out, v.size());
|
||||
write_fragmented(out, v);
|
||||
};
|
||||
// FIXME: overflow?
|
||||
write<int32_t>(out, std::ranges::distance(cells));
|
||||
for (auto&& kv : cells) {
|
||||
writek(kv);
|
||||
writev(kv);
|
||||
auto&& k = kv.first;
|
||||
auto&& v = kv.second;
|
||||
writek(k);
|
||||
|
||||
writev(v.serialize());
|
||||
}
|
||||
return collection_mutation(type, std::move(ret));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/// A key-value pair where the key is bytes-like and the value is an atomic_cell-like type
|
||||
/// with a serialize() method returning managed_bytes_view.
|
||||
template <typename T>
|
||||
concept AtomicCellKV = requires(const T& kv) {
|
||||
{ kv.first.size() } -> std::convertible_to<size_t>;
|
||||
{ kv.second.serialize() } -> std::convertible_to<managed_bytes_view>;
|
||||
};
|
||||
|
||||
struct atomic_cell_adaptor {
|
||||
static size_t key_size(const AtomicCellKV auto& v) { return v.first.size(); }
|
||||
static size_t value_size(const AtomicCellKV auto& v) { return v.second.serialize().size(); }
|
||||
|
||||
static void write_key(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
|
||||
write_fragmented(out, single_fragmented_view(v.first));
|
||||
}
|
||||
static void write_value(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
|
||||
write_fragmented(out, v.second.serialize());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
|
||||
return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
|
||||
return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_view_description::serialize(const abstract_type& type) const {
|
||||
return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct serialized_cell_adaptor {
|
||||
static size_t key_size(const ser::collection_element_view& v) {
|
||||
return v.key().view().size_bytes();
|
||||
}
|
||||
|
||||
static size_t value_size(const ser::collection_element_view& v) {
|
||||
struct collection_cell_visitor {
|
||||
size_t operator()(const ser::live_cell_view& lcv) const { return atomic_cell_type::live_serialized_size(lcv.value().view().size_bytes()); }
|
||||
size_t operator()(const ser::expiring_cell_view& ecv) const { return atomic_cell_type::live_expiring_serialized_size(ecv.c().value().view().size_bytes()); }
|
||||
size_t operator()(const ser::dead_cell_view& dcv) const { return atomic_cell_type::dead_serialized_size(); }
|
||||
size_t operator()(const ser::counter_cell_view& ccv) const { utils::on_internal_error("Trying to deserialize counter cell from collection"); }
|
||||
size_t operator()(const ser::unknown_variant_type&) const { utils::on_internal_error("Trying to deserialize cell in unknown state"); };
|
||||
};
|
||||
return boost::apply_visitor(collection_cell_visitor{}, v.value());
|
||||
}
|
||||
|
||||
static void write_key(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
|
||||
write_fragmented(out, v.key().view());
|
||||
}
|
||||
|
||||
static void write_value(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
|
||||
struct collection_cell_visitor {
|
||||
managed_bytes_mutable_view& out;
|
||||
|
||||
void operator()(const ser::live_cell_view& lcv) const {
|
||||
const auto v = lcv.value().view();
|
||||
atomic_cell_type::write_live(out, lcv.created_at(), v);
|
||||
out.remove_prefix(atomic_cell_type::live_serialized_size(v.size_bytes()));
|
||||
}
|
||||
void operator()(const ser::expiring_cell_view& ecv) const {
|
||||
const auto v = ecv.c().value().view();
|
||||
atomic_cell_type::write_live(out, ecv.c().created_at(), v, ecv.expiry(), ecv.ttl());
|
||||
out.remove_prefix(atomic_cell_type::live_expiring_serialized_size(v.size_bytes()));
|
||||
}
|
||||
void operator()(const ser::dead_cell_view& dcv) const {
|
||||
atomic_cell_type::write_dead(out, dcv.tomb().timestamp(), dcv.tomb().deletion_time());
|
||||
out.remove_prefix(atomic_cell_type::dead_serialized_size());
|
||||
}
|
||||
void operator()(const ser::counter_cell_view& ccv) const {
|
||||
utils::on_internal_error("Trying to deserialize counter cell from collection");
|
||||
}
|
||||
void operator()(const ser::unknown_variant_type&) const {
|
||||
utils::on_internal_error("Trying to deserialize cell in unknown state");
|
||||
}
|
||||
};
|
||||
boost::apply_visitor(collection_cell_visitor{out}, v.value());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
collection_mutation read_from_collection_cell_view(const abstract_type& type, const ser::collection_cell_view& collection) {
|
||||
auto tomb = collection.tomb();
|
||||
auto cells = collection.elements();
|
||||
return serialize_collection_mutation<serialized_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
|
||||
return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
|
||||
@@ -23,10 +23,6 @@ class row_tombstone;
|
||||
|
||||
class collection_mutation;
|
||||
|
||||
namespace ser {
|
||||
class collection_cell_view;
|
||||
}
|
||||
|
||||
// An auxiliary struct used to (de)construct collection_mutations.
|
||||
// Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
|
||||
// (tombstone and cells) inside the mutation easily.
|
||||
@@ -134,12 +130,6 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
|
||||
|
||||
collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);
|
||||
|
||||
// Transcode a collection from the IDL representation directly into the
|
||||
// collection_mutation serialization format, without using any intermediary representation.
|
||||
// Only the final collection-mutation blob is allocated, no intermediate allocations needed.
|
||||
// Safe to use in LSA, it won't produce garbage.
|
||||
collection_mutation read_from_collection_cell_view(const abstract_type&, const ser::collection_cell_view&);
|
||||
|
||||
// Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
|
||||
bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);
|
||||
|
||||
|
||||
@@ -97,9 +97,9 @@ public:
|
||||
r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, collection_mutation collection) override {
|
||||
virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
|
||||
row& r = _static_row.maybe_create();
|
||||
r.append_cell(id, std::move(collection));
|
||||
r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
|
||||
virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
|
||||
@@ -125,9 +125,9 @@ public:
|
||||
r.append_cell(id, std::move(cell));
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, collection_mutation collection) override {
|
||||
virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
|
||||
row& r = _current_row->cells();
|
||||
r.append_cell(id, std::move(collection));
|
||||
r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
|
||||
auto on_end_of_partition() {
|
||||
|
||||
@@ -707,10 +707,9 @@ struct fmt::formatter<shadowable_tombstone> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const shadowable_tombstone& t, FormatContext& ctx) const {
|
||||
if (t) {
|
||||
auto& tomb = t.tomb();
|
||||
return fmt::format_to(ctx.out(),
|
||||
"{{shadowable tombstone: timestamp={}, deletion_time={}}}",
|
||||
tomb.timestamp, tomb.deletion_time.time_since_epoch().count());
|
||||
t.tomb().timestamp, t.tomb(), t.tomb().deletion_time.time_since_epoch().count());
|
||||
} else {
|
||||
return fmt::format_to(ctx.out(),
|
||||
"{{shadowable tombstone: none}}");
|
||||
|
||||
@@ -86,6 +86,37 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
|
||||
return boost::apply_visitor(atomic_cell_visitor(type, cm), cv);
|
||||
}
|
||||
|
||||
collection_mutation read_collection_cell(const abstract_type& type, ser::collection_cell_view cv)
|
||||
{
|
||||
collection_mutation_description mut;
|
||||
mut.tomb = cv.tomb();
|
||||
auto&& elements = cv.elements();
|
||||
mut.cells.reserve(elements.size());
|
||||
|
||||
visit(type, make_visitor(
|
||||
[&] (const collection_type_impl& ctype) {
|
||||
auto& value_type = *ctype.value_comparator();
|
||||
for (auto&& e : elements) {
|
||||
mut.cells.emplace_back(e.key(), read_atomic_cell(value_type, e.value(), atomic_cell::collection_member::yes));
|
||||
}
|
||||
},
|
||||
[&] (const user_type_impl& utype) {
|
||||
for (auto&& e : elements) {
|
||||
bytes key = e.key();
|
||||
auto idx = deserialize_field_index(key);
|
||||
SCYLLA_ASSERT(idx < utype.size());
|
||||
|
||||
mut.cells.emplace_back(key, read_atomic_cell(*utype.type(idx), e.value(), atomic_cell::collection_member::yes));
|
||||
}
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("attempted to read a collection cell with type: {}", o.name()));
|
||||
}
|
||||
));
|
||||
|
||||
return mut.serialize(type);
|
||||
}
|
||||
|
||||
template<typename Visitor>
|
||||
void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind kind, Visitor&& visitor)
|
||||
{
|
||||
@@ -111,7 +142,14 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
|
||||
if (_col.is_atomic()) {
|
||||
throw std::runtime_error("An atomic cell expected, got a collection");
|
||||
}
|
||||
_visitor.accept_collection(_id, read_from_collection_cell_view(*_col.type(), ccv));
|
||||
// FIXME: Pass view to cell to avoid copy
|
||||
auto&& outer = current_allocator();
|
||||
with_allocator(standard_allocator(), [&] {
|
||||
auto cell = read_collection_cell(*_col.type(), ccv);
|
||||
with_allocator(outer, [&] {
|
||||
_visitor.accept_collection(_id, cell);
|
||||
});
|
||||
});
|
||||
}
|
||||
void operator()(ser::unknown_variant_type&) const {
|
||||
throw std::runtime_error("Trying to deserialize unknown cell type");
|
||||
@@ -160,8 +198,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_static_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_static_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_static_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
|
||||
@@ -180,8 +218,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_row_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_row_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_row_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
|
||||
@@ -202,8 +240,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_static_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_static_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_static_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
|
||||
@@ -225,8 +263,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_row_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_row_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_row_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
|
||||
@@ -248,8 +286,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_static_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_static_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_static_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
|
||||
@@ -270,8 +308,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_row_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_row_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_row_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
|
||||
@@ -299,8 +337,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_static_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_static_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_static_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
|
||||
@@ -338,8 +376,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_row_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) const {
|
||||
_visitor.accept_row_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_row_cell(id, cm);
|
||||
}
|
||||
};
|
||||
read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
|
||||
@@ -463,40 +501,44 @@ mutation_partition_view mutation_partition_view::from_view(ser::mutation_partiti
|
||||
|
||||
clustering_row read_clustered_row(const schema& s, ser::clustering_row_view crv) {
|
||||
class clustering_row_builder {
|
||||
const schema& _s;
|
||||
clustering_row _row;
|
||||
public:
|
||||
clustering_row_builder(clustering_key key, row_tombstone t, row_marker m)
|
||||
: _row(std::move(key), std::move(t), std::move(m), row()) { }
|
||||
clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
|
||||
: _s(s), _row(std::move(key), std::move(t), std::move(m), row()) { }
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) {
|
||||
_row.cells().append_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) {
|
||||
_row.cells().append_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) {
|
||||
_row.cells().append_cell(id, collection_mutation(*_s.regular_column_at(id).type, cm));
|
||||
}
|
||||
clustering_row get() && { return std::move(_row); }
|
||||
};
|
||||
|
||||
auto cr = crv.row();
|
||||
auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
|
||||
clustering_row_builder builder(cr.key(), std::move(t), read_row_marker(cr.marker()));
|
||||
clustering_row_builder builder(s, cr.key(), std::move(t), read_row_marker(cr.marker()));
|
||||
read_and_visit_row(cr.cells(), s.get_column_mapping(), column_kind::regular_column, builder);
|
||||
return std::move(builder).get();
|
||||
}
|
||||
|
||||
static_row read_static_row(const schema& s, ser::static_row_view sr) {
|
||||
class static_row_builder {
|
||||
const schema& _s;
|
||||
static_row _row;
|
||||
public:
|
||||
explicit static_row_builder(const schema& s)
|
||||
: _s(s) { }
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) {
|
||||
_row.cells().append_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, collection_mutation cm) {
|
||||
_row.cells().append_cell(id, std::move(cm));
|
||||
void accept_collection(column_id id, const collection_mutation& cm) {
|
||||
_row.cells().append_cell(id, collection_mutation(*_s.static_column_at(id).type, cm));
|
||||
}
|
||||
static_row get() && { return std::move(_row); }
|
||||
};
|
||||
|
||||
static_row_builder builder;
|
||||
static_row_builder builder(s);
|
||||
read_and_visit_row(sr.cells(), s.get_column_mapping(), column_kind::static_column, builder);
|
||||
return std::move(builder).get();
|
||||
}
|
||||
|
||||
@@ -23,31 +23,31 @@ class converting_mutation_partition_applier;
|
||||
|
||||
template<typename T>
|
||||
concept MutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
|
||||
collection_mutation cm, range_tombstone rt,
|
||||
collection_mutation_view cmv, range_tombstone rt,
|
||||
position_in_partition_view pipv, row_tombstone row_tomb,
|
||||
row_marker rm) {
|
||||
visitor.accept_partition_tombstone(t);
|
||||
visitor.accept_static_cell(column_id(), std::move(ac));
|
||||
visitor.accept_static_cell(column_id(), std::move(cm));
|
||||
visitor.accept_static_cell(column_id(), cmv);
|
||||
visitor.accept_row_tombstone(rt);
|
||||
visitor.accept_row(pipv, row_tomb, rm,
|
||||
is_dummy::no, is_continuous::yes);
|
||||
visitor.accept_row_cell(column_id(), std::move(ac));
|
||||
visitor.accept_row_cell(column_id(), std::move(cm));
|
||||
visitor.accept_row_cell(column_id(), cmv);
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
concept AsyncMutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
|
||||
collection_mutation cm, range_tombstone rt,
|
||||
collection_mutation_view cmv, range_tombstone rt,
|
||||
position_in_partition_view pipv, row_tombstone row_tomb,
|
||||
row_marker rm) {
|
||||
{ visitor.accept_partition_tombstone(t) } -> std::same_as<void>;
|
||||
{ visitor.accept_static_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
|
||||
{ visitor.accept_static_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
|
||||
{ visitor.accept_static_cell(column_id(), cmv) } -> std::same_as<void>;
|
||||
{ visitor.accept_row_tombstone(rt) } -> std::same_as<future<>>;
|
||||
{ visitor.accept_row(pipv, row_tomb, rm, is_dummy::no, is_continuous::yes) } -> std::same_as<future<>>;
|
||||
{ visitor.accept_row_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
|
||||
{ visitor.accept_row_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
|
||||
{ visitor.accept_row_cell(column_id(), cmv) } -> std::same_as<void>;
|
||||
{ visitor.accept_end_of_partition() } -> std::same_as<future<>>;
|
||||
};
|
||||
|
||||
@@ -56,11 +56,11 @@ public:
|
||||
virtual ~mutation_partition_view_virtual_visitor();
|
||||
virtual void accept_partition_tombstone(tombstone t) = 0;
|
||||
virtual void accept_static_cell(column_id, atomic_cell ac) = 0;
|
||||
virtual void accept_static_cell(column_id, collection_mutation cm) = 0;
|
||||
virtual void accept_static_cell(column_id, collection_mutation_view cmv) = 0;
|
||||
virtual stop_iteration accept_row_tombstone(range_tombstone rt) = 0;
|
||||
virtual stop_iteration accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) = 0;
|
||||
virtual void accept_row_cell(column_id, atomic_cell ac) = 0;
|
||||
virtual void accept_row_cell(column_id, collection_mutation cm) = 0;
|
||||
virtual void accept_row_cell(column_id, collection_mutation_view cmv) = 0;
|
||||
};
|
||||
|
||||
// View on serialized mutation partition. See mutation_partition_serializer.
|
||||
|
||||
@@ -46,12 +46,8 @@ public:
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
|
||||
accept_static_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
|
||||
void accept_static_cell(column_id id, collection_mutation&& collection) {
|
||||
row& r = _partition.static_row().maybe_create();
|
||||
r.append_cell(id, std::move(collection));
|
||||
r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
|
||||
virtual void accept_row_tombstone(const range_tombstone& rt) override {
|
||||
@@ -76,12 +72,8 @@ public:
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
|
||||
accept_row_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
|
||||
void accept_row_cell(column_id id, collection_mutation collection) {
|
||||
row& r = _current_row->cells();
|
||||
r.append_cell(id, std::move(collection));
|
||||
r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ Usage:
|
||||
import argparse, os, sys
|
||||
from typing import Sequence
|
||||
|
||||
from test.pylib.driver_utils import safe_driver_shutdown
|
||||
|
||||
def read_statements(path: str) -> list[tuple[int, str]]:
|
||||
stms: list[tuple[int, str]] = []
|
||||
@@ -57,7 +58,7 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
|
||||
print(f"ERROR executing statement from file line {lineno}: {s}\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
cluster.shutdown()
|
||||
safe_driver_shutdown(cluster)
|
||||
return 0
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8b22f9a548a03c88250d31e97ea3e8f77b4d90c502bcf74336c24056557f947f
|
||||
size 6698412
|
||||
oid sha256:524c54493b72c5e1b783f14dfa49d733e21b24cc2ec776e9c6e578095073162d
|
||||
size 6646304
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:31e515a62f006649b0dc4671b51b2643fba9a70884c09b90fbc2237044954254
|
||||
size 6707108
|
||||
oid sha256:fec2bb253d43139da954cee3441fc8bc74824246b080f23bf1f824714d0adc45
|
||||
size 6646576
|
||||
|
||||
@@ -239,10 +239,7 @@ private:
|
||||
|
||||
// Drop waiter that we lost track of, can happen due to a snapshot transfer,
|
||||
// or a leader removed from cluster while some entries added on it are uncommitted.
|
||||
// When `snp` is provided (snapshot transfer case), waiters whose term matches
|
||||
// the snapshot term are resolved successfully, since the snapshot-term match proves
|
||||
// they were committed and included in the snapshot (by the Log Matching Property).
|
||||
void drop_waiters(const snapshot_descriptor* snp = nullptr);
|
||||
void drop_waiters(std::optional<index_t> idx = {});
|
||||
|
||||
// Wake up all waiter that wait for entries with idx smaller of equal to the one provided
|
||||
// to be applied.
|
||||
@@ -559,10 +556,12 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
|
||||
auto snap_term = _fsm->log_term_for(snap_idx);
|
||||
SCYLLA_ASSERT(snap_term);
|
||||
SCYLLA_ASSERT(snap_idx >= eid.idx);
|
||||
if (snap_term == eid.term) {
|
||||
if (type == wait_type::committed && snap_term == eid.term) {
|
||||
logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away, but has the snapshot's term"
|
||||
" (snapshot index: {})", id(), eid.term, eid.idx, snap_idx);
|
||||
co_return;
|
||||
|
||||
// We don't do this for `wait_type::applied` - see below why.
|
||||
}
|
||||
|
||||
logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away", id(), eid.term, eid.idx);
|
||||
@@ -573,6 +572,20 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
|
||||
throw dropped_entry();
|
||||
}
|
||||
|
||||
if (type == wait_type::applied && _fsm->log_last_snapshot_idx() >= eid.idx) {
|
||||
// We know the entry was committed but the wait type is `applied`
|
||||
// and we don't know if the entry was applied with `state_machine::apply`
|
||||
// (we may've loaded a snapshot before we managed to apply the entry).
|
||||
// As specified by `add_entry`, throw `commit_status_unknown` in this case.
|
||||
//
|
||||
// FIXME: replace this with a different exception type - `commit_status_unknown`
|
||||
// gives too much uncertainty while we know that the entry was committed
|
||||
// and had to be applied on at least one server. Some callers of `add_entry`
|
||||
// need to know only that the current state includes that entry, whether it was done
|
||||
// through `apply` on this server or through receiving a snapshot.
|
||||
throw commit_status_unknown();
|
||||
}
|
||||
|
||||
co_return;
|
||||
}
|
||||
}
|
||||
@@ -747,8 +760,6 @@ future<> server_impl::add_entry(command command, wait_type type, seastar::abort_
|
||||
throw not_a_leader{leader};
|
||||
}
|
||||
auto eid = co_await add_entry_on_leader(std::move(command), as);
|
||||
co_await utils::get_local_injector().inject("block_raft_add_entry_before_wait_for_entry",
|
||||
utils::wait_for_message(std::chrono::minutes(5)));
|
||||
co_return co_await wait_for_entry(eid, type, as);
|
||||
}
|
||||
|
||||
@@ -984,24 +995,17 @@ void server_impl::notify_waiters(std::map<index_t, op_status>& waiters,
|
||||
}
|
||||
}
|
||||
|
||||
void server_impl::drop_waiters(const snapshot_descriptor* snp) {
|
||||
void server_impl::drop_waiters(std::optional<index_t> idx) {
|
||||
auto drop = [&] (std::map<index_t, op_status>& waiters) {
|
||||
while (waiters.size() != 0) {
|
||||
auto it = waiters.begin();
|
||||
if (snp && it->first > snp->idx) {
|
||||
if (idx && it->first > *idx) {
|
||||
break;
|
||||
}
|
||||
auto [entry_idx, status] = std::move(*it);
|
||||
waiters.erase(it);
|
||||
if (snp && status.term == snp->term) {
|
||||
// entry_idx <= snapshot index and the entry's term matches the snapshot term.
|
||||
// By the Log Matching Property the entry was committed and included in the snapshot.
|
||||
status.done.set_value();
|
||||
_stats.waiters_awoken++;
|
||||
} else {
|
||||
status.done.set_exception(commit_status_unknown());
|
||||
_stats.waiters_dropped++;
|
||||
}
|
||||
status.done.set_exception(commit_status_unknown());
|
||||
_stats.waiters_dropped++;
|
||||
}
|
||||
};
|
||||
drop(_awaited_commits);
|
||||
@@ -1427,7 +1431,7 @@ future<> server_impl::applier_fiber() {
|
||||
// Apply snapshot it to the state machine
|
||||
logger.trace("[{}] apply_fiber applying snapshot {}", _id, snp.id);
|
||||
co_await _state_machine->load_snapshot(snp.id);
|
||||
drop_waiters(&snp);
|
||||
drop_waiters(snp.idx);
|
||||
_applied_idx = snp.idx;
|
||||
_applied_index_changed.broadcast();
|
||||
_stats.sm_load_snapshot++;
|
||||
@@ -1936,7 +1940,7 @@ std::unique_ptr<server> create_server(server_id uuid, std::unique_ptr<rpc> rpc,
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const server_impl& s) {
|
||||
fmt::print(os, "[id: {}, fsm ({})]\n", s._id, *s._fsm);
|
||||
fmt::print(os, "[id: {}, fsm ()]\n", s._id, *s._fsm);
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
@@ -79,18 +79,18 @@ public:
|
||||
// The caller may pass a pointer to an abort_source to make the operation abortable.
|
||||
// If it passes nullptr, the operation is unabortable.
|
||||
//
|
||||
// Successful `add_entry` does not guarantee that `state_machine::apply` will be called
|
||||
// locally for this entry. Between the commit and the application we may load a snapshot
|
||||
// containing this entry, so the state machine's state 'jumps' forward in time, skipping
|
||||
// the local entry application. For `wait_type::applied` this should be fine, because
|
||||
// state machine implementations shouldn't care whether an entry was applied via
|
||||
// `state_machine::apply` or via a snapshot load.
|
||||
// Successful `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called
|
||||
// locally for this entry. Between the commit and the application we may receive a snapshot containing this entry,
|
||||
// so the state machine's state 'jumps' forward in time, skipping the entry application.
|
||||
// However, for `wait_type::applied`, we guarantee that the entry will be applied locally with `state_machine::apply`.
|
||||
// If a snapshot causes the state machine to jump over the entry, `add_entry` will return `commit_status_unknown`
|
||||
// (even if the snapshot included that entry).
|
||||
//
|
||||
// Exceptions:
|
||||
// raft::commit_status_unknown
|
||||
// Thrown if the leader has changed and the log entry has either
|
||||
// been replaced by the new leader or the server has lost track of it.
|
||||
// It may also be thrown in case of a transport error while forwarding add_entry to the leader.
|
||||
// It may also be thrown in case of a transport error while forwarding add_entry to the leader.L
|
||||
// raft::dropped_entry
|
||||
// Thrown if the entry was replaced because of a leader change.
|
||||
// raft::request_aborted
|
||||
|
||||
@@ -1022,7 +1022,8 @@ void database::drop_keyspace(const sstring& name) {
|
||||
static bool is_system_table(const schema& s) {
|
||||
auto& k = s.ks_name();
|
||||
return k == db::system_keyspace::NAME ||
|
||||
k == db::system_distributed_keyspace::NAME;
|
||||
k == db::system_distributed_keyspace::NAME ||
|
||||
k == db::system_distributed_keyspace::NAME_EVERYWHERE;
|
||||
}
|
||||
|
||||
sstables::sstables_manager& database::get_sstables_manager(const schema& s) const {
|
||||
@@ -1141,7 +1142,7 @@ future<> database::create_local_system_table(
|
||||
cfg.memtable_scheduling_group = default_scheduling_group();
|
||||
cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
|
||||
}
|
||||
auto lock = co_await get_tables_metadata().hold_write_lock();
|
||||
auto lock = get_tables_metadata().hold_write_lock();
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
add_column_family(ks, table, std::move(cfg), replica::database::is_new_cf::no);
|
||||
@@ -1327,27 +1328,9 @@ future<global_table_ptr> get_table_on_all_shards(sharded<database>& sharded_db,
|
||||
|
||||
future<tables_metadata_lock_on_all_shards> database::lock_tables_metadata(sharded<database>& sharded_db) {
|
||||
tables_metadata_lock_on_all_shards locks;
|
||||
// Acquire write lock on shard 0 first, and then on the remaining shards.
|
||||
//
|
||||
// Parallel acquisition on all shards could deadlock when two
|
||||
// fibers call lock_tables_metadata() concurrently: parallel_for_each
|
||||
// sends SMP messages to all shards even when the local shard's lock
|
||||
// attempt blocks. If task reordering (SEASTAR_SHUFFLE_TASK_QUEUE in
|
||||
// debug/sanitize builds) causes fiber A to win on shard X while
|
||||
// fiber B wins on shard Y, neither can make progress — classic
|
||||
// cross-shard lock-ordering deadlock.
|
||||
//
|
||||
// Acquiring the write lock on shard 0 first, and then on the remaining
|
||||
// shards, eliminates this: whichever fiber acquires shard 0 first is
|
||||
// guaranteed to acquire locks on all other shards before the other fiber
|
||||
// can acquire the lock on shard 0.
|
||||
co_await sharded_db.invoke_on(0, [&locks, &sharded_db] (auto& db) -> future<> {
|
||||
co_await sharded_db.invoke_on_all([&] (auto& db) -> future<> {
|
||||
locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
|
||||
co_await sharded_db.invoke_on_others([&locks] (auto& db) -> future<> {
|
||||
locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
|
||||
});
|
||||
});
|
||||
|
||||
co_return locks;
|
||||
}
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@ bool is_system_keyspace(std::string_view name) {
|
||||
|
||||
static const std::unordered_set<std::string_view> internal_keyspaces = {
|
||||
db::system_distributed_keyspace::NAME,
|
||||
db::system_distributed_keyspace::NAME_EVERYWHERE,
|
||||
db::system_keyspace::NAME,
|
||||
db::schema_tables::NAME,
|
||||
auth::meta::legacy::AUTH_KS,
|
||||
|
||||
@@ -4624,7 +4624,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
sstables::shared_sstable sst;
|
||||
replica::enable_backlog_tracker enable_backlog_tracker;
|
||||
};
|
||||
std::unordered_map<size_t, std::vector<removed_sstable>> per_cg_remove;
|
||||
std::vector<removed_sstable> remove;
|
||||
|
||||
_stats.pending_sstable_deletions++;
|
||||
auto undo_stats = defer([this] {
|
||||
@@ -4633,7 +4633,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
|
||||
auto permit = co_await get_sstable_list_permit();
|
||||
|
||||
co_await _cache.invalidate(row_cache::external_updater([this, &rp, &per_cg_remove, truncated_at] {
|
||||
co_await _cache.invalidate(row_cache::external_updater([this, &rp, &remove, truncated_at] {
|
||||
// FIXME: the following isn't exception safe.
|
||||
for_each_compaction_group([&] (compaction_group& cg) {
|
||||
|
||||
@@ -4648,7 +4648,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
if (p->originated_on_this_node().value_or(false) && p->get_stats_metadata().position.shard_id() == this_shard_id()) {
|
||||
rp = std::max(p->get_stats_metadata().position, rp);
|
||||
}
|
||||
per_cg_remove[cg.group_id()].emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
|
||||
remove.emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
|
||||
return;
|
||||
}
|
||||
pruned->insert(p);
|
||||
@@ -4665,19 +4665,16 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
|
||||
}));
|
||||
rebuild_statistics();
|
||||
|
||||
co_await coroutine::parallel_for_each(per_cg_remove, [&] (auto& entry) {
|
||||
auto& removed = entry.second;
|
||||
std::vector<sstables::shared_sstable> del;
|
||||
del.reserve(removed.size());
|
||||
for (auto& r : removed) {
|
||||
if (r.enable_backlog_tracker) {
|
||||
remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
|
||||
}
|
||||
erase_sstable_cleanup_state(r.sst);
|
||||
del.emplace_back(std::move(r.sst));
|
||||
std::vector<sstables::shared_sstable> del;
|
||||
del.reserve(remove.size());
|
||||
for (auto& r : remove) {
|
||||
if (r.enable_backlog_tracker) {
|
||||
remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
|
||||
}
|
||||
return delete_sstables_atomically(permit, std::move(del));
|
||||
});
|
||||
erase_sstable_cleanup_state(r.sst);
|
||||
del.emplace_back(r.sst);
|
||||
};
|
||||
co_await delete_sstables_atomically(permit, std::move(del));
|
||||
co_return rp;
|
||||
}
|
||||
|
||||
@@ -5612,7 +5609,7 @@ future<> compaction_group::cleanup() {
|
||||
auto updater = row_cache::external_updater(std::make_unique<compaction_group_cleaner>(*this));
|
||||
|
||||
auto p_range = to_partition_range(token_range());
|
||||
tlogger.debug("Invalidating range {} for compaction group {} of table {}.{} during cleanup.",
|
||||
tlogger.debug("Invalidating range {} for compaction group {} of table {} during cleanup.",
|
||||
p_range, group_id(), _t.schema()->ks_name(), _t.schema()->cf_name());
|
||||
// Since permit is still held, all actions below will be executed atomically:
|
||||
co_await _t._cache.invalidate(std::move(updater), p_range);
|
||||
|
||||
@@ -1328,14 +1328,8 @@ class interval_printer(gdb.printing.PrettyPrinter):
|
||||
def __init__(self, val):
|
||||
self.val = val['_interval']
|
||||
|
||||
def inspect_bound(self, bound_name):
|
||||
if f'_{bound_name}_exists' in self.val:
|
||||
if not self.val[f'_{bound_name}_exists']:
|
||||
return False, False, None
|
||||
|
||||
return True, bool(self.val[f'_{bound_name}_inclusive']), self.val[f'_{bound_name}_value']
|
||||
|
||||
bound = std_optional(self.val[f'_{bound_name}'])
|
||||
def inspect_bound(self, bound_opt):
|
||||
bound = std_optional(bound_opt)
|
||||
if not bound:
|
||||
return False, False, None
|
||||
|
||||
@@ -1344,8 +1338,8 @@ class interval_printer(gdb.printing.PrettyPrinter):
|
||||
return True, bool(bound['_inclusive']), bound['_value']
|
||||
|
||||
def to_string(self):
|
||||
has_start, start_inclusive, start_value = self.inspect_bound('start')
|
||||
has_end, end_inclusive, end_value = self.inspect_bound('end')
|
||||
has_start, start_inclusive, start_value = self.inspect_bound(self.val['_start'])
|
||||
has_end, end_inclusive, end_value = self.inspect_bound(self.val['_end'])
|
||||
|
||||
if self.val['_singular']:
|
||||
return '{{{}}}'.format(str(start_value))
|
||||
@@ -5472,9 +5466,10 @@ class scylla_compaction_tasks(gdb.Command):
|
||||
try:
|
||||
task_list = list(intrusive_list(cm['_tasks']))
|
||||
except gdb.error: # 6.2 compatibility
|
||||
task_list = [seastar_shared_ptr(t).get().dereference() for t in std_list(cm['_tasks'])]
|
||||
task_list = list(std_list(cm['_tasks']))
|
||||
|
||||
for task in task_list:
|
||||
task = seastar_shared_ptr(task).get().dereference()
|
||||
schema = schema_ptr(task['_compacting_table'].dereference()['_schema'])
|
||||
key = 'type={}, state={:5}, {}'.format(task['_type'], str(task['_state']), schema.table_name())
|
||||
task_hist.add(key)
|
||||
|
||||
@@ -122,9 +122,11 @@ future<> service::client_state::check_internal_table_permissions(std::string_vie
|
||||
auth::permission::ALTER, auth::permission::DROP>();
|
||||
|
||||
if (forbidden_permissions.contains(cmd.permission)) {
|
||||
if (ks == db::system_distributed_keyspace::NAME
|
||||
if ((ks == db::system_distributed_keyspace::NAME || ks == db::system_distributed_keyspace::NAME_EVERYWHERE)
|
||||
&& (table_name == db::system_distributed_keyspace::CDC_DESC_V2
|
||||
|| table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS)) {
|
||||
|| table_name == db::system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION
|
||||
|| table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS
|
||||
|| table_name == db::system_distributed_keyspace::CDC_GENERATIONS_V2)) {
|
||||
return make_exception_future(exceptions::unauthorized_exception(
|
||||
format("Cannot {} {}", auth::permissions::to_string(cmd.permission), cmd.resource)));
|
||||
}
|
||||
|
||||
@@ -239,7 +239,7 @@ future<> migration_manager::wait_for_schema_agreement(const replica::database& d
|
||||
as->check();
|
||||
}
|
||||
if (db::timeout_clock::now() > deadline) {
|
||||
throw schema_agreement_timeout();
|
||||
throw std::runtime_error("Unable to reach schema agreement");
|
||||
}
|
||||
co_await (as ? sleep_abortable(std::chrono::milliseconds(500), *as) : sleep(std::chrono::milliseconds(500)));
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/timed_out_error.hh>
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
@@ -134,19 +133,6 @@ public:
|
||||
* Known peers in the cluster have the same schema version as us.
|
||||
*/
|
||||
bool have_schema_agreement();
|
||||
// Thrown by wait_for_schema_agreement() when the deadline is reached.
|
||||
struct schema_agreement_timeout : public seastar::timed_out_error {
|
||||
const char* what() const noexcept override {
|
||||
return "Unable to reach schema agreement";
|
||||
}
|
||||
};
|
||||
/**
|
||||
* Waits until all known live peers have the same schema version as this
|
||||
* node. Returns normally once agreement is reached, or throws
|
||||
* schema_agreement_timeout if the deadline is reached before agreement.
|
||||
* If as != nullptr, can also throw abort_requested_exception if the abort
|
||||
* source fires.
|
||||
*/
|
||||
future<> wait_for_schema_agreement(const replica::database& db, db::timeout_clock::time_point deadline, seastar::abort_source* as);
|
||||
|
||||
// Maximum number of retries one should attempt when trying to perform
|
||||
|
||||
@@ -438,10 +438,9 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
|
||||
const auto cache_key = qp.compute_id(req, "", cql3::internal_dialect());
|
||||
auto ps_ptr = qp.get_prepared(cache_key);
|
||||
shared_ptr<cql_transport::messages::result_message::prepared> prepared_msg;
|
||||
if (!ps_ptr) {
|
||||
prepared_msg = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = prepared_msg->get_prepared();
|
||||
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = msg_ptr->get_prepared();
|
||||
if (!ps_ptr) {
|
||||
on_internal_error(paxos_state::logger, "prepared statement is null");
|
||||
}
|
||||
@@ -450,8 +449,8 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
-1, service::node_local_only::yes);
|
||||
const auto st = ps_ptr->statement;
|
||||
|
||||
const auto result_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
|
||||
co_return cql3::untyped_result_set(result_ptr);
|
||||
const auto msg_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
|
||||
co_return cql3::untyped_result_set(msg_ptr);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include "service/qos/raft_service_level_distributed_data_accessor.hh"
|
||||
#include "service_level_controller.hh"
|
||||
#include "db/system_distributed_keyspace.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/topology_state_machine.hh"
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
class system_distributed_keyspace;
|
||||
}
|
||||
namespace cql3 {
|
||||
class query_processor;
|
||||
|
||||
@@ -434,8 +434,6 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
}
|
||||
|
||||
future<> group0_state_machine::enable_in_memory_state_machine() {
|
||||
co_await utils::get_local_injector().inject("group0_state_machine_enable_in_memory_fail",
|
||||
[] { return std::make_exception_ptr(std::runtime_error("injected failure in enable_in_memory_state_machine")); });
|
||||
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
|
||||
if (!_in_memory_state_machine_enabled) {
|
||||
_in_memory_state_machine_enabled = true;
|
||||
|
||||
@@ -452,16 +452,14 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
|
||||
auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm);
|
||||
auto& persistence = srv_for_group0.persistence;
|
||||
auto& server = *srv_for_group0.server;
|
||||
co_await with_scheduling_group(_sg, [this, &srv_for_group0, group0_id] (this auto self) -> future<> {
|
||||
co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> {
|
||||
auto& state_machine = dynamic_cast<group0_state_machine&>(srv_for_group0.state_machine);
|
||||
co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
|
||||
// Set _group0 immediately after the server is registered in _raft_gr._servers.
|
||||
// This ensures abort_and_drain()/destroy() can find and clean up the server
|
||||
// even if enable_in_memory_state_machine() or later steps throw.
|
||||
_group0.emplace<raft::group_id>(group0_id);
|
||||
co_await state_machine.enable_in_memory_state_machine();
|
||||
});
|
||||
|
||||
_group0.emplace<raft::group_id>(group0_id);
|
||||
|
||||
// Fix for scylladb/scylladb#16683:
|
||||
// If the snapshot index is 0, trigger creation of a new snapshot
|
||||
// so bootstrapping nodes will receive a snapshot transfer.
|
||||
@@ -683,6 +681,16 @@ bool raft_group0::maintenance_mode() {
|
||||
}
|
||||
|
||||
future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm) {
|
||||
if (maintenance_mode()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
if (!sys_ks.bootstrap_complete()) {
|
||||
// If bootstrap did not complete yet, there is no group 0 to setup at this point
|
||||
// -- it will be done after we start gossiping, in `setup_group0`.
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto group0_id = raft::group_id{co_await sys_ks.get_raft_group0_id()};
|
||||
if (group0_id) {
|
||||
// Group 0 ID is present => we've already joined group 0 earlier.
|
||||
@@ -703,6 +711,15 @@ future<> raft_group0::setup_group0(
|
||||
db::system_keyspace& sys_ks, const std::unordered_set<gms::inet_address>& initial_contact_nodes, shared_ptr<group0_handshaker> handshaker,
|
||||
service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm,
|
||||
const join_node_request_params& params) {
|
||||
if (maintenance_mode()) {
|
||||
// The node is in maintenance mode.
|
||||
co_return;
|
||||
}
|
||||
|
||||
if (joined_group0()) {
|
||||
// Group 0 is already set up, there is nothing to do.
|
||||
co_return;
|
||||
}
|
||||
// Reaching this point is possible only in two cases:
|
||||
// - the node is bootstrapping,
|
||||
// - the node is restarting in the Raft-based recovery procedure and has not joined the new group 0 yet.
|
||||
@@ -1019,7 +1036,7 @@ with_timeout(abort_source& as, db::timeout_clock::duration d, F&& fun) {
|
||||
} catch (...) {
|
||||
// There should be no other exceptions, but just in case, catch and discard.
|
||||
// we want to propagate exceptions from `f`, not from sleep.
|
||||
group0_log.error("unexpected exception from sleep_and_abort: {}", std::current_exception());
|
||||
group0_log.error("unexpected exception from sleep_and_abort", std::current_exception());
|
||||
}
|
||||
|
||||
// Translate aborts caused by timeout to `timed_out_error`.
|
||||
|
||||
@@ -271,10 +271,6 @@ public:
|
||||
seastar::scheduling_group get_scheduling_group() {
|
||||
return _sg;
|
||||
}
|
||||
|
||||
// Returns true if in maintenance mode
|
||||
bool maintenance_mode();
|
||||
|
||||
private:
|
||||
static void init_rpc_verbs(raft_group0& shard0_this);
|
||||
static future<> uninit_rpc_verbs(netw::messaging_service& ms);
|
||||
@@ -336,6 +332,9 @@ private:
|
||||
// Does not affect non-members. This behavior is only guaranteed if no concurrent membership changes are happening.
|
||||
future<> modify_raft_voter_status(const std::unordered_set<raft::server_id>& voters_add, const std::unordered_set<raft::server_id>& voters_del,
|
||||
abort_source& as, std::optional<raft_timeout> timeout = std::nullopt);
|
||||
|
||||
// Returns true if in maintenance mode
|
||||
bool maintenance_mode();
|
||||
};
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "service/session.hh"
|
||||
#include "utils/log.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -59,35 +58,18 @@ void session_manager::initiate_close_of_sessions_except(const std::unordered_set
|
||||
}
|
||||
|
||||
future<> session_manager::drain_closing_sessions() {
|
||||
slogger.info("drain_closing_sessions: waiting for lock");
|
||||
seastar::timer<lowres_clock> lock_timer([this] {
|
||||
slogger.warn("drain_closing_sessions: still waiting for lock, available units {}",
|
||||
_session_drain_sem.available_units());
|
||||
});
|
||||
lock_timer.arm_periodic(std::chrono::minutes(5));
|
||||
auto lock = co_await get_units(_session_drain_sem, 1);
|
||||
lock_timer.cancel();
|
||||
auto n = std::distance(_closing_sessions.begin(), _closing_sessions.end());
|
||||
slogger.info("drain_closing_sessions: acquired lock, {} sessions to drain", n);
|
||||
auto i = _closing_sessions.begin();
|
||||
while (i != _closing_sessions.end()) {
|
||||
session& s = *i;
|
||||
++i;
|
||||
auto id = s.id();
|
||||
slogger.info("drain_closing_sessions: waiting for session {} to close, gate count {}", id, s.gate_count());
|
||||
std::optional<seastar::timer<lowres_clock>> warn_timer;
|
||||
warn_timer.emplace([&s, id] {
|
||||
slogger.warn("drain_closing_sessions: session {} still not closed, gate count {}",
|
||||
id, s.gate_count());
|
||||
});
|
||||
warn_timer->arm_periodic(std::chrono::minutes(5));
|
||||
slogger.debug("draining session {}", id);
|
||||
co_await s.close();
|
||||
warn_timer.reset();
|
||||
if (_sessions.erase(id)) {
|
||||
slogger.info("drain_closing_sessions: session {} closed", id);
|
||||
slogger.debug("session {} closed", id);
|
||||
}
|
||||
}
|
||||
slogger.info("drain_closing_sessions: done");
|
||||
}
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -95,10 +95,6 @@ public:
|
||||
return _id;
|
||||
}
|
||||
|
||||
size_t gate_count() const {
|
||||
return _gate.get_count();
|
||||
}
|
||||
|
||||
/// Post-condition of successfully resolved future: There are no guards alive for this session, and
|
||||
/// and it's impossible to create more such guards later.
|
||||
/// Can be called concurrently.
|
||||
|
||||
@@ -1940,7 +1940,7 @@ public:
|
||||
// Calculates how much to delay completing the request. The delay adds to the request's inherent latency.
|
||||
template<typename Func>
|
||||
void delay(tracing::trace_state_ptr trace, Func&& on_resume) {
|
||||
auto delay = _proxy->_max_view_update_backlog.calculate_throttling_delay(_view_backlog, _expire_timer.get_timeout());
|
||||
auto delay = db::view::calculate_view_update_throttling_delay(_view_backlog, _expire_timer.get_timeout(), _proxy->data_dictionary().get_config().view_flow_control_delay_limit_in_ms());
|
||||
stats().last_mv_flow_control_delay = delay;
|
||||
stats().mv_flow_control_delay += delay.count();
|
||||
if (delay.count() == 0) {
|
||||
@@ -3337,7 +3337,6 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
|
||||
, _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
|
||||
, _stats_key(stats_key)
|
||||
, _features(feat)
|
||||
, _maintenance_mode(cfg.maintenance_mode)
|
||||
, _background_write_throttle_threahsold(cfg.available_memory / 10)
|
||||
, _mutate_stage{"storage_proxy_mutate", &storage_proxy::do_mutate}
|
||||
, _max_view_update_backlog(max_view_update_backlog)
|
||||
@@ -7104,7 +7103,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
|
||||
auto endpoints = erm.get_replicas_for_reading(token);
|
||||
// Skip for non-debug builds and maintenance mode.
|
||||
if constexpr (tools::build_info::is_debug_build()) {
|
||||
if (!_maintenance_mode) {
|
||||
if (!_db.local().get_config().maintenance_mode()) {
|
||||
validate_read_replicas(erm, endpoints);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,6 @@
|
||||
#include "dht/token_range_endpoints.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/cas_shard.hh"
|
||||
#include "service/maintenance_mode.hh"
|
||||
#include "service/storage_proxy_fwd.hh"
|
||||
|
||||
class reconcilable_result;
|
||||
@@ -198,7 +197,6 @@ public:
|
||||
// with writes.
|
||||
smp_service_group write_ack_smp_service_group = default_smp_service_group();
|
||||
scheduling_group hints_sched_group;
|
||||
maintenance_mode_enabled maintenance_mode = maintenance_mode_enabled::no;
|
||||
};
|
||||
private:
|
||||
|
||||
@@ -296,7 +294,6 @@ private:
|
||||
scheduling_group_key _stats_key;
|
||||
storage_proxy_stats::global_stats _global_stats;
|
||||
gms::feature_service& _features;
|
||||
maintenance_mode_enabled _maintenance_mode;
|
||||
|
||||
class remote;
|
||||
std::unique_ptr<remote> _remote;
|
||||
|
||||
@@ -496,7 +496,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
};
|
||||
|
||||
auto process_normal_node = [&] (raft::server_id id, locator::host_id host_id, std::optional<gms::inet_address> ip, const replica_state& rs) -> future<> {
|
||||
rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={} cleanup={}",
|
||||
rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={}",
|
||||
id, ip, rs.state, rs.datacenter, rs.rack, _topology_state_machine._topology.tstate, rs.ring.value().tokens, rs.shard_count, rs.cleanup);
|
||||
// Save tokens, not needed for raft topology management, but needed by legacy
|
||||
// Also ip -> id mapping is needed for address map recreation on reboot
|
||||
@@ -1614,44 +1614,45 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
|
||||
|
||||
SCYLLA_ASSERT(_group0);
|
||||
|
||||
auto request_id = utils::UUID_gen::get_time_UUID();
|
||||
if (!_group0->maintenance_mode() && !_group0->joined_group0()) {
|
||||
join_node_request_params join_params {
|
||||
.host_id = _group0->load_my_id(),
|
||||
.cluster_name = _db.local().get_config().cluster_name(),
|
||||
.snitch_name = _db.local().get_snitch_name(),
|
||||
.datacenter = _snitch.local()->get_datacenter(),
|
||||
.rack = _snitch.local()->get_rack(),
|
||||
.release_version = version::release(),
|
||||
.num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
|
||||
.tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
|
||||
.shard_count = smp::count,
|
||||
.ignore_msb = _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
|
||||
.supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
|
||||
.request_id = request_id,
|
||||
};
|
||||
join_node_request_params join_params {
|
||||
.host_id = _group0->load_my_id(),
|
||||
.cluster_name = _db.local().get_config().cluster_name(),
|
||||
.snitch_name = _db.local().get_snitch_name(),
|
||||
.datacenter = _snitch.local()->get_datacenter(),
|
||||
.rack = _snitch.local()->get_rack(),
|
||||
.release_version = version::release(),
|
||||
.num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
|
||||
.tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
|
||||
.shard_count = smp::count,
|
||||
.ignore_msb = _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
|
||||
.supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
|
||||
.request_id = utils::UUID_gen::get_time_UUID(),
|
||||
};
|
||||
|
||||
if (raft_replace_info) {
|
||||
join_params.replaced_id = raft_replace_info->raft_id;
|
||||
join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
|
||||
if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
|
||||
slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
|
||||
" be disabled in a future release. Please use host IDs instead. Provided values: {}",
|
||||
_db.local().get_config().ignore_dead_nodes_for_replace());
|
||||
}
|
||||
if (raft_replace_info) {
|
||||
join_params.replaced_id = raft_replace_info->raft_id;
|
||||
join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
|
||||
if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
|
||||
slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
|
||||
" be disabled in a future release. Please use host IDs instead. Provided values: {}",
|
||||
_db.local().get_config().ignore_dead_nodes_for_replace());
|
||||
}
|
||||
|
||||
// We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
|
||||
// the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
|
||||
// and joined topology.
|
||||
::shared_ptr<group0_handshaker> handshaker =
|
||||
!_db.local().get_config().recovery_leader.is_set()
|
||||
? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
|
||||
: _group0->make_legacy_handshaker(raft::is_voter::no);
|
||||
co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
|
||||
*this, _qp, _migration_manager.local(), join_params);
|
||||
}
|
||||
|
||||
// setup_group0 will do nothing if the node has already set up group 0 in setup_group0_if_exist in main.cc, which
|
||||
// happens when the node is restarting and not joining the new group 0 in the Raft-based recovery procedure.
|
||||
// It does not matter which handshaker we choose in this case since it will not be used.
|
||||
//
|
||||
// We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
|
||||
// the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
|
||||
// and joined topology.
|
||||
::shared_ptr<group0_handshaker> handshaker =
|
||||
!_db.local().get_config().recovery_leader.is_set()
|
||||
? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
|
||||
: _group0->make_legacy_handshaker(raft::is_voter::no);
|
||||
co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
|
||||
*this, _qp, _migration_manager.local(), join_params);
|
||||
|
||||
raft::server& raft_server = _group0->group0_server();
|
||||
|
||||
// This is the moment when the locator::topology has gathered information about other nodes
|
||||
@@ -1699,7 +1700,7 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
|
||||
throw std::runtime_error("Crashed in crash_before_topology_request_completion");
|
||||
});
|
||||
|
||||
auto err = co_await wait_for_topology_request_completion(request_id);
|
||||
auto err = co_await wait_for_topology_request_completion(join_params.request_id);
|
||||
if (!err.empty()) {
|
||||
throw std::runtime_error(fmt::format("{} failed. See earlier errors ({})", raft_replace_info ? "Replace" : "Bootstrap", err));
|
||||
}
|
||||
@@ -4493,20 +4494,10 @@ future<> storage_service::local_topology_barrier() {
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version);
|
||||
{
|
||||
seastar::timer<lowres_clock> warn_timer([&ss, version] {
|
||||
rtlogger.warn("raft_topology_cmd::barrier_and_drain version {}: still waiting for stale versions, "
|
||||
"stale versions (version: use_count): {}",
|
||||
version, ss._shared_token_metadata.describe_stale_versions());
|
||||
});
|
||||
warn_timer.arm_periodic(std::chrono::minutes(5));
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
}
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version);
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: done", version);
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4518,9 +4509,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
auto& raft_server = _group0->group0_server();
|
||||
auto group0_holder = _group0->hold_group0_gate();
|
||||
// do barrier to make sure we always see the latest topology
|
||||
rtlogger.info("topology cmd rpc {} index={}: starting read_barrier, term={}", cmd.cmd, cmd_index, term);
|
||||
co_await raft_server.read_barrier(&_group0_as);
|
||||
rtlogger.info("topology cmd rpc {} index={}: read_barrier completed", cmd.cmd, cmd_index);
|
||||
if (raft_server.get_current_term() != term) {
|
||||
// Return an error since the command is from outdated leader
|
||||
co_return result;
|
||||
@@ -5960,12 +5949,18 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
|
||||
if (const auto *p = _topology_state_machine._topology.find(params.host_id)) {
|
||||
const auto& rs = p->second;
|
||||
if (rs.state == node_state::left) {
|
||||
rtlogger.warn("the node {} attempted to join but it was removed from the cluster. Rejecting the node", params.host_id);
|
||||
rtlogger.warn("the node {} attempted to join",
|
||||
" but it was removed from the cluster. Rejecting"
|
||||
" the node",
|
||||
params.host_id);
|
||||
result.result = join_node_request_result::rejected{
|
||||
.reason = "The node has already been removed from the cluster",
|
||||
};
|
||||
} else {
|
||||
rtlogger.warn("the node {} attempted to join again after an unfinished attempt but it is no longer allowed to do so. Rejecting the node", params.host_id);
|
||||
rtlogger.warn("the node {} attempted to join",
|
||||
" again after an unfinished attempt but it is no longer"
|
||||
" allowed to do so. Rejecting the node",
|
||||
params.host_id);
|
||||
result.result = join_node_request_result::rejected{
|
||||
.reason = "The node requested to join before but didn't finish the procedure. "
|
||||
"Please clear the data directory and restart.",
|
||||
|
||||
@@ -2117,14 +2117,10 @@ public:
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
// Returns the schema and tablet-aware replication strategy for a given table.
|
||||
// Returns {nullptr, nullptr} if the table has been dropped concurrently (race between
|
||||
// the token metadata snapshot and the live schema).
|
||||
std::tuple<schema_ptr, const tablet_aware_replication_strategy*> get_schema_and_rs(table_id table) {
|
||||
auto t = _db.get_tables_metadata().get_table_if_exists(table);
|
||||
if (!t) {
|
||||
lblogger.debug("Table {} no longer exists, skipping", table);
|
||||
return {nullptr, nullptr};
|
||||
on_internal_error(lblogger, format("Table {} does not exist", table));
|
||||
}
|
||||
|
||||
auto s = t->schema();
|
||||
@@ -2139,8 +2135,6 @@ public:
|
||||
return {s, rs};
|
||||
}
|
||||
|
||||
// Returns the tablet-aware replication strategy for a given table, or nullptr
|
||||
// if the table has been dropped concurrently.
|
||||
const tablet_aware_replication_strategy* get_rs(table_id id) {
|
||||
auto [s, rs] = get_schema_and_rs(id);
|
||||
return rs;
|
||||
@@ -2164,7 +2158,6 @@ public:
|
||||
sstring target_tablet_count_reason; // Winning rule for target_tablet_count value.
|
||||
std::optional<uint64_t> avg_tablet_size; // nullopt when stats not yet available.
|
||||
bool pow2_count; // Whether tablet count for the table should be a power of two.
|
||||
bool tablet_merges_allowed; // Whether merges are allowed for the table.
|
||||
|
||||
// Final tablet count.
|
||||
// It's target_tablet_count aligned to power of 2 if pow2_count == true.
|
||||
@@ -2319,17 +2312,6 @@ public:
|
||||
table_plan.current_tablet_count = tablet_count;
|
||||
table_plan.pow2_count = tablet_options.pow2_count.value_or(
|
||||
_db.features().arbitrary_tablet_boundaries ? db::tablet_options::default_pow2_count : true);
|
||||
table_plan.tablet_merges_allowed = !s->tablet_merges_forbidden();
|
||||
if (!table_plan.tablet_merges_allowed) {
|
||||
// Block merge decisions for Alternator tablet tables whose
|
||||
// stream configuration forbids merges. Tablet merges produce
|
||||
// 2 parents per child which is incompatible with the DynamoDB
|
||||
// Streams API. If a merge is already in progress on the tmap,
|
||||
// suppressing new_resize_decision here causes the existing
|
||||
// revocation logic in tables_being_resized to cancel the merge.
|
||||
lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
|
||||
table, s->ks_name(), s->cf_name());
|
||||
}
|
||||
|
||||
rs_by_table[table] = rs;
|
||||
|
||||
@@ -2437,9 +2419,6 @@ public:
|
||||
}
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
auto [s, rs] = get_schema_and_rs(table);
|
||||
if (s == nullptr || rs == nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto tablet_options = combine_tablet_options(
|
||||
tables | std::views::transform([&] (table_id table) { return _db.get_tables_metadata().get_table_if_exists(table); })
|
||||
| std::views::filter([] (auto t) { return t != nullptr; })
|
||||
@@ -2572,7 +2551,7 @@ public:
|
||||
} else if (table_plan.target_tablet_count_aligned < table_plan.current_tablet_count) {
|
||||
// Needed to avoid oscillations, because we reduce the count by a factor of 2.
|
||||
// FIXME: Once we have a way to split individual tablets, we can achieve exactly the desired tablet count.
|
||||
if (table_plan.tablet_merges_allowed && div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
|
||||
if (div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
|
||||
auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
auto cur_decision = tmap.resize_decision();
|
||||
if (cur_decision.is_merge()) {
|
||||
@@ -2622,6 +2601,21 @@ public:
|
||||
resize_decision new_resize_decision;
|
||||
new_resize_decision.way = table_plan.resize_decision;
|
||||
|
||||
// Block merge decisions for Alternator tablet tables whose
|
||||
// stream configuration forbids merges. Tablet merges produce
|
||||
// 2 parents per child which is incompatible with the DynamoDB
|
||||
// Streams API. If a merge is already in progress on the tmap,
|
||||
// suppressing new_resize_decision here causes the existing
|
||||
// revocation logic in tables_being_resized to cancel the merge.
|
||||
if (new_resize_decision.is_merge()) {
|
||||
auto [s, rs] = get_schema_and_rs(table);
|
||||
if (s->tablet_merges_forbidden()) {
|
||||
lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
|
||||
table, s->ks_name(), s->cf_name());
|
||||
new_resize_decision = {};
|
||||
}
|
||||
}
|
||||
|
||||
table_size_desc size_desc {
|
||||
.avg_tablet_size = *table_plan.avg_tablet_size,
|
||||
.resize_decision = tmap.resize_decision(),
|
||||
@@ -3293,10 +3287,6 @@ public:
|
||||
std::unordered_map<sstring, int> rack_load;
|
||||
|
||||
auto rs = get_rs(tablet.table);
|
||||
if (rs == nullptr) {
|
||||
// Table was dropped concurrently. Skip this tablet.
|
||||
return skip_info{};
|
||||
}
|
||||
|
||||
auto get_viable_targets = [&] () {
|
||||
std::unordered_set<host_id> viable_targets;
|
||||
|
||||
@@ -4237,7 +4237,6 @@ public:
|
||||
, _topology_cmd_rpc_tracker(topology_cmd_rpc_tracker)
|
||||
, _async_gate("topology_coordinator")
|
||||
{
|
||||
_lifecycle_notifier.register_subscriber(this);
|
||||
_db.get_notifier().register_listener(this);
|
||||
// When the delay_cdc_stream_finalization error injection is disabled
|
||||
// (test releases it), wake the topology coordinator so it retries
|
||||
@@ -4401,7 +4400,6 @@ future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(
|
||||
}
|
||||
|
||||
future<> topology_coordinator::refresh_tablet_load_stats() {
|
||||
co_await utils::get_local_injector().inject("refresh_tablet_load_stats_pause", utils::wait_for_message(5min));
|
||||
auto tm = get_token_metadata_ptr();
|
||||
|
||||
locator::load_stats stats;
|
||||
@@ -4725,6 +4723,7 @@ future<> topology_coordinator::run() {
|
||||
|
||||
co_await _async_gate.close();
|
||||
co_await std::move(tablet_load_stats_refresher);
|
||||
co_await _tablet_load_stats_refresh.join();
|
||||
co_await std::move(cdc_generation_publisher);
|
||||
co_await std::move(cdc_streams_gc);
|
||||
co_await std::move(gossiper_orphan_remover);
|
||||
@@ -4737,8 +4736,6 @@ future<> topology_coordinator::stop() {
|
||||
co_await _db.get_notifier().unregister_listener(this);
|
||||
utils::get_local_injector().unregister_on_disable("delay_cdc_stream_finalization");
|
||||
_topo_sm.on_tablet_split_ready = nullptr;
|
||||
co_await _lifecycle_notifier.unregister_subscriber(this);
|
||||
co_await _tablet_load_stats_refresh.join();
|
||||
|
||||
// if topology_coordinator::run() is aborted either because we are not a
|
||||
// leader anymore, or we are shutting down as a leader, we have to handle
|
||||
@@ -4800,6 +4797,7 @@ future<> run_topology_coordinator(
|
||||
topology_cmd_rpc_tracker};
|
||||
|
||||
std::exception_ptr ex;
|
||||
lifecycle_notifier.register_subscriber(&coordinator);
|
||||
try {
|
||||
rtlogger.info("start topology coordinator fiber");
|
||||
co_await with_scheduling_group(group0.get_scheduling_group(), [&] {
|
||||
@@ -4820,7 +4818,7 @@ future<> run_topology_coordinator(
|
||||
}
|
||||
on_fatal_internal_error(rtlogger, format("unhandled exception in topology_coordinator::run: {}", ex));
|
||||
}
|
||||
co_await utils::get_local_injector().inject("topology_coordinator_pause_before_stop", utils::wait_for_message(5min));
|
||||
co_await lifecycle_notifier.unregister_subscriber(&coordinator);
|
||||
co_await coordinator.stop();
|
||||
}
|
||||
|
||||
|
||||
@@ -502,7 +502,7 @@ public:
|
||||
}
|
||||
if (_row_start != _partition_end) {
|
||||
on_internal_error(sstlog, format(
|
||||
"partition_reversing_data_source: invariant broken: _row_start({}) == _row_end({}), but"
|
||||
"partition_reversing_data_source: invariant broken: _row_start == _row_end({}), but"
|
||||
" != _partition_end({})", _row_start, _row_end, _partition_end));
|
||||
}
|
||||
look_in_last_block = true;
|
||||
|
||||
@@ -505,7 +505,7 @@ public:
|
||||
return consume_range_tombstone_boundary(std::move(pos), end_tombstone, start_tombstone);
|
||||
}
|
||||
default:
|
||||
on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
|
||||
on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2221,7 +2221,7 @@ public:
|
||||
case bound_kind_m::excl_end_incl_start:
|
||||
return consume_range_tombstone(ecp, bound_kind::incl_start, start_tombstone);
|
||||
default:
|
||||
on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
|
||||
on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -543,16 +543,11 @@ future<> filesystem_storage::wipe(const sstable& sst, sync_dir sync) noexcept {
|
||||
// during SSTable writing and removed before sealing. If the write
|
||||
// failed before sealing, the file may still be on disk and must be
|
||||
// cleaned up explicitly.
|
||||
// The component is only defined for the `ms` sstable format; for
|
||||
// older formats it is absent from the component map and looking up
|
||||
// its filename would throw std::out_of_range.
|
||||
// Use file_exists() to avoid a C++ exception on the common path
|
||||
// where the file was already removed before sealing.
|
||||
if (sstable_version_constants::get_component_map(sst.get_version()).contains(component_type::TemporaryHashes)) {
|
||||
auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
|
||||
if (co_await file_exists(temp_hashes)) {
|
||||
co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
|
||||
}
|
||||
auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
|
||||
if (co_await file_exists(temp_hashes)) {
|
||||
co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
|
||||
}
|
||||
if (sync) {
|
||||
co_await sst.sstable_write_io_check(sync_directory, dir_name.native());
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <map>
|
||||
#include <set>
|
||||
@@ -255,7 +254,6 @@ inline void trie_writer<Output>::lay_out_children(ptr<writer_node> x) {
|
||||
}
|
||||
|
||||
while (unwritten_children.size()) {
|
||||
seastar::thread::maybe_yield();
|
||||
// Find the smallest child which doesn't fit.
|
||||
// (If all fit, then this will be the past-the-end iterator).
|
||||
// Its predecessor will be the biggest child which does fit.
|
||||
@@ -352,7 +350,6 @@ template <trie_writer_sink Output>
|
||||
inline void trie_writer<Output>::complete_until_depth(size_t depth) {
|
||||
expensive_log("writer_node::complete_until_depth: start,_stack={}, depth={}, _current_depth={}", _stack.size(), depth, _current_depth);
|
||||
while (_current_depth > depth) {
|
||||
seastar::thread::maybe_yield();
|
||||
// Every node must be smaller than a page, and the transition chain
|
||||
// must be short enough to ensure that.
|
||||
//
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
|
||||
#include "cql3/statements/property_definitions.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include "table_helper.hh"
|
||||
@@ -136,32 +135,10 @@ future<> table_helper::cache_table_info(cql3::query_processor& qp, service::migr
|
||||
}
|
||||
|
||||
future<> table_helper::insert(cql3::query_processor& qp, service::migration_manager& mm, service::query_state& qs, noncopyable_function<cql3::query_options ()> opt_maker) {
|
||||
// _prepared_stmt is a checked_weak_ptr into the prepared statements
|
||||
// cache and can be invalidated by a concurrent purge (e.g. on a schema
|
||||
// change). cache_table_info() (re-)prepares and assigns _prepared_stmt,
|
||||
// but the pin protecting the entry is dropped when try_prepare()
|
||||
// returns. In release the chain of ready-future co_awaits back to here
|
||||
// resumes synchronously, but debug builds preempt on every co_await
|
||||
// even for ready futures, opening a window for a purge to drop the
|
||||
// entry and leave _prepared_stmt null. Loop until a synchronous
|
||||
// post-resume check finds _prepared_stmt valid; nothing can run between
|
||||
// that check and the dereference below. _insert_stmt is a strong
|
||||
// shared_ptr and is not affected by cache invalidation.
|
||||
while (true) {
|
||||
co_await cache_table_info(qp, mm, qs);
|
||||
if (_prepared_stmt) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Pin a strong ref locally: while we suspend in execute(), a concurrent
|
||||
// insert() on this shard may reset _insert_stmt to nullptr if the
|
||||
// prepared_statements_cache entry gets invalidated, freeing the object.
|
||||
auto stmt = _insert_stmt;
|
||||
co_await cache_table_info(qp, mm, qs);
|
||||
auto opts = opt_maker();
|
||||
opts.prepare(_prepared_stmt->bound_names);
|
||||
co_await utils::get_local_injector().inject("table_helper_insert_before_execute",
|
||||
utils::wait_for_message(std::chrono::seconds{30}));
|
||||
co_await stmt->execute(qp, qs, opts, std::nullopt);
|
||||
co_await _insert_stmt->execute(qp, qs, opts, std::nullopt);
|
||||
}
|
||||
|
||||
future<> table_helper::setup_keyspace(cql3::query_processor& qp, service::migration_manager& mm, std::string_view keyspace_name, sstring replication_strategy_name,
|
||||
|
||||
76
test.py
76
test.py
@@ -11,11 +11,9 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import math
|
||||
import shlex
|
||||
import textwrap
|
||||
from bisect import insort
|
||||
from random import randint
|
||||
|
||||
import pytest
|
||||
@@ -185,8 +183,6 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
help="Specific byte limit for failure injection (random by default)")
|
||||
parser.add_argument('--skip-internet-dependent-tests', action="store_true",
|
||||
help="Skip tests which depend on artifacts from the internet.")
|
||||
parser.add_argument('--keep-duplicates', action='store_true', default=False,
|
||||
help="Do not deduplicate test arguments.")
|
||||
parser.add_argument("--pytest-arg", action='store', type=str,
|
||||
default=None, dest="pytest_arg",
|
||||
help="Additional command line arguments to pass to pytest, for example ./test.py --pytest-arg=\"-v -x\"")
|
||||
@@ -245,73 +241,6 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
return args
|
||||
|
||||
|
||||
# TODO: Remove _CollectionArgument and _deduplicate_test_args once we update
|
||||
# to pytest 9.x, which fixes argument deduplication:
|
||||
# https://github.com/pytest-dev/pytest/issues/12083
|
||||
@dataclasses.dataclass(frozen=True, order=True)
|
||||
class _CollectionArgument:
|
||||
"""Resolved collection argument for deduplication.
|
||||
|
||||
A version-independent subset of pytest's CollectionArgument that
|
||||
includes the fields needed for normalization (parametrization and
|
||||
original_index were added in pytest 9.0).
|
||||
|
||||
``a in b`` means ``b`` subsumes (contains) ``a``. Adapted from
|
||||
pytest 9.0.3 ``_pytest.main.is_collection_argument_subsumed_by``.
|
||||
"""
|
||||
path: pathlib.Path
|
||||
parts: tuple[str, ...]
|
||||
parametrization: str
|
||||
original_index: int
|
||||
|
||||
def __contains__(self, other: _CollectionArgument) -> bool:
|
||||
if self.path != other.path:
|
||||
return not self.parts and other.path.is_relative_to(self.path)
|
||||
if len(self.parts) > len(other.parts) or other.parts[:len(self.parts)] != self.parts:
|
||||
return False
|
||||
return not self.parametrization or self.parametrization == other.parametrization
|
||||
|
||||
|
||||
def _deduplicate_test_args(args: list[str]) -> list[str]:
|
||||
"""Remove duplicate and subsumed test arguments.
|
||||
|
||||
Resolves and normalizes CLI test arguments, then applies the normalization
|
||||
algorithm from pytest 9.0.3 to remove exact duplicates and arguments whose
|
||||
paths are contained within another argument's path.
|
||||
For example, ``["test/cql", "test/cql/lua_test.cql"]`` becomes ``["test/cql"]``.
|
||||
"""
|
||||
if not args:
|
||||
return args
|
||||
invocation_path = pathlib.Path.cwd()
|
||||
resolved_sorted: list[_CollectionArgument] = []
|
||||
unresolved_indices: set[int] = set()
|
||||
for i, arg in enumerate(args):
|
||||
# Adapted from pytest 9.0.3 _pytest.main.resolve_collection_argument.
|
||||
base, squacket, rest = arg.partition("[")
|
||||
strpath, *parts = base.split("::")
|
||||
fspath = pathlib.Path(os.path.abspath(invocation_path / strpath))
|
||||
if not fspath.exists():
|
||||
# Keep unresolved args — let pytest report the error.
|
||||
unresolved_indices.add(i)
|
||||
continue
|
||||
insort(resolved_sorted, _CollectionArgument(
|
||||
path=fspath,
|
||||
parts=tuple(parts),
|
||||
parametrization=squacket + rest,
|
||||
original_index=i,
|
||||
))
|
||||
|
||||
# Normalize: remove duplicates and subsumed arguments using an O(n log n)
|
||||
# sort-based algorithm adapted from pytest 9.0.3.
|
||||
normalized = resolved_sorted[:1]
|
||||
for ca in resolved_sorted[1:]:
|
||||
if ca not in normalized[-1]:
|
||||
normalized.append(ca)
|
||||
|
||||
kept_indices = {ca.original_index for ca in normalized} | unresolved_indices
|
||||
return [arg for i, arg in enumerate(args) if i in kept_indices]
|
||||
|
||||
|
||||
def run_pytest(options: argparse.Namespace) -> int:
|
||||
# When tests are executed in parallel on different hosts, we need to distinguish results from them.
|
||||
# So HOST_ID needed to not overwrite results from different hosts during Jenkins will copy to one directory.
|
||||
@@ -320,8 +249,7 @@ def run_pytest(options: argparse.Namespace) -> int:
|
||||
|
||||
report_dir = temp_dir / 'report'
|
||||
junit_output_file = report_dir / f'pytest_cpp_{HOST_ID}.xml'
|
||||
files_to_run = options.name if options.keep_duplicates else _deduplicate_test_args(options.name)
|
||||
files_to_run = files_to_run or [str(TOP_SRC_DIR / 'test/')]
|
||||
files_to_run = options.name or [str(TOP_SRC_DIR / 'test/')]
|
||||
args = [
|
||||
'--color=yes',
|
||||
f'--repeat={options.repeat}',
|
||||
@@ -341,8 +269,6 @@ def run_pytest(options: argparse.Namespace) -> int:
|
||||
])
|
||||
if options.verbose:
|
||||
args.append('-v')
|
||||
if options.keep_duplicates:
|
||||
args.append('--keep-duplicates')
|
||||
if options.quiet:
|
||||
args.append('--quiet')
|
||||
args.extend(['-p','no:sugar'])
|
||||
|
||||
@@ -70,6 +70,11 @@ def run_alternator_cmd(pid, dir):
|
||||
# now that this parameter is used also by CQL's per-row TTL.
|
||||
#'--alternator-ttl-period-in-seconds', '0.5',
|
||||
'--alternator-allow-system-table-write=1',
|
||||
# Allow testing experimental features. Following issue #9467, we need
|
||||
# to add here specific experimental features as they are introduced.
|
||||
# We only list here Alternator-specific experimental features - CQL
|
||||
# ones are listed in test/cqlpy/run.py.
|
||||
'--experimental-features=alternator-streams',
|
||||
# this is required by test_streams.py test_parent_filtering and test_get_records_with_alternating_tablets_count
|
||||
# setting the value using scylla_config_temporary won't work, because the value is read
|
||||
# at the start and then periodically with `tablet-load-stats-refresh-interval-in-seconds`
|
||||
|
||||
@@ -476,7 +476,8 @@ def test_audit_query_table_operations(dynamodb, cql, alternator_audit_enabled):
|
||||
# table is pipe-separated "base_table|cdc_table". CL=LOCAL_QUORUM.
|
||||
# Produces 5 audit entries.
|
||||
def test_audit_streams_operations(dynamodb, dynamodbstreams, cql, alternator_audit_enabled):
|
||||
with new_test_table(dynamodb, StreamSpecification={"StreamEnabled": True, "StreamViewType": "NEW_AND_OLD_IMAGES"}, **HASH_ONLY_SCHEMA) as table:
|
||||
# With #23838 open, we will explicitly ask for a table with vnodes.
|
||||
with new_test_table(dynamodb, StreamSpecification={"StreamEnabled": True, "StreamViewType": "NEW_AND_OLD_IMAGES"}, Tags=[{'Key': 'system:initial_tablets', 'Value': 'none'}], **HASH_ONLY_SCHEMA) as table:
|
||||
ks_name = f"alternator_{table.name}"
|
||||
client = table.meta.client
|
||||
# Write data so that stream records exist.
|
||||
|
||||
@@ -15,6 +15,7 @@ extra_scylla_config_options:
|
||||
{
|
||||
experimental_features: [
|
||||
udf,
|
||||
alternator-streams,
|
||||
keyspace-storage-options
|
||||
],
|
||||
alternator_port: 8000,
|
||||
|
||||
@@ -48,8 +48,8 @@ def disable_stream(dynamodbstreams, table):
|
||||
# Wait for the stream to really be disabled. A table may have multiple
|
||||
# historic streams - we need all of them to become DISABLED. One of
|
||||
# them (the current one) may remain DISABLING for some time.
|
||||
exp = time.time() + 60
|
||||
while time.time() < exp:
|
||||
exp = time.process_time() + 60
|
||||
while time.process_time() < exp:
|
||||
streams = dynamodbstreams.list_streams(TableName=table.name)
|
||||
disabled = True
|
||||
for stream in streams['Streams']:
|
||||
@@ -60,7 +60,7 @@ def disable_stream(dynamodbstreams, table):
|
||||
if disabled:
|
||||
print('disabled stream on {}'.format(table.name))
|
||||
return
|
||||
time.sleep(0.1)
|
||||
time.sleep(0.5)
|
||||
pytest.fail("timed out")
|
||||
|
||||
# Cannot use fixtures. Because real dynamodb cannot _remove_ a stream
|
||||
@@ -105,8 +105,8 @@ def create_stream_test_table(dynamodb, StreamViewType=None, Tags=None):
|
||||
raise
|
||||
|
||||
def wait_for_active_stream(dynamodbstreams, table, timeout=60):
|
||||
exp = time.time() + timeout
|
||||
while time.time() < exp:
|
||||
exp = time.process_time() + timeout
|
||||
while time.process_time() < exp:
|
||||
streams = dynamodbstreams.list_streams(TableName=table.name)
|
||||
for stream in streams['Streams']:
|
||||
arn = stream['StreamArn']
|
||||
@@ -2205,6 +2205,7 @@ def test_stream_specification(test_table_stream_with_result, dynamodbstreams):
|
||||
# be missing? Or a "null" JSON type? Or an empty string? This test verifies
|
||||
# that the right answer is that NextShardIterator should be *missing*
|
||||
# (reproduces issue #7237).
|
||||
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
|
||||
def test_streams_closed_read(dynamodb, dynamodbstreams):
|
||||
# This test can't use the shared table test_table_ss_keys_only,
|
||||
# because it wants to disable streaming, so let's create a new table:
|
||||
@@ -2257,6 +2258,7 @@ def test_streams_closed_read(dynamodb, dynamodbstreams):
|
||||
# listed for the table, this ARN should continue to work, listing the
|
||||
# stream's shards should give an indication that they are all closed - but
|
||||
# all these shards should still be readable.
|
||||
@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
|
||||
def test_streams_disabled_stream(dynamodb, dynamodbstreams):
|
||||
# This test can't use the shared table test_table_ss_keys_only,
|
||||
# because it wants to disable streaming, so let's create a new table:
|
||||
@@ -2578,70 +2580,3 @@ def test_stream_shard_filtering_missing_shard_id(test_table_ss_keys_only, dynamo
|
||||
# TODO: Can we test shard splitting? (shard splitting
|
||||
# requires the user to - periodically or following shards ending - to call
|
||||
# DescribeStream again. We don't do this in any of our tests.
|
||||
|
||||
# Count the total number of records currently visible on a stream by reading
|
||||
# all shards from the beginning (TRIM_HORIZON).
|
||||
def _count_stream_records(dynamodbstreams, arn):
|
||||
shards = []
|
||||
last_shard_id = None
|
||||
while True:
|
||||
kwargs = {'StreamArn': arn}
|
||||
if last_shard_id:
|
||||
kwargs['ExclusiveStartShardId'] = last_shard_id
|
||||
desc = dynamodbstreams.describe_stream(**kwargs)['StreamDescription']
|
||||
shards.extend(desc['Shards'])
|
||||
last_shard_id = desc.get('LastEvaluatedShardId')
|
||||
if not last_shard_id:
|
||||
break
|
||||
nrecords = 0
|
||||
for shard in shards:
|
||||
it = dynamodbstreams.get_shard_iterator(StreamArn=arn,
|
||||
ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator']
|
||||
while it:
|
||||
response = dynamodbstreams.get_records(ShardIterator=it)
|
||||
nrecords += len(response.get('Records', []))
|
||||
it = response.get('NextShardIterator')
|
||||
if not response.get('Records'):
|
||||
break
|
||||
return nrecords
|
||||
|
||||
def _wait_for_stream_records(dynamodbstreams, arn, timeout=15):
|
||||
"""Poll until at least one stream record is visible."""
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if _count_stream_records(dynamodbstreams, arn) > 0:
|
||||
return
|
||||
time.sleep(0.1)
|
||||
pytest.fail(f"Timed out waiting for stream records on {arn}")
|
||||
|
||||
# Test that after disabling and re-enabling a stream on a table, the old
|
||||
# stream data remains readable through the old ARN. In DynamoDB, it
|
||||
# remains readable for 24 hours. In Scylla, it is currently purged upon
|
||||
# re-enabling.
|
||||
@pytest.mark.xfail(reason="Scylla purges old stream data on re-enable "
|
||||
"instead of keeping it readable for 24h - SCYLLADB-1873")
|
||||
def test_streams_reenable(dynamodb, dynamodbstreams):
|
||||
with create_stream_test_table(dynamodb, StreamViewType='KEYS_ONLY') as table:
|
||||
(arn1, label1) = wait_for_active_stream(dynamodbstreams, table)
|
||||
|
||||
# Write some data while the first stream is active
|
||||
p = random_string()
|
||||
table.update_item(Key={'p': p, 'c': random_string()},
|
||||
UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
|
||||
|
||||
_wait_for_stream_records(dynamodbstreams, arn1)
|
||||
|
||||
disable_stream(dynamodbstreams, table)
|
||||
|
||||
# Re-enable the stream
|
||||
table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': 'KEYS_ONLY'})
|
||||
(arn2, label2) = wait_for_active_stream(dynamodbstreams, table)
|
||||
|
||||
# The new ARN must differ from the old one
|
||||
assert arn1 != arn2
|
||||
|
||||
# The new stream should have no old data.
|
||||
assert _count_stream_records(dynamodbstreams, arn2) == 0
|
||||
|
||||
# The old stream data should still be readable via the old ARN.
|
||||
assert _count_stream_records(dynamodbstreams, arn1) > 0
|
||||
|
||||
@@ -656,6 +656,12 @@ def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration):
|
||||
# content), and a special userIdentity flag saying that this is not a regular
|
||||
# REMOVE but an expiration. Reproduces issue #11523.
|
||||
def test_ttl_expiration_streams(dynamodb, dynamodbstreams, waits_for_expiration):
|
||||
# Alternator Streams currently doesn't work with tablets, so until
|
||||
# #23838 is solved, skip this test on tablets.
|
||||
for tag in TAGS:
|
||||
if tag['Key'] == 'system:initial_tablets' and tag['Value'].isdigit():
|
||||
skip_bug("Streams test skipped on tablets due to #23838")
|
||||
|
||||
# In my experiments, a 30-minute (1800 seconds) is the typical
|
||||
# expiration delay in this test. If the test doesn't finish within
|
||||
# max_duration, we report a failure.
|
||||
|
||||
@@ -150,8 +150,6 @@ add_scylla_test(lister_test
|
||||
KIND SEASTAR)
|
||||
add_scylla_test(locator_topology_test
|
||||
KIND SEASTAR)
|
||||
add_scylla_test(lock_tables_metadata_test
|
||||
KIND SEASTAR)
|
||||
add_scylla_test(log_heap_test
|
||||
KIND BOOST)
|
||||
add_scylla_test(logalloc_standard_allocator_segment_pool_backend_test
|
||||
@@ -325,7 +323,6 @@ add_scylla_test(combined_tests
|
||||
auth_cache_test.cc
|
||||
auth_test.cc
|
||||
batchlog_manager_test.cc
|
||||
table_helper_test.cc
|
||||
cache_algorithm_test.cc
|
||||
castas_fcts_test.cc
|
||||
cdc_test.cc
|
||||
@@ -377,7 +374,7 @@ add_scylla_test(combined_tests
|
||||
sstable_compression_config_test.cc
|
||||
sstable_directory_test.cc
|
||||
sstable_set_test.cc
|
||||
sstable_tablet_streaming_test.cc
|
||||
sstable_tablet_streaming.cc
|
||||
statement_restrictions_test.cc
|
||||
storage_proxy_test.cc
|
||||
tablets_test.cc
|
||||
|
||||
@@ -122,7 +122,7 @@ SEASTAR_TEST_CASE(test_reclaimed_bloom_filter_deletion_from_disk) {
|
||||
|
||||
auto mut1 = mutation(s, pks[0]);
|
||||
mut1.partition().apply_insert(*s, ss.make_ckey(0), ss.new_timestamp());
|
||||
auto sst = make_sstable_containing(env.make_sstable(s), {std::move(mut1)}).get();
|
||||
auto sst = make_sstable_containing(env.make_sstable(s), {std::move(mut1)});
|
||||
auto sst_test = sstables::test(sst);
|
||||
|
||||
const auto filter_path = (env.tempdir().path() / sst_test.filename(component_type::Filter)).native();
|
||||
@@ -269,7 +269,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reload_after_unlink) {
|
||||
mut.partition().apply_insert(*schema, ss.make_ckey(1), ss.new_timestamp());
|
||||
|
||||
// bloom filter will be reclaimed automatically due to low memory
|
||||
auto sst = make_sstable_containing(env.make_sstable(schema), {mut}).get();
|
||||
auto sst = make_sstable_containing(env.make_sstable(schema), {mut});
|
||||
auto& sst_mgr = env.manager();
|
||||
BOOST_REQUIRE_EQUAL(sst->filter_memory_size(), 0);
|
||||
|
||||
@@ -325,7 +325,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_after_unlink) {
|
||||
}
|
||||
|
||||
// create one sst; there is sufficient memory for the bloom filter, so it won't be reclaimed
|
||||
auto sst1 = make_sstable_containing(env.make_sstable(schema), mutations).get();
|
||||
auto sst1 = make_sstable_containing(env.make_sstable(schema), mutations);
|
||||
auto& sst_mgr = env.manager();
|
||||
auto sst1_filename = sst1->get_filename();
|
||||
BOOST_REQUIRE(sst1->filter_memory_size() != 0);
|
||||
@@ -358,7 +358,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_after_unlink) {
|
||||
|
||||
// create another sst and unlink it to trigger reload of components.
|
||||
// the reload should not attempt to load sst'1 bloom filter into memory depsite its presence in the _active list.
|
||||
auto sst2 = make_sstable_containing(env.make_sstable(schema), {mutations[0]}).get();
|
||||
auto sst2 = make_sstable_containing(env.make_sstable(schema), {mutations[0]});
|
||||
sst2->unlink().get();
|
||||
sst2.release();
|
||||
|
||||
|
||||
@@ -297,10 +297,11 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
|
||||
BOOST_REQUIRE_THROW(e.execute_cql(stmt).get(), exceptions::unauthorized_exception);
|
||||
};
|
||||
|
||||
const std::string generations_v2 = "system_distributed_everywhere.cdc_generation_descriptions_v2";
|
||||
const std::string streams = "system_distributed.cdc_streams_descriptions_v2";
|
||||
const std::string timestamps = "system_distributed.cdc_generation_timestamps";
|
||||
|
||||
for (auto& t : {streams, timestamps}) {
|
||||
for (auto& t : {generations_v2, streams, timestamps}) {
|
||||
auto dot_pos = t.find_first_of('.');
|
||||
SCYLLA_ASSERT(dot_pos != std::string_view::npos && dot_pos != 0 && dot_pos != t.size() - 1);
|
||||
BOOST_REQUIRE(e.local_db().has_schema(t.substr(0, dot_pos), t.substr(dot_pos + 1)));
|
||||
@@ -316,15 +317,18 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
|
||||
for (auto& t : {streams}) {
|
||||
assert_unauthorized(seastar::format("ALTER TABLE {} ALTER time TYPE blob", t));
|
||||
}
|
||||
assert_unauthorized(seastar::format("ALTER TABLE {} ALTER id TYPE blob", generations_v2));
|
||||
assert_unauthorized(seastar::format("ALTER TABLE {} ALTER key TYPE blob", timestamps));
|
||||
|
||||
// Allow DELETE
|
||||
for (auto& t : {streams}) {
|
||||
e.execute_cql(seastar::format("DELETE FROM {} WHERE time = toTimeStamp(now())", t)).get();
|
||||
}
|
||||
e.execute_cql(seastar::format("DELETE FROM {} WHERE id = uuid()", generations_v2)).get();
|
||||
e.execute_cql(seastar::format("DELETE FROM {} WHERE key = 'timestamps'", timestamps)).get();
|
||||
|
||||
// Allow UPDATE, INSERT
|
||||
e.execute_cql(seastar::format("INSERT INTO {} (id, range_end) VALUES (uuid(), 0)", generations_v2)).get();
|
||||
e.execute_cql(seastar::format("INSERT INTO {} (time, range_end) VALUES (toTimeStamp(now()), 0)", streams)).get();
|
||||
e.execute_cql(seastar::format("UPDATE {} SET expired = toTimeStamp(now()) WHERE key = 'timestamps' AND time = toTimeStamp(now())", timestamps)).get();
|
||||
}).get();
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user