mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
Compare commits
15 Commits
scylladb_1
...
ykaul/comp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
339f1ae1a0 | ||
|
|
07d69aa8fa | ||
|
|
c50bfb995b | ||
|
|
e7dbccbdcd | ||
|
|
faa2f8ba76 | ||
|
|
7aca42aa31 | ||
|
|
92e0597807 | ||
|
|
0798c112d0 | ||
|
|
9650390482 | ||
|
|
a1e8ef8d6e | ||
|
|
ea00cfad3d | ||
|
|
0fd89d77b3 | ||
|
|
361a717d89 | ||
|
|
9df4fc3e2f | ||
|
|
d1b4fd5683 |
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@@ -32,8 +32,8 @@ counters* @nuivall
|
||||
tests/counter_test* @nuivall
|
||||
|
||||
# DOCS
|
||||
/docs/ @annastuchlik @tzach
|
||||
/docs/alternator/ @annastuchlik @tzach @nyh
|
||||
docs/* @annastuchlik @tzach
|
||||
docs/alternator @annastuchlik @tzach @nyh
|
||||
|
||||
# GOSSIP
|
||||
gms/* @tgrabiec @asias @kbr-scylla
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -36,6 +36,4 @@ compile_commands.json
|
||||
clang_build
|
||||
.idea/
|
||||
nuke
|
||||
rust/**/target
|
||||
rust/**/Cargo.lock
|
||||
test/resource/wasm/rust/target
|
||||
rust/target
|
||||
|
||||
@@ -234,11 +234,15 @@ generate_scylla_version()
|
||||
|
||||
option(Scylla_USE_PRECOMPILED_HEADER "Use precompiled header for Scylla" ON)
|
||||
add_library(scylla-precompiled-header STATIC exported_templates.cc)
|
||||
target_include_directories(scylla-precompiled-header PRIVATE
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"${scylla_gen_build_dir}")
|
||||
target_link_libraries(scylla-precompiled-header PRIVATE
|
||||
absl::headers
|
||||
absl::btree
|
||||
absl::hash
|
||||
absl::raw_hash_set
|
||||
idl
|
||||
Seastar::seastar
|
||||
Snappy::snappy
|
||||
systemd
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.3.0-dev
|
||||
VERSION=2026.2.0-dev
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -681,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
|
||||
case parsed::primitive_condition::type::VALUE:
|
||||
if (calculated_values.size() != 1) {
|
||||
// Shouldn't happen unless we have a bug in the parser
|
||||
throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
|
||||
throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
|
||||
}
|
||||
// Unwrap the boolean wrapped as the value (if it is a boolean)
|
||||
if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
|
||||
|
||||
@@ -1362,33 +1362,6 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
|
||||
return dimensions_v->GetInt();
|
||||
}
|
||||
|
||||
// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
|
||||
// currently synchronous - they return only after the operation is complete.
|
||||
// After announce() of the new schema finished, the schema change is committed
|
||||
// and a majority of nodes know it - but it's possible that some live nodes
|
||||
// have not yet applied the new schema. If we return to the user now, and the
|
||||
// user sends a node request that relies on the new schema, it might fail.
|
||||
// So before returning, we must verify that *all* nodes have applied the new
|
||||
// schema. This is what wait_for_schema_agreement_after_ddl() does.
|
||||
//
|
||||
// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
|
||||
// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
|
||||
// is returned. The user, who doesn't know if the CreateTable succeeded or not,
|
||||
// can retry the request and will get a ResourceInUseException and know the
|
||||
// table already exists. So a CreateTable that returns a ResourceInUseException
|
||||
// should also call wait_for_schema_agreement_after_ddl().
|
||||
//
|
||||
// When issue #5052 is resolved, this function can be removed - we will need
|
||||
// to check if we reached schema agreement, but not to *wait* for it.
|
||||
static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
|
||||
static constexpr auto schema_agreement_seconds = 30;
|
||||
try {
|
||||
co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
|
||||
} catch (const service::migration_manager::schema_agreement_timeout&) {
|
||||
throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
|
||||
}
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
|
||||
const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
||||
throwing_assert(this_shard_id() == 0);
|
||||
@@ -1722,26 +1695,13 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
}
|
||||
bool table_already_exists = false;
|
||||
try {
|
||||
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
|
||||
} catch (exceptions::already_exists_exception&) {
|
||||
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
||||
table_already_exists = true;
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
}
|
||||
if (table_already_exists) {
|
||||
// The user may have retried a CreateTable operation after it timed
|
||||
// out in wait_for_schema_agreement_after_ddl(). So before we may
|
||||
// return ResourceInUseException (which can lead the user to start
|
||||
// using the table which it now knows exists), we need to wait for
|
||||
// schema agreement, just like the original CreateTable did. Again
|
||||
// we fail with InternalServerError if schema agreement still cannot
|
||||
// be reached. We can release group0_guard before waiting.
|
||||
release_guard(std::move(group0_guard));
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
||||
}
|
||||
if (_proxy.data_dictionary().try_find_table(schema->id())) {
|
||||
// This should never happen, the ID is supposed to be unique
|
||||
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
||||
@@ -1790,7 +1750,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
|
||||
}
|
||||
}
|
||||
|
||||
co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
|
||||
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
rjson::value status = rjson::empty_object();
|
||||
executor::supplement_table_info(request, *schema, _proxy);
|
||||
rjson::add(status, "TableDescription", std::move(request));
|
||||
@@ -1900,7 +1860,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
empty_request = false;
|
||||
if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
|
||||
if (add_stream_options(*stream_specification, builder, p.local())) {
|
||||
validate_cdc_log_name_length(builder.cf_name());
|
||||
// On tablet tables, defer stream enablement and block
|
||||
// tablet merges (see defer_enabling_streams_block_tablet_merges).
|
||||
@@ -1915,23 +1875,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
|
||||
}
|
||||
// When re-enabling streams on an Alternator table, drop the old
|
||||
// CDC log table first as a separate schema change, so the
|
||||
// subsequent UpdateTable creates a fresh one with a new UUID
|
||||
// (= new StreamArn). See #7239.
|
||||
auto logname = cdc::log_name(tab->cf_name());
|
||||
auto& local_db = p.local().local_db();
|
||||
if (local_db.has_schema(tab->ks_name(), logname)
|
||||
&& cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
|
||||
auto drop_m = co_await service::prepare_column_family_drop_announcement(
|
||||
p.local(), tab->ks_name(), logname,
|
||||
group0_guard.write_timestamp());
|
||||
co_await mm.announce(std::move(drop_m), std::move(group0_guard),
|
||||
format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
|
||||
co_await mm.wait_for_schema_agreement(
|
||||
p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
|
||||
co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
|
||||
@@ -1949,7 +1892,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
}
|
||||
if (vector_index_updates->Size() > 1) {
|
||||
// VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates.
|
||||
// Since DynamoDB artificially limits the latter to just a
|
||||
// Since DynamoDB artifically limits the latter to just a
|
||||
// single operation (one Create or one Delete), we also
|
||||
// place the same artificial limit on VectorIndexUpdates,
|
||||
// and throw the same LimitExceeded error if the client
|
||||
@@ -2246,7 +2189,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
throw;
|
||||
}
|
||||
}
|
||||
co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());
|
||||
co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
||||
|
||||
rjson::value status = rjson::empty_object();
|
||||
supplement_table_info(request, *schema, p.local());
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "cdc/cdc_options.hh"
|
||||
|
||||
|
||||
namespace db {
|
||||
@@ -200,7 +199,7 @@ private:
|
||||
tracing::trace_state_ptr trace_state, service_permit permit);
|
||||
|
||||
public:
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
|
||||
static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
|
||||
static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
|
||||
static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
|
||||
};
|
||||
|
||||
@@ -1354,7 +1354,7 @@ static future<executor::request_return_type> query_vector(
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
// Parse the Select parameter and determine which attributes to return.
|
||||
// For a vector index, the default Select is ALL_ATTRIBUTES (full items).
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficient because it
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it
|
||||
// returns what the vector store returned without looking up additional
|
||||
// base-table data. Currently only the primary key attributes are projected
|
||||
// but in the future we'll implement projecting additional attributes into
|
||||
|
||||
@@ -167,8 +167,46 @@ static schema_ptr get_schema_from_arn(service::storage_proxy& proxy, const strea
|
||||
}
|
||||
}
|
||||
|
||||
// ShardId. Must be between 28 and 65 characters inclusive.
|
||||
// UUID is 36 bytes as string (including dashes).
|
||||
// Prepend a version/type marker (`S`) -> 37
|
||||
class stream_shard_id : public utils::UUID {
|
||||
public:
|
||||
using UUID = utils::UUID;
|
||||
static constexpr char marker = 'S';
|
||||
|
||||
stream_shard_id() = default;
|
||||
stream_shard_id(const UUID& uuid)
|
||||
: UUID(uuid)
|
||||
{}
|
||||
stream_shard_id(const table_id& tid)
|
||||
: UUID(tid.uuid())
|
||||
{}
|
||||
stream_shard_id(std::string_view v)
|
||||
: UUID(v.substr(1))
|
||||
{
|
||||
if (v[0] != marker) {
|
||||
throw std::invalid_argument(std::string(v));
|
||||
}
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& os, const stream_shard_id& arn) {
|
||||
const UUID& uuid = arn;
|
||||
return os << marker << uuid;
|
||||
}
|
||||
friend std::istream& operator>>(std::istream& is, stream_shard_id& arn) {
|
||||
std::string s;
|
||||
is >> s;
|
||||
arn = stream_shard_id(s);
|
||||
return is;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_shard_id>
|
||||
: public from_string_helper<ValueType, alternator::stream_shard_id>
|
||||
{};
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
|
||||
: public from_string_helper<ValueType, alternator::stream_arn>
|
||||
@@ -180,8 +218,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
_stats.api_operations.list_streams++;
|
||||
|
||||
auto limit = rjson::get_opt<int>(request, "Limit").value_or(100);
|
||||
auto streams_start = rjson::get_opt<stream_arn>(request, "ExclusiveStartStreamArn");
|
||||
|
||||
auto streams_start = rjson::get_opt<stream_shard_id>(request, "ExclusiveStartStreamArn");
|
||||
auto table = find_table(_proxy, request);
|
||||
auto db = _proxy.data_dictionary();
|
||||
|
||||
@@ -207,34 +244,34 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
cfs = db.get_tables();
|
||||
}
|
||||
|
||||
// We need to sort the tables to ensure a stable order for paging.
|
||||
// We sort by keyspace and table name, which will also allow us to skip to
|
||||
// the right position by ExclusiveStartStreamArn.
|
||||
auto cmp = [](std::string_view ks1, std::string_view cf1, std::string_view ks2, std::string_view cf2) {
|
||||
return ks1 == ks2 ? cf1 < cf2 : ks1 < ks2;
|
||||
};
|
||||
// # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
|
||||
// generate duplicates in a paged listing here. Can obviously miss things if they
|
||||
// are added between paged calls and end up with a "smaller" UUID/ARN, but that
|
||||
// is to be expected.
|
||||
if (std::cmp_less(limit, cfs.size()) || streams_start) {
|
||||
std::sort(cfs.begin(), cfs.end(),
|
||||
[&cmp](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return cmp(t1.schema()->ks_name(), t1.schema()->cf_name(),
|
||||
t2.schema()->ks_name(), t2.schema()->cf_name());
|
||||
});
|
||||
std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return t1.schema()->id().uuid() < t2.schema()->id().uuid();
|
||||
});
|
||||
}
|
||||
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (streams_start) {
|
||||
i = std::upper_bound(i, e, *streams_start,
|
||||
[&cmp](const stream_arn& arn, const data_dictionary::table& t) {
|
||||
return cmp(arn.keyspace_name(), arn.table_name(),
|
||||
t.schema()->ks_name(), t.schema()->cf_name());
|
||||
});
|
||||
i = std::find_if(i, e, [&](const data_dictionary::table& t) {
|
||||
return t.schema()->id().uuid() == streams_start
|
||||
&& cdc::get_base_table(db.real_database(), *t.schema())
|
||||
&& is_alternator_keyspace(t.schema()->ks_name())
|
||||
;
|
||||
});
|
||||
if (i != e) {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = rjson::empty_object();
|
||||
auto streams = rjson::empty_array();
|
||||
std::optional<stream_arn> last;
|
||||
std::optional<stream_shard_id> last;
|
||||
|
||||
for (;limit > 0 && i != e; ++i) {
|
||||
auto s = i->schema();
|
||||
@@ -243,29 +280,21 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
if (!is_alternator_keyspace(ks_name)) {
|
||||
continue;
|
||||
}
|
||||
// Use get_base_table instead of is_log_for_some_table because the
|
||||
// latter requires CDC to be enabled, but we want to list streams
|
||||
// that have been disabled but whose log table still exists (#7239).
|
||||
if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
|
||||
if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
|
||||
rjson::value new_entry = rjson::empty_object();
|
||||
|
||||
last = i->schema()->id();
|
||||
auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
|
||||
rjson::add(new_entry, "StreamArn", arn);
|
||||
rjson::add(new_entry, "StreamLabel", rjson::from_string(stream_label(*s)));
|
||||
rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(s->cf_name())));
|
||||
rjson::push_back(streams, std::move(new_entry));
|
||||
last = std::move(arn);
|
||||
--limit;
|
||||
}
|
||||
}
|
||||
|
||||
rjson::add(ret, "Streams", std::move(streams));
|
||||
|
||||
// Only emit LastEvaluatedStreamArn when we stopped because we hit the
|
||||
// limit (limit == 0), meaning there may be more streams to list.
|
||||
// If we exhausted all tables naturally (limit > 0), there are no more
|
||||
// streams, so we must not emit a cookie.
|
||||
if (last && limit == 0) {
|
||||
if (last) {
|
||||
rjson::add(ret, "LastEvaluatedStreamArn", *last);
|
||||
}
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
@@ -395,7 +424,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
|
||||
return is;
|
||||
}
|
||||
|
||||
static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
|
||||
static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
|
||||
stream_view_type type = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
type = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
@@ -585,7 +614,7 @@ void stream_id_range::prepare_for_iterating()
|
||||
// the function returns `stream_id_range` that will allow iteration over children Streams shards for the Streams shard `parent`
|
||||
// a child Streams shard is defined as a Streams shard that touches token range that was previously covered by `parent` Streams shard
|
||||
// Streams shard contains a token, that represents end of the token range for that Streams shard (inclusive)
|
||||
// beginning of the token range is defined by previous Streams shard's token + 1
|
||||
// begginning of the token range is defined by previous Streams shard's token + 1
|
||||
// NOTE: With vnodes, ranges of Streams' shards wrap, while with tablets the biggest allowed token number is always a range end.
|
||||
// NOTE: both streams generation are guaranteed to cover whole range and be non-empty
|
||||
// NOTE: it's possible to get more than one stream shard with the same token value (thus some of those stream shards will be empty) -
|
||||
@@ -841,7 +870,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto& opts = bs->cdc_options();
|
||||
|
||||
auto status = "DISABLED";
|
||||
bool stream_disabled = !opts.enabled();
|
||||
|
||||
if (opts.enabled()) {
|
||||
if (!_cdc_metadata.streams_available()) {
|
||||
@@ -857,7 +885,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(opts);
|
||||
stream_view_type type = cdc_options_to_steam_view_type(opts);
|
||||
|
||||
rjson::add(stream_desc, "StreamArn", stream_arn);
|
||||
rjson::add(stream_desc, "StreamViewType", type);
|
||||
@@ -865,9 +893,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
|
||||
describe_key_schema(stream_desc, *bs);
|
||||
|
||||
// For disabled streams, we still fall through to enumerate shards
|
||||
// below. All shards will have EndingSequenceNumber set, indicating
|
||||
// they are closed. See issue #7239.
|
||||
if (!opts.enabled()) {
|
||||
rjson::add(ret, "StreamDescription", std::move(stream_desc));
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
// TODO: label
|
||||
// TODO: creation time
|
||||
@@ -950,12 +979,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
|
||||
auto expired = [&]() -> std::optional<db_clock::time_point> {
|
||||
auto j = std::next(i);
|
||||
if (j == e) {
|
||||
// For a disabled stream, all shards are closed (#7239).
|
||||
// Use "now" as the ending sequence number for the last
|
||||
// generation's shards.
|
||||
if (stream_disabled) {
|
||||
return db_clock::now();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
// add this so we sort of match potential
|
||||
@@ -1306,7 +1329,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
| std::ranges::to<query::column_id_vector>()
|
||||
;
|
||||
|
||||
stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());
|
||||
stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
|
||||
|
||||
auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
|
||||
auto partition_slice = query::partition_slice(
|
||||
@@ -1490,17 +1513,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
|
||||
auto& shard = iter.shard;
|
||||
|
||||
if (!base->cdc_options().enabled()) {
|
||||
// Stream is disabled -- all shards are closed (#7239).
|
||||
// Don't return NextShardIterator.
|
||||
} else if (shard.time < ts && ts < high_ts) {
|
||||
if (shard.time < ts && ts < high_ts) {
|
||||
// The DynamoDB documentation states that when a shard is
|
||||
// closed, reading it until the end has NextShardIterator
|
||||
// "set to null". Our test test_streams_closed_read
|
||||
// confirms that by "null" they meant not set at all.
|
||||
} else {
|
||||
// Shard is still open with no records in the scanned window.
|
||||
// Return the original iterator so the client can poll again.
|
||||
// We could have return the same iterator again, but we did
|
||||
// a search from it until high_ts and found nothing, so we
|
||||
// can also start the next search from high_ts.
|
||||
// TODO: but why? It's simpler just to leave the iterator be.
|
||||
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
|
||||
rjson::add(ret, "NextShardIterator", iter);
|
||||
}
|
||||
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
@@ -1510,13 +1533,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
|
||||
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
|
||||
auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
|
||||
if (!stream_enabled || !stream_enabled->IsBool()) {
|
||||
throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
|
||||
}
|
||||
|
||||
if (stream_enabled->GetBool()) {
|
||||
if (!sp.features().alternator_streams) {
|
||||
throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
|
||||
}
|
||||
|
||||
cdc::options opts;
|
||||
opts.enabled(true);
|
||||
opts.tablet_merge_blocked(true);
|
||||
@@ -1542,13 +1569,8 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
builder.with_cdc_options(opts);
|
||||
return true;
|
||||
} else {
|
||||
// When disabling, preserve the existing CDC options (preimage,
|
||||
// postimage, ttl, etc.) so that DescribeStream can still report
|
||||
// the correct StreamViewType on a disabled stream.
|
||||
cdc::options opts = existing_cdc_opts;
|
||||
cdc::options opts;
|
||||
opts.enabled(false);
|
||||
opts.enable_requested(false);
|
||||
opts.tablet_merge_blocked(false);
|
||||
builder.with_cdc_options(opts);
|
||||
return false;
|
||||
}
|
||||
@@ -1556,36 +1578,33 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
|
||||
|
||||
void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
|
||||
auto& opts = schema.cdc_options();
|
||||
// Report stream info when:
|
||||
// 1. Log table exists (covers both enabled and disabled-but-readable).
|
||||
// 2. enable_requested (ENABLING state, log not yet created).
|
||||
auto db = sp.data_dictionary();
|
||||
auto log_name = cdc::log_name(schema.cf_name());
|
||||
auto log_cf = db.try_find_table(schema.ks_name(), log_name);
|
||||
if (log_cf) {
|
||||
auto log_schema = log_cf->schema();
|
||||
stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
|
||||
if (opts.enabled()) {
|
||||
auto db = sp.data_dictionary();
|
||||
auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
|
||||
stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
|
||||
rjson::add(descr, "LatestStreamArn", arn);
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));
|
||||
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", opts.enabled());
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
} else if (opts.enable_requested()) {
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
stream_view_type mode = cdc_options_to_stream_view_type(opts);
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
|
||||
} else if (!opts.enable_requested()) {
|
||||
return;
|
||||
}
|
||||
// For both enabled() and enable_requested():
|
||||
// DynamoDB returns StreamEnabled=true in StreamSpecification even when
|
||||
// the stream status is ENABLING (not yet fully active). We mirror this
|
||||
// behavior: enable_requested means the user asked for streams but CDC
|
||||
// is not yet finalized, so we still report StreamEnabled=true.
|
||||
auto stream_desc = rjson::empty_object();
|
||||
rjson::add(stream_desc, "StreamEnabled", true);
|
||||
|
||||
auto mode = stream_view_type::KEYS_ONLY;
|
||||
if (opts.preimage() && opts.postimage()) {
|
||||
mode = stream_view_type::NEW_AND_OLD_IMAGES;
|
||||
} else if (opts.preimage()) {
|
||||
mode = stream_view_type::OLD_IMAGE;
|
||||
} else if (opts.postimage()) {
|
||||
mode = stream_view_type::NEW_IMAGE;
|
||||
}
|
||||
rjson::add(stream_desc, "StreamViewType", mode);
|
||||
rjson::add(descr, "StreamSpecification", std::move(stream_desc));
|
||||
}
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
@@ -856,9 +856,7 @@ rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::r
|
||||
}
|
||||
|
||||
apilog.info("exclude_node: hosts={}", hosts);
|
||||
co_await ss.local().run_with_no_api_lock([hosts = std::move(hosts)] (service::storage_service& ss) {
|
||||
return ss.mark_excluded(hosts);
|
||||
});
|
||||
co_await ss.local().mark_excluded(hosts);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1733,9 +1731,7 @@ rest_create_vnode_tablet_migration(http_context& ctx, sharded<service::storage_s
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.prepare_for_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().prepare_for_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1747,9 +1743,7 @@ rest_get_vnode_tablet_migration(http_context& ctx, sharded<service::storage_serv
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
auto status = co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.get_tablets_migration_status_with_node_details(keyspace);
|
||||
});
|
||||
auto status = co_await ss.local().get_tablets_migration_status_with_node_details(keyspace);
|
||||
|
||||
ss::vnode_tablet_migration_status result;
|
||||
result.keyspace = status.keyspace;
|
||||
@@ -1774,9 +1768,7 @@ rest_set_vnode_tablet_migration_node_storage_mode(http_context& ctx, sharded<ser
|
||||
}
|
||||
auto mode_str = req->get_query_param("intended_mode");
|
||||
auto mode = service::intended_storage_mode_from_string(mode_str);
|
||||
co_await ss.local().run_with_no_api_lock([mode] (service::storage_service& ss) {
|
||||
return ss.set_node_intended_storage_mode(mode);
|
||||
});
|
||||
co_await ss.local().set_node_intended_storage_mode(mode);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1790,9 +1782,7 @@ rest_finalize_vnode_tablet_migration(http_context& ctx, sharded<service::storage
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
validate_keyspace(ctx, keyspace);
|
||||
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.finalize_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().finalize_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1869,106 +1859,90 @@ rest_bind(FuncType func, BindArgs&... args) {
|
||||
return std::bind_front(func, std::ref(args)...);
|
||||
}
|
||||
|
||||
// Hold the storage_service async gate for the duration of async REST
|
||||
// handlers so stop() drains in-flight requests before teardown.
|
||||
// Synchronous handlers don't yield and need no gate.
|
||||
static seastar::httpd::future_json_function
|
||||
gated(sharded<service::storage_service>& ss, seastar::httpd::future_json_function fn) {
|
||||
return [fn = std::move(fn), &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
auto holder = ss.local().hold_async_gate();
|
||||
co_return co_await fn(std::move(req));
|
||||
};
|
||||
}
|
||||
|
||||
static seastar::httpd::json_request_function
|
||||
gated(sharded<service::storage_service>&, seastar::httpd::json_request_function fn) {
|
||||
return fn;
|
||||
}
|
||||
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
ss::get_token_endpoint.set(r, gated(ss, rest_bind(rest_get_token_endpoint, ctx, ss)));
|
||||
ss::get_release_version.set(r, gated(ss, rest_bind(rest_get_release_version, ss)));
|
||||
ss::get_scylla_release_version.set(r, gated(ss, rest_bind(rest_get_scylla_release_version, ss)));
|
||||
ss::get_schema_version.set(r, gated(ss, rest_bind(rest_get_schema_version, ss)));
|
||||
ss::get_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_range_to_endpoint_map, ctx, ss)));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_pending_range_to_endpoint_map, ctx)));
|
||||
ss::describe_ring.set(r, gated(ss, rest_bind(rest_describe_ring, ctx, ss)));
|
||||
ss::get_current_generation_number.set(r, gated(ss, rest_bind(rest_get_current_generation_number, ss)));
|
||||
ss::get_natural_endpoints.set(r, gated(ss, rest_bind(rest_get_natural_endpoints, ctx, ss)));
|
||||
ss::get_natural_endpoints_v2.set(r, gated(ss, rest_bind(rest_get_natural_endpoints_v2, ctx, ss)));
|
||||
ss::cdc_streams_check_and_repair.set(r, gated(ss, rest_bind(rest_cdc_streams_check_and_repair, ss)));
|
||||
ss::cleanup_all.set(r, gated(ss, rest_bind(rest_cleanup_all, ctx, ss)));
|
||||
ss::reset_cleanup_needed.set(r, gated(ss, rest_bind(rest_reset_cleanup_needed, ctx, ss)));
|
||||
ss::force_flush.set(r, gated(ss, rest_bind(rest_force_flush, ctx)));
|
||||
ss::force_keyspace_flush.set(r, gated(ss, rest_bind(rest_force_keyspace_flush, ctx)));
|
||||
ss::decommission.set(r, gated(ss, rest_bind(rest_decommission, ss, ssc)));
|
||||
ss::logstor_compaction.set(r, gated(ss, rest_bind(rest_logstor_compaction, ctx)));
|
||||
ss::logstor_flush.set(r, gated(ss, rest_bind(rest_logstor_flush, ctx)));
|
||||
ss::move.set(r, gated(ss, rest_bind(rest_move, ss)));
|
||||
ss::remove_node.set(r, gated(ss, rest_bind(rest_remove_node, ss)));
|
||||
ss::exclude_node.set(r, gated(ss, rest_bind(rest_exclude_node, ss)));
|
||||
ss::get_removal_status.set(r, gated(ss, rest_bind(rest_get_removal_status, ss)));
|
||||
ss::force_remove_completion.set(r, gated(ss, rest_bind(rest_force_remove_completion, ss)));
|
||||
ss::set_logging_level.set(r, gated(ss, rest_bind(rest_set_logging_level)));
|
||||
ss::get_logging_levels.set(r, gated(ss, rest_bind(rest_get_logging_levels)));
|
||||
ss::get_operation_mode.set(r, gated(ss, rest_bind(rest_get_operation_mode, ss)));
|
||||
ss::is_starting.set(r, gated(ss, rest_bind(rest_is_starting, ss)));
|
||||
ss::get_drain_progress.set(r, gated(ss, rest_bind(rest_get_drain_progress, ss)));
|
||||
ss::drain.set(r, gated(ss, rest_bind(rest_drain, ss)));
|
||||
ss::stop_gossiping.set(r, gated(ss, rest_bind(rest_stop_gossiping, ss)));
|
||||
ss::start_gossiping.set(r, gated(ss, rest_bind(rest_start_gossiping, ss)));
|
||||
ss::is_gossip_running.set(r, gated(ss, rest_bind(rest_is_gossip_running, ss)));
|
||||
ss::stop_daemon.set(r, gated(ss, rest_bind(rest_stop_daemon)));
|
||||
ss::is_initialized.set(r, gated(ss, rest_bind(rest_is_initialized, ss)));
|
||||
ss::join_ring.set(r, gated(ss, rest_bind(rest_join_ring)));
|
||||
ss::is_joined.set(r, gated(ss, rest_bind(rest_is_joined, ss)));
|
||||
ss::is_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_is_incremental_backups_enabled, ctx)));
|
||||
ss::set_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_set_incremental_backups_enabled, ctx)));
|
||||
ss::rebuild.set(r, gated(ss, rest_bind(rest_rebuild, ss)));
|
||||
ss::bulk_load.set(r, gated(ss, rest_bind(rest_bulk_load)));
|
||||
ss::bulk_load_async.set(r, gated(ss, rest_bind(rest_bulk_load_async)));
|
||||
ss::reschedule_failed_deletions.set(r, gated(ss, rest_bind(rest_reschedule_failed_deletions)));
|
||||
ss::sample_key_range.set(r, gated(ss, rest_bind(rest_sample_key_range)));
|
||||
ss::reset_local_schema.set(r, gated(ss, rest_bind(rest_reset_local_schema, ss)));
|
||||
ss::set_trace_probability.set(r, gated(ss, rest_bind(rest_set_trace_probability)));
|
||||
ss::get_trace_probability.set(r, gated(ss, rest_bind(rest_get_trace_probability)));
|
||||
ss::get_slow_query_info.set(r, gated(ss, rest_bind(rest_get_slow_query_info)));
|
||||
ss::set_slow_query.set(r, gated(ss, rest_bind(rest_set_slow_query)));
|
||||
ss::deliver_hints.set(r, gated(ss, rest_bind(rest_deliver_hints)));
|
||||
ss::get_cluster_name.set(r, gated(ss, rest_bind(rest_get_cluster_name, ss)));
|
||||
ss::get_partitioner_name.set(r, gated(ss, rest_bind(rest_get_partitioner_name, ss)));
|
||||
ss::get_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_warn_threshold)));
|
||||
ss::set_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_warn_threshold)));
|
||||
ss::get_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_failure_threshold)));
|
||||
ss::set_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_failure_threshold)));
|
||||
ss::get_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_get_batch_size_failure_threshold)));
|
||||
ss::set_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_set_batch_size_failure_threshold)));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, gated(ss, rest_bind(rest_set_hinted_handoff_throttle_in_kb)));
|
||||
ss::get_exceptions.set(r, gated(ss, rest_bind(rest_get_exceptions, ss)));
|
||||
ss::get_total_hints_in_progress.set(r, gated(ss, rest_bind(rest_get_total_hints_in_progress)));
|
||||
ss::get_total_hints.set(r, gated(ss, rest_bind(rest_get_total_hints)));
|
||||
ss::get_ownership.set(r, gated(ss, rest_bind(rest_get_ownership, ctx, ss)));
|
||||
ss::get_effective_ownership.set(r, gated(ss, rest_bind(rest_get_effective_ownership, ctx, ss)));
|
||||
ss::retrain_dict.set(r, gated(ss, rest_bind(rest_retrain_dict, ctx, ss, group0_client)));
|
||||
ss::estimate_compression_ratios.set(r, gated(ss, rest_bind(rest_estimate_compression_ratios, ctx, ss)));
|
||||
ss::sstable_info.set(r, gated(ss, rest_bind(rest_sstable_info, ctx)));
|
||||
ss::logstor_info.set(r, gated(ss, rest_bind(rest_logstor_info, ctx)));
|
||||
ss::reload_raft_topology_state.set(r, gated(ss, rest_bind(rest_reload_raft_topology_state, ss, group0_client)));
|
||||
ss::upgrade_to_raft_topology.set(r, gated(ss, rest_bind(rest_upgrade_to_raft_topology, ss)));
|
||||
ss::raft_topology_upgrade_status.set(r, gated(ss, rest_bind(rest_raft_topology_upgrade_status, ss)));
|
||||
ss::raft_topology_get_cmd_status.set(r, gated(ss, rest_bind(rest_raft_topology_get_cmd_status, ss)));
|
||||
ss::move_tablet.set(r, gated(ss, rest_bind(rest_move_tablet, ctx, ss)));
|
||||
ss::add_tablet_replica.set(r, gated(ss, rest_bind(rest_add_tablet_replica, ctx, ss)));
|
||||
ss::del_tablet_replica.set(r, gated(ss, rest_bind(rest_del_tablet_replica, ctx, ss)));
|
||||
ss::repair_tablet.set(r, gated(ss, rest_bind(rest_repair_tablet, ctx, ss)));
|
||||
ss::tablet_balancing_enable.set(r, gated(ss, rest_bind(rest_tablet_balancing_enable, ss)));
|
||||
ss::create_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_create_vnode_tablet_migration, ctx, ss)));
|
||||
ss::get_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_get_vnode_tablet_migration, ctx, ss)));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, gated(ss, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss)));
|
||||
ss::finalize_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss)));
|
||||
ss::quiesce_topology.set(r, gated(ss, rest_bind(rest_quiesce_topology, ss)));
|
||||
sp::get_schema_versions.set(r, gated(ss, rest_bind(rest_get_schema_versions, ss)));
|
||||
ss::drop_quarantined_sstables.set(r, gated(ss, rest_bind(rest_drop_quarantined_sstables, ctx, ss)));
|
||||
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
|
||||
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
|
||||
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
|
||||
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
|
||||
ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
|
||||
ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
|
||||
ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
|
||||
ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
|
||||
ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
|
||||
ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
|
||||
ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
|
||||
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
|
||||
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
|
||||
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
|
||||
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
|
||||
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
|
||||
ss::move.set(r, rest_bind(rest_move, ss));
|
||||
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
|
||||
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
|
||||
ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
|
||||
ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
|
||||
ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
|
||||
ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
|
||||
ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
|
||||
ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
|
||||
ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
|
||||
ss::drain.set(r, rest_bind(rest_drain, ss));
|
||||
ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
|
||||
ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
|
||||
ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
|
||||
ss::stop_daemon.set(r, rest_bind(rest_stop_daemon));
|
||||
ss::is_initialized.set(r, rest_bind(rest_is_initialized, ss));
|
||||
ss::join_ring.set(r, rest_bind(rest_join_ring));
|
||||
ss::is_joined.set(r, rest_bind(rest_is_joined, ss));
|
||||
ss::is_incremental_backups_enabled.set(r, rest_bind(rest_is_incremental_backups_enabled, ctx));
|
||||
ss::set_incremental_backups_enabled.set(r, rest_bind(rest_set_incremental_backups_enabled, ctx));
|
||||
ss::rebuild.set(r, rest_bind(rest_rebuild, ss));
|
||||
ss::bulk_load.set(r, rest_bind(rest_bulk_load));
|
||||
ss::bulk_load_async.set(r, rest_bind(rest_bulk_load_async));
|
||||
ss::reschedule_failed_deletions.set(r, rest_bind(rest_reschedule_failed_deletions));
|
||||
ss::sample_key_range.set(r, rest_bind(rest_sample_key_range));
|
||||
ss::reset_local_schema.set(r, rest_bind(rest_reset_local_schema, ss));
|
||||
ss::set_trace_probability.set(r, rest_bind(rest_set_trace_probability));
|
||||
ss::get_trace_probability.set(r, rest_bind(rest_get_trace_probability));
|
||||
ss::get_slow_query_info.set(r, rest_bind(rest_get_slow_query_info));
|
||||
ss::set_slow_query.set(r, rest_bind(rest_set_slow_query));
|
||||
ss::deliver_hints.set(r, rest_bind(rest_deliver_hints));
|
||||
ss::get_cluster_name.set(r, rest_bind(rest_get_cluster_name, ss));
|
||||
ss::get_partitioner_name.set(r, rest_bind(rest_get_partitioner_name, ss));
|
||||
ss::get_tombstone_warn_threshold.set(r, rest_bind(rest_get_tombstone_warn_threshold));
|
||||
ss::set_tombstone_warn_threshold.set(r, rest_bind(rest_set_tombstone_warn_threshold));
|
||||
ss::get_tombstone_failure_threshold.set(r, rest_bind(rest_get_tombstone_failure_threshold));
|
||||
ss::set_tombstone_failure_threshold.set(r, rest_bind(rest_set_tombstone_failure_threshold));
|
||||
ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
|
||||
ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
|
||||
ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
|
||||
ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
|
||||
ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
|
||||
ss::get_ownership.set(r, rest_bind(rest_get_ownership, ctx, ss));
|
||||
ss::get_effective_ownership.set(r, rest_bind(rest_get_effective_ownership, ctx, ss));
|
||||
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
|
||||
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
|
||||
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
|
||||
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
|
||||
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
|
||||
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
|
||||
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
|
||||
ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
|
||||
ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
|
||||
ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
|
||||
ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
|
||||
ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
|
||||
ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
|
||||
ss::create_vnode_tablet_migration.set(r, rest_bind(rest_create_vnode_tablet_migration, ctx, ss));
|
||||
ss::get_vnode_tablet_migration.set(r, rest_bind(rest_get_vnode_tablet_migration, ctx, ss));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss));
|
||||
ss::finalize_vnode_tablet_migration.set(r, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss));
|
||||
ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
|
||||
sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
|
||||
ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
|
||||
}
|
||||
|
||||
void unset_storage_service(http_context& ctx, routes& r) {
|
||||
|
||||
@@ -113,8 +113,8 @@ static category_set parse_audit_categories(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
audit::audited_tables_t result;
|
||||
static std::map<sstring, std::set<sstring>> parse_audit_tables(const sstring& data) {
|
||||
std::map<sstring, std::set<sstring>> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -139,8 +139,8 @@ static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_keyspaces_t parse_audit_keyspaces(const sstring& data) {
|
||||
audit::audited_keyspaces_t result;
|
||||
static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
|
||||
std::set<sstring> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -156,8 +156,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg)
|
||||
: _token_metadata(token_metadata)
|
||||
@@ -165,8 +165,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
, _audited_tables(std::move(audited_tables))
|
||||
, _audited_categories(std::move(audited_categories))
|
||||
, _cfg(cfg)
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<audited_keyspaces_t>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<audited_tables_t>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
|
||||
{
|
||||
_storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
|
||||
@@ -181,8 +181,8 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
return make_ready_future<>();
|
||||
}
|
||||
category_set audited_categories = parse_audit_categories(cfg.audit_categories());
|
||||
audit::audited_tables_t audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
audit::audited_keyspaces_t audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
std::set<sstring> audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
|
||||
logger.info("Audit is enabled. Auditing to: \"{}\", with the following categories: \"{}\", keyspaces: \"{}\", and tables: \"{}\"",
|
||||
cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());
|
||||
@@ -194,36 +194,22 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
std::move(audited_keyspaces),
|
||||
std::move(audited_tables),
|
||||
std::move(audited_categories),
|
||||
std::cref(cfg));
|
||||
}
|
||||
|
||||
future<> audit::start_storage(const db::config& cfg) {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
|
||||
local_audit._storage_running = true;
|
||||
std::cref(cfg))
|
||||
.then([&cfg] {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
|
||||
return local_audit.start(cfg);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_storage() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit_instance().invoke_on_all([] (audit& local_audit) {
|
||||
local_audit._storage_running = false;
|
||||
return local_audit._storage_helper_ptr->stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> audit::stop_audit() {
|
||||
if (!audit_instance().local_is_initialized()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
|
||||
SCYLLA_ASSERT(!local_audit._storage_running);
|
||||
return local_audit.shutdown();
|
||||
}).then([] {
|
||||
return audit::audit::audit_instance().stop();
|
||||
@@ -237,6 +223,14 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
|
||||
return std::make_unique<audit_info>(cat, keyspace, table, batch);
|
||||
}
|
||||
|
||||
future<> audit::start(const db::config& cfg) {
|
||||
return _storage_helper_ptr->start(cfg);
|
||||
}
|
||||
|
||||
future<> audit::stop() {
|
||||
return _storage_helper_ptr->stop();
|
||||
}
|
||||
|
||||
future<> audit::shutdown() {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -247,12 +241,6 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
|
||||
const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
|
||||
socket_address client_ip = client_state.get_client_address().addr();
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
audit_info.query(), client_ip, audit_info.table(), username));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
|
||||
node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
|
||||
@@ -298,11 +286,6 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c
|
||||
|
||||
future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
|
||||
socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
|
||||
if (!_storage_running) {
|
||||
on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false"));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
|
||||
node_ip, client_ip, username, error ? "true" : "false");
|
||||
@@ -321,7 +304,7 @@ future<> inspect_login(const sstring& username, socket_address client_ip, bool e
|
||||
return audit::local_audit_instance().log_login(username, client_ip, error);
|
||||
}
|
||||
|
||||
bool audit::should_log_table(std::string_view keyspace, std::string_view name) const {
|
||||
bool audit::should_log_table(const sstring& keyspace, const sstring& name) const {
|
||||
auto keyspace_it = _audited_tables.find(keyspace);
|
||||
return keyspace_it != _audited_tables.cend() && keyspace_it->second.find(name) != keyspace_it->second.cend();
|
||||
}
|
||||
@@ -336,8 +319,8 @@ bool audit::will_log(statement_category cat, std::string_view keyspace, std::str
|
||||
// so it is logged whenever the category matches.
|
||||
return _audited_categories.contains(cat)
|
||||
&& (keyspace.empty()
|
||||
|| _audited_keyspaces.find(keyspace) != _audited_keyspaces.cend()
|
||||
|| should_log_table(keyspace, table)
|
||||
|| _audited_keyspaces.find(sstring(keyspace)) != _audited_keyspaces.cend()
|
||||
|| should_log_table(sstring(keyspace), sstring(table))
|
||||
|| cat == statement_category::AUTH
|
||||
|| cat == statement_category::ADMIN
|
||||
|| cat == statement_category::DCL);
|
||||
|
||||
@@ -129,19 +129,13 @@ public:
|
||||
class storage_helper;
|
||||
|
||||
class audit final : public seastar::async_sharded_service<audit> {
|
||||
public:
|
||||
// Transparent comparator (std::less<>) enables heterogeneous lookup with
|
||||
// string_view keys.
|
||||
using audited_keyspaces_t = std::set<sstring, std::less<>>;
|
||||
using audited_tables_t = std::map<sstring, std::set<sstring, std::less<>>, std::less<>>;
|
||||
private:
|
||||
locator::shared_token_metadata& _token_metadata;
|
||||
audited_keyspaces_t _audited_keyspaces;
|
||||
audited_tables_t _audited_tables;
|
||||
std::set<sstring> _audited_keyspaces;
|
||||
// Maps keyspace name to set of table names in that keyspace
|
||||
std::map<sstring, std::set<sstring>> _audited_tables;
|
||||
category_set _audited_categories;
|
||||
|
||||
std::unique_ptr<storage_helper> _storage_helper_ptr;
|
||||
bool _storage_running = false;
|
||||
|
||||
const db::config& _cfg;
|
||||
utils::observer<sstring> _cfg_keyspaces_observer;
|
||||
@@ -151,7 +145,7 @@ private:
|
||||
template<class T>
|
||||
void update_config(const sstring & new_value, std::function<T(const sstring&)> parse_func, T& cfg_parameter);
|
||||
|
||||
bool should_log_table(std::string_view keyspace, std::string_view name) const;
|
||||
bool should_log_table(const sstring& keyspace, const sstring& name) const;
|
||||
public:
|
||||
static seastar::sharded<audit>& audit_instance() {
|
||||
// FIXME: leaked intentionally to avoid shutdown problems, see #293
|
||||
@@ -164,19 +158,19 @@ public:
|
||||
return audit_instance().local();
|
||||
}
|
||||
static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
|
||||
static future<> start_storage(const db::config& cfg);
|
||||
static future<> stop_storage();
|
||||
static future<> stop_audit();
|
||||
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
|
||||
audit(locator::shared_token_metadata& stm,
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg);
|
||||
~audit();
|
||||
future<> start(const db::config& cfg);
|
||||
future<> stop();
|
||||
future<> shutdown();
|
||||
bool should_log(const audit_info& audit_info) const;
|
||||
bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
|
||||
|
||||
@@ -185,14 +185,24 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
|
||||
static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
if (!r.has("value")) {
|
||||
continue;
|
||||
}
|
||||
rec->attributes[r.get_as<sstring>("name")] =
|
||||
r.get_as<sstring>("value");
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
// permissions
|
||||
{
|
||||
static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
|
||||
auto rs = co_await fetch(q);
|
||||
for (const auto& r : *rs) {
|
||||
auto resource = r.get_as<sstring>("resource");
|
||||
auto perms_strings = r.get_set<sstring>("permissions");
|
||||
std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
|
||||
auto pset = permissions::from_strings(perms_set);
|
||||
rec->permissions[std::move(resource)] = std::move(pset);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
co_return rec;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ public:
|
||||
std::unordered_set<role_name_t> members;
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
|
||||
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
|
||||
@@ -76,11 +76,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
|
||||
if (results->empty()) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
const auto& row = results->one();
|
||||
if (!row.has(PERMISSIONS_NAME)) {
|
||||
co_return permissions::NONE;
|
||||
}
|
||||
co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
|
||||
co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
|
||||
}
|
||||
|
||||
future<>
|
||||
|
||||
@@ -258,11 +258,13 @@ future<> ldap_role_manager::start() {
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
try {
|
||||
co_await _cache.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
return _std_mgr.start();
|
||||
|
||||
@@ -157,20 +157,6 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
return create_legacy_keyspace_if_missing(mm);
|
||||
});
|
||||
}
|
||||
// Authorizer must be started before the permission loader is set,
|
||||
// because the loader calls _authorizer->authorize().
|
||||
// The loader must be set before starting the role manager, because
|
||||
// LDAP role manager starts a pruner fiber that calls
|
||||
// reload_all_permissions() which asserts _permission_loader is set.
|
||||
co_await _authorizer->start();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
co_await _role_manager->start();
|
||||
if (this_shard_id() == 0) {
|
||||
// Role manager and password authenticator have this odd startup
|
||||
@@ -179,19 +165,21 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
// creation therefore we need to wait here.
|
||||
co_await _role_manager->ensure_superuser_is_created();
|
||||
}
|
||||
// Authenticator must be started after ensure_superuser_is_created()
|
||||
// because password_authenticator queries system.roles for the
|
||||
// superuser entry created by the role manager.
|
||||
co_await _authenticator->start();
|
||||
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
}
|
||||
|
||||
future<> service::stop() {
|
||||
_as.request_abort();
|
||||
// Reverse of start() order.
|
||||
co_await _authenticator->stop();
|
||||
co_await _role_manager->stop();
|
||||
_cache.set_permission_loader(nullptr);
|
||||
co_await _authorizer->stop();
|
||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
|
||||
}
|
||||
|
||||
future<> service::ensure_superuser_is_created() {
|
||||
|
||||
@@ -1625,7 +1625,7 @@ struct process_change_visitor {
|
||||
if (_enable_updating_state) {
|
||||
if (_request_options.alternator && _alternator_schema_has_no_clustering_key && _clustering_row_states.empty()) {
|
||||
// Alternator's table can be with or without clustering key. If the clustering key exists,
|
||||
// delete request will be `clustered_row_delete` and will be handled there.
|
||||
// delete request will be `clustered_row_delete` and will be hanlded there.
|
||||
// If the clustering key doesn't exist, delete request will be `partition_delete` and will be handled here.
|
||||
// The no-clustering-key case is slightly tricky, because insert of such item is handled by `clustered_row_cells`
|
||||
// and has some value as clustering_key (the value currently seems to be empty bytes object).
|
||||
@@ -1933,7 +1933,7 @@ public:
|
||||
if (_options.alternator && !_alternator_clustering_keys_to_ignore.empty()) {
|
||||
// we filter mutations for Alternator's changes here.
|
||||
// We do it per mutation object (user might submit a batch of those in one go
|
||||
// and some might be split because of different timestamps),
|
||||
// and some might be splitted because of different timestamps),
|
||||
// ignore key set is cleared afterwards.
|
||||
// If single mutation object contains two separate changes to the same row
|
||||
// and at least one of them is ignored, all of them will be ignored.
|
||||
|
||||
@@ -267,7 +267,7 @@ struct extract_row_visitor {
|
||||
visit_collection(v);
|
||||
},
|
||||
[&] (const abstract_type& o) {
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
|
||||
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
@@ -137,24 +137,6 @@ endfunction()
|
||||
|
||||
option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)
|
||||
|
||||
# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
|
||||
# Each .o produces a companion .json file in the build directory that can be
|
||||
# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
|
||||
#
|
||||
# Usage:
|
||||
# cmake -DScylla_TIME_TRACE=ON ...
|
||||
# ninja
|
||||
# # Analyze results (requires ClangBuildAnalyzer):
|
||||
# ClangBuildAnalyzer --all <build-dir> capture.bin
|
||||
# ClangBuildAnalyzer --analyze capture.bin
|
||||
option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
|
||||
if(Scylla_TIME_TRACE)
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
|
||||
endif()
|
||||
add_compile_options(-ftime-trace)
|
||||
endif()
|
||||
|
||||
macro(update_build_flags config)
|
||||
cmake_parse_arguments (
|
||||
parsed_args
|
||||
|
||||
@@ -240,7 +240,7 @@ static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& ta
|
||||
// and if the memtable also contains the key we're calculating max purgeable timestamp for.
|
||||
// First condition helps to not penalize the common scenario where memtable only contains
|
||||
// newer data.
|
||||
if (!table_s.skip_memtable_for_tombstone_gc() && memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
timestamp = memtable_min_timestamp;
|
||||
source = max_purgeable::timestamp_source::memtable_possibly_shadowing_data;
|
||||
}
|
||||
|
||||
@@ -39,9 +39,6 @@ public:
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
|
||||
virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
|
||||
// Returns true when tombstone GC considers only the repaired sstable set, meaning the
|
||||
// memtable does not need to be consulted (its data is always newer than any GC-eligible tombstone).
|
||||
virtual bool skip_memtable_for_tombstone_gc() const noexcept = 0;
|
||||
virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
|
||||
virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
|
||||
virtual compaction_strategy& get_compaction_strategy() const noexcept = 0;
|
||||
|
||||
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
|
||||
sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
|
||||
sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
|
||||
sm::make_counter("validation_errors", [this] { return _validation_errors; },
|
||||
sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
|
||||
sm::description("Holds the number of encountered validation errors.")),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -406,11 +406,7 @@ commitlog_total_space_in_mb: -1
|
||||
# In short, `ms` needs more CPU during sstable writes,
|
||||
# but should behave better during reads,
|
||||
# although it might behave worse for very long clustering keys.
|
||||
#
|
||||
# `ms` sstable format works even better with `column_index_size_in_kb` set to 1,
|
||||
# so keep those two settings in sync (either both set, or both unset).
|
||||
sstable_format: ms
|
||||
column_index_size_in_kb: 1
|
||||
|
||||
# Auto-scaling of the promoted index prevents running out of memory
|
||||
# when the promoted index grows too large (due to partitions with many rows
|
||||
|
||||
69
configure.py
69
configure.py
@@ -285,12 +285,8 @@ def generate_compdb(compdb, ninja, buildfile, modes):
|
||||
os.symlink(compdb_target, compdb)
|
||||
except FileExistsError:
|
||||
# if there is already a valid compile_commands.json link in the
|
||||
# source root, we are done. if it's a stale link, update it.
|
||||
if os.path.islink(compdb):
|
||||
current_target = os.readlink(compdb)
|
||||
if not os.path.exists(current_target):
|
||||
os.unlink(compdb)
|
||||
os.symlink(compdb_target, compdb)
|
||||
# source root, we are done.
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
@@ -597,7 +593,6 @@ scylla_tests = set([
|
||||
'test/boost/linearizing_input_stream_test',
|
||||
'test/boost/lister_test',
|
||||
'test/boost/locator_topology_test',
|
||||
'test/boost/lock_tables_metadata_test',
|
||||
'test/boost/log_heap_test',
|
||||
'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
|
||||
'test/boost/logalloc_test',
|
||||
@@ -858,10 +853,6 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
|
||||
arg_parser.add_argument('--build-dir', action='store', default='build',
|
||||
help='Build directory path')
|
||||
arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
|
||||
arg_parser.add_argument('--time-trace', action='store_true', default=False,
|
||||
help='Enable Clang -ftime-trace for build profiling. '
|
||||
'Each .o produces a .json file analyzable with '
|
||||
'ClangBuildAnalyzer or chrome://tracing')
|
||||
arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
|
||||
args = arg_parser.parse_args()
|
||||
if args.help:
|
||||
@@ -1668,7 +1659,6 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/auth_cache_test.cc',
|
||||
'test/boost/auth_test.cc',
|
||||
'test/boost/batchlog_manager_test.cc',
|
||||
'test/boost/table_helper_test.cc',
|
||||
'test/boost/cache_algorithm_test.cc',
|
||||
'test/boost/castas_fcts_test.cc',
|
||||
'test/boost/cdc_test.cc',
|
||||
@@ -1720,7 +1710,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/sstable_compression_config_test.cc',
|
||||
'test/boost/sstable_directory_test.cc',
|
||||
'test/boost/sstable_set_test.cc',
|
||||
'test/boost/sstable_tablet_streaming_test.cc',
|
||||
'test/boost/sstable_tablet_streaming.cc',
|
||||
'test/boost/statement_restrictions_test.cc',
|
||||
'test/boost/storage_proxy_test.cc',
|
||||
'test/boost/tablets_test.cc',
|
||||
@@ -1975,9 +1965,6 @@ user_cflags += ' -fextend-variable-liveness=none'
|
||||
if args.target != '':
|
||||
user_cflags += ' -march=' + args.target
|
||||
|
||||
if args.time_trace:
|
||||
user_cflags += ' -ftime-trace'
|
||||
|
||||
for mode in modes:
|
||||
# Those flags are passed not only to Scylla objects, but also to libraries
|
||||
# that we compile ourselves.
|
||||
@@ -2470,9 +2457,6 @@ def write_build_file(f,
|
||||
command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
|
||||
rule unified
|
||||
command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
|
||||
rule collect_pkgs
|
||||
command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
|
||||
description = COLLECT $out
|
||||
rule rust_header
|
||||
command = cxxbridge --include rust/cxx.h --header $in > $out
|
||||
description = RUST_HEADER $out
|
||||
@@ -2785,6 +2769,25 @@ def write_build_file(f,
|
||||
f.write('build {}: rust_source {}\n'.format(cc, src))
|
||||
obj = cc.replace('.cc', '.o')
|
||||
compiles[obj] = cc
|
||||
# Sources shared between scylla (compiled with PCH) and small tests
|
||||
# (with custom deps and partial link sets) must not use the PCH,
|
||||
# because -fpch-instantiate-templates injects symbol references that
|
||||
# the small test link sets cannot satisfy.
|
||||
small_test_srcs = set()
|
||||
for test_binary, test_deps in deps.items():
|
||||
if not test_binary.startswith('test/'):
|
||||
continue
|
||||
# Only exclude PCH for tests with truly small/partial link sets.
|
||||
# Tests that include scylla_core or similar large dep sets link
|
||||
# against enough objects to satisfy PCH-injected symbol refs.
|
||||
if len(test_deps) > 50:
|
||||
continue
|
||||
for src in test_deps:
|
||||
if src.endswith('.cc'):
|
||||
small_test_srcs.add(src)
|
||||
for src in small_test_srcs:
|
||||
obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
|
||||
compiles_with_pch.discard(obj)
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
@@ -2958,8 +2961,6 @@ def write_build_file(f,
|
||||
build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar
|
||||
|
||||
build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
|
||||
|
||||
build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
|
||||
'''))
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
@@ -2967,28 +2968,7 @@ def write_build_file(f,
|
||||
rule dist-check
|
||||
command = ./tools/testing/dist-check/dist-check.sh --mode $mode
|
||||
'''))
|
||||
deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
|
||||
deb_ver = f'{scylla_version}-{scylla_release}-1'
|
||||
rpm_ver = f'{scylla_version}-{scylla_release}'
|
||||
for mode in build_modes:
|
||||
server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
|
||||
server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
|
||||
for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
|
||||
python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
|
||||
all_rpms = server_rpms + cqlsh_rpms + python3_rpms
|
||||
|
||||
server_deb_dir = f'$builddir/dist/{mode}/debian'
|
||||
server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
|
||||
for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
|
||||
for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
|
||||
cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
|
||||
python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
|
||||
f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
|
||||
all_debs = server_debs + cqlsh_debs + python3_debs
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
@@ -2996,11 +2976,6 @@ def write_build_file(f,
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
|
||||
|
||||
build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
|
||||
pkgs = {' '.join(all_rpms)}
|
||||
build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
|
||||
pkgs = {' '.join(all_debs)}
|
||||
build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
|
||||
build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
|
||||
build dist-{mode}: phony {mode}-dist
|
||||
build dist-check-{mode}: dist-check
|
||||
|
||||
@@ -136,9 +136,9 @@ public:
|
||||
{}
|
||||
|
||||
future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
|
||||
return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
|
||||
return make_ready_future<value_type>(std::move(v));
|
||||
});
|
||||
}).discard_result();
|
||||
}
|
||||
|
||||
value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
|
||||
|
||||
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
|
||||
.args = {},
|
||||
};
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
84
cql3/prepared_cache_key_type.hh
Normal file
84
cql3/prepared_cache_key_type.hh
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "bytes.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "utils/loading_cache.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/prepared_cache_key_type.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/column_specification.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
@@ -27,39 +28,6 @@ struct prepared_cache_entry_size {
|
||||
}
|
||||
};
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
class prepared_statements_cache {
|
||||
public:
|
||||
struct stats {
|
||||
@@ -164,35 +132,3 @@ public:
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include <seastar/coroutine/as_future.hh>
|
||||
#include <seastar/coroutine/try_future.hh>
|
||||
|
||||
#include "cql3/prepared_statements_cache.hh"
|
||||
#include "cql3/authorized_prepared_statements_cache.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/mapreduce_service.hh"
|
||||
@@ -77,7 +80,7 @@ static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
@@ -86,7 +89,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _authorized_prepared_cache(auth_prep_cache_cfg, authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
|
||||
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
@@ -1074,7 +1077,7 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
|
||||
@@ -22,13 +22,14 @@
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/stats.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/broadcast_tables/experimental/query_result.hh"
|
||||
#include "vector_search/vector_store_client.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/observable.hh"
|
||||
#include "utils/rolling_max_tracker.hh"
|
||||
@@ -41,6 +42,9 @@
|
||||
|
||||
|
||||
namespace lang { class manager; }
|
||||
namespace vector_search {
|
||||
class vector_store_client;
|
||||
}
|
||||
namespace service {
|
||||
class migration_manager;
|
||||
class query_state;
|
||||
@@ -58,6 +62,9 @@ struct query;
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
class prepared_statements_cache;
|
||||
class authorized_prepared_statements_cache;
|
||||
|
||||
namespace statements {
|
||||
class batch_statement;
|
||||
class schema_altering_statement;
|
||||
@@ -184,7 +191,7 @@ public:
|
||||
static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries, dialect d);
|
||||
|
||||
query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc,
|
||||
memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm);
|
||||
memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm);
|
||||
|
||||
~query_processor();
|
||||
|
||||
@@ -474,7 +481,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement> stmt,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
return execute_batch_without_checking_exception_message(
|
||||
std::move(stmt),
|
||||
query_state,
|
||||
@@ -490,7 +497,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement>,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries);
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries);
|
||||
|
||||
future<service::broadcast_tables::query_result>
|
||||
execute_broadcast_table_query(const service::broadcast_tables::query&);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,113 +23,15 @@ namespace cql3 {
|
||||
|
||||
namespace restrictions {
|
||||
|
||||
/// A set of discrete values.
|
||||
using value_list = std::vector<managed_bytes>; // Sorted and deduped using value comparator.
|
||||
|
||||
/// General set of values. Empty set and single-element sets are always value_list. interval is
|
||||
/// never singular and never has start > end. Universal set is a interval with both bounds null.
|
||||
using value_set = std::variant<value_list, interval<managed_bytes>>;
|
||||
|
||||
// For some boolean expression (say (X = 3) = TRUE, this represents a function that solves for X.
|
||||
// (here, it would return 3). The expression is obtained by equating some factors of the WHERE
|
||||
// clause to TRUE.
|
||||
using solve_for_t = std::function<value_set (const query_options&)>;
|
||||
|
||||
struct on_row {
|
||||
bool operator==(const on_row&) const = default;
|
||||
};
|
||||
|
||||
struct on_column {
|
||||
const column_definition* column;
|
||||
|
||||
bool operator==(const on_column&) const = default;
|
||||
};
|
||||
|
||||
// Placeholder type indicating we're solving for the partition key token.
|
||||
struct on_partition_key_token {
|
||||
const ::schema* schema;
|
||||
|
||||
bool operator==(const on_partition_key_token&) const = default;
|
||||
};
|
||||
|
||||
struct on_clustering_key_prefix {
|
||||
std::vector<const column_definition*> columns;
|
||||
|
||||
bool operator==(const on_clustering_key_prefix&) const = default;
|
||||
};
|
||||
|
||||
// A predicate on a column or a combination of columns. The WHERE clause analyzer
|
||||
// will attempt to convert predicates (that return true or false for a particular row)
|
||||
// to solvers (that return the set of column values that satisfy the predicate) when possible.
|
||||
struct predicate {
|
||||
// A function that returns the set of values that satisfy the filter. Can be unset,
|
||||
// in which case the filter must be interpreted.
|
||||
solve_for_t solve_for;
|
||||
// The original filter for this column.
|
||||
expr::expression filter;
|
||||
// What column the predicate can be solved for
|
||||
std::variant<
|
||||
on_row, // cannot determine, so predicate is on entire row
|
||||
on_column, // solving for a single column: e.g. c1 = 3
|
||||
on_partition_key_token, // solving for the token, e.g. token(pk1, pk2) >= :var
|
||||
on_clustering_key_prefix // solving for a clustering key prefix: e.g. (ck1, ck2) >= (3, 4)
|
||||
> on;
|
||||
// Whether the returned value_set will resolve to a single value.
|
||||
bool is_singleton = false;
|
||||
// Whether the returned value_set follows CQL comparison semantics
|
||||
bool comparable = true;
|
||||
bool is_multi_column = false;
|
||||
bool is_not_null_single_column = false;
|
||||
bool equality = false; // operator is EQ
|
||||
bool is_in = false; // operator is IN
|
||||
bool is_slice = false; // operator is LT/LTE/GT/GTE
|
||||
bool is_upper_bound = false; // operator is LT/LTE
|
||||
bool is_lower_bound = false; // operator is GT/GTE
|
||||
expr::comparison_order order = expr::comparison_order::cql;
|
||||
std::optional<expr::oper_t> op; // the binary operator, if any
|
||||
bool is_subscript = false; // whether the LHS is a subscript (map element access)
|
||||
};
|
||||
|
||||
///In some cases checking if columns have indexes is undesired of even
|
||||
///impossible, because e.g. the query runs on a pseudo-table, which does not
|
||||
///have an index-manager, or even a table object.
|
||||
using check_indexes = bool_class<class check_indexes_tag>;
|
||||
|
||||
// A function that returns the partition key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE token(pk) > 1 or WHERE pk1 IN :list1 AND pk2 IN :list2.
|
||||
using get_partition_key_ranges_fn_t = std::function<dht::partition_range_vector (const query_options&)>;
|
||||
|
||||
// A function that returns the clustering key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE ck > 1 or WHERE (ck1, ck2) > (1, 2).
|
||||
using get_clustering_bounds_fn_t = std::function<std::vector<query::clustering_range> (const query_options& options)>;
|
||||
|
||||
// A function that returns a singleton value, usable for a key (e.g. bytes_opt)
|
||||
using get_singleton_value_fn_t = std::function<bytes_opt (const query_options&)>;
|
||||
|
||||
struct no_partition_range_restrictions {
|
||||
};
|
||||
|
||||
struct token_range_restrictions {
|
||||
predicate token_restrictions;
|
||||
};
|
||||
|
||||
struct single_column_partition_range_restrictions {
|
||||
std::vector<predicate> per_column_restrictions;
|
||||
};
|
||||
|
||||
using partition_range_restrictions = std::variant<
|
||||
no_partition_range_restrictions,
|
||||
token_range_restrictions,
|
||||
single_column_partition_range_restrictions>;
|
||||
|
||||
// A map of per-column predicate vectors, ordered by schema position.
|
||||
using single_column_predicate_vectors = std::map<const column_definition*, std::vector<predicate>, expr::schema_pos_column_definition_comparator>;
|
||||
|
||||
/**
|
||||
* The restrictions corresponding to the relations specified on the where-clause of CQL query.
|
||||
*/
|
||||
class statement_restrictions {
|
||||
struct private_tag {}; // Tag for private constructor
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
|
||||
@@ -179,7 +81,7 @@ private:
|
||||
bool _has_queriable_regular_index = false, _has_queriable_pk_index = false, _has_queriable_ck_index = false;
|
||||
bool _has_multi_column; ///< True iff _clustering_columns_restrictions has a multi-column restriction.
|
||||
|
||||
std::vector<expr::expression> _where; ///< The entire WHERE clause (factorized).
|
||||
std::optional<expr::expression> _where; ///< The entire WHERE clause.
|
||||
|
||||
/// Parts of _where defining the clustering slice.
|
||||
///
|
||||
@@ -194,7 +96,7 @@ private:
|
||||
/// 4.4 elements other than the last have only EQ or IN atoms
|
||||
/// 4.5 the last element has only EQ, IN, or is_slice() atoms
|
||||
/// 5. if multi-column, then each element is a binary_operator
|
||||
std::vector<predicate> _clustering_prefix_restrictions;
|
||||
std::vector<expr::expression> _clustering_prefix_restrictions;
|
||||
|
||||
/// Like _clustering_prefix_restrictions, but for the indexing table (if this is an index-reading statement).
|
||||
/// Recall that the index-table CK is (token, PK, CK) of the base table for a global index and (indexed column,
|
||||
@@ -203,7 +105,7 @@ private:
|
||||
/// Elements are conjunctions of single-column binary operators with the same LHS.
|
||||
/// Element order follows the indexing-table clustering key.
|
||||
/// In case of a global index the first element's (token restriction) RHS is a dummy value, it is filled later.
|
||||
std::optional<std::vector<predicate>> _idx_tbl_ck_prefix;
|
||||
std::optional<std::vector<expr::expression>> _idx_tbl_ck_prefix;
|
||||
|
||||
/// Parts of _where defining the partition range.
|
||||
///
|
||||
@@ -211,25 +113,16 @@ private:
|
||||
/// binary_operators on token. If single-column restrictions define the partition range, each element holds
|
||||
/// restrictions for one partition column. Each partition column has a corresponding element, but the elements
|
||||
/// are in arbitrary order.
|
||||
partition_range_restrictions _partition_range_restrictions;
|
||||
std::vector<expr::expression> _partition_range_restrictions;
|
||||
|
||||
bool _partition_range_is_simple; ///< False iff _partition_range_restrictions imply a Cartesian product.
|
||||
|
||||
|
||||
check_indexes _check_indexes = check_indexes::yes;
|
||||
/// Columns that appear on the LHS of an EQ restriction (not IN).
|
||||
/// For multi-column EQ like (ck1, ck2) = (1, 2), all columns in the tuple are included.
|
||||
std::unordered_set<const column_definition*> _columns_with_eq;
|
||||
std::vector<const column_definition*> _column_defs_for_filtering;
|
||||
schema_ptr _view_schema;
|
||||
std::optional<secondary_index::index> _idx_opt;
|
||||
expr::expression _idx_restrictions = expr::conjunction({});
|
||||
get_partition_key_ranges_fn_t _get_partition_key_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_clustering_bounds_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_token_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_local_index_clustering_ranges_fn;
|
||||
get_singleton_value_fn_t _value_for_index_partition_key_fn;
|
||||
public:
|
||||
/**
|
||||
* Creates a new empty <code>StatementRestrictions</code>.
|
||||
@@ -237,10 +130,9 @@ public:
|
||||
* @param cfm the column family meta data
|
||||
* @return a new empty <code>StatementRestrictions</code>.
|
||||
*/
|
||||
statement_restrictions(private_tag, schema_ptr schema, bool allow_filtering);
|
||||
statement_restrictions(schema_ptr schema, bool allow_filtering);
|
||||
|
||||
public:
|
||||
friend shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
friend statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -250,15 +142,9 @@ public:
|
||||
bool for_view,
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
friend shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Important: objects of this class captures `this` extensively and so must remain non-copyable.
|
||||
statement_restrictions(const statement_restrictions&) = delete;
|
||||
statement_restrictions& operator=(const statement_restrictions&) = delete;
|
||||
statement_restrictions(private_tag,
|
||||
data_dictionary::database db,
|
||||
private:
|
||||
statement_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
const expr::expression& where_clause,
|
||||
@@ -325,7 +211,10 @@ public:
|
||||
|
||||
bool has_token_restrictions() const;
|
||||
|
||||
// Checks whether the given column has an EQ restriction (not IN).
|
||||
// Checks whether the given column has an EQ restriction.
|
||||
// EQ restriction is `col = ...` or `(col, col2) = ...`
|
||||
// IN restriction is NOT an EQ restriction, this function will not look for IN restrictions.
|
||||
// Uses column_defintion::operator== for comparison, columns with the same name but different schema will not be equal.
|
||||
bool has_eq_restriction_on_column(const column_definition&) const;
|
||||
|
||||
/**
|
||||
@@ -335,6 +224,12 @@ public:
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(data_dictionary::database db) const;
|
||||
|
||||
/**
|
||||
* Gives a score that the index has - index with the highest score will be chosen
|
||||
* in find_idx()
|
||||
*/
|
||||
int score(const secondary_index::index& index) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the data_dictionary::database context (for extracting index manager)
|
||||
@@ -355,8 +250,18 @@ public:
|
||||
|
||||
size_t partition_key_restrictions_size() const;
|
||||
|
||||
bool parition_key_restrictions_have_supporting_index(const secondary_index::secondary_index_manager& index_manager, expr::allow_local_index allow_local) const;
|
||||
|
||||
size_t clustering_columns_restrictions_size() const;
|
||||
|
||||
bool clustering_columns_restrictions_have_supporting_index(
|
||||
const secondary_index::secondary_index_manager& index_manager,
|
||||
expr::allow_local_index allow_local) const;
|
||||
|
||||
bool multi_column_clustering_restrictions_are_supported_by(const secondary_index::index& index) const;
|
||||
|
||||
bounds_slice get_clustering_slice() const;
|
||||
|
||||
/**
|
||||
* Checks if the clustering key has some unrestricted components.
|
||||
* @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
|
||||
@@ -374,6 +279,15 @@ public:
|
||||
|
||||
schema_ptr get_view_schema() const { return _view_schema; }
|
||||
private:
|
||||
std::pair<std::optional<secondary_index::index>, expr::expression> do_find_idx(const secondary_index::secondary_index_manager& sim) const;
|
||||
void add_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_is_not_restriction(const expr::binary_operator& restr, schema_ptr schema, bool for_view);
|
||||
void add_single_column_parition_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_token_partition_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_clustering_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering);
|
||||
void add_multi_column_clustering_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_nonprimary_key_restriction(const expr::binary_operator& restr);
|
||||
|
||||
void process_partition_key_restrictions(bool for_view, bool allow_filtering, statements::statement_type type);
|
||||
|
||||
/**
|
||||
@@ -401,17 +315,7 @@ private:
|
||||
void add_clustering_restrictions_to_idx_ck_prefix(const schema& idx_tbl_schema);
|
||||
|
||||
unsigned int num_clustering_prefix_columns_that_need_not_be_filtered() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(
|
||||
data_dictionary::database db,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
get_partition_key_ranges_fn_t build_partition_key_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_clustering_bounds_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_token_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_local_index_clustering_ranges_fn() const;
|
||||
get_singleton_value_fn_t build_value_for_index_partition_key_fn() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(data_dictionary::database db);
|
||||
public:
|
||||
/**
|
||||
* Returns the specified range of the partition key.
|
||||
@@ -485,10 +389,7 @@ public:
|
||||
private:
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_local_index_clustering_ranges().
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema);
|
||||
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_global_index_clustering_ranges() or get_global_index_token_clustering_ranges().
|
||||
@@ -497,18 +398,15 @@ private:
|
||||
public:
|
||||
/// Calculates clustering ranges for querying a global-index table.
|
||||
std::vector<query::clustering_range> get_global_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a global-index table for queries with token restrictions present.
|
||||
std::vector<query::clustering_range> get_global_index_token_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a local-index table.
|
||||
std::vector<query::clustering_range> get_local_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
|
||||
/// Finds the value of partition key of the index table
|
||||
bytes_opt value_for_index_partition_key(const query_options&) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
sstring to_string() const;
|
||||
|
||||
@@ -518,7 +416,7 @@ public:
|
||||
bool is_empty() const;
|
||||
};
|
||||
|
||||
shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -529,14 +427,23 @@ shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
|
||||
shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Extracts all binary operators which have the given column on their left hand side.
|
||||
// Extracts only single-column restrictions.
|
||||
// Does not include multi-column restrictions.
|
||||
// Does not include token() restrictions.
|
||||
// Does not include boolean constant restrictions.
|
||||
// For example "WHERE c = 1 AND (a, c) = (2, 1) AND token(p) < 2 AND FALSE" will return {"c = 1"}.
|
||||
std::vector<expr::expression> extract_single_column_restrictions_for_column(const expr::expression&, const column_definition&);
|
||||
|
||||
|
||||
// Checks whether this expression is empty - doesn't restrict anything
|
||||
bool is_empty_restriction(const expr::expression&);
|
||||
|
||||
// Finds the value of the given column in the expression
|
||||
// In case of multpiple possible values calls on_internal_error
|
||||
bytes_opt value_for(const column_definition&, const expr::expression&, const query_options&);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -90,20 +90,6 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
auto& current_rf_per_dc = ks.metadata()->strategy_options();
|
||||
auto new_rf_per_dc = _attrs->get_replication_options();
|
||||
new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
|
||||
// Check if multi-RF change is allowed: all DC changes must be 0->N or N->0.
|
||||
auto all_changes_are_0_N = [&] {
|
||||
for (const auto& [dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf_val = size_t(0);
|
||||
if (auto it = current_rf_per_dc.find(dc); it != current_rf_per_dc.end()) {
|
||||
old_rf_val = locator::get_replication_factor(it->second);
|
||||
}
|
||||
auto new_rf_val = locator::get_replication_factor(new_rf);
|
||||
if (old_rf_val != new_rf_val && old_rf_val != 0 && new_rf_val != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
unsigned total_abs_rfs_diff = 0;
|
||||
for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf = locator::replication_strategy_config_option(sstring("0"));
|
||||
@@ -117,9 +103,7 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
// first we need to report non-existing DCs, then if RFs aren't changed by too much.
|
||||
continue;
|
||||
}
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2 &&
|
||||
!(qp.proxy().features().keyspace_multi_rf_change && locator::uses_rack_list_exclusively(current_rf_per_dc)
|
||||
&& locator::uses_rack_list_exclusively(new_ks->strategy_options()) && all_changes_are_0_N())) {
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2) {
|
||||
throw exceptions::invalid_request_exception("Only one DC's RF can be changed at a time and not by more than 1");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
#include "cas_request.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "utils/unique_view.hh"
|
||||
|
||||
@@ -89,10 +89,6 @@ public:
|
||||
|
||||
const std::vector<single_statement>& statements() const { return _statements; }
|
||||
|
||||
audit::audit_info_ptr audit_info() const {
|
||||
return audit::audit::create_audit_info(audit::statement_category::DML, sstring(), sstring(), true);
|
||||
}
|
||||
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "cql3/expr/evaluate.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/values.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/broadcast_tables/experimental/lang.hh"
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "auth/service.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "transport/event.hh"
|
||||
|
||||
@@ -411,10 +411,10 @@ bool ks_prop_defs::get_durable_writes() const {
|
||||
|
||||
lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
|
||||
auto sc = get_replication_strategy_class().value();
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0. The strategy will
|
||||
// validate it and throw an error if it does not support tablets.
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
|
||||
auto enable_tablets = feat.tablets && cfg.enable_tablets_by_default();
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets ? std::optional<unsigned>(0) : std::nullopt;
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
@@ -440,7 +440,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
sc = old->strategy_name();
|
||||
options = old_options;
|
||||
}
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options(), {}, old->next_strategy_options_opt());
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -626,7 +626,7 @@ modification_statement::prepare(data_dictionary::database db, prepare_context& c
|
||||
// Since this cache is only meaningful for LWT queries, just clear the ids
|
||||
// if it's not a conditional statement so that the AST nodes don't
|
||||
// participate in the caching mechanism later.
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions) {
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions.has_value()) {
|
||||
ctx.clear_pk_function_calls_cache();
|
||||
}
|
||||
prepared_stmt->_may_use_token_aware_routing = ctx.get_partition_key_bind_indexes(*schema).size() != 0;
|
||||
|
||||
@@ -94,7 +94,7 @@ private:
|
||||
std::optional<bool> _is_raw_counter_shard_write;
|
||||
|
||||
protected:
|
||||
shared_ptr<const restrictions::statement_restrictions> _restrictions;
|
||||
std::optional<restrictions::statement_restrictions> _restrictions;
|
||||
public:
|
||||
typedef std::optional<std::unordered_map<sstring, bytes_opt>> json_cache_opt;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
|
||||
@@ -109,7 +109,7 @@ public:
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats, const cql_config& cfg, bool for_view);
|
||||
private:
|
||||
std::vector<selection::prepared_selector> maybe_jsonize_select_clause(std::vector<selection::prepared_selector> select, data_dictionary::database db, schema_ptr schema);
|
||||
::shared_ptr<const restrictions::statement_restrictions> prepare_restrictions(
|
||||
::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
|
||||
@@ -1027,7 +1027,7 @@ view_indexed_table_select_statement::prepare(data_dictionary::database db,
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1139,7 +1139,7 @@ lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_stat
|
||||
auto& last_base_pk = last_pos.partition;
|
||||
auto* last_base_ck = last_pos.position.has_key() ? &last_pos.position.key() : nullptr;
|
||||
|
||||
bytes_opt indexed_column_value = _restrictions->value_for_index_partition_key(options);
|
||||
bytes_opt indexed_column_value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
|
||||
auto index_pk = [&]() {
|
||||
if (_index.metadata().local()) {
|
||||
@@ -1350,7 +1350,12 @@ dht::partition_range_vector view_indexed_table_select_statement::get_partition_r
|
||||
dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
|
||||
dht::partition_range_vector partition_ranges;
|
||||
|
||||
bytes_opt value = _restrictions->value_for_index_partition_key(options);
|
||||
const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
|
||||
if (!cdef) {
|
||||
throw exceptions::invalid_request_exception("Indexed column not found in schema");
|
||||
}
|
||||
|
||||
bytes_opt value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
if (value) {
|
||||
auto pk = partition_key::from_single_value(*_view_schema, *value);
|
||||
auto dk = dht::decorate_key(*_view_schema, pk);
|
||||
@@ -1369,11 +1374,11 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
// Only EQ restrictions on base partition key can be used in an index view query
|
||||
if (pk_restrictions_is_single && _restrictions->partition_key_restrictions_is_all_eq()) {
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_clustering_ranges(options));
|
||||
_restrictions->get_global_index_clustering_ranges(options, *_view_schema));
|
||||
} else if (_restrictions->has_token_restrictions()) {
|
||||
// Restrictions like token(p1, p2) < 0 have all partition key components restricted, but require special handling.
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_token_clustering_ranges(options));
|
||||
_restrictions->get_global_index_token_clustering_ranges(options, *_view_schema));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1384,7 +1389,7 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
partition_slice_builder partition_slice_builder{*_view_schema};
|
||||
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_local_index_clustering_ranges(options));
|
||||
_restrictions->get_local_index_clustering_ranges(options, *_view_schema));
|
||||
|
||||
return partition_slice_builder.build();
|
||||
}
|
||||
@@ -1602,7 +1607,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1640,7 +1645,7 @@ private:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const select_statement::parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
parallelized_select_statement::ordering_comparator_type ordering_comparator,
|
||||
@@ -2071,7 +2076,7 @@ static select_statement::ordering_comparator_type get_similarity_ordering_compar
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
|
||||
uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
|
||||
|
||||
@@ -2584,7 +2589,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
|
||||
return make_unique<prepared_statement>(audit_info(), std::move(stmt), ctx, std::move(partition_key_bind_indices), std::move(warnings));
|
||||
}
|
||||
|
||||
::shared_ptr<const restrictions::statement_restrictions>
|
||||
::shared_ptr<restrictions::statement_restrictions>
|
||||
select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
@@ -2594,8 +2599,8 @@ select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
restrictions::check_indexes do_check_indexes)
|
||||
{
|
||||
try {
|
||||
return restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes);
|
||||
return ::make_shared<restrictions::statement_restrictions>(restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes));
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the WHERE clause (name: '{}')", e.entity));
|
||||
|
||||
@@ -200,7 +200,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -372,7 +372,7 @@ public:
|
||||
|
||||
static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "raw/parsed_statement.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "service/query_state.hh"
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include <optional>
|
||||
#include "validation.hh"
|
||||
|
||||
@@ -66,7 +66,7 @@ public:
|
||||
: update_statement(std::move(audit_info), statement_type::INSERT, bound_terms, s, std::move(attrs), stats)
|
||||
, _value(std::move(v))
|
||||
, _default_unset(default_unset) {
|
||||
_restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, false);
|
||||
_restrictions = restrictions::statement_restrictions(s, false);
|
||||
}
|
||||
private:
|
||||
virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const override;
|
||||
|
||||
@@ -224,12 +224,10 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
user_types_metadata user_types,
|
||||
storage_options storage_opts,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
storage_options storage_opts)
|
||||
: _name{name}
|
||||
, _strategy_name{locator::abstract_replication_strategy::to_qualified_class_name(strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name)}
|
||||
, _strategy_options{std::move(strategy_options)}
|
||||
, _next_strategy_options{std::move(next_options)}
|
||||
, _initial_tablets(initial_tablets)
|
||||
, _durable_writes{durable_writes}
|
||||
, _user_types{std::move(user_types)}
|
||||
@@ -275,15 +273,14 @@ keyspace_metadata::new_keyspace(std::string_view name,
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes,
|
||||
storage_options storage_opts,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
std::vector<schema_ptr> cf_defs)
|
||||
{
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts, next_options);
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
|
||||
}
|
||||
|
||||
lw_shared_ptr<keyspace_metadata>
|
||||
keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options(), {}, ksm.next_strategy_options_opt());
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
|
||||
}
|
||||
|
||||
void keyspace_metadata::add_user_type(const user_type ut) {
|
||||
@@ -339,7 +336,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
|
||||
}
|
||||
if (values.size() > allowed_options.size()) {
|
||||
throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
|
||||
type, fmt::join(values | std::views::keys, ","),
|
||||
fmt::join(values | std::views::keys, ","), type,
|
||||
fmt::join(allowed_options | std::views::keys, ",")));
|
||||
}
|
||||
options.type = std::string(type);
|
||||
@@ -652,8 +649,8 @@ struct fmt::formatter<data_dictionary::user_types_metadata> {
|
||||
};
|
||||
|
||||
auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dictionary::keyspace_metadata& m, fmt::format_context& ctx) const -> decltype(ctx.out()) {
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, nextStrategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.next_strategy_options_opt(), m.cf_meta_data(), m.durable_writes());
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes());
|
||||
if (m.initial_tablets()) {
|
||||
if (auto initial_tablets = m.initial_tablets().value()) {
|
||||
fmt::format_to(ctx.out(), "{{\"initial\":{}}}", initial_tablets);
|
||||
|
||||
@@ -28,9 +28,7 @@ namespace data_dictionary {
|
||||
class keyspace_metadata final {
|
||||
sstring _name;
|
||||
sstring _strategy_name;
|
||||
// If _next_strategy_options has value, there is ongoing rf change of this keyspace.
|
||||
locator::replication_strategy_config_options _strategy_options;
|
||||
std::optional<locator::replication_strategy_config_options> _next_strategy_options;
|
||||
std::optional<unsigned> _initial_tablets;
|
||||
std::unordered_map<sstring, schema_ptr> _cf_meta_data;
|
||||
bool _durable_writes;
|
||||
@@ -46,8 +44,7 @@ public:
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
|
||||
user_types_metadata user_types = user_types_metadata{},
|
||||
storage_options storage_opts = storage_options{},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
storage_options storage_opts = storage_options{});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(std::string_view name,
|
||||
std::string_view strategy_name,
|
||||
@@ -56,8 +53,7 @@ public:
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes = true,
|
||||
storage_options storage_opts = {},
|
||||
std::vector<schema_ptr> cf_defs = {},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
std::vector<schema_ptr> cf_defs = {});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(const keyspace_metadata& ksm);
|
||||
void validate(const gms::feature_service&, const locator::topology&) const;
|
||||
@@ -70,18 +66,6 @@ public:
|
||||
const locator::replication_strategy_config_options& strategy_options() const {
|
||||
return _strategy_options;
|
||||
}
|
||||
void set_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_strategy_options = options;
|
||||
}
|
||||
const std::optional<locator::replication_strategy_config_options>& next_strategy_options_opt() const {
|
||||
return _next_strategy_options;
|
||||
}
|
||||
void set_next_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_next_strategy_options = options;
|
||||
}
|
||||
void clear_next_strategy_options() {
|
||||
_next_strategy_options = std::nullopt;
|
||||
}
|
||||
locator::replication_strategy_config_options strategy_options_v1() const;
|
||||
std::optional<unsigned> initial_tablets() const {
|
||||
return _initial_tablets;
|
||||
|
||||
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
friend std::ostream& operator<<(std::ostream&, const segment&);
|
||||
friend class segment_manager;
|
||||
|
||||
constexpr size_t sector_overhead(size_t size) const {
|
||||
size_t sector_overhead(size_t size) const {
|
||||
return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
|
||||
}
|
||||
|
||||
@@ -1028,21 +1028,18 @@ public:
|
||||
co_return me;
|
||||
}
|
||||
|
||||
std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t s) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
|
||||
auto overhead = segment_overhead_size;
|
||||
if (_file_pos == 0) {
|
||||
overhead += descriptor_header_size;
|
||||
}
|
||||
|
||||
return {s + overhead, overhead};
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a new buffer
|
||||
*/
|
||||
void new_buffer(size_t size_in) {
|
||||
SCYLLA_ASSERT(_buffer.empty());
|
||||
auto [s, overhead] = buffer_usage_size(size_in);
|
||||
s += overhead;
|
||||
// add bookkeep data reqs.
|
||||
auto a = align_up(s + sector_overhead(s), _alignment);
|
||||
auto k = std::max(a, default_size);
|
||||
@@ -1430,9 +1427,6 @@ public:
|
||||
|
||||
position_type next_position(size_t size) const {
|
||||
auto used = _buffer_ostream_size - _buffer_ostream.size();
|
||||
if (used == 0) { // new chunk/segment
|
||||
std::tie(size, std::ignore) = buffer_usage_size(size);
|
||||
}
|
||||
used += size;
|
||||
return _file_pos + used + sector_overhead(used);
|
||||
}
|
||||
@@ -1576,6 +1570,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);
|
||||
|
||||
auto size = writer.size();
|
||||
auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;
|
||||
|
||||
// check if this cannot be written at all...
|
||||
if (!cfg.allow_going_over_size_limit) {
|
||||
@@ -1584,11 +1579,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
// more worst case
|
||||
auto size_with_meta_overhead = size_with_sector_overhead
|
||||
+ (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
|
||||
* (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
|
||||
* (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
|
||||
;
|
||||
// this is not really true. We could have some space in current segment,
|
||||
// but again, lets be conservative.
|
||||
auto max_file_size_avail = max_disk_size - max_size;
|
||||
auto max_file_size_avail = max_disk_size - max_file_size;
|
||||
|
||||
if (size_with_meta_overhead > max_file_size_avail) {
|
||||
throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
|
||||
@@ -1775,13 +1770,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
co_await s->close();
|
||||
s = co_await get_segment();
|
||||
}
|
||||
// bytes not counting overhead
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());
|
||||
// bytes not counting overhead
|
||||
auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
|
||||
|
||||
size_t avail;
|
||||
if (buf_rem >= align) {
|
||||
if (buf_rem > align) {
|
||||
auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
|
||||
avail = std::min(rem2, max_mutation_size)
|
||||
- segment::entry_overhead_size
|
||||
@@ -1791,7 +1784,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
|
||||
} else {
|
||||
co_await s->cycle();
|
||||
auto pos = s->position();
|
||||
auto max = std::max<size_t>(pos, max_size);
|
||||
auto max = std::max<size_t>(pos, max_file_size);
|
||||
auto file_rem = max - pos;
|
||||
|
||||
if (file_rem < align) {
|
||||
|
||||
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
|
||||
if (cm_it == local_cm.end()) {
|
||||
if (!cer.get_column_mapping()) {
|
||||
rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
|
||||
throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
|
||||
throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
|
||||
}
|
||||
rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
|
||||
cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
|
||||
|
||||
20
db/config.cc
20
db/config.cc
@@ -330,14 +330,14 @@ const config_type& config_type_for<std::vector<db::config::error_injection_at_st
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_loop::when>>() {
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_when>>() {
|
||||
static config_type ct(
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_loop::when>>);
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_when>>);
|
||||
return ct;
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<netw::advanced_rpc_compressor::tracker::algo_config>() {
|
||||
const config_type& config_type_for<netw::algo_config>() {
|
||||
static config_type ct(
|
||||
"advanced rpc compressor config", printable_vector_to_json<enum_option<netw::compression_algorithm>>);
|
||||
return ct;
|
||||
@@ -530,9 +530,9 @@ struct convert<db::config::error_injection_at_startup> {
|
||||
|
||||
|
||||
template <>
|
||||
class convert<enum_option<netw::dict_training_loop::when>> {
|
||||
class convert<enum_option<netw::dict_training_when>> {
|
||||
public:
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_loop::when>& rhs) {
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_when>& rhs) {
|
||||
std::string name;
|
||||
if (!convert<std::string>::decode(node, name)) {
|
||||
return false;
|
||||
@@ -1110,7 +1110,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Specifies RPC compression algorithms supported by this node. ")
|
||||
, internode_compression_enable_advanced(this, "internode_compression_enable_advanced", liveness::MustRestart, value_status::Used, false,
|
||||
"Enables the new implementation of RPC compression. If disabled, Scylla will fall back to the old implementation.")
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_loop::when::type::NEVER,
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_when::type::NEVER,
|
||||
"Specifies when RPC compression dictionary training is performed by this node.\n"
|
||||
"* `never` disables it unconditionally.\n"
|
||||
"* `when_leader` enables it only whenever the node is the Raft leader.\n"
|
||||
@@ -1921,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
|
||||
{"lwt", feature::UNUSED},
|
||||
{"udf", feature::UDF},
|
||||
{"cdc", feature::UNUSED},
|
||||
{"alternator-streams", feature::UNUSED},
|
||||
{"alternator-streams", feature::ALTERNATOR_STREAMS},
|
||||
{"alternator-ttl", feature::UNUSED },
|
||||
{"consistent-topology-changes", feature::UNUSED},
|
||||
{"broadcast-tables", feature::BROADCAST_TABLES},
|
||||
@@ -2025,8 +2025,8 @@ template struct utils::config_file::named_value<enum_option<db::experimental_fea
|
||||
template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
template struct utils::config_file::named_value<netw::algo_config>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
@@ -2094,7 +2094,7 @@ future<gms::inet_address> resolve(const config_file::named_value<sstring>& addre
|
||||
}
|
||||
}
|
||||
|
||||
co_return coroutine::exception(std::move(ex));
|
||||
co_return seastar::coroutine::exception(std::move(ex));
|
||||
}
|
||||
|
||||
static std::vector<seastar::metrics::relabel_config> get_relable_from_yaml(const YAML::Node& yaml, const std::string& name) {
|
||||
|
||||
15
db/config.hh
15
db/config.hh
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
@@ -16,15 +17,14 @@
|
||||
#include <seastar/util/program-options.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/replication_strategy_type.hh"
|
||||
#include "seastarx.hh"
|
||||
#include "utils/config_file.hh"
|
||||
#include "utils/enum_option.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "db/hints/host_filter.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "message/dict_trainer.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include "message/rpc_compression_types.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/tri_mode_restriction.hh"
|
||||
#include "sstables/compressor.hh"
|
||||
@@ -115,6 +115,7 @@ struct experimental_features_t {
|
||||
enum class feature {
|
||||
UNUSED,
|
||||
UDF,
|
||||
ALTERNATOR_STREAMS,
|
||||
BROADCAST_TABLES,
|
||||
KEYSPACE_STORAGE_OPTIONS,
|
||||
STRONGLY_CONSISTENT_TABLES,
|
||||
@@ -324,9 +325,9 @@ public:
|
||||
named_value<uint32_t> internode_compression_zstd_min_message_size;
|
||||
named_value<uint32_t> internode_compression_zstd_max_message_size;
|
||||
named_value<bool> internode_compression_checksumming;
|
||||
named_value<netw::advanced_rpc_compressor::tracker::algo_config> internode_compression_algorithms;
|
||||
named_value<netw::algo_config> internode_compression_algorithms;
|
||||
named_value<bool> internode_compression_enable_advanced;
|
||||
named_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when;
|
||||
named_value<enum_option<netw::dict_training_when>> rpc_dict_training_when;
|
||||
named_value<uint32_t> rpc_dict_training_min_time_seconds;
|
||||
named_value<uint64_t> rpc_dict_training_min_bytes;
|
||||
named_value<bool> inter_dc_tcp_nodelay;
|
||||
@@ -738,8 +739,8 @@ extern template struct utils::config_file::named_value<enum_option<db::experimen
|
||||
extern template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
extern template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
extern template struct utils::config_file::named_value<netw::algo_config>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
|
||||
@@ -277,7 +277,7 @@ filter_for_query(consistency_level cl,
|
||||
|
||||
host_id_vector_replica_set selected_endpoints;
|
||||
|
||||
// Preselect endpoints based on client preference. If the endpoints
|
||||
// Pre-select endpoints based on client preference. If the endpoints
|
||||
// selected this way aren't enough to satisfy CL requirements select the
|
||||
// remaining ones according to the load-balancing strategy as before.
|
||||
if (!preferred_endpoints.empty()) {
|
||||
|
||||
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
|
||||
}
|
||||
}
|
||||
|
||||
hr_logger.trace(" pp after1={}", pp);
|
||||
hr_logger.trace(" pp after1=", pp);
|
||||
if (d.first == me) {
|
||||
// We only care what "me" sends, and only the elements in
|
||||
// the sorted list earlier than me could have forced it to
|
||||
|
||||
@@ -33,11 +33,6 @@ enum class schema_feature {
|
||||
|
||||
// Per-table tablet options
|
||||
TABLET_OPTIONS,
|
||||
|
||||
// When enabled, `system_schema.keyspaces` will keep three replication values:
|
||||
// the initial, the current, and the target replication factor,
|
||||
// which reflect the phases of the multi RF change.
|
||||
KEYSPACE_MULTI_RF_CHANGE,
|
||||
};
|
||||
|
||||
using schema_features = enum_set<super_enum<schema_feature,
|
||||
@@ -48,8 +43,7 @@ using schema_features = enum_set<super_enum<schema_feature,
|
||||
schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
|
||||
schema_feature::GROUP0_SCHEMA_VERSIONING,
|
||||
schema_feature::IN_MEMORY_TABLES,
|
||||
schema_feature::TABLET_OPTIONS,
|
||||
schema_feature::KEYSPACE_MULTI_RF_CHANGE
|
||||
schema_feature::TABLET_OPTIONS
|
||||
>>;
|
||||
|
||||
}
|
||||
|
||||
@@ -216,7 +216,6 @@ schema_ptr keyspaces() {
|
||||
{"durable_writes", boolean_type},
|
||||
{"replication", map_type_impl::get_instance(utf8_type, utf8_type, false)},
|
||||
{"replication_v2", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // with rack list RF
|
||||
{"next_replication", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // target rack list RF for this RF change
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
@@ -1179,14 +1178,6 @@ utils::chunked_vector<mutation> make_create_keyspace_mutations(schema_features f
|
||||
// If the maps are different, the upgrade must be already done.
|
||||
store_map(m, ckey, "replication_v2", timestamp, cql3::statements::to_flattened_map(map));
|
||||
}
|
||||
if (features.contains<schema_feature::KEYSPACE_MULTI_RF_CHANGE>()) {
|
||||
const auto& next_map_opt = keyspace->next_strategy_options_opt();
|
||||
if (next_map_opt) {
|
||||
auto next_map = *next_map_opt;
|
||||
next_map["class"] = keyspace->strategy_name();
|
||||
store_map(m, ckey, "next_replication", timestamp, cql3::statements::to_flattened_map(next_map));
|
||||
}
|
||||
}
|
||||
|
||||
if (features.contains<schema_feature::SCYLLA_KEYSPACES>()) {
|
||||
schema_ptr scylla_keyspaces_s = scylla_keyspaces();
|
||||
@@ -1260,7 +1251,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
// (or screw up shared pointers)
|
||||
const auto& replication = row.get_nonnull<map_type_impl::native_type>("replication");
|
||||
const auto& replication_v2 = row.get<map_type_impl::native_type>("replication_v2");
|
||||
const auto& next_replication = row.get<map_type_impl::native_type>("next_replication");
|
||||
|
||||
cql3::statements::property_definitions::map_type flat_strategy_options;
|
||||
for (auto& p : replication_v2 ? *replication_v2 : replication) {
|
||||
@@ -1269,17 +1259,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
auto strategy_options = cql3::statements::from_flattened_map(flat_strategy_options);
|
||||
auto strategy_name = std::get<sstring>(strategy_options["class"]);
|
||||
strategy_options.erase("class");
|
||||
|
||||
std::optional<cql3::statements::property_definitions::extended_map_type> next_strategy_options = std::nullopt;
|
||||
if (next_replication) {
|
||||
cql3::statements::property_definitions::map_type flat_next_replication;
|
||||
for (auto& p : *next_replication) {
|
||||
flat_next_replication.emplace(value_cast<sstring>(p.first), value_cast<sstring>(p.second));
|
||||
}
|
||||
next_strategy_options = cql3::statements::from_flattened_map(flat_next_replication);
|
||||
next_strategy_options->erase("class");
|
||||
}
|
||||
|
||||
bool durable_writes = row.get_nonnull<bool>("durable_writes");
|
||||
|
||||
data_dictionary::storage_options storage_opts;
|
||||
@@ -1305,7 +1284,7 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts, {}, next_strategy_options);
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts);
|
||||
}
|
||||
|
||||
template<typename V>
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "replica/database.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/config.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "types/types.hh"
|
||||
@@ -21,6 +22,8 @@
|
||||
#include "cdc/generation.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
|
||||
#include "service/migration_manager.hh"
|
||||
#include "locator/host_id.hh"
|
||||
|
||||
@@ -38,10 +41,27 @@ static logging::logger dlogger("system_distributed_keyspace");
|
||||
extern logging::logger cdc_log;
|
||||
|
||||
namespace db {
|
||||
namespace {
|
||||
const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
|
||||
(builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
|
||||
{
|
||||
builder.set_wait_for_sync_to_commitlog(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
extern thread_local data_type cdc_streams_set_type;
|
||||
thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);
|
||||
|
||||
/* See `token_range_description` struct */
|
||||
thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
|
||||
thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
|
||||
{ long_type // dht::token token_range_end;
|
||||
, cdc_streams_list_type // std::vector<stream_id> streams;
|
||||
, byte_type // uint8_t sharding_ignore_msb;
|
||||
});
|
||||
thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);
|
||||
|
||||
schema_ptr view_build_status() {
|
||||
static thread_local auto schema = [] {
|
||||
@@ -57,6 +77,42 @@ schema_ptr view_build_status() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* An internal table used by nodes to exchange CDC generation data. */
|
||||
schema_ptr cdc_generations_v2() {
|
||||
thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
|
||||
/* The unique identifier of this generation. */
|
||||
.with_column("id", uuid_type, column_kind::partition_key)
|
||||
/* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
|
||||
* This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
|
||||
* of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
|
||||
* Each such range-local mapping is represented by a row of this table.
|
||||
* The clustering key of the row is the end of the range being described by this row.
|
||||
* The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
|
||||
* or of the last row of this partition if this is the first the first row. */
|
||||
.with_column("range_end", long_type, column_kind::clustering_key)
|
||||
/* The set of streams mapped to in this range.
|
||||
* The number of streams mapped to a single range in a CDC generation is bounded from above by the number
|
||||
* of shards on the owner of that range in the token ring.
|
||||
* In other words, the number of elements of this set is bounded by the maximum of the number of shards
|
||||
* over all nodes. The serialized size is obtained by counting about 20B for each stream.
|
||||
* For example, if all nodes in the cluster have at most 128 shards,
|
||||
* the serialized size of this set will be bounded by ~2.5 KB. */
|
||||
.with_column("streams", cdc_streams_set_type)
|
||||
/* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
|
||||
* when the generation was first created. Together with the set of streams above it fully describes
|
||||
* the mapping for this particular range. */
|
||||
.with_column("ignore_msb", byte_type)
|
||||
/* Column used for sanity checking.
|
||||
* For a given generation it's equal to the number of ranges in this generation;
|
||||
* thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
|
||||
.with_column("num_ranges", int32_type, column_kind::static_column)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
/* A user-facing table providing identifiers of the streams used in CDC generations. */
|
||||
schema_ptr cdc_desc() {
|
||||
@@ -96,6 +152,23 @@ schema_ptr cdc_timestamps() {
|
||||
|
||||
static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
|
||||
schema_ptr service_levels() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
|
||||
auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
|
||||
.with_column("service_level", utf8_type, column_kind::partition_key)
|
||||
.with_column("shares", int32_type);
|
||||
if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
|
||||
builder.remove_column("shares");
|
||||
}
|
||||
|
||||
return builder
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
// This is the set of tables which this node ensures to exist in the cluster.
|
||||
// It does that by announcing the creation of these schemas on initialization
|
||||
// of the `system_distributed_keyspace` service (see `start()`), unless it first
|
||||
@@ -109,13 +182,19 @@ static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
static std::vector<schema_ptr> ensured_tables() {
|
||||
return {
|
||||
view_build_status(),
|
||||
cdc_generations_v2(),
|
||||
cdc_desc(),
|
||||
cdc_timestamps(),
|
||||
service_levels(),
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps()};
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
|
||||
return {cdc_generations_v2()};
|
||||
}
|
||||
|
||||
system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
|
||||
@@ -124,6 +203,36 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
|
||||
, _sp(sp) {
|
||||
}
|
||||
|
||||
static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
|
||||
std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
|
||||
if (workload_prioritization_enabled) {
|
||||
new_columns.push_back({"shares", int32_type});
|
||||
}
|
||||
return new_columns;
|
||||
};
|
||||
|
||||
static schema_ptr get_current_service_levels(data_dictionary::database db) {
|
||||
return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
|
||||
: service_levels();
|
||||
}
|
||||
|
||||
static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto schema = get_current_service_levels(db);
|
||||
schema_builder b(schema);
|
||||
for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
|
||||
auto& [col_name, col_type] = col;
|
||||
bytes options_name = to_bytes(col_name.data());
|
||||
if (schema->get_column_definition(options_name)) {
|
||||
continue;
|
||||
}
|
||||
b.with_column(options_name, col_type, column_kind::regular_column);
|
||||
}
|
||||
b.with_hash_version();
|
||||
return b.build();
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -134,9 +243,11 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
|
||||
while (true) {
|
||||
// Check if there is any work to do before taking the group 0 guard.
|
||||
bool keyspaces_setup = db.has_keyspace(NAME);
|
||||
bool workload_prioritization_enabled = _sp.features().workload_prioritization;
|
||||
bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
|
||||
bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
|
||||
if (keyspaces_setup && tables_setup) {
|
||||
bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
|
||||
if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
|
||||
dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
|
||||
_started = true;
|
||||
co_return;
|
||||
@@ -147,25 +258,51 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
sstring description;
|
||||
|
||||
auto ksm = keyspace_metadata::new_keyspace(
|
||||
auto sd_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME,
|
||||
"org.apache.cassandra.locator.SimpleStrategy",
|
||||
{{"replication_factor", "3"}},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME)) {
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
|
||||
mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
|
||||
description += format(" create {} keyspace;", NAME);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME);
|
||||
}
|
||||
|
||||
// Get mutations for creating tables.
|
||||
auto sde_ksm = keyspace_metadata::new_keyspace(
|
||||
NAME_EVERYWHERE,
|
||||
"org.apache.cassandra.locator.EverywhereStrategy",
|
||||
{},
|
||||
std::nullopt, std::nullopt);
|
||||
if (!db.has_keyspace(NAME_EVERYWHERE)) {
|
||||
auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
|
||||
std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
|
||||
description += format(" create {} keyspace;", NAME_EVERYWHERE);
|
||||
} else {
|
||||
dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
|
||||
}
|
||||
|
||||
// Get mutations for creating and updating tables.
|
||||
auto num_keyspace_mutations = mutations.size();
|
||||
co_await coroutine::parallel_for_each(ensured_tables(),
|
||||
[this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
|
||||
[this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
|
||||
auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
|
||||
|
||||
// Ensure that the service_levels table contains new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS) {
|
||||
table = get_updated_service_levels(db, workload_prioritization_enabled);
|
||||
}
|
||||
|
||||
if (!db.has_schema(table->ks_name(), table->cf_name())) {
|
||||
co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
|
||||
}
|
||||
|
||||
// The service_levels table exists. Update it if it lacks new columns.
|
||||
if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
|
||||
auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
|
||||
std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
|
||||
}
|
||||
});
|
||||
if (mutations.size() > num_keyspace_mutations) {
|
||||
description += " create and update system_distributed(_everywhere) tables";
|
||||
@@ -187,6 +324,15 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start_workload_prioritization() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_return;
|
||||
}
|
||||
if (_qp.db().features().workload_prioritization) {
|
||||
co_await create_tables({get_updated_service_levels(_qp.db(), true)});
|
||||
}
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::start() {
|
||||
if (this_shard_id() != 0) {
|
||||
_started = true;
|
||||
@@ -229,6 +375,90 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
|
||||
return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
|
||||
}
|
||||
|
||||
future<>
|
||||
system_distributed_keyspace::insert_cdc_generation(
|
||||
utils::UUID id,
|
||||
const cdc::topology_description& desc,
|
||||
context ctx) {
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
const size_t concurrency = 10;
|
||||
const size_t num_replicas = ctx.num_token_owners;
|
||||
|
||||
// To insert the data quickly and efficiently we send it in batches of multiple rows
|
||||
// (each batch represented by a single mutation). We also send multiple such batches concurrently.
|
||||
// However, we need to limit the memory consumption of the operation.
|
||||
// I assume that the memory consumption grows linearly with the number of replicas
|
||||
// (we send to all replicas ``at the same time''), with the batch size (the data must
|
||||
// be copied for each replica?) and with concurrency. These assumptions may be too conservative
|
||||
// but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
|
||||
// Thus, if we want to limit the memory consumption to L, it should be true that
|
||||
// mutation_size * num_replicas * concurrency <= L, hence
|
||||
// mutation_size <= L / (num_replicas * concurrency).
|
||||
// For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
|
||||
// mutation_size <= 10MB / 1000 = 10KB.
|
||||
// On the other hand we must have mutation_size >= size of a single row,
|
||||
// so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
|
||||
|
||||
// It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
|
||||
// which would correspond to L ~= 60MB. Hence that's the limit we use here.
|
||||
const size_t L = 60'000'000;
|
||||
const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
|
||||
|
||||
auto s = _qp.db().real_database().find_schema(
|
||||
system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
|
||||
auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
|
||||
co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
|
||||
co_await _sp.mutate(
|
||||
{ std::move(m) },
|
||||
db::consistency_level::ALL,
|
||||
db::timeout_clock::now() + 60s,
|
||||
nullptr, // trace_state
|
||||
empty_service_permit(),
|
||||
db::allow_per_partition_rate_limit::no,
|
||||
false // raw_counters
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
future<std::optional<cdc::topology_description>>
|
||||
system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
|
||||
utils::chunked_vector<cdc::token_range_description> entries;
|
||||
size_t num_ranges = 0;
|
||||
co_await _qp.query_internal(
|
||||
// This should be a local read so 20s should be more than enough
|
||||
format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
|
||||
db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
|
||||
{ id },
|
||||
1000, // for ~1KB rows, ~1MB page size
|
||||
[&] (const cql3::untyped_result_set_row& row) {
|
||||
|
||||
std::vector<cdc::stream_id> streams;
|
||||
row.get_list_data<bytes>("streams", std::back_inserter(streams));
|
||||
entries.push_back(cdc::token_range_description{
|
||||
dht::token::from_int64(row.get_as<int64_t>("range_end")),
|
||||
std::move(streams),
|
||||
uint8_t(row.get_as<int8_t>("ignore_msb"))});
|
||||
num_ranges = row.get_as<int32_t>("num_ranges");
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
|
||||
if (entries.empty()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
// Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
|
||||
// were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
|
||||
// tested, so just in case...
|
||||
if (entries.size() != num_ranges) {
|
||||
throw std::runtime_error(format(
|
||||
"read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
|
||||
" but reading the partition returned {}.", num_ranges, entries.size()));
|
||||
}
|
||||
|
||||
co_return std::optional{cdc::topology_description(std::move(entries))};
|
||||
}
|
||||
|
||||
static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
|
||||
const replica::database& db,
|
||||
db_clock::time_point time,
|
||||
@@ -400,4 +630,65 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
|
||||
co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
|
||||
}
|
||||
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
|
||||
return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
|
||||
}
|
||||
|
||||
future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
|
||||
return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
|
||||
static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
|
||||
co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
|
||||
auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(duration_type);
|
||||
},
|
||||
[&] (const lowres_clock::duration& d) {
|
||||
return data_value(cql_duration(months_counter{0},
|
||||
days_counter{0},
|
||||
nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
|
||||
},
|
||||
}, tv);
|
||||
};
|
||||
auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
|
||||
return std::visit(overloaded_functor {
|
||||
[&] (const qos::service_level_options::unset_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const qos::service_level_options::delete_marker&) {
|
||||
return data_value::make_null(data_type_for<T>());
|
||||
},
|
||||
[&] (const T& v) {
|
||||
return data_value(v);
|
||||
},
|
||||
}, v);
|
||||
};
|
||||
data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
|
||||
? data_value::make_null(utf8_type)
|
||||
: data_value(qos::service_level_options::to_string(slo.workload));
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value(slo.timeout),
|
||||
workload,
|
||||
service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
|
||||
db::consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{to_data_value_g(slo.shares), service_level_name},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
|
||||
static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
|
||||
return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "cdc/generation_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -21,6 +24,7 @@ class query_processor;
|
||||
}
|
||||
|
||||
namespace cdc {
|
||||
class stream_id;
|
||||
class topology_description;
|
||||
class streams_version;
|
||||
} // namespace cdc
|
||||
@@ -35,8 +39,17 @@ namespace db {
|
||||
class system_distributed_keyspace {
|
||||
public:
|
||||
static constexpr auto NAME = "system_distributed";
|
||||
static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";
|
||||
|
||||
static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
|
||||
static constexpr auto SERVICE_LEVELS = "service_levels";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes. */
|
||||
static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
|
||||
|
||||
/* Nodes use this table to communicate new CDC stream generations to other nodes.
|
||||
* Resides in system_distributed_everywhere. */
|
||||
static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";
|
||||
|
||||
/* This table is used by CDC clients to learn about available CDC streams. */
|
||||
static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
|
||||
@@ -64,14 +77,19 @@ private:
|
||||
|
||||
public:
|
||||
static std::vector<schema_ptr> all_distributed_tables();
|
||||
static std::vector<schema_ptr> all_everywhere_tables();
|
||||
|
||||
system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);
|
||||
|
||||
future<> start();
|
||||
future<> start_workload_prioritization();
|
||||
future<> stop();
|
||||
|
||||
bool started() const { return _started; }
|
||||
|
||||
future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
|
||||
future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
|
||||
|
||||
future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
|
||||
future<bool> cdc_desc_exists(db_clock::time_point, context);
|
||||
|
||||
@@ -87,6 +105,11 @@ public:
|
||||
// NOTE: currently used only by alternator
|
||||
future<db_clock::time_point> cdc_current_generation_timestamp(context);
|
||||
|
||||
future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
|
||||
future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
|
||||
future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
|
||||
future<> drop_service_level(sstring service_level_name) const;
|
||||
|
||||
private:
|
||||
future<> create_tables(std::vector<schema_ptr> tables);
|
||||
};
|
||||
|
||||
@@ -300,7 +300,6 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("ongoing_rf_changes", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -3351,12 +3350,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("ongoing_rf_changes")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "ongoing_rf_changes")) {
|
||||
ret.ongoing_rf_changes.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
|
||||
@@ -15,10 +15,11 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/generation-number.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db_clock.hh"
|
||||
#include "mutation_query.hh"
|
||||
#include "system_keyspace_view_types.hh"
|
||||
@@ -36,6 +37,10 @@ namespace netw {
|
||||
class shared_dict;
|
||||
};
|
||||
|
||||
namespace query {
|
||||
class result_set;
|
||||
}
|
||||
|
||||
namespace sstables {
|
||||
struct entry_descriptor;
|
||||
class generation_type;
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
#include <seastar/core/cacheline.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -42,16 +41,13 @@ class node_update_backlog {
|
||||
std::chrono::milliseconds _interval;
|
||||
std::atomic<clock::time_point> _last_update;
|
||||
std::atomic<update_backlog> _max;
|
||||
utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;
|
||||
|
||||
public:
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
|
||||
utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
|
||||
: _backlogs(shards)
|
||||
, _interval(interval)
|
||||
, _last_update(clock::now() - _interval)
|
||||
, _max(update_backlog::no_backlog())
|
||||
, _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
|
||||
, _max(update_backlog::no_backlog()) {
|
||||
if (utils::get_local_injector().enter("update_backlog_immediately")) {
|
||||
_interval = std::chrono::milliseconds(0);
|
||||
_last_update = clock::now();
|
||||
@@ -63,9 +59,6 @@ public:
|
||||
update_backlog fetch_shard(unsigned shard);
|
||||
seastar::future<std::optional<update_backlog>> fetch_if_changed();
|
||||
|
||||
std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const;
|
||||
|
||||
// Exposed for testing only.
|
||||
update_backlog load() const {
|
||||
return _max.load(std::memory_order_relaxed);
|
||||
|
||||
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
|
||||
auto pli = _two_level_locks.find(*pk);
|
||||
if (pli == _two_level_locks.end()) {
|
||||
// This shouldn't happen... We can't unlock this lock if we can't find it...
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&pli->first == pk);
|
||||
if (cpk) {
|
||||
auto rli = pli->second._row_locks.find(*cpk);
|
||||
if (rli == pli->second._row_locks.end()) {
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
|
||||
mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
|
||||
return;
|
||||
}
|
||||
SCYLLA_ASSERT(&rli->first == cpk);
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "db/config.hh"
|
||||
#include "db/view/base_info.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "db/view/view_consumer.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
@@ -45,7 +47,6 @@
|
||||
#include "db/view/view_builder.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
#include "db/view/view_update_generator.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include "db/view/regular_column_transformation.hh"
|
||||
#include "db/system_keyspace_view_types.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
@@ -1585,11 +1586,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
|
||||
auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
|
||||
if (tombstone && _existing && !_existing->is_end_of_partition()) {
|
||||
if (_existing->is_range_tombstone_change()) {
|
||||
_existing_current_tombstone = _existing->as_range_tombstone_change().tombstone();
|
||||
} else if (_existing->is_clustering_row()) {
|
||||
// We don't care if it's a range tombstone, as we're only looking for existing entries that get deleted
|
||||
if (_existing->is_clustering_row()) {
|
||||
auto existing = clustering_row(*_schema, _existing->as_clustering_row());
|
||||
existing.apply(std::max(_existing_partition_tombstone, _existing_current_tombstone));
|
||||
auto update = clustering_row(existing.key(), row_tombstone(std::move(tombstone)), row_marker(), ::row());
|
||||
generate_update(std::move(update), { std::move(existing) });
|
||||
} else if (_existing->is_static_row()) {
|
||||
@@ -1600,10 +1599,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
return should_stop_updates() ? stop() : advance_existings();
|
||||
}
|
||||
|
||||
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
||||
if (_update && !_update->is_end_of_partition()) {
|
||||
if (_update->is_range_tombstone_change()) {
|
||||
_update_current_tombstone = _update->as_range_tombstone_change().tombstone();
|
||||
} else if (_update->is_clustering_row()) {
|
||||
if (_update->is_clustering_row()) {
|
||||
_update->mutate_as_clustering_row(*_schema, [&] (clustering_row& cr) mutable {
|
||||
cr.apply(std::max(_update_partition_tombstone, _update_current_tombstone));
|
||||
});
|
||||
@@ -3493,27 +3491,18 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
|
||||
}
|
||||
}
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout) const {
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms) {
|
||||
auto adjust = [] (float x) { return x * x * x; };
|
||||
auto budget = std::max(db::timeout_clock::duration(0),
|
||||
timeout - db::timeout_clock::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
|
||||
auto budget = std::max(service::storage_proxy::clock_type::duration(0),
|
||||
timeout - service::storage_proxy::clock_type::now());
|
||||
std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
|
||||
// "budget" has millisecond resolution and can potentially be long
|
||||
// in the future so converting it to microseconds may overflow.
|
||||
// So to compare buget and ret we need to convert both to the lower
|
||||
// resolution.
|
||||
if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
|
||||
if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
|
||||
return ret;
|
||||
} else {
|
||||
// budget is small (< ret) so can be converted to microseconds
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "gms/gossiper.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
@@ -21,6 +21,8 @@
|
||||
#include "dht/token.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
@@ -715,7 +717,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
|
||||
} catch (...) {
|
||||
eptr = std::current_exception();
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
|
||||
vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
|
||||
}
|
||||
reader.close().get();
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
#include <flat_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include <seastar/core/gate.hh>
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
|
||||
@@ -43,7 +43,7 @@ public:
|
||||
// Returns the number of bytes in the backlog divided by the maximum number of bytes
|
||||
// that the backlog can hold before employing admission control. While the backlog
|
||||
// is below the threshold, the coordinator will slow down the view updates up to
|
||||
// node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
|
||||
// the coordinator will reject the writes that would increase the backlog. On the
|
||||
// replica, the writes will start failing only after reaching the hard limit '_max'.
|
||||
float relative_size() const {
|
||||
@@ -70,4 +70,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// View updates are asynchronous, and because of this limiting their concurrency requires
|
||||
// a special approach. The current algorithm places all of the pending view updates in the backlog
|
||||
// and artificially slows down new responses to coordinator requests based on how full the backlog is.
|
||||
// This function calculates how much a request should be slowed down based on the backlog's fullness.
|
||||
// The equation is basically: delay(in seconds) = view_fullness_ratio^3
|
||||
// The more full the backlog gets the more aggressively the requests are slowed down.
|
||||
// The delay is limited to the amount of time left until timeout.
|
||||
// After the timeout the request fails, so there's no point in waiting longer than that.
|
||||
// The second argument defines this timeout point - we can't delay the request more than this time point.
|
||||
// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
|
||||
std::chrono::microseconds calculate_view_update_throttling_delay(
|
||||
update_backlog backlog,
|
||||
db::timeout_clock::time_point timeout,
|
||||
uint32_t view_flow_control_delay_limit_in_ms);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
*/
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "db/view/node_view_update_backlog.hh"
|
||||
#include <seastar/core/timed_out_error.hh>
|
||||
#include "gms/inet_address.hh"
|
||||
#include <seastar/util/defer.hh>
|
||||
@@ -96,10 +95,9 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
|
||||
view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
|
||||
: _db(db)
|
||||
, _proxy(proxy)
|
||||
, _node_update_backlog(node_backlog)
|
||||
, _progress_tracker(std::make_unique<progress_tracker>())
|
||||
, _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
|
||||
{
|
||||
@@ -114,7 +112,7 @@ future<> view_update_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
auto drop_sstable_references = defer([&] () noexcept {
|
||||
// Clear sstable references so sstables_manager::stop() doesn't hang.
|
||||
vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
|
||||
vug_logger.info("leaving {} unstaged sstables unprocessed",
|
||||
_sstables_to_move.size(), _sstables_with_tables.size());
|
||||
_sstables_to_move.clear();
|
||||
_sstables_with_tables.clear();
|
||||
@@ -242,9 +240,6 @@ future<> view_update_generator::process_staging_sstables(lw_shared_ptr<replica::
|
||||
_progress_tracker->on_sstable_registration(sst);
|
||||
}
|
||||
|
||||
utils::get_local_injector().inject("view_update_generator_pause_before_processing",
|
||||
utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
|
||||
// Generate view updates from staging sstables
|
||||
auto start_time = db_clock::now();
|
||||
auto [result, input_size] = generate_updates_from_staging_sstables(table, sstables);
|
||||
@@ -500,7 +495,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
|
||||
// the one which limits the number of incoming client requests by delaying the response to the client.
|
||||
if (batch_num > 0) {
|
||||
update_backlog local_backlog = _db.get_view_update_backlog();
|
||||
std::chrono::microseconds throttle_delay = _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);
|
||||
std::chrono::microseconds throttle_delay = calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
|
||||
|
||||
co_await seastar::sleep(throttle_delay);
|
||||
|
||||
|
||||
@@ -52,7 +52,6 @@ using allow_hints = bool_class<allow_hints_tag>;
|
||||
|
||||
namespace db::view {
|
||||
|
||||
class node_update_backlog;
|
||||
class stats;
|
||||
struct wait_for_all_updates_tag {};
|
||||
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
|
||||
@@ -64,7 +63,6 @@ public:
|
||||
private:
|
||||
replica::database& _db;
|
||||
sharded<service::storage_proxy>& _proxy;
|
||||
node_update_backlog& _node_update_backlog;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::condition_variable _pending_sstables;
|
||||
@@ -77,7 +75,7 @@ private:
|
||||
optimized_optional<abort_source::subscription> _early_abort_subscription;
|
||||
void do_abort() noexcept;
|
||||
public:
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
|
||||
view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
|
||||
~view_update_generator();
|
||||
|
||||
future<> start();
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "cdc/metadata.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/virtual_table.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "db/virtual_tables.hh"
|
||||
|
||||
68
dist/CMakeLists.txt
vendored
68
dist/CMakeLists.txt
vendored
@@ -141,72 +141,4 @@ add_dependencies(dist
|
||||
dist-python3
|
||||
dist-server)
|
||||
|
||||
set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
|
||||
set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
|
||||
|
||||
# Map system processor to Debian architecture names
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(deb_arch "amd64")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
set(deb_arch "arm64")
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
|
||||
set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
|
||||
set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
|
||||
set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
|
||||
set(server_rpms
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
|
||||
"${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(cqlsh_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
|
||||
set(python3_rpms
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
|
||||
|
||||
set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
|
||||
set(server_debs
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
|
||||
"${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
|
||||
"${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
|
||||
set(cqlsh_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
|
||||
set(python3_debs
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
|
||||
"${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
|
||||
|
||||
add_custom_target(collect-dist-rpm
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting RPMs into ${dist_rpm_dir}")
|
||||
|
||||
add_custom_target(collect-dist-deb
|
||||
COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
|
||||
DEPENDS dist
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
COMMENT "Collecting DEBs into ${dist_deb_dir}")
|
||||
|
||||
add_custom_target(collect-dist
|
||||
DEPENDS collect-dist-rpm collect-dist-deb)
|
||||
|
||||
add_subdirectory(debuginfo)
|
||||
|
||||
@@ -324,13 +324,6 @@ experimental:
|
||||
stream events. Without this option, such no-op operations may still
|
||||
generate spurious stream events.
|
||||
<https://github.com/scylladb/scylladb/issues/28368>
|
||||
* When a stream is disabled, no new records are written but the existing
|
||||
stream data is preserved and remains readable through its original
|
||||
StreamArn. The data expires via TTL after 24 hours. Re-enabling the
|
||||
stream purges the old data immediately and produces a new StreamArn.
|
||||
In contrast, DynamoDB keeps the old stream and its data readable for
|
||||
24 hours through the old StreamArn even after re-enabling.
|
||||
<https://scylladb.atlassian.net/browse/SCYLLADB-1873>
|
||||
|
||||
## Unimplemented API features
|
||||
|
||||
|
||||
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
|
||||
|
||||
|
||||
Altering from a rack list to a numeric replication factor is not supported.
|
||||
@@ -1017,11 +1017,11 @@ For example:
|
||||
|
||||
CREATE TABLE customer_data (
|
||||
cust_id uuid,
|
||||
"cust_first-name" text,
|
||||
"cust_last-name" text,
|
||||
cust_first-name text,
|
||||
cust_last-name text,
|
||||
cust_phone text,
|
||||
"cust_get-sms" text,
|
||||
PRIMARY KEY (cust_id)
|
||||
cust_get-sms text,
|
||||
PRIMARY KEY (customer_id)
|
||||
) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };
|
||||
|
||||
.. _cql-caching-options:
|
||||
|
||||
@@ -24,8 +24,7 @@ For example:
|
||||
|
||||
INSERT INTO NerdMovies (movie, director, main_actor, year)
|
||||
VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
|
||||
IF NOT EXISTS
|
||||
USING TTL 86400;
|
||||
USING TTL 86400 IF NOT EXISTS;
|
||||
|
||||
The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
|
||||
its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
|
||||
|
||||
@@ -507,7 +507,7 @@ For example::
|
||||
|
||||
CREATE TABLE superheroes (
|
||||
name frozen<full_name> PRIMARY KEY,
|
||||
home frozen<address>
|
||||
home address
|
||||
);
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -271,7 +271,7 @@ The json structure is as follows:
|
||||
}
|
||||
|
||||
The `manifest` member contains the following attributes:
|
||||
- `version` - representing the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `version` - respresenting the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `scope` - the scope of metadata stored in this manifest file. The following scopes are supported:
|
||||
- `node` - the manifest describes all SSTables owned by this node in this snapshot.
|
||||
|
||||
|
||||
@@ -12,9 +12,7 @@ Schema:
|
||||
CREATE TABLE system_schema.keyspaces (
|
||||
keyspace_name text PRIMARY KEY,
|
||||
durable_writes boolean,
|
||||
replication frozen<map<text, text>>,
|
||||
replication_v2 frozen<map<text, text>>,
|
||||
next_replication frozen<map<text, text>>
|
||||
replication frozen<map<text, text>>
|
||||
)
|
||||
```
|
||||
|
||||
@@ -33,8 +31,6 @@ Columns:
|
||||
stored as a flattened map of the extended options map (see below).
|
||||
|
||||
For `SimpleStrategy` there is a single option `"replication_factor"` specifying the replication factor.
|
||||
* `next_replication` - the target replication factor for the keyspace during rf change.
|
||||
If there is no ongoing rf change, `next_replication` value is not set.
|
||||
|
||||
Extended options map used by NetworkTopologyStrategy is a map where values can be either strings or lists of strings.
|
||||
|
||||
|
||||
@@ -146,25 +146,6 @@ AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
|
||||
- When set, these values are used by the S3 client to sign requests.
|
||||
- If not set, requests are sent unsigned, which may not be accepted by all servers.
|
||||
|
||||
.. _admin-oci-object-storage:
|
||||
|
||||
Using Oracle OCI Object Storage
|
||||
=================================
|
||||
|
||||
Oracle Cloud Infrastructure (OCI) Object Storage is compatible with the Amazon
|
||||
S3 API, so it works with ScyllaDB without additional configuration.
|
||||
|
||||
To use OCI Object Storage, follow the same configuration as for AWS S3, and
|
||||
specify your OCI S3-compatible endpoint.
|
||||
|
||||
Example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: https://idedxcgnkfkt.compat.objectstorage.us-ashburn-1.oci.customer-oci.com:443
|
||||
aws_region: us-ashburn-1
|
||||
|
||||
.. _admin-compression:
|
||||
|
||||
Compression
|
||||
|
||||
@@ -231,46 +231,6 @@ Add New DC
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While adding a new datacenter and altering keyspaces, do **not** perform any reads or writes that involve the new datacenter.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the new datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the new datacenter is fully operational.
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
|
||||
CREATE KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``<existing_dc>`` (adds ``<existing_rack4>``) and adds ``<new_dc>`` in the same statement:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>', '<existing_rack4>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
CREATE KEYSPACE mykeyspace4 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
You can abort the keyspace alteration using :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -102,34 +102,6 @@ Procedure
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While removing a datacenter and altering keyspaces, do **not** perform any reads or writes that involve the datacenter being removed.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the decommissioned datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the datacenter is fully decommissioned.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba4
|
||||
cqlsh> CREATE KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``EUROPE-DC`` (adds ``RAC9``) and removes ``ASIA-DC`` in the same statement:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8', 'RAC9']} AND tablets = { 'enabled': true };
|
||||
|
||||
Remove all replicas from the decommissioned datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
|
||||
If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
|
||||
@@ -141,10 +113,6 @@ Procedure
|
||||
|
||||
Failure to do so will result in decommission errors such as "zero replica after the removal".
|
||||
|
||||
.. warning::
|
||||
|
||||
Removal of replicas from a datacenter cannot be aborted. To get back to the previous replication, wait until the ALTER KEYSPACE finishes and then add the replicas back by running another ALTER KEYSPACE statement.
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
|
||||
Metrics Update <metric-update-2025.x-to-2026.1>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
|
||||
@@ -0,0 +1,82 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
.. |PRECEDING_VERSION| replace:: 2025.4
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_operation_size_kb
|
||||
- Histogram of item sizes involved in a request.
|
||||
* - scylla_column_family_total_disk_space_before_compression
|
||||
- Hypothetical total disk space used if data files weren't compressed
|
||||
* - scylla_group_name_auto_repair_enabled_nr
|
||||
- Number of tablets with auto repair enabled.
|
||||
* - scylla_group_name_auto_repair_needs_repair_nr
|
||||
- Number of tablets with auto repair enabled that currently need repair.
|
||||
* - scylla_lsa_compact_time_ms
|
||||
- Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
|
||||
* - scylla_lsa_evict_time_ms
|
||||
- Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
|
||||
* - scylla_lsa_reclaim_time_ms
|
||||
- Total time spent in reclaiming LSA memory back to std allocator.
|
||||
* - scylla_object_storage_memory_usage
|
||||
- Total number of bytes consumed by the object storage client.
|
||||
* - scylla_tablet_ops_failed
|
||||
- Number of failed tablet auto repair attempts.
|
||||
* - scylla_tablet_ops_succeeded
|
||||
- Number of successful tablet auto repair attempts.
|
||||
|
||||
Renamed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric Name in |PRECEDING_VERSION|
|
||||
- Metric Name in |NEW_VERSION|
|
||||
* - scylla_s3_memory_usage
|
||||
- scylla_object_storage_memory_usage
|
||||
|
||||
Removed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are removed in ScyllaDB |NEW_VERSION|.
|
||||
|
||||
* scylla_redis_current_connections
|
||||
* scylla_redis_op_latency
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_requests_latency
|
||||
* scylla_redis_requests_served
|
||||
* scylla_redis_requests_serving
|
||||
|
||||
New and Updated Metrics in Previous Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
|
||||
.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
|
||||
Metrics Update <metric-update-2026.1-to-2026.2>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
|
||||
* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
|
||||
@@ -1,126 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2026.1
|
||||
.. |NEW_VERSION| replace:: 2026.2
|
||||
.. |PRECEDING_VERSION| replace:: 2026.1
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_auth_cache_permissions
|
||||
- Total number of permission sets currently cached across all roles.
|
||||
* - scylla_auth_cache_roles
|
||||
- Number of roles currently cached.
|
||||
* - scylla_cql_forwarded_requests
|
||||
- Counts the total number of attempts to forward CQL requests to other nodes.
|
||||
One request may be forwarded multiple times, particularly when a write is
|
||||
handled by a non-replica node.
|
||||
* - scylla_cql_write_consistency_levels_disallowed_violations
|
||||
- Counts the number of write_consistency_levels_disallowed guardrail violations,
|
||||
i.e. attempts to write with a forbidden consistency level.
|
||||
* - scylla_cql_write_consistency_levels_warned_violations
|
||||
- Counts the number of write_consistency_levels_warned guardrail violations,
|
||||
i.e. attempts to write with a discouraged consistency level.
|
||||
* - scylla_cql_writes_per_consistency_level
|
||||
- Counts the number of writes for each consistency level.
|
||||
* - scylla_io_queue_integrated_disk_queue_length
|
||||
- Length of the integrated disk queue.
|
||||
* - scylla_io_queue_integrated_queue_length
|
||||
- Length of the integrated queue.
|
||||
* - scylla_logstor_sm_bytes_freed
|
||||
- Counts the number of data bytes freed.
|
||||
* - scylla_logstor_sm_bytes_read
|
||||
- Counts the number of bytes read from the disk.
|
||||
* - scylla_logstor_sm_bytes_written
|
||||
- Counts the number of bytes written to the disk.
|
||||
* - scylla_logstor_sm_compaction_bytes_written
|
||||
- Counts the number of bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_data_bytes_written
|
||||
- Counts the number of data bytes written to the disk by compaction.
|
||||
* - scylla_logstor_sm_compaction_records_rewritten
|
||||
- Counts the number of records rewritten during compaction.
|
||||
* - scylla_logstor_sm_compaction_records_skipped
|
||||
- Counts the number of records skipped during compaction.
|
||||
* - scylla_logstor_sm_compaction_segments_freed
|
||||
- Counts the number of data bytes written to the disk.
|
||||
* - scylla_logstor_sm_disk_usage
|
||||
- Total disk usage.
|
||||
* - scylla_logstor_sm_free_segments
|
||||
- Counts the number of free segments currently available.
|
||||
* - scylla_logstor_sm_segment_pool_compaction_segments_get
|
||||
- Counts the number of segments taken from the segment pool for compaction.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_get
|
||||
- Counts the number of segments taken from the segment pool for normal writes.
|
||||
* - scylla_logstor_sm_segment_pool_normal_segments_wait
|
||||
- Counts the number of times normal writes had to wait for a segment to become
|
||||
available in the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_segments_put
|
||||
- Counts the number of segments returned to the segment pool.
|
||||
* - scylla_logstor_sm_segment_pool_separator_segments_get
|
||||
- Counts the number of segments taken from the segment pool for separator writes.
|
||||
* - scylla_logstor_sm_segment_pool_size
|
||||
- Counts the number of segments in the segment pool.
|
||||
* - scylla_logstor_sm_segments_allocated
|
||||
- Counts the number of segments allocated.
|
||||
* - scylla_logstor_sm_segments_compacted
|
||||
- Counts the number of segments compacted.
|
||||
* - scylla_logstor_sm_segments_freed
|
||||
- Counts the number of segments freed.
|
||||
* - scylla_logstor_sm_segments_in_use
|
||||
- Counts the number of segments currently in use.
|
||||
* - scylla_logstor_sm_separator_buffer_flushed
|
||||
- Counts the number of times the separator buffer has been flushed.
|
||||
* - scylla_logstor_sm_separator_bytes_written
|
||||
- Counts the number of bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_data_bytes_written
|
||||
- Counts the number of data bytes written to the separator.
|
||||
* - scylla_logstor_sm_separator_flow_control_delay
|
||||
- Current delay applied to writes to control separator debt in microseconds.
|
||||
* - scylla_logstor_sm_separator_segments_freed
|
||||
- Counts the number of segments freed by the separator.
|
||||
* - scylla_transport_cql_pending_response_memory
|
||||
- Holds the total memory in bytes consumed by responses waiting to be sent.
|
||||
* - scylla_transport_cql_request_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_cql_requests_serving
|
||||
- Holds the number of requests that are being processed right now.
|
||||
* - scylla_transport_cql_response_histogram_bytes
|
||||
- A histogram of received bytes in CQL messages of a specific kind and
|
||||
specific scheduling group.
|
||||
* - scylla_transport_requests_forwarded_failed
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed to execute there.
|
||||
* - scylla_transport_requests_forwarded_prepared_not_found
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but failed there because the statement was not prepared on the target.
|
||||
When this happens, the coordinator performs an additional remote call
|
||||
to prepare the statement on the replica and retries the EXECUTE request
|
||||
afterwards.
|
||||
* - scylla_transport_requests_forwarded_redirected
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
but that replica responded with a redirect to another node. This can
|
||||
happen when replica has stale information about the cluster topology or
|
||||
when the request is handled by a node that is not a replica for the data
|
||||
being accessed by the request.
|
||||
* - scylla_transport_requests_forwarded_successfully
|
||||
- Counts the number of requests that were forwarded to another replica
|
||||
and executed successfully there.
|
||||
|
||||
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret
|
||||
|
||||
template<typename Func>
|
||||
future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
|
||||
kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
|
||||
kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
|
||||
KMIP_CMD* cmd = cmd_in;
|
||||
|
||||
// #998 Need to do retry loop, because we can have either timed out connection,
|
||||
|
||||
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
|
||||
static auto get_xml_node = [](node_type* node, const char* what) {
|
||||
auto res = node->first_node(what);
|
||||
if (!res) {
|
||||
throw malformed_response_error(fmt::format("XML parse error: {}", what));
|
||||
throw malformed_response_error(fmt::format("XML parse error", what));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/smp.hh>
|
||||
#include "db/schema_features.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
@@ -109,7 +108,6 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
|
||||
"UUID_SSTABLE_IDENTIFIERS"sv,
|
||||
"GROUP0_SCHEMA_VERSIONING"sv,
|
||||
"VIEW_BUILD_STATUS_ON_GROUP0"sv,
|
||||
"CDC_GENERATIONS_V2"sv,
|
||||
};
|
||||
|
||||
if (is_test_only_feature_deprecated()) {
|
||||
@@ -181,7 +179,6 @@ db::schema_features feature_service::cluster_schema_features() const {
|
||||
f.set<db::schema_feature::GROUP0_SCHEMA_VERSIONING>();
|
||||
f.set_if<db::schema_feature::IN_MEMORY_TABLES>(bool(in_memory_tables));
|
||||
f.set_if<db::schema_feature::TABLET_OPTIONS>(bool(tablet_options));
|
||||
f.set_if<db::schema_feature::KEYSPACE_MULTI_RF_CHANGE>(bool(keyspace_multi_rf_change));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
@@ -83,6 +83,7 @@ public:
|
||||
gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
|
||||
gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
|
||||
gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
|
||||
gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
|
||||
gms::feature user_defined_aggregates { *this, "UDA"sv };
|
||||
// Historically max_result_size contained only two fields: soft_limit and
|
||||
// hard_limit. It was somehow obscure because for normal paged queries both
|
||||
@@ -181,7 +182,6 @@ public:
|
||||
gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv };
|
||||
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
|
||||
gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
|
||||
gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -399,10 +399,9 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
|
||||
}
|
||||
}
|
||||
gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
|
||||
auto ack2_msg_str = fmt::format("{}", ack2_msg);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
|
||||
logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
|
||||
}
|
||||
|
||||
// Depends on
|
||||
@@ -965,7 +964,8 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
|
||||
diff = now - last;
|
||||
if (!failed) {
|
||||
last = now;
|
||||
} else if (diff > max_duration) {
|
||||
}
|
||||
if (diff > max_duration) {
|
||||
logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
|
||||
co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
|
||||
return g.convict(host_id);
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "locator/token_metadata.hh"
|
||||
#include "locator/types.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
@@ -71,11 +72,6 @@ struct gossip_config {
|
||||
utils::updateable_value<utils::UUID> recovery_leader;
|
||||
};
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
gms::inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
/**
|
||||
* This module is responsible for Gossiping information for the local endpoint. This abstraction
|
||||
* maintains the list of live and dead endpoints. Periodically i.e. every 1 second this module
|
||||
|
||||
23
gms/loaded_endpoint_state.hh
Normal file
23
gms/loaded_endpoint_state.hh
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/types.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
} // namespace gms
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "query/query_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
|
||||
namespace utils {
|
||||
class UUID final {
|
||||
@@ -43,4 +43,3 @@ class host_id final {
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
3
init.cc
3
init.cc
@@ -87,6 +87,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
|
||||
}
|
||||
}
|
||||
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
|
||||
disabled.insert("ALTERNATOR_STREAMS"s);
|
||||
}
|
||||
if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
|
||||
disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
|
||||
}
|
||||
|
||||
@@ -284,14 +284,3 @@ future<> instance_cache::stop() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
struct equal_to<seastar::scheduling_group> {
|
||||
bool operator()(seastar::scheduling_group& sg1, seastar::scheduling_group& sg2) const noexcept {
|
||||
return sg1 == sg2;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user