mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-28 12:17:02 +00:00
Compare commits
15 Commits
fix-invali
...
ykaul/comp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
339f1ae1a0 | ||
|
|
07d69aa8fa | ||
|
|
c50bfb995b | ||
|
|
e7dbccbdcd | ||
|
|
faa2f8ba76 | ||
|
|
7aca42aa31 | ||
|
|
92e0597807 | ||
|
|
0798c112d0 | ||
|
|
9650390482 | ||
|
|
a1e8ef8d6e | ||
|
|
ea00cfad3d | ||
|
|
0fd89d77b3 | ||
|
|
361a717d89 | ||
|
|
9df4fc3e2f | ||
|
|
d1b4fd5683 |
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@@ -32,8 +32,8 @@ counters* @nuivall
|
||||
tests/counter_test* @nuivall
|
||||
|
||||
# DOCS
|
||||
/docs/ @annastuchlik @tzach
|
||||
/docs/alternator/ @annastuchlik @tzach @nyh
|
||||
docs/* @annastuchlik @tzach
|
||||
docs/alternator @annastuchlik @tzach @nyh
|
||||
|
||||
# GOSSIP
|
||||
gms/* @tgrabiec @asias @kbr-scylla
|
||||
|
||||
@@ -234,11 +234,15 @@ generate_scylla_version()
|
||||
|
||||
option(Scylla_USE_PRECOMPILED_HEADER "Use precompiled header for Scylla" ON)
|
||||
add_library(scylla-precompiled-header STATIC exported_templates.cc)
|
||||
target_include_directories(scylla-precompiled-header PRIVATE
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"${scylla_gen_build_dir}")
|
||||
target_link_libraries(scylla-precompiled-header PRIVATE
|
||||
absl::headers
|
||||
absl::btree
|
||||
absl::hash
|
||||
absl::raw_hash_set
|
||||
idl
|
||||
Seastar::seastar
|
||||
Snappy::snappy
|
||||
systemd
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.3.0-dev
|
||||
VERSION=2026.2.0-dev
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -1892,7 +1892,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
}
|
||||
if (vector_index_updates->Size() > 1) {
|
||||
// VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates.
|
||||
// Since DynamoDB artificially limits the latter to just a
|
||||
// Since DynamoDB artifically limits the latter to just a
|
||||
// single operation (one Create or one Delete), we also
|
||||
// place the same artificial limit on VectorIndexUpdates,
|
||||
// and throw the same LimitExceeded error if the client
|
||||
|
||||
@@ -1354,7 +1354,7 @@ static future<executor::request_return_type> query_vector(
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
// Parse the Select parameter and determine which attributes to return.
|
||||
// For a vector index, the default Select is ALL_ATTRIBUTES (full items).
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficient because it
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it
|
||||
// returns what the vector store returned without looking up additional
|
||||
// base-table data. Currently only the primary key attributes are projected
|
||||
// but in the future we'll implement projecting additional attributes into
|
||||
|
||||
@@ -167,8 +167,46 @@ static schema_ptr get_schema_from_arn(service::storage_proxy& proxy, const strea
|
||||
}
|
||||
}
|
||||
|
||||
// ShardId. Must be between 28 and 65 characters inclusive.
|
||||
// UUID is 36 bytes as string (including dashes).
|
||||
// Prepend a version/type marker (`S`) -> 37
|
||||
class stream_shard_id : public utils::UUID {
|
||||
public:
|
||||
using UUID = utils::UUID;
|
||||
static constexpr char marker = 'S';
|
||||
|
||||
stream_shard_id() = default;
|
||||
stream_shard_id(const UUID& uuid)
|
||||
: UUID(uuid)
|
||||
{}
|
||||
stream_shard_id(const table_id& tid)
|
||||
: UUID(tid.uuid())
|
||||
{}
|
||||
stream_shard_id(std::string_view v)
|
||||
: UUID(v.substr(1))
|
||||
{
|
||||
if (v[0] != marker) {
|
||||
throw std::invalid_argument(std::string(v));
|
||||
}
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& os, const stream_shard_id& arn) {
|
||||
const UUID& uuid = arn;
|
||||
return os << marker << uuid;
|
||||
}
|
||||
friend std::istream& operator>>(std::istream& is, stream_shard_id& arn) {
|
||||
std::string s;
|
||||
is >> s;
|
||||
arn = stream_shard_id(s);
|
||||
return is;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_shard_id>
|
||||
: public from_string_helper<ValueType, alternator::stream_shard_id>
|
||||
{};
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
|
||||
: public from_string_helper<ValueType, alternator::stream_arn>
|
||||
@@ -180,8 +218,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
_stats.api_operations.list_streams++;
|
||||
|
||||
auto limit = rjson::get_opt<int>(request, "Limit").value_or(100);
|
||||
auto streams_start = rjson::get_opt<stream_arn>(request, "ExclusiveStartStreamArn");
|
||||
|
||||
auto streams_start = rjson::get_opt<stream_shard_id>(request, "ExclusiveStartStreamArn");
|
||||
auto table = find_table(_proxy, request);
|
||||
auto db = _proxy.data_dictionary();
|
||||
|
||||
@@ -207,34 +244,34 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
cfs = db.get_tables();
|
||||
}
|
||||
|
||||
// We need to sort the tables to ensure a stable order for paging.
|
||||
// We sort by keyspace and table name, which will also allow us to skip to
|
||||
// the right position by ExclusiveStartStreamArn.
|
||||
auto cmp = [](std::string_view ks1, std::string_view cf1, std::string_view ks2, std::string_view cf2) {
|
||||
return ks1 == ks2 ? cf1 < cf2 : ks1 < ks2;
|
||||
};
|
||||
// # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
|
||||
// generate duplicates in a paged listing here. Can obviously miss things if they
|
||||
// are added between paged calls and end up with a "smaller" UUID/ARN, but that
|
||||
// is to be expected.
|
||||
if (std::cmp_less(limit, cfs.size()) || streams_start) {
|
||||
std::sort(cfs.begin(), cfs.end(),
|
||||
[&cmp](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return cmp(t1.schema()->ks_name(), t1.schema()->cf_name(),
|
||||
t2.schema()->ks_name(), t2.schema()->cf_name());
|
||||
});
|
||||
std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return t1.schema()->id().uuid() < t2.schema()->id().uuid();
|
||||
});
|
||||
}
|
||||
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (streams_start) {
|
||||
i = std::upper_bound(i, e, *streams_start,
|
||||
[&cmp](const stream_arn& arn, const data_dictionary::table& t) {
|
||||
return cmp(arn.keyspace_name(), arn.table_name(),
|
||||
t.schema()->ks_name(), t.schema()->cf_name());
|
||||
});
|
||||
i = std::find_if(i, e, [&](const data_dictionary::table& t) {
|
||||
return t.schema()->id().uuid() == streams_start
|
||||
&& cdc::get_base_table(db.real_database(), *t.schema())
|
||||
&& is_alternator_keyspace(t.schema()->ks_name())
|
||||
;
|
||||
});
|
||||
if (i != e) {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = rjson::empty_object();
|
||||
auto streams = rjson::empty_array();
|
||||
std::optional<stream_arn> last;
|
||||
std::optional<stream_shard_id> last;
|
||||
|
||||
for (;limit > 0 && i != e; ++i) {
|
||||
auto s = i->schema();
|
||||
@@ -245,24 +282,19 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
}
|
||||
if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
|
||||
rjson::value new_entry = rjson::empty_object();
|
||||
|
||||
last = i->schema()->id();
|
||||
auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
|
||||
rjson::add(new_entry, "StreamArn", arn);
|
||||
rjson::add(new_entry, "StreamLabel", rjson::from_string(stream_label(*s)));
|
||||
rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(s->cf_name())));
|
||||
rjson::push_back(streams, std::move(new_entry));
|
||||
last = std::move(arn);
|
||||
--limit;
|
||||
}
|
||||
}
|
||||
|
||||
rjson::add(ret, "Streams", std::move(streams));
|
||||
|
||||
// Only emit LastEvaluatedStreamArn when we stopped because we hit the
|
||||
// limit (limit == 0), meaning there may be more streams to list.
|
||||
// If we exhausted all tables naturally (limit > 0), there are no more
|
||||
// streams, so we must not emit a cookie.
|
||||
if (last && limit == 0) {
|
||||
if (last) {
|
||||
rjson::add(ret, "LastEvaluatedStreamArn", *last);
|
||||
}
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
@@ -582,7 +614,7 @@ void stream_id_range::prepare_for_iterating()
|
||||
// the function returns `stream_id_range` that will allow iteration over children Streams shards for the Streams shard `parent`
|
||||
// a child Streams shard is defined as a Streams shard that touches token range that was previously covered by `parent` Streams shard
|
||||
// Streams shard contains a token, that represents end of the token range for that Streams shard (inclusive)
|
||||
// beginning of the token range is defined by previous Streams shard's token + 1
|
||||
// begginning of the token range is defined by previous Streams shard's token + 1
|
||||
// NOTE: With vnodes, ranges of Streams' shards wrap, while with tablets the biggest allowed token number is always a range end.
|
||||
// NOTE: both streams generation are guaranteed to cover whole range and be non-empty
|
||||
// NOTE: it's possible to get more than one stream shard with the same token value (thus some of those stream shards will be empty) -
|
||||
|
||||
@@ -856,9 +856,7 @@ rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::r
|
||||
}
|
||||
|
||||
apilog.info("exclude_node: hosts={}", hosts);
|
||||
co_await ss.local().run_with_no_api_lock([hosts = std::move(hosts)] (service::storage_service& ss) {
|
||||
return ss.mark_excluded(hosts);
|
||||
});
|
||||
co_await ss.local().mark_excluded(hosts);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1733,9 +1731,7 @@ rest_create_vnode_tablet_migration(http_context& ctx, sharded<service::storage_s
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.prepare_for_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().prepare_for_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1747,9 +1743,7 @@ rest_get_vnode_tablet_migration(http_context& ctx, sharded<service::storage_serv
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
auto status = co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.get_tablets_migration_status_with_node_details(keyspace);
|
||||
});
|
||||
auto status = co_await ss.local().get_tablets_migration_status_with_node_details(keyspace);
|
||||
|
||||
ss::vnode_tablet_migration_status result;
|
||||
result.keyspace = status.keyspace;
|
||||
@@ -1774,9 +1768,7 @@ rest_set_vnode_tablet_migration_node_storage_mode(http_context& ctx, sharded<ser
|
||||
}
|
||||
auto mode_str = req->get_query_param("intended_mode");
|
||||
auto mode = service::intended_storage_mode_from_string(mode_str);
|
||||
co_await ss.local().run_with_no_api_lock([mode] (service::storage_service& ss) {
|
||||
return ss.set_node_intended_storage_mode(mode);
|
||||
});
|
||||
co_await ss.local().set_node_intended_storage_mode(mode);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1790,9 +1782,7 @@ rest_finalize_vnode_tablet_migration(http_context& ctx, sharded<service::storage
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
validate_keyspace(ctx, keyspace);
|
||||
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.finalize_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().finalize_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1869,106 +1859,90 @@ rest_bind(FuncType func, BindArgs&... args) {
|
||||
return std::bind_front(func, std::ref(args)...);
|
||||
}
|
||||
|
||||
// Hold the storage_service async gate for the duration of async REST
|
||||
// handlers so stop() drains in-flight requests before teardown.
|
||||
// Synchronous handlers don't yield and need no gate.
|
||||
static seastar::httpd::future_json_function
|
||||
gated(sharded<service::storage_service>& ss, seastar::httpd::future_json_function fn) {
|
||||
return [fn = std::move(fn), &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
auto holder = ss.local().hold_async_gate();
|
||||
co_return co_await fn(std::move(req));
|
||||
};
|
||||
}
|
||||
|
||||
static seastar::httpd::json_request_function
|
||||
gated(sharded<service::storage_service>&, seastar::httpd::json_request_function fn) {
|
||||
return fn;
|
||||
}
|
||||
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
ss::get_token_endpoint.set(r, gated(ss, rest_bind(rest_get_token_endpoint, ctx, ss)));
|
||||
ss::get_release_version.set(r, gated(ss, rest_bind(rest_get_release_version, ss)));
|
||||
ss::get_scylla_release_version.set(r, gated(ss, rest_bind(rest_get_scylla_release_version, ss)));
|
||||
ss::get_schema_version.set(r, gated(ss, rest_bind(rest_get_schema_version, ss)));
|
||||
ss::get_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_range_to_endpoint_map, ctx, ss)));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_pending_range_to_endpoint_map, ctx)));
|
||||
ss::describe_ring.set(r, gated(ss, rest_bind(rest_describe_ring, ctx, ss)));
|
||||
ss::get_current_generation_number.set(r, gated(ss, rest_bind(rest_get_current_generation_number, ss)));
|
||||
ss::get_natural_endpoints.set(r, gated(ss, rest_bind(rest_get_natural_endpoints, ctx, ss)));
|
||||
ss::get_natural_endpoints_v2.set(r, gated(ss, rest_bind(rest_get_natural_endpoints_v2, ctx, ss)));
|
||||
ss::cdc_streams_check_and_repair.set(r, gated(ss, rest_bind(rest_cdc_streams_check_and_repair, ss)));
|
||||
ss::cleanup_all.set(r, gated(ss, rest_bind(rest_cleanup_all, ctx, ss)));
|
||||
ss::reset_cleanup_needed.set(r, gated(ss, rest_bind(rest_reset_cleanup_needed, ctx, ss)));
|
||||
ss::force_flush.set(r, gated(ss, rest_bind(rest_force_flush, ctx)));
|
||||
ss::force_keyspace_flush.set(r, gated(ss, rest_bind(rest_force_keyspace_flush, ctx)));
|
||||
ss::decommission.set(r, gated(ss, rest_bind(rest_decommission, ss, ssc)));
|
||||
ss::logstor_compaction.set(r, gated(ss, rest_bind(rest_logstor_compaction, ctx)));
|
||||
ss::logstor_flush.set(r, gated(ss, rest_bind(rest_logstor_flush, ctx)));
|
||||
ss::move.set(r, gated(ss, rest_bind(rest_move, ss)));
|
||||
ss::remove_node.set(r, gated(ss, rest_bind(rest_remove_node, ss)));
|
||||
ss::exclude_node.set(r, gated(ss, rest_bind(rest_exclude_node, ss)));
|
||||
ss::get_removal_status.set(r, gated(ss, rest_bind(rest_get_removal_status, ss)));
|
||||
ss::force_remove_completion.set(r, gated(ss, rest_bind(rest_force_remove_completion, ss)));
|
||||
ss::set_logging_level.set(r, gated(ss, rest_bind(rest_set_logging_level)));
|
||||
ss::get_logging_levels.set(r, gated(ss, rest_bind(rest_get_logging_levels)));
|
||||
ss::get_operation_mode.set(r, gated(ss, rest_bind(rest_get_operation_mode, ss)));
|
||||
ss::is_starting.set(r, gated(ss, rest_bind(rest_is_starting, ss)));
|
||||
ss::get_drain_progress.set(r, gated(ss, rest_bind(rest_get_drain_progress, ss)));
|
||||
ss::drain.set(r, gated(ss, rest_bind(rest_drain, ss)));
|
||||
ss::stop_gossiping.set(r, gated(ss, rest_bind(rest_stop_gossiping, ss)));
|
||||
ss::start_gossiping.set(r, gated(ss, rest_bind(rest_start_gossiping, ss)));
|
||||
ss::is_gossip_running.set(r, gated(ss, rest_bind(rest_is_gossip_running, ss)));
|
||||
ss::stop_daemon.set(r, gated(ss, rest_bind(rest_stop_daemon)));
|
||||
ss::is_initialized.set(r, gated(ss, rest_bind(rest_is_initialized, ss)));
|
||||
ss::join_ring.set(r, gated(ss, rest_bind(rest_join_ring)));
|
||||
ss::is_joined.set(r, gated(ss, rest_bind(rest_is_joined, ss)));
|
||||
ss::is_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_is_incremental_backups_enabled, ctx)));
|
||||
ss::set_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_set_incremental_backups_enabled, ctx)));
|
||||
ss::rebuild.set(r, gated(ss, rest_bind(rest_rebuild, ss)));
|
||||
ss::bulk_load.set(r, gated(ss, rest_bind(rest_bulk_load)));
|
||||
ss::bulk_load_async.set(r, gated(ss, rest_bind(rest_bulk_load_async)));
|
||||
ss::reschedule_failed_deletions.set(r, gated(ss, rest_bind(rest_reschedule_failed_deletions)));
|
||||
ss::sample_key_range.set(r, gated(ss, rest_bind(rest_sample_key_range)));
|
||||
ss::reset_local_schema.set(r, gated(ss, rest_bind(rest_reset_local_schema, ss)));
|
||||
ss::set_trace_probability.set(r, gated(ss, rest_bind(rest_set_trace_probability)));
|
||||
ss::get_trace_probability.set(r, gated(ss, rest_bind(rest_get_trace_probability)));
|
||||
ss::get_slow_query_info.set(r, gated(ss, rest_bind(rest_get_slow_query_info)));
|
||||
ss::set_slow_query.set(r, gated(ss, rest_bind(rest_set_slow_query)));
|
||||
ss::deliver_hints.set(r, gated(ss, rest_bind(rest_deliver_hints)));
|
||||
ss::get_cluster_name.set(r, gated(ss, rest_bind(rest_get_cluster_name, ss)));
|
||||
ss::get_partitioner_name.set(r, gated(ss, rest_bind(rest_get_partitioner_name, ss)));
|
||||
ss::get_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_warn_threshold)));
|
||||
ss::set_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_warn_threshold)));
|
||||
ss::get_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_failure_threshold)));
|
||||
ss::set_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_failure_threshold)));
|
||||
ss::get_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_get_batch_size_failure_threshold)));
|
||||
ss::set_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_set_batch_size_failure_threshold)));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, gated(ss, rest_bind(rest_set_hinted_handoff_throttle_in_kb)));
|
||||
ss::get_exceptions.set(r, gated(ss, rest_bind(rest_get_exceptions, ss)));
|
||||
ss::get_total_hints_in_progress.set(r, gated(ss, rest_bind(rest_get_total_hints_in_progress)));
|
||||
ss::get_total_hints.set(r, gated(ss, rest_bind(rest_get_total_hints)));
|
||||
ss::get_ownership.set(r, gated(ss, rest_bind(rest_get_ownership, ctx, ss)));
|
||||
ss::get_effective_ownership.set(r, gated(ss, rest_bind(rest_get_effective_ownership, ctx, ss)));
|
||||
ss::retrain_dict.set(r, gated(ss, rest_bind(rest_retrain_dict, ctx, ss, group0_client)));
|
||||
ss::estimate_compression_ratios.set(r, gated(ss, rest_bind(rest_estimate_compression_ratios, ctx, ss)));
|
||||
ss::sstable_info.set(r, gated(ss, rest_bind(rest_sstable_info, ctx)));
|
||||
ss::logstor_info.set(r, gated(ss, rest_bind(rest_logstor_info, ctx)));
|
||||
ss::reload_raft_topology_state.set(r, gated(ss, rest_bind(rest_reload_raft_topology_state, ss, group0_client)));
|
||||
ss::upgrade_to_raft_topology.set(r, gated(ss, rest_bind(rest_upgrade_to_raft_topology, ss)));
|
||||
ss::raft_topology_upgrade_status.set(r, gated(ss, rest_bind(rest_raft_topology_upgrade_status, ss)));
|
||||
ss::raft_topology_get_cmd_status.set(r, gated(ss, rest_bind(rest_raft_topology_get_cmd_status, ss)));
|
||||
ss::move_tablet.set(r, gated(ss, rest_bind(rest_move_tablet, ctx, ss)));
|
||||
ss::add_tablet_replica.set(r, gated(ss, rest_bind(rest_add_tablet_replica, ctx, ss)));
|
||||
ss::del_tablet_replica.set(r, gated(ss, rest_bind(rest_del_tablet_replica, ctx, ss)));
|
||||
ss::repair_tablet.set(r, gated(ss, rest_bind(rest_repair_tablet, ctx, ss)));
|
||||
ss::tablet_balancing_enable.set(r, gated(ss, rest_bind(rest_tablet_balancing_enable, ss)));
|
||||
ss::create_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_create_vnode_tablet_migration, ctx, ss)));
|
||||
ss::get_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_get_vnode_tablet_migration, ctx, ss)));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, gated(ss, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss)));
|
||||
ss::finalize_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss)));
|
||||
ss::quiesce_topology.set(r, gated(ss, rest_bind(rest_quiesce_topology, ss)));
|
||||
sp::get_schema_versions.set(r, gated(ss, rest_bind(rest_get_schema_versions, ss)));
|
||||
ss::drop_quarantined_sstables.set(r, gated(ss, rest_bind(rest_drop_quarantined_sstables, ctx, ss)));
|
||||
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
|
||||
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
|
||||
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
|
||||
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
|
||||
ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
|
||||
ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
|
||||
ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
|
||||
ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
|
||||
ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
|
||||
ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
|
||||
ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
|
||||
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
|
||||
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
|
||||
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
|
||||
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
|
||||
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
|
||||
ss::move.set(r, rest_bind(rest_move, ss));
|
||||
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
|
||||
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
|
||||
ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
|
||||
ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
|
||||
ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
|
||||
ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
|
||||
ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
|
||||
ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
|
||||
ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
|
||||
ss::drain.set(r, rest_bind(rest_drain, ss));
|
||||
ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
|
||||
ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
|
||||
ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
|
||||
ss::stop_daemon.set(r, rest_bind(rest_stop_daemon));
|
||||
ss::is_initialized.set(r, rest_bind(rest_is_initialized, ss));
|
||||
ss::join_ring.set(r, rest_bind(rest_join_ring));
|
||||
ss::is_joined.set(r, rest_bind(rest_is_joined, ss));
|
||||
ss::is_incremental_backups_enabled.set(r, rest_bind(rest_is_incremental_backups_enabled, ctx));
|
||||
ss::set_incremental_backups_enabled.set(r, rest_bind(rest_set_incremental_backups_enabled, ctx));
|
||||
ss::rebuild.set(r, rest_bind(rest_rebuild, ss));
|
||||
ss::bulk_load.set(r, rest_bind(rest_bulk_load));
|
||||
ss::bulk_load_async.set(r, rest_bind(rest_bulk_load_async));
|
||||
ss::reschedule_failed_deletions.set(r, rest_bind(rest_reschedule_failed_deletions));
|
||||
ss::sample_key_range.set(r, rest_bind(rest_sample_key_range));
|
||||
ss::reset_local_schema.set(r, rest_bind(rest_reset_local_schema, ss));
|
||||
ss::set_trace_probability.set(r, rest_bind(rest_set_trace_probability));
|
||||
ss::get_trace_probability.set(r, rest_bind(rest_get_trace_probability));
|
||||
ss::get_slow_query_info.set(r, rest_bind(rest_get_slow_query_info));
|
||||
ss::set_slow_query.set(r, rest_bind(rest_set_slow_query));
|
||||
ss::deliver_hints.set(r, rest_bind(rest_deliver_hints));
|
||||
ss::get_cluster_name.set(r, rest_bind(rest_get_cluster_name, ss));
|
||||
ss::get_partitioner_name.set(r, rest_bind(rest_get_partitioner_name, ss));
|
||||
ss::get_tombstone_warn_threshold.set(r, rest_bind(rest_get_tombstone_warn_threshold));
|
||||
ss::set_tombstone_warn_threshold.set(r, rest_bind(rest_set_tombstone_warn_threshold));
|
||||
ss::get_tombstone_failure_threshold.set(r, rest_bind(rest_get_tombstone_failure_threshold));
|
||||
ss::set_tombstone_failure_threshold.set(r, rest_bind(rest_set_tombstone_failure_threshold));
|
||||
ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
|
||||
ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
|
||||
ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
|
||||
ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
|
||||
ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
|
||||
ss::get_ownership.set(r, rest_bind(rest_get_ownership, ctx, ss));
|
||||
ss::get_effective_ownership.set(r, rest_bind(rest_get_effective_ownership, ctx, ss));
|
||||
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
|
||||
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
|
||||
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
|
||||
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
|
||||
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
|
||||
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
|
||||
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
|
||||
ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
|
||||
ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
|
||||
ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
|
||||
ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
|
||||
ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
|
||||
ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
|
||||
ss::create_vnode_tablet_migration.set(r, rest_bind(rest_create_vnode_tablet_migration, ctx, ss));
|
||||
ss::get_vnode_tablet_migration.set(r, rest_bind(rest_get_vnode_tablet_migration, ctx, ss));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss));
|
||||
ss::finalize_vnode_tablet_migration.set(r, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss));
|
||||
ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
|
||||
sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
|
||||
ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
|
||||
}
|
||||
|
||||
void unset_storage_service(http_context& ctx, routes& r) {
|
||||
|
||||
@@ -113,8 +113,8 @@ static category_set parse_audit_categories(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
audit::audited_tables_t result;
|
||||
static std::map<sstring, std::set<sstring>> parse_audit_tables(const sstring& data) {
|
||||
std::map<sstring, std::set<sstring>> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -139,8 +139,8 @@ static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_keyspaces_t parse_audit_keyspaces(const sstring& data) {
|
||||
audit::audited_keyspaces_t result;
|
||||
static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
|
||||
std::set<sstring> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -156,8 +156,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg)
|
||||
: _token_metadata(token_metadata)
|
||||
@@ -165,8 +165,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
, _audited_tables(std::move(audited_tables))
|
||||
, _audited_categories(std::move(audited_categories))
|
||||
, _cfg(cfg)
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<audited_keyspaces_t>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<audited_tables_t>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
|
||||
{
|
||||
_storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
|
||||
@@ -181,8 +181,8 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
return make_ready_future<>();
|
||||
}
|
||||
category_set audited_categories = parse_audit_categories(cfg.audit_categories());
|
||||
audit::audited_tables_t audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
audit::audited_keyspaces_t audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
std::set<sstring> audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
|
||||
logger.info("Audit is enabled. Auditing to: \"{}\", with the following categories: \"{}\", keyspaces: \"{}\", and tables: \"{}\"",
|
||||
cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());
|
||||
@@ -304,7 +304,7 @@ future<> inspect_login(const sstring& username, socket_address client_ip, bool e
|
||||
return audit::local_audit_instance().log_login(username, client_ip, error);
|
||||
}
|
||||
|
||||
bool audit::should_log_table(std::string_view keyspace, std::string_view name) const {
|
||||
bool audit::should_log_table(const sstring& keyspace, const sstring& name) const {
|
||||
auto keyspace_it = _audited_tables.find(keyspace);
|
||||
return keyspace_it != _audited_tables.cend() && keyspace_it->second.find(name) != keyspace_it->second.cend();
|
||||
}
|
||||
@@ -319,8 +319,8 @@ bool audit::will_log(statement_category cat, std::string_view keyspace, std::str
|
||||
// so it is logged whenever the category matches.
|
||||
return _audited_categories.contains(cat)
|
||||
&& (keyspace.empty()
|
||||
|| _audited_keyspaces.find(keyspace) != _audited_keyspaces.cend()
|
||||
|| should_log_table(keyspace, table)
|
||||
|| _audited_keyspaces.find(sstring(keyspace)) != _audited_keyspaces.cend()
|
||||
|| should_log_table(sstring(keyspace), sstring(table))
|
||||
|| cat == statement_category::AUTH
|
||||
|| cat == statement_category::ADMIN
|
||||
|| cat == statement_category::DCL);
|
||||
|
||||
@@ -129,15 +129,10 @@ public:
|
||||
class storage_helper;
|
||||
|
||||
class audit final : public seastar::async_sharded_service<audit> {
|
||||
public:
|
||||
// Transparent comparator (std::less<>) enables heterogeneous lookup with
|
||||
// string_view keys.
|
||||
using audited_keyspaces_t = std::set<sstring, std::less<>>;
|
||||
using audited_tables_t = std::map<sstring, std::set<sstring, std::less<>>, std::less<>>;
|
||||
private:
|
||||
locator::shared_token_metadata& _token_metadata;
|
||||
audited_keyspaces_t _audited_keyspaces;
|
||||
audited_tables_t _audited_tables;
|
||||
std::set<sstring> _audited_keyspaces;
|
||||
// Maps keyspace name to set of table names in that keyspace
|
||||
std::map<sstring, std::set<sstring>> _audited_tables;
|
||||
category_set _audited_categories;
|
||||
|
||||
std::unique_ptr<storage_helper> _storage_helper_ptr;
|
||||
@@ -150,7 +145,7 @@ private:
|
||||
template<class T>
|
||||
void update_config(const sstring & new_value, std::function<T(const sstring&)> parse_func, T& cfg_parameter);
|
||||
|
||||
bool should_log_table(std::string_view keyspace, std::string_view name) const;
|
||||
bool should_log_table(const sstring& keyspace, const sstring& name) const;
|
||||
public:
|
||||
static seastar::sharded<audit>& audit_instance() {
|
||||
// FIXME: leaked intentionally to avoid shutdown problems, see #293
|
||||
@@ -169,8 +164,8 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg);
|
||||
~audit();
|
||||
|
||||
@@ -1625,7 +1625,7 @@ struct process_change_visitor {
|
||||
if (_enable_updating_state) {
|
||||
if (_request_options.alternator && _alternator_schema_has_no_clustering_key && _clustering_row_states.empty()) {
|
||||
// Alternator's table can be with or without clustering key. If the clustering key exists,
|
||||
// delete request will be `clustered_row_delete` and will be handled there.
|
||||
// delete request will be `clustered_row_delete` and will be hanlded there.
|
||||
// If the clustering key doesn't exist, delete request will be `partition_delete` and will be handled here.
|
||||
// The no-clustering-key case is slightly tricky, because insert of such item is handled by `clustered_row_cells`
|
||||
// and has some value as clustering_key (the value currently seems to be empty bytes object).
|
||||
@@ -1933,7 +1933,7 @@ public:
|
||||
if (_options.alternator && !_alternator_clustering_keys_to_ignore.empty()) {
|
||||
// we filter mutations for Alternator's changes here.
|
||||
// We do it per mutation object (user might submit a batch of those in one go
|
||||
// and some might be split because of different timestamps),
|
||||
// and some might be splitted because of different timestamps),
|
||||
// ignore key set is cleared afterwards.
|
||||
// If single mutation object contains two separate changes to the same row
|
||||
// and at least one of them is ignored, all of them will be ignored.
|
||||
|
||||
@@ -240,7 +240,7 @@ static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& ta
|
||||
// and if the memtable also contains the key we're calculating max purgeable timestamp for.
|
||||
// First condition helps to not penalize the common scenario where memtable only contains
|
||||
// newer data.
|
||||
if (!table_s.skip_memtable_for_tombstone_gc() && memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
timestamp = memtable_min_timestamp;
|
||||
source = max_purgeable::timestamp_source::memtable_possibly_shadowing_data;
|
||||
}
|
||||
|
||||
@@ -39,9 +39,6 @@ public:
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
|
||||
virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
|
||||
// Returns true when tombstone GC considers only the repaired sstable set, meaning the
|
||||
// memtable does not need to be consulted (its data is always newer than any GC-eligible tombstone).
|
||||
virtual bool skip_memtable_for_tombstone_gc() const noexcept = 0;
|
||||
virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
|
||||
virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
|
||||
virtual compaction_strategy& get_compaction_strategy() const noexcept = 0;
|
||||
|
||||
@@ -406,11 +406,7 @@ commitlog_total_space_in_mb: -1
|
||||
# In short, `ms` needs more CPU during sstable writes,
|
||||
# but should behave better during reads,
|
||||
# although it might behave worse for very long clustering keys.
|
||||
#
|
||||
# `ms` sstable format works even better with `column_index_size_in_kb` set to 1,
|
||||
# so keep those two settings in sync (either both set, or both unset).
|
||||
sstable_format: ms
|
||||
column_index_size_in_kb: 1
|
||||
|
||||
# Auto-scaling of the promoted index prevents running out of memory
|
||||
# when the promoted index grows too large (due to partitions with many rows
|
||||
|
||||
19
configure.py
19
configure.py
@@ -2769,6 +2769,25 @@ def write_build_file(f,
|
||||
f.write('build {}: rust_source {}\n'.format(cc, src))
|
||||
obj = cc.replace('.cc', '.o')
|
||||
compiles[obj] = cc
|
||||
# Sources shared between scylla (compiled with PCH) and small tests
|
||||
# (with custom deps and partial link sets) must not use the PCH,
|
||||
# because -fpch-instantiate-templates injects symbol references that
|
||||
# the small test link sets cannot satisfy.
|
||||
small_test_srcs = set()
|
||||
for test_binary, test_deps in deps.items():
|
||||
if not test_binary.startswith('test/'):
|
||||
continue
|
||||
# Only exclude PCH for tests with truly small/partial link sets.
|
||||
# Tests that include scylla_core or similar large dep sets link
|
||||
# against enough objects to satisfy PCH-injected symbol refs.
|
||||
if len(test_deps) > 50:
|
||||
continue
|
||||
for src in test_deps:
|
||||
if src.endswith('.cc'):
|
||||
small_test_srcs.add(src)
|
||||
for src in small_test_srcs:
|
||||
obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
|
||||
compiles_with_pch.discard(obj)
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
|
||||
84
cql3/prepared_cache_key_type.hh
Normal file
84
cql3/prepared_cache_key_type.hh
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "bytes.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "utils/loading_cache.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/prepared_cache_key_type.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/column_specification.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
@@ -27,39 +28,6 @@ struct prepared_cache_entry_size {
|
||||
}
|
||||
};
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
class prepared_statements_cache {
|
||||
public:
|
||||
struct stats {
|
||||
@@ -164,35 +132,3 @@ public:
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include <seastar/coroutine/as_future.hh>
|
||||
#include <seastar/coroutine/try_future.hh>
|
||||
|
||||
#include "cql3/prepared_statements_cache.hh"
|
||||
#include "cql3/authorized_prepared_statements_cache.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/mapreduce_service.hh"
|
||||
@@ -77,7 +80,7 @@ static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
@@ -86,7 +89,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _authorized_prepared_cache(auth_prep_cache_cfg, authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
|
||||
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
@@ -1074,7 +1077,7 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
|
||||
@@ -22,13 +22,14 @@
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/stats.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/broadcast_tables/experimental/query_result.hh"
|
||||
#include "vector_search/vector_store_client.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/observable.hh"
|
||||
#include "utils/rolling_max_tracker.hh"
|
||||
@@ -41,6 +42,9 @@
|
||||
|
||||
|
||||
namespace lang { class manager; }
|
||||
namespace vector_search {
|
||||
class vector_store_client;
|
||||
}
|
||||
namespace service {
|
||||
class migration_manager;
|
||||
class query_state;
|
||||
@@ -58,6 +62,9 @@ struct query;
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
class prepared_statements_cache;
|
||||
class authorized_prepared_statements_cache;
|
||||
|
||||
namespace statements {
|
||||
class batch_statement;
|
||||
class schema_altering_statement;
|
||||
@@ -184,7 +191,7 @@ public:
|
||||
static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries, dialect d);
|
||||
|
||||
query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc,
|
||||
memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm);
|
||||
memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm);
|
||||
|
||||
~query_processor();
|
||||
|
||||
@@ -474,7 +481,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement> stmt,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
return execute_batch_without_checking_exception_message(
|
||||
std::move(stmt),
|
||||
query_state,
|
||||
@@ -490,7 +497,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement>,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries);
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries);
|
||||
|
||||
future<service::broadcast_tables::query_result>
|
||||
execute_broadcast_table_query(const service::broadcast_tables::query&);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,113 +23,15 @@ namespace cql3 {
|
||||
|
||||
namespace restrictions {
|
||||
|
||||
/// A set of discrete values.
|
||||
using value_list = std::vector<managed_bytes>; // Sorted and deduped using value comparator.
|
||||
|
||||
/// General set of values. Empty set and single-element sets are always value_list. interval is
|
||||
/// never singular and never has start > end. Universal set is a interval with both bounds null.
|
||||
using value_set = std::variant<value_list, interval<managed_bytes>>;
|
||||
|
||||
// For some boolean expression (say (X = 3) = TRUE, this represents a function that solves for X.
|
||||
// (here, it would return 3). The expression is obtained by equating some factors of the WHERE
|
||||
// clause to TRUE.
|
||||
using solve_for_t = std::function<value_set (const query_options&)>;
|
||||
|
||||
struct on_row {
|
||||
bool operator==(const on_row&) const = default;
|
||||
};
|
||||
|
||||
struct on_column {
|
||||
const column_definition* column;
|
||||
|
||||
bool operator==(const on_column&) const = default;
|
||||
};
|
||||
|
||||
// Placeholder type indicating we're solving for the partition key token.
|
||||
struct on_partition_key_token {
|
||||
const ::schema* schema;
|
||||
|
||||
bool operator==(const on_partition_key_token&) const = default;
|
||||
};
|
||||
|
||||
struct on_clustering_key_prefix {
|
||||
std::vector<const column_definition*> columns;
|
||||
|
||||
bool operator==(const on_clustering_key_prefix&) const = default;
|
||||
};
|
||||
|
||||
// A predicate on a column or a combination of columns. The WHERE clause analyzer
|
||||
// will attempt to convert predicates (that return true or false for a particular row)
|
||||
// to solvers (that return the set of column values that satisfy the predicate) when possible.
|
||||
struct predicate {
|
||||
// A function that returns the set of values that satisfy the filter. Can be unset,
|
||||
// in which case the filter must be interpreted.
|
||||
solve_for_t solve_for;
|
||||
// The original filter for this column.
|
||||
expr::expression filter;
|
||||
// What column the predicate can be solved for
|
||||
std::variant<
|
||||
on_row, // cannot determine, so predicate is on entire row
|
||||
on_column, // solving for a single column: e.g. c1 = 3
|
||||
on_partition_key_token, // solving for the token, e.g. token(pk1, pk2) >= :var
|
||||
on_clustering_key_prefix // solving for a clustering key prefix: e.g. (ck1, ck2) >= (3, 4)
|
||||
> on;
|
||||
// Whether the returned value_set will resolve to a single value.
|
||||
bool is_singleton = false;
|
||||
// Whether the returned value_set follows CQL comparison semantics
|
||||
bool comparable = true;
|
||||
bool is_multi_column = false;
|
||||
bool is_not_null_single_column = false;
|
||||
bool equality = false; // operator is EQ
|
||||
bool is_in = false; // operator is IN
|
||||
bool is_slice = false; // operator is LT/LTE/GT/GTE
|
||||
bool is_upper_bound = false; // operator is LT/LTE
|
||||
bool is_lower_bound = false; // operator is GT/GTE
|
||||
expr::comparison_order order = expr::comparison_order::cql;
|
||||
std::optional<expr::oper_t> op; // the binary operator, if any
|
||||
bool is_subscript = false; // whether the LHS is a subscript (map element access)
|
||||
};
|
||||
|
||||
///In some cases checking if columns have indexes is undesired of even
|
||||
///impossible, because e.g. the query runs on a pseudo-table, which does not
|
||||
///have an index-manager, or even a table object.
|
||||
using check_indexes = bool_class<class check_indexes_tag>;
|
||||
|
||||
// A function that returns the partition key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE token(pk) > 1 or WHERE pk1 IN :list1 AND pk2 IN :list2.
|
||||
using get_partition_key_ranges_fn_t = std::function<dht::partition_range_vector (const query_options&)>;
|
||||
|
||||
// A function that returns the clustering key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE ck > 1 or WHERE (ck1, ck2) > (1, 2).
|
||||
using get_clustering_bounds_fn_t = std::function<std::vector<query::clustering_range> (const query_options& options)>;
|
||||
|
||||
// A function that returns a singleton value, usable for a key (e.g. bytes_opt)
|
||||
using get_singleton_value_fn_t = std::function<bytes_opt (const query_options&)>;
|
||||
|
||||
struct no_partition_range_restrictions {
|
||||
};
|
||||
|
||||
struct token_range_restrictions {
|
||||
predicate token_restrictions;
|
||||
};
|
||||
|
||||
struct single_column_partition_range_restrictions {
|
||||
std::vector<predicate> per_column_restrictions;
|
||||
};
|
||||
|
||||
using partition_range_restrictions = std::variant<
|
||||
no_partition_range_restrictions,
|
||||
token_range_restrictions,
|
||||
single_column_partition_range_restrictions>;
|
||||
|
||||
// A map of per-column predicate vectors, ordered by schema position.
|
||||
using single_column_predicate_vectors = std::map<const column_definition*, std::vector<predicate>, expr::schema_pos_column_definition_comparator>;
|
||||
|
||||
/**
|
||||
* The restrictions corresponding to the relations specified on the where-clause of CQL query.
|
||||
*/
|
||||
class statement_restrictions {
|
||||
struct private_tag {}; // Tag for private constructor
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
|
||||
@@ -179,7 +81,7 @@ private:
|
||||
bool _has_queriable_regular_index = false, _has_queriable_pk_index = false, _has_queriable_ck_index = false;
|
||||
bool _has_multi_column; ///< True iff _clustering_columns_restrictions has a multi-column restriction.
|
||||
|
||||
std::vector<expr::expression> _where; ///< The entire WHERE clause (factorized).
|
||||
std::optional<expr::expression> _where; ///< The entire WHERE clause.
|
||||
|
||||
/// Parts of _where defining the clustering slice.
|
||||
///
|
||||
@@ -194,7 +96,7 @@ private:
|
||||
/// 4.4 elements other than the last have only EQ or IN atoms
|
||||
/// 4.5 the last element has only EQ, IN, or is_slice() atoms
|
||||
/// 5. if multi-column, then each element is a binary_operator
|
||||
std::vector<predicate> _clustering_prefix_restrictions;
|
||||
std::vector<expr::expression> _clustering_prefix_restrictions;
|
||||
|
||||
/// Like _clustering_prefix_restrictions, but for the indexing table (if this is an index-reading statement).
|
||||
/// Recall that the index-table CK is (token, PK, CK) of the base table for a global index and (indexed column,
|
||||
@@ -203,7 +105,7 @@ private:
|
||||
/// Elements are conjunctions of single-column binary operators with the same LHS.
|
||||
/// Element order follows the indexing-table clustering key.
|
||||
/// In case of a global index the first element's (token restriction) RHS is a dummy value, it is filled later.
|
||||
std::optional<std::vector<predicate>> _idx_tbl_ck_prefix;
|
||||
std::optional<std::vector<expr::expression>> _idx_tbl_ck_prefix;
|
||||
|
||||
/// Parts of _where defining the partition range.
|
||||
///
|
||||
@@ -211,25 +113,16 @@ private:
|
||||
/// binary_operators on token. If single-column restrictions define the partition range, each element holds
|
||||
/// restrictions for one partition column. Each partition column has a corresponding element, but the elements
|
||||
/// are in arbitrary order.
|
||||
partition_range_restrictions _partition_range_restrictions;
|
||||
std::vector<expr::expression> _partition_range_restrictions;
|
||||
|
||||
bool _partition_range_is_simple; ///< False iff _partition_range_restrictions imply a Cartesian product.
|
||||
|
||||
|
||||
check_indexes _check_indexes = check_indexes::yes;
|
||||
/// Columns that appear on the LHS of an EQ restriction (not IN).
|
||||
/// For multi-column EQ like (ck1, ck2) = (1, 2), all columns in the tuple are included.
|
||||
std::unordered_set<const column_definition*> _columns_with_eq;
|
||||
std::vector<const column_definition*> _column_defs_for_filtering;
|
||||
schema_ptr _view_schema;
|
||||
std::optional<secondary_index::index> _idx_opt;
|
||||
expr::expression _idx_restrictions = expr::conjunction({});
|
||||
get_partition_key_ranges_fn_t _get_partition_key_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_clustering_bounds_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_token_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_local_index_clustering_ranges_fn;
|
||||
get_singleton_value_fn_t _value_for_index_partition_key_fn;
|
||||
public:
|
||||
/**
|
||||
* Creates a new empty <code>StatementRestrictions</code>.
|
||||
@@ -237,10 +130,9 @@ public:
|
||||
* @param cfm the column family meta data
|
||||
* @return a new empty <code>StatementRestrictions</code>.
|
||||
*/
|
||||
statement_restrictions(private_tag, schema_ptr schema, bool allow_filtering);
|
||||
statement_restrictions(schema_ptr schema, bool allow_filtering);
|
||||
|
||||
public:
|
||||
friend shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
friend statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -250,15 +142,9 @@ public:
|
||||
bool for_view,
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
friend shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Important: objects of this class captures `this` extensively and so must remain non-copyable.
|
||||
statement_restrictions(const statement_restrictions&) = delete;
|
||||
statement_restrictions& operator=(const statement_restrictions&) = delete;
|
||||
statement_restrictions(private_tag,
|
||||
data_dictionary::database db,
|
||||
private:
|
||||
statement_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
const expr::expression& where_clause,
|
||||
@@ -325,7 +211,10 @@ public:
|
||||
|
||||
bool has_token_restrictions() const;
|
||||
|
||||
// Checks whether the given column has an EQ restriction (not IN).
|
||||
// Checks whether the given column has an EQ restriction.
|
||||
// EQ restriction is `col = ...` or `(col, col2) = ...`
|
||||
// IN restriction is NOT an EQ restriction, this function will not look for IN restrictions.
|
||||
// Uses column_defintion::operator== for comparison, columns with the same name but different schema will not be equal.
|
||||
bool has_eq_restriction_on_column(const column_definition&) const;
|
||||
|
||||
/**
|
||||
@@ -335,6 +224,12 @@ public:
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(data_dictionary::database db) const;
|
||||
|
||||
/**
|
||||
* Gives a score that the index has - index with the highest score will be chosen
|
||||
* in find_idx()
|
||||
*/
|
||||
int score(const secondary_index::index& index) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the data_dictionary::database context (for extracting index manager)
|
||||
@@ -355,8 +250,18 @@ public:
|
||||
|
||||
size_t partition_key_restrictions_size() const;
|
||||
|
||||
bool parition_key_restrictions_have_supporting_index(const secondary_index::secondary_index_manager& index_manager, expr::allow_local_index allow_local) const;
|
||||
|
||||
size_t clustering_columns_restrictions_size() const;
|
||||
|
||||
bool clustering_columns_restrictions_have_supporting_index(
|
||||
const secondary_index::secondary_index_manager& index_manager,
|
||||
expr::allow_local_index allow_local) const;
|
||||
|
||||
bool multi_column_clustering_restrictions_are_supported_by(const secondary_index::index& index) const;
|
||||
|
||||
bounds_slice get_clustering_slice() const;
|
||||
|
||||
/**
|
||||
* Checks if the clustering key has some unrestricted components.
|
||||
* @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
|
||||
@@ -374,6 +279,15 @@ public:
|
||||
|
||||
schema_ptr get_view_schema() const { return _view_schema; }
|
||||
private:
|
||||
std::pair<std::optional<secondary_index::index>, expr::expression> do_find_idx(const secondary_index::secondary_index_manager& sim) const;
|
||||
void add_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_is_not_restriction(const expr::binary_operator& restr, schema_ptr schema, bool for_view);
|
||||
void add_single_column_parition_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_token_partition_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_clustering_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering);
|
||||
void add_multi_column_clustering_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_nonprimary_key_restriction(const expr::binary_operator& restr);
|
||||
|
||||
void process_partition_key_restrictions(bool for_view, bool allow_filtering, statements::statement_type type);
|
||||
|
||||
/**
|
||||
@@ -401,17 +315,7 @@ private:
|
||||
void add_clustering_restrictions_to_idx_ck_prefix(const schema& idx_tbl_schema);
|
||||
|
||||
unsigned int num_clustering_prefix_columns_that_need_not_be_filtered() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(
|
||||
data_dictionary::database db,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
get_partition_key_ranges_fn_t build_partition_key_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_clustering_bounds_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_token_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_local_index_clustering_ranges_fn() const;
|
||||
get_singleton_value_fn_t build_value_for_index_partition_key_fn() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(data_dictionary::database db);
|
||||
public:
|
||||
/**
|
||||
* Returns the specified range of the partition key.
|
||||
@@ -485,10 +389,7 @@ public:
|
||||
private:
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_local_index_clustering_ranges().
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema);
|
||||
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_global_index_clustering_ranges() or get_global_index_token_clustering_ranges().
|
||||
@@ -497,18 +398,15 @@ private:
|
||||
public:
|
||||
/// Calculates clustering ranges for querying a global-index table.
|
||||
std::vector<query::clustering_range> get_global_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a global-index table for queries with token restrictions present.
|
||||
std::vector<query::clustering_range> get_global_index_token_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a local-index table.
|
||||
std::vector<query::clustering_range> get_local_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
|
||||
/// Finds the value of partition key of the index table
|
||||
bytes_opt value_for_index_partition_key(const query_options&) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
sstring to_string() const;
|
||||
|
||||
@@ -518,7 +416,7 @@ public:
|
||||
bool is_empty() const;
|
||||
};
|
||||
|
||||
shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -529,14 +427,23 @@ shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
|
||||
shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Extracts all binary operators which have the given column on their left hand side.
|
||||
// Extracts only single-column restrictions.
|
||||
// Does not include multi-column restrictions.
|
||||
// Does not include token() restrictions.
|
||||
// Does not include boolean constant restrictions.
|
||||
// For example "WHERE c = 1 AND (a, c) = (2, 1) AND token(p) < 2 AND FALSE" will return {"c = 1"}.
|
||||
std::vector<expr::expression> extract_single_column_restrictions_for_column(const expr::expression&, const column_definition&);
|
||||
|
||||
|
||||
// Checks whether this expression is empty - doesn't restrict anything
|
||||
bool is_empty_restriction(const expr::expression&);
|
||||
|
||||
// Finds the value of the given column in the expression
|
||||
// In case of multpiple possible values calls on_internal_error
|
||||
bytes_opt value_for(const column_definition&, const expr::expression&, const query_options&);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -90,20 +90,6 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
auto& current_rf_per_dc = ks.metadata()->strategy_options();
|
||||
auto new_rf_per_dc = _attrs->get_replication_options();
|
||||
new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
|
||||
// Check if multi-RF change is allowed: all DC changes must be 0->N or N->0.
|
||||
auto all_changes_are_0_N = [&] {
|
||||
for (const auto& [dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf_val = size_t(0);
|
||||
if (auto it = current_rf_per_dc.find(dc); it != current_rf_per_dc.end()) {
|
||||
old_rf_val = locator::get_replication_factor(it->second);
|
||||
}
|
||||
auto new_rf_val = locator::get_replication_factor(new_rf);
|
||||
if (old_rf_val != new_rf_val && old_rf_val != 0 && new_rf_val != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
unsigned total_abs_rfs_diff = 0;
|
||||
for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf = locator::replication_strategy_config_option(sstring("0"));
|
||||
@@ -117,9 +103,7 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
// first we need to report non-existing DCs, then if RFs aren't changed by too much.
|
||||
continue;
|
||||
}
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2 &&
|
||||
!(qp.proxy().features().keyspace_multi_rf_change && locator::uses_rack_list_exclusively(current_rf_per_dc)
|
||||
&& locator::uses_rack_list_exclusively(new_ks->strategy_options()) && all_changes_are_0_N())) {
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2) {
|
||||
throw exceptions::invalid_request_exception("Only one DC's RF can be changed at a time and not by more than 1");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
#include "cas_request.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "utils/unique_view.hh"
|
||||
|
||||
@@ -89,10 +89,6 @@ public:
|
||||
|
||||
const std::vector<single_statement>& statements() const { return _statements; }
|
||||
|
||||
audit::audit_info_ptr audit_info() const {
|
||||
return audit::audit::create_audit_info(audit::statement_category::DML, sstring(), sstring(), true);
|
||||
}
|
||||
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "cql3/expr/evaluate.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/values.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/broadcast_tables/experimental/lang.hh"
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "auth/service.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "transport/event.hh"
|
||||
|
||||
@@ -411,10 +411,10 @@ bool ks_prop_defs::get_durable_writes() const {
|
||||
|
||||
lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
|
||||
auto sc = get_replication_strategy_class().value();
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0. The strategy will
|
||||
// validate it and throw an error if it does not support tablets.
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
|
||||
auto enable_tablets = feat.tablets && cfg.enable_tablets_by_default();
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets ? std::optional<unsigned>(0) : std::nullopt;
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
@@ -440,7 +440,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
sc = old->strategy_name();
|
||||
options = old_options;
|
||||
}
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options(), {}, old->next_strategy_options_opt());
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -626,7 +626,7 @@ modification_statement::prepare(data_dictionary::database db, prepare_context& c
|
||||
// Since this cache is only meaningful for LWT queries, just clear the ids
|
||||
// if it's not a conditional statement so that the AST nodes don't
|
||||
// participate in the caching mechanism later.
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions) {
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions.has_value()) {
|
||||
ctx.clear_pk_function_calls_cache();
|
||||
}
|
||||
prepared_stmt->_may_use_token_aware_routing = ctx.get_partition_key_bind_indexes(*schema).size() != 0;
|
||||
|
||||
@@ -94,7 +94,7 @@ private:
|
||||
std::optional<bool> _is_raw_counter_shard_write;
|
||||
|
||||
protected:
|
||||
shared_ptr<const restrictions::statement_restrictions> _restrictions;
|
||||
std::optional<restrictions::statement_restrictions> _restrictions;
|
||||
public:
|
||||
typedef std::optional<std::unordered_map<sstring, bytes_opt>> json_cache_opt;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
|
||||
@@ -109,7 +109,7 @@ public:
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats, const cql_config& cfg, bool for_view);
|
||||
private:
|
||||
std::vector<selection::prepared_selector> maybe_jsonize_select_clause(std::vector<selection::prepared_selector> select, data_dictionary::database db, schema_ptr schema);
|
||||
::shared_ptr<const restrictions::statement_restrictions> prepare_restrictions(
|
||||
::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
|
||||
@@ -1027,7 +1027,7 @@ view_indexed_table_select_statement::prepare(data_dictionary::database db,
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1139,7 +1139,7 @@ lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_stat
|
||||
auto& last_base_pk = last_pos.partition;
|
||||
auto* last_base_ck = last_pos.position.has_key() ? &last_pos.position.key() : nullptr;
|
||||
|
||||
bytes_opt indexed_column_value = _restrictions->value_for_index_partition_key(options);
|
||||
bytes_opt indexed_column_value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
|
||||
auto index_pk = [&]() {
|
||||
if (_index.metadata().local()) {
|
||||
@@ -1350,7 +1350,12 @@ dht::partition_range_vector view_indexed_table_select_statement::get_partition_r
|
||||
dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
|
||||
dht::partition_range_vector partition_ranges;
|
||||
|
||||
bytes_opt value = _restrictions->value_for_index_partition_key(options);
|
||||
const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
|
||||
if (!cdef) {
|
||||
throw exceptions::invalid_request_exception("Indexed column not found in schema");
|
||||
}
|
||||
|
||||
bytes_opt value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
if (value) {
|
||||
auto pk = partition_key::from_single_value(*_view_schema, *value);
|
||||
auto dk = dht::decorate_key(*_view_schema, pk);
|
||||
@@ -1369,11 +1374,11 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
// Only EQ restrictions on base partition key can be used in an index view query
|
||||
if (pk_restrictions_is_single && _restrictions->partition_key_restrictions_is_all_eq()) {
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_clustering_ranges(options));
|
||||
_restrictions->get_global_index_clustering_ranges(options, *_view_schema));
|
||||
} else if (_restrictions->has_token_restrictions()) {
|
||||
// Restrictions like token(p1, p2) < 0 have all partition key components restricted, but require special handling.
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_token_clustering_ranges(options));
|
||||
_restrictions->get_global_index_token_clustering_ranges(options, *_view_schema));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1384,7 +1389,7 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
partition_slice_builder partition_slice_builder{*_view_schema};
|
||||
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_local_index_clustering_ranges(options));
|
||||
_restrictions->get_local_index_clustering_ranges(options, *_view_schema));
|
||||
|
||||
return partition_slice_builder.build();
|
||||
}
|
||||
@@ -1602,7 +1607,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1640,7 +1645,7 @@ private:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const select_statement::parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
parallelized_select_statement::ordering_comparator_type ordering_comparator,
|
||||
@@ -2071,7 +2076,7 @@ static select_statement::ordering_comparator_type get_similarity_ordering_compar
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
|
||||
uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
|
||||
|
||||
@@ -2584,7 +2589,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
|
||||
return make_unique<prepared_statement>(audit_info(), std::move(stmt), ctx, std::move(partition_key_bind_indices), std::move(warnings));
|
||||
}
|
||||
|
||||
::shared_ptr<const restrictions::statement_restrictions>
|
||||
::shared_ptr<restrictions::statement_restrictions>
|
||||
select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
@@ -2594,8 +2599,8 @@ select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
restrictions::check_indexes do_check_indexes)
|
||||
{
|
||||
try {
|
||||
return restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes);
|
||||
return ::make_shared<restrictions::statement_restrictions>(restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes));
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the WHERE clause (name: '{}')", e.entity));
|
||||
|
||||
@@ -200,7 +200,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -372,7 +372,7 @@ public:
|
||||
|
||||
static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "raw/parsed_statement.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "service/query_state.hh"
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include <optional>
|
||||
#include "validation.hh"
|
||||
|
||||
@@ -66,7 +66,7 @@ public:
|
||||
: update_statement(std::move(audit_info), statement_type::INSERT, bound_terms, s, std::move(attrs), stats)
|
||||
, _value(std::move(v))
|
||||
, _default_unset(default_unset) {
|
||||
_restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, false);
|
||||
_restrictions = restrictions::statement_restrictions(s, false);
|
||||
}
|
||||
private:
|
||||
virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const override;
|
||||
|
||||
@@ -224,12 +224,10 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
user_types_metadata user_types,
|
||||
storage_options storage_opts,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
storage_options storage_opts)
|
||||
: _name{name}
|
||||
, _strategy_name{locator::abstract_replication_strategy::to_qualified_class_name(strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name)}
|
||||
, _strategy_options{std::move(strategy_options)}
|
||||
, _next_strategy_options{std::move(next_options)}
|
||||
, _initial_tablets(initial_tablets)
|
||||
, _durable_writes{durable_writes}
|
||||
, _user_types{std::move(user_types)}
|
||||
@@ -275,15 +273,14 @@ keyspace_metadata::new_keyspace(std::string_view name,
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes,
|
||||
storage_options storage_opts,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
std::vector<schema_ptr> cf_defs)
|
||||
{
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts, next_options);
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
|
||||
}
|
||||
|
||||
lw_shared_ptr<keyspace_metadata>
|
||||
keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options(), {}, ksm.next_strategy_options_opt());
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
|
||||
}
|
||||
|
||||
void keyspace_metadata::add_user_type(const user_type ut) {
|
||||
@@ -652,8 +649,8 @@ struct fmt::formatter<data_dictionary::user_types_metadata> {
|
||||
};
|
||||
|
||||
auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dictionary::keyspace_metadata& m, fmt::format_context& ctx) const -> decltype(ctx.out()) {
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, nextStrategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.next_strategy_options_opt(), m.cf_meta_data(), m.durable_writes());
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes());
|
||||
if (m.initial_tablets()) {
|
||||
if (auto initial_tablets = m.initial_tablets().value()) {
|
||||
fmt::format_to(ctx.out(), "{{\"initial\":{}}}", initial_tablets);
|
||||
|
||||
@@ -28,9 +28,7 @@ namespace data_dictionary {
|
||||
class keyspace_metadata final {
|
||||
sstring _name;
|
||||
sstring _strategy_name;
|
||||
// If _next_strategy_options has value, there is ongoing rf change of this keyspace.
|
||||
locator::replication_strategy_config_options _strategy_options;
|
||||
std::optional<locator::replication_strategy_config_options> _next_strategy_options;
|
||||
std::optional<unsigned> _initial_tablets;
|
||||
std::unordered_map<sstring, schema_ptr> _cf_meta_data;
|
||||
bool _durable_writes;
|
||||
@@ -46,8 +44,7 @@ public:
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
|
||||
user_types_metadata user_types = user_types_metadata{},
|
||||
storage_options storage_opts = storage_options{},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
storage_options storage_opts = storage_options{});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(std::string_view name,
|
||||
std::string_view strategy_name,
|
||||
@@ -56,8 +53,7 @@ public:
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes = true,
|
||||
storage_options storage_opts = {},
|
||||
std::vector<schema_ptr> cf_defs = {},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
std::vector<schema_ptr> cf_defs = {});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(const keyspace_metadata& ksm);
|
||||
void validate(const gms::feature_service&, const locator::topology&) const;
|
||||
@@ -70,18 +66,6 @@ public:
|
||||
const locator::replication_strategy_config_options& strategy_options() const {
|
||||
return _strategy_options;
|
||||
}
|
||||
void set_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_strategy_options = options;
|
||||
}
|
||||
const std::optional<locator::replication_strategy_config_options>& next_strategy_options_opt() const {
|
||||
return _next_strategy_options;
|
||||
}
|
||||
void set_next_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_next_strategy_options = options;
|
||||
}
|
||||
void clear_next_strategy_options() {
|
||||
_next_strategy_options = std::nullopt;
|
||||
}
|
||||
locator::replication_strategy_config_options strategy_options_v1() const;
|
||||
std::optional<unsigned> initial_tablets() const {
|
||||
return _initial_tablets;
|
||||
|
||||
18
db/config.cc
18
db/config.cc
@@ -330,14 +330,14 @@ const config_type& config_type_for<std::vector<db::config::error_injection_at_st
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_loop::when>>() {
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_when>>() {
|
||||
static config_type ct(
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_loop::when>>);
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_when>>);
|
||||
return ct;
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<netw::advanced_rpc_compressor::tracker::algo_config>() {
|
||||
const config_type& config_type_for<netw::algo_config>() {
|
||||
static config_type ct(
|
||||
"advanced rpc compressor config", printable_vector_to_json<enum_option<netw::compression_algorithm>>);
|
||||
return ct;
|
||||
@@ -530,9 +530,9 @@ struct convert<db::config::error_injection_at_startup> {
|
||||
|
||||
|
||||
template <>
|
||||
class convert<enum_option<netw::dict_training_loop::when>> {
|
||||
class convert<enum_option<netw::dict_training_when>> {
|
||||
public:
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_loop::when>& rhs) {
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_when>& rhs) {
|
||||
std::string name;
|
||||
if (!convert<std::string>::decode(node, name)) {
|
||||
return false;
|
||||
@@ -1110,7 +1110,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Specifies RPC compression algorithms supported by this node. ")
|
||||
, internode_compression_enable_advanced(this, "internode_compression_enable_advanced", liveness::MustRestart, value_status::Used, false,
|
||||
"Enables the new implementation of RPC compression. If disabled, Scylla will fall back to the old implementation.")
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_loop::when::type::NEVER,
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_when::type::NEVER,
|
||||
"Specifies when RPC compression dictionary training is performed by this node.\n"
|
||||
"* `never` disables it unconditionally.\n"
|
||||
"* `when_leader` enables it only whenever the node is the Raft leader.\n"
|
||||
@@ -2025,8 +2025,8 @@ template struct utils::config_file::named_value<enum_option<db::experimental_fea
|
||||
template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
template struct utils::config_file::named_value<netw::algo_config>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
@@ -2094,7 +2094,7 @@ future<gms::inet_address> resolve(const config_file::named_value<sstring>& addre
|
||||
}
|
||||
}
|
||||
|
||||
co_return coroutine::exception(std::move(ex));
|
||||
co_return seastar::coroutine::exception(std::move(ex));
|
||||
}
|
||||
|
||||
static std::vector<seastar::metrics::relabel_config> get_relable_from_yaml(const YAML::Node& yaml, const std::string& name) {
|
||||
|
||||
14
db/config.hh
14
db/config.hh
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
@@ -16,15 +17,14 @@
|
||||
#include <seastar/util/program-options.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/replication_strategy_type.hh"
|
||||
#include "seastarx.hh"
|
||||
#include "utils/config_file.hh"
|
||||
#include "utils/enum_option.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "db/hints/host_filter.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "message/dict_trainer.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include "message/rpc_compression_types.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/tri_mode_restriction.hh"
|
||||
#include "sstables/compressor.hh"
|
||||
@@ -325,9 +325,9 @@ public:
|
||||
named_value<uint32_t> internode_compression_zstd_min_message_size;
|
||||
named_value<uint32_t> internode_compression_zstd_max_message_size;
|
||||
named_value<bool> internode_compression_checksumming;
|
||||
named_value<netw::advanced_rpc_compressor::tracker::algo_config> internode_compression_algorithms;
|
||||
named_value<netw::algo_config> internode_compression_algorithms;
|
||||
named_value<bool> internode_compression_enable_advanced;
|
||||
named_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when;
|
||||
named_value<enum_option<netw::dict_training_when>> rpc_dict_training_when;
|
||||
named_value<uint32_t> rpc_dict_training_min_time_seconds;
|
||||
named_value<uint64_t> rpc_dict_training_min_bytes;
|
||||
named_value<bool> inter_dc_tcp_nodelay;
|
||||
@@ -739,8 +739,8 @@ extern template struct utils::config_file::named_value<enum_option<db::experimen
|
||||
extern template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
extern template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
extern template struct utils::config_file::named_value<netw::algo_config>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
|
||||
@@ -277,7 +277,7 @@ filter_for_query(consistency_level cl,
|
||||
|
||||
host_id_vector_replica_set selected_endpoints;
|
||||
|
||||
// Preselect endpoints based on client preference. If the endpoints
|
||||
// Pre-select endpoints based on client preference. If the endpoints
|
||||
// selected this way aren't enough to satisfy CL requirements select the
|
||||
// remaining ones according to the load-balancing strategy as before.
|
||||
if (!preferred_endpoints.empty()) {
|
||||
|
||||
@@ -33,11 +33,6 @@ enum class schema_feature {
|
||||
|
||||
// Per-table tablet options
|
||||
TABLET_OPTIONS,
|
||||
|
||||
// When enabled, `system_schema.keyspaces` will keep three replication values:
|
||||
// the initial, the current, and the target replication factor,
|
||||
// which reflect the phases of the multi RF change.
|
||||
KEYSPACE_MULTI_RF_CHANGE,
|
||||
};
|
||||
|
||||
using schema_features = enum_set<super_enum<schema_feature,
|
||||
@@ -48,8 +43,7 @@ using schema_features = enum_set<super_enum<schema_feature,
|
||||
schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
|
||||
schema_feature::GROUP0_SCHEMA_VERSIONING,
|
||||
schema_feature::IN_MEMORY_TABLES,
|
||||
schema_feature::TABLET_OPTIONS,
|
||||
schema_feature::KEYSPACE_MULTI_RF_CHANGE
|
||||
schema_feature::TABLET_OPTIONS
|
||||
>>;
|
||||
|
||||
}
|
||||
|
||||
@@ -216,7 +216,6 @@ schema_ptr keyspaces() {
|
||||
{"durable_writes", boolean_type},
|
||||
{"replication", map_type_impl::get_instance(utf8_type, utf8_type, false)},
|
||||
{"replication_v2", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // with rack list RF
|
||||
{"next_replication", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // target rack list RF for this RF change
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
@@ -1179,14 +1178,6 @@ utils::chunked_vector<mutation> make_create_keyspace_mutations(schema_features f
|
||||
// If the maps are different, the upgrade must be already done.
|
||||
store_map(m, ckey, "replication_v2", timestamp, cql3::statements::to_flattened_map(map));
|
||||
}
|
||||
if (features.contains<schema_feature::KEYSPACE_MULTI_RF_CHANGE>()) {
|
||||
const auto& next_map_opt = keyspace->next_strategy_options_opt();
|
||||
if (next_map_opt) {
|
||||
auto next_map = *next_map_opt;
|
||||
next_map["class"] = keyspace->strategy_name();
|
||||
store_map(m, ckey, "next_replication", timestamp, cql3::statements::to_flattened_map(next_map));
|
||||
}
|
||||
}
|
||||
|
||||
if (features.contains<schema_feature::SCYLLA_KEYSPACES>()) {
|
||||
schema_ptr scylla_keyspaces_s = scylla_keyspaces();
|
||||
@@ -1260,7 +1251,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
// (or screw up shared pointers)
|
||||
const auto& replication = row.get_nonnull<map_type_impl::native_type>("replication");
|
||||
const auto& replication_v2 = row.get<map_type_impl::native_type>("replication_v2");
|
||||
const auto& next_replication = row.get<map_type_impl::native_type>("next_replication");
|
||||
|
||||
cql3::statements::property_definitions::map_type flat_strategy_options;
|
||||
for (auto& p : replication_v2 ? *replication_v2 : replication) {
|
||||
@@ -1269,17 +1259,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
auto strategy_options = cql3::statements::from_flattened_map(flat_strategy_options);
|
||||
auto strategy_name = std::get<sstring>(strategy_options["class"]);
|
||||
strategy_options.erase("class");
|
||||
|
||||
std::optional<cql3::statements::property_definitions::extended_map_type> next_strategy_options = std::nullopt;
|
||||
if (next_replication) {
|
||||
cql3::statements::property_definitions::map_type flat_next_replication;
|
||||
for (auto& p : *next_replication) {
|
||||
flat_next_replication.emplace(value_cast<sstring>(p.first), value_cast<sstring>(p.second));
|
||||
}
|
||||
next_strategy_options = cql3::statements::from_flattened_map(flat_next_replication);
|
||||
next_strategy_options->erase("class");
|
||||
}
|
||||
|
||||
bool durable_writes = row.get_nonnull<bool>("durable_writes");
|
||||
|
||||
data_dictionary::storage_options storage_opts;
|
||||
@@ -1305,7 +1284,7 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts, {}, next_strategy_options);
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts);
|
||||
}
|
||||
|
||||
template<typename V>
|
||||
|
||||
@@ -300,7 +300,6 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("ongoing_rf_changes", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -3351,12 +3350,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("ongoing_rf_changes")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "ongoing_rf_changes")) {
|
||||
ret.ongoing_rf_changes.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
|
||||
@@ -15,10 +15,11 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/generation-number.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db_clock.hh"
|
||||
#include "mutation_query.hh"
|
||||
#include "system_keyspace_view_types.hh"
|
||||
@@ -36,6 +37,10 @@ namespace netw {
|
||||
class shared_dict;
|
||||
};
|
||||
|
||||
namespace query {
|
||||
class result_set;
|
||||
}
|
||||
|
||||
namespace sstables {
|
||||
struct entry_descriptor;
|
||||
class generation_type;
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "db/config.hh"
|
||||
#include "db/view/base_info.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "db/view/view_consumer.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
@@ -1584,11 +1586,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
|
||||
auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
|
||||
if (tombstone && _existing && !_existing->is_end_of_partition()) {
|
||||
if (_existing->is_range_tombstone_change()) {
|
||||
_existing_current_tombstone = _existing->as_range_tombstone_change().tombstone();
|
||||
} else if (_existing->is_clustering_row()) {
|
||||
// We don't care if it's a range tombstone, as we're only looking for existing entries that get deleted
|
||||
if (_existing->is_clustering_row()) {
|
||||
auto existing = clustering_row(*_schema, _existing->as_clustering_row());
|
||||
existing.apply(std::max(_existing_partition_tombstone, _existing_current_tombstone));
|
||||
auto update = clustering_row(existing.key(), row_tombstone(std::move(tombstone)), row_marker(), ::row());
|
||||
generate_update(std::move(update), { std::move(existing) });
|
||||
} else if (_existing->is_static_row()) {
|
||||
@@ -1599,10 +1599,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
return should_stop_updates() ? stop() : advance_existings();
|
||||
}
|
||||
|
||||
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
||||
if (_update && !_update->is_end_of_partition()) {
|
||||
if (_update->is_range_tombstone_change()) {
|
||||
_update_current_tombstone = _update->as_range_tombstone_change().tombstone();
|
||||
} else if (_update->is_clustering_row()) {
|
||||
if (_update->is_clustering_row()) {
|
||||
_update->mutate_as_clustering_row(*_schema, [&] (clustering_row& cr) mutable {
|
||||
cr.apply(std::max(_update_partition_tombstone, _update_current_tombstone));
|
||||
});
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "gms/gossiper.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
@@ -21,6 +21,8 @@
|
||||
#include "dht/token.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
#include <flat_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include <seastar/core/gate.hh>
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
|
||||
@@ -240,9 +240,6 @@ future<> view_update_generator::process_staging_sstables(lw_shared_ptr<replica::
|
||||
_progress_tracker->on_sstable_registration(sst);
|
||||
}
|
||||
|
||||
utils::get_local_injector().inject("view_update_generator_pause_before_processing",
|
||||
utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
|
||||
// Generate view updates from staging sstables
|
||||
auto start_time = db_clock::now();
|
||||
auto [result, input_size] = generate_updates_from_staging_sstables(table, sstables);
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "cdc/metadata.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/virtual_table.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "db/virtual_tables.hh"
|
||||
|
||||
@@ -271,7 +271,7 @@ The json structure is as follows:
|
||||
}
|
||||
|
||||
The `manifest` member contains the following attributes:
|
||||
- `version` - representing the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `version` - respresenting the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `scope` - the scope of metadata stored in this manifest file. The following scopes are supported:
|
||||
- `node` - the manifest describes all SSTables owned by this node in this snapshot.
|
||||
|
||||
|
||||
@@ -12,9 +12,7 @@ Schema:
|
||||
CREATE TABLE system_schema.keyspaces (
|
||||
keyspace_name text PRIMARY KEY,
|
||||
durable_writes boolean,
|
||||
replication frozen<map<text, text>>,
|
||||
replication_v2 frozen<map<text, text>>,
|
||||
next_replication frozen<map<text, text>>
|
||||
replication frozen<map<text, text>>
|
||||
)
|
||||
```
|
||||
|
||||
@@ -33,8 +31,6 @@ Columns:
|
||||
stored as a flattened map of the extended options map (see below).
|
||||
|
||||
For `SimpleStrategy` there is a single option `"replication_factor"` specifying the replication factor.
|
||||
* `next_replication` - the target replication factor for the keyspace during rf change.
|
||||
If there is no ongoing rf change, `next_replication` value is not set.
|
||||
|
||||
Extended options map used by NetworkTopologyStrategy is a map where values can be either strings or lists of strings.
|
||||
|
||||
|
||||
@@ -146,25 +146,6 @@ AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
|
||||
- When set, these values are used by the S3 client to sign requests.
|
||||
- If not set, requests are sent unsigned, which may not be accepted by all servers.
|
||||
|
||||
.. _admin-oci-object-storage:
|
||||
|
||||
Using Oracle OCI Object Storage
|
||||
=================================
|
||||
|
||||
Oracle Cloud Infrastructure (OCI) Object Storage is compatible with the Amazon
|
||||
S3 API, so it works with ScyllaDB without additional configuration.
|
||||
|
||||
To use OCI Object Storage, follow the same configuration as for AWS S3, and
|
||||
specify your OCI S3-compatible endpoint.
|
||||
|
||||
Example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: https://idedxcgnkfkt.compat.objectstorage.us-ashburn-1.oci.customer-oci.com:443
|
||||
aws_region: us-ashburn-1
|
||||
|
||||
.. _admin-compression:
|
||||
|
||||
Compression
|
||||
|
||||
@@ -45,7 +45,7 @@ Example:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
To only mark the node as permanently down without doing actual removal, use :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>`:
|
||||
|
||||
@@ -79,6 +79,6 @@ Example:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1db0-aac8-43fddce9123e 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
.. include:: nodetool-index.rst
|
||||
|
||||
@@ -231,46 +231,6 @@ Add New DC
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While adding a new datacenter and altering keyspaces, do **not** perform any reads or writes that involve the new datacenter.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the new datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the new datacenter is fully operational.
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
|
||||
CREATE KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``<existing_dc>`` (adds ``<existing_rack4>``) and adds ``<new_dc>`` in the same statement:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>', '<existing_rack4>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
CREATE KEYSPACE mykeyspace4 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
You can abort the keyspace alteration using :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -74,7 +74,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UJ 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UJ 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Nodes in the cluster finished streaming data to the new node:
|
||||
|
||||
@@ -86,7 +86,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
#. When the new node status is Up Normal (UN), run the :doc:`nodetool cleanup </operating-scylla/nodetool-commands/cleanup>` command on all nodes in the cluster except for the new node that has just been added. Cleanup removes keys that were streamed to the newly added node and are no longer owned by the node.
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ Adding new nodes
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-ca08-43fddce952de RACK2
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
@@ -205,7 +205,7 @@ Adding new nodes
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-ca08-43fddce952de RACK2
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
|
||||
@@ -163,5 +163,5 @@ This example shows how to install and configure a three-node cluster using Gossi
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c 43
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e 44
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de 45
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy 45
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ Prerequisites
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-lac8-23fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Datacenter: ASIA-DC
|
||||
Status=Up/Down
|
||||
@@ -102,34 +102,6 @@ Procedure
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While removing a datacenter and altering keyspaces, do **not** perform any reads or writes that involve the datacenter being removed.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the decommissioned datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the datacenter is fully decommissioned.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba4
|
||||
cqlsh> CREATE KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``EUROPE-DC`` (adds ``RAC9``) and removes ``ASIA-DC`` in the same statement:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8', 'RAC9']} AND tablets = { 'enabled': true };
|
||||
|
||||
Remove all replicas from the decommissioned datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
|
||||
If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
|
||||
@@ -141,10 +113,6 @@ Procedure
|
||||
|
||||
Failure to do so will result in decommission errors such as "zero replica after the removal".
|
||||
|
||||
.. warning::
|
||||
|
||||
Removal of replicas from a datacenter cannot be aborted. To get back to the previous replication, wait until the ALTER KEYSPACE finishes and then add the replicas back by running another ALTER KEYSPACE statement.
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
@@ -165,7 +133,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Datacenter: EUROPE-DC
|
||||
Status=Up/Down
|
||||
|
||||
@@ -18,7 +18,7 @@ Removing a Running Node
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
#. If the node status is **Up Normal (UN)**, run the :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` command
|
||||
to remove the node you are connected to. Using ``nodetool decommission`` is the recommended method for cluster scale-down operations. It prevents data loss
|
||||
@@ -75,7 +75,7 @@ command providing the Host ID of the node you are removing. See :doc:`nodetool r
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
The ``nodetool removenode`` command notifies other nodes that the token range it owns needs to be moved and
|
||||
the nodes should redistribute the data using streaming. Using the command does not guarantee the consistency of the rebalanced data if
|
||||
|
||||
@@ -23,7 +23,7 @@ Prerequisites
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
DN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Login to one of the nodes in the cluster with (UN) status, collect the following info from the node:
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ Down (DN), and the node can be replaced.
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Remove the Data
|
||||
==================
|
||||
@@ -72,7 +72,7 @@ Procedure
|
||||
|
||||
For example (using the Host ID of the failed node from above):
|
||||
|
||||
``replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de``
|
||||
``replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy``
|
||||
|
||||
#. Start the new node.
|
||||
|
||||
@@ -90,7 +90,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
``192.168.1.203`` is the dead node.
|
||||
|
||||
@@ -121,7 +121,7 @@ Procedure
|
||||
/192.168.1.203
|
||||
generation:1553759866
|
||||
heartbeat:2147483647
|
||||
HOST_ID:675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
HOST_ID:675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
STATUS:shutdown,true
|
||||
RELEASE_VERSION:3.0.8
|
||||
X3:3
|
||||
@@ -178,7 +178,7 @@ In this case, the node's data will be cleaned after restart. To remedy this, you
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de' | sudo tee --append /etc/scylla/scylla.yaml
|
||||
echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy' | sudo tee --append /etc/scylla/scylla.yaml
|
||||
|
||||
#. Run the following command to re-setup RAID
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/smp.hh>
|
||||
#include "db/schema_features.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
@@ -180,7 +179,6 @@ db::schema_features feature_service::cluster_schema_features() const {
|
||||
f.set<db::schema_feature::GROUP0_SCHEMA_VERSIONING>();
|
||||
f.set_if<db::schema_feature::IN_MEMORY_TABLES>(bool(in_memory_tables));
|
||||
f.set_if<db::schema_feature::TABLET_OPTIONS>(bool(tablet_options));
|
||||
f.set_if<db::schema_feature::KEYSPACE_MULTI_RF_CHANGE>(bool(keyspace_multi_rf_change));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
@@ -182,7 +182,6 @@ public:
|
||||
gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv };
|
||||
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
|
||||
gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
|
||||
gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "locator/token_metadata.hh"
|
||||
#include "locator/types.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
@@ -71,11 +72,6 @@ struct gossip_config {
|
||||
utils::updateable_value<utils::UUID> recovery_leader;
|
||||
};
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
gms::inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
/**
|
||||
* This module is responsible for Gossiping information for the local endpoint. This abstraction
|
||||
* maintains the list of live and dead endpoints. Periodically i.e. every 1 second this module
|
||||
|
||||
23
gms/loaded_endpoint_state.hh
Normal file
23
gms/loaded_endpoint_state.hh
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/types.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
} // namespace gms
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "query/query_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
|
||||
namespace utils {
|
||||
class UUID final {
|
||||
@@ -43,4 +43,3 @@ class host_id final {
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
@@ -284,14 +284,3 @@ future<> instance_cache::stop() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
struct equal_to<seastar::scheduling_group> {
|
||||
bool operator()(seastar::scheduling_group& sg1, seastar::scheduling_group& sg2) const noexcept {
|
||||
return sg1 == sg2;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "utils/sequenced_set.hh"
|
||||
#include "utils/simple_hashers.hh"
|
||||
#include "tablets.hh"
|
||||
#include "locator/replication_strategy_type.hh"
|
||||
#include "data_dictionary/consistency_config_options.hh"
|
||||
|
||||
// forward declaration since replica/database.hh includes this file
|
||||
@@ -38,13 +39,6 @@ extern logging::logger rslogger;
|
||||
using inet_address = gms::inet_address;
|
||||
using token = dht::token;
|
||||
|
||||
enum class replication_strategy_type {
|
||||
simple,
|
||||
local,
|
||||
network_topology,
|
||||
everywhere_topology,
|
||||
};
|
||||
|
||||
using replication_strategy_config_option = std::variant<sstring, rack_list>;
|
||||
using replication_strategy_config_options = std::map<sstring, replication_strategy_config_option>;
|
||||
|
||||
|
||||
@@ -381,10 +381,6 @@ public:
|
||||
return _nodes.at(node)._du.capacity;
|
||||
}
|
||||
|
||||
bool has_node(host_id node) const {
|
||||
return _nodes.contains(node);
|
||||
}
|
||||
|
||||
shard_id get_shard_count(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
|
||||
20
locator/replication_strategy_type.hh
Normal file
20
locator/replication_strategy_type.hh
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright (C) 2015-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace locator {
|
||||
|
||||
enum class replication_strategy_type {
|
||||
simple,
|
||||
local,
|
||||
network_topology,
|
||||
everywhere_topology,
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "dht/i_partitioner_fwd.hh"
|
||||
#include "dht/token-sharding.hh"
|
||||
#include "dht/ring_position.hh"
|
||||
@@ -21,10 +21,9 @@
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
|
||||
#include <ranges>
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/util/noncopyable_function.hh>
|
||||
@@ -153,27 +152,19 @@ struct hash<locator::range_based_tablet_id> {
|
||||
|
||||
namespace locator {
|
||||
|
||||
/// Returns a copy of the replica set with the following modifications:
|
||||
/// - If both old_replica and new_replica are set, old_replica is substituted
|
||||
/// with new_replica. If old_replica is not found in rs, the set is returned as-is.
|
||||
/// - If only old_replica is set, it is removed from the result.
|
||||
/// - If only new_replica is set, it is appended to the result.
|
||||
/// Creates a new replica set with old_replica replaced by new_replica.
|
||||
/// If there is no old_replica, the set is returned unchanged.
|
||||
inline
|
||||
tablet_replica_set replace_replica(const tablet_replica_set& rs, std::optional<tablet_replica> old_replica, std::optional<tablet_replica> new_replica) {
|
||||
tablet_replica_set replace_replica(const tablet_replica_set& rs, tablet_replica old_replica, tablet_replica new_replica) {
|
||||
tablet_replica_set result;
|
||||
result.reserve(rs.size());
|
||||
for (auto&& r : rs) {
|
||||
if (old_replica.has_value() && r == old_replica.value()) {
|
||||
if (new_replica.has_value()) {
|
||||
result.push_back(new_replica.value());
|
||||
}
|
||||
if (r == old_replica) {
|
||||
result.push_back(new_replica);
|
||||
} else {
|
||||
result.push_back(r);
|
||||
}
|
||||
}
|
||||
if (!old_replica.has_value() && new_replica.has_value()) {
|
||||
result.push_back(new_replica.value());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -391,8 +382,8 @@ bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tab
|
||||
struct tablet_migration_info {
|
||||
locator::tablet_transition_kind kind;
|
||||
locator::global_tablet_id tablet;
|
||||
std::optional<locator::tablet_replica> src;
|
||||
std::optional<locator::tablet_replica> dst;
|
||||
locator::tablet_replica src;
|
||||
locator::tablet_replica dst;
|
||||
};
|
||||
|
||||
class tablet_map;
|
||||
|
||||
2
main.cc
2
main.cc
@@ -942,7 +942,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();
|
||||
|
||||
// Maintenance supergroup -- the collection of background low-prio activities
|
||||
// Maintenance supergroup -- the collection of background low-prio activites
|
||||
auto maintenance_supergroup = create_scheduling_supergroup(200).get();
|
||||
auto bandwidth_updater = io_throughput_updater("maintenance supergroup", maintenance_supergroup,
|
||||
cfg->maintenance_io_throughput_mb_per_sec.is_set() ? cfg->maintenance_io_throughput_mb_per_sec : cfg->stream_io_throughput_mb_per_sec);
|
||||
|
||||
@@ -11,9 +11,10 @@
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/rpc/rpc_types.hh>
|
||||
#include <utility>
|
||||
|
||||
#include "rpc_compression_types.hh"
|
||||
#include "utils/refcounted.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/enum_option.hh"
|
||||
#include "shared_dict.hh"
|
||||
|
||||
namespace netw {
|
||||
@@ -28,103 +29,6 @@ class dict_sampler;
|
||||
using dict_ptr = lw_shared_ptr<foreign_ptr<lw_shared_ptr<shared_dict>>>;
|
||||
class control_protocol_frame;
|
||||
|
||||
// An enum wrapper, describing supported RPC compression algorithms.
|
||||
// Always contains a valid value —- the constructors won't allow
|
||||
// an invalid/unknown enum variant to be constructed.
|
||||
struct compression_algorithm {
|
||||
using underlying = uint8_t;
|
||||
enum class type : underlying {
|
||||
RAW,
|
||||
LZ4,
|
||||
ZSTD,
|
||||
COUNT,
|
||||
} _value;
|
||||
// Construct from an integer.
|
||||
// Used to deserialize the algorithm from the first byte of the frame.
|
||||
constexpr compression_algorithm(underlying x) {
|
||||
if (x < std::to_underlying(type::RAW) || x >= std::to_underlying(type::COUNT)) {
|
||||
throw std::runtime_error(fmt::format("Invalid value {} for enum compression_algorithm", static_cast<int>(x)));
|
||||
}
|
||||
_value = static_cast<type>(x);
|
||||
}
|
||||
// Construct from `type`. Makes sure that `type` has a valid value.
|
||||
constexpr compression_algorithm(type x) : compression_algorithm(std::to_underlying(x)) {}
|
||||
|
||||
// These names are used in multiple places:
|
||||
// RPC negotiation, in metric labels, and config.
|
||||
static constexpr std::string_view names[] = {
|
||||
"raw",
|
||||
"lz4",
|
||||
"zstd",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<int>(compression_algorithm::type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static auto map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert(std::make_pair<std::string, type>(std::string(names[i]), compression_algorithm(i).get()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
constexpr std::string_view name() const noexcept { return names[idx()]; }
|
||||
constexpr underlying idx() const noexcept { return std::to_underlying(_value); }
|
||||
constexpr type get() const noexcept { return _value; }
|
||||
constexpr static size_t count() { return static_cast<size_t>(type::COUNT); };
|
||||
bool operator<=>(const compression_algorithm &) const = default;
|
||||
};
|
||||
|
||||
|
||||
// Represents a set of compression algorithms.
|
||||
// Backed by a bitset.
|
||||
// Used for convenience during algorithm negotiations.
|
||||
class compression_algorithm_set {
|
||||
uint8_t _bitset;
|
||||
static_assert(std::numeric_limits<decltype(_bitset)>::digits > compression_algorithm::count());
|
||||
constexpr compression_algorithm_set(uint8_t v) noexcept : _bitset(v) {}
|
||||
public:
|
||||
// Returns a set containing the given algorithm and all algorithms weaker (smaller in the enum order)
|
||||
// than it.
|
||||
constexpr static compression_algorithm_set this_or_lighter(compression_algorithm algo) noexcept {
|
||||
auto x = 1 << (algo.idx());
|
||||
return {x + (x - 1)};
|
||||
}
|
||||
// Returns the strongest (greatest in the enum order) algorithm in the set.
|
||||
constexpr compression_algorithm heaviest() const {
|
||||
return {std::bit_width(_bitset) - 1};
|
||||
}
|
||||
// The usual set operations.
|
||||
constexpr static compression_algorithm_set singleton(compression_algorithm algo) noexcept {
|
||||
return {1 << algo.idx()};
|
||||
}
|
||||
constexpr compression_algorithm_set intersection(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset & o._bitset};
|
||||
}
|
||||
constexpr compression_algorithm_set difference(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset &~ o._bitset};
|
||||
}
|
||||
constexpr compression_algorithm_set sum(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset | o._bitset};
|
||||
}
|
||||
constexpr bool contains(compression_algorithm algo) const noexcept {
|
||||
return _bitset & (1 << algo.idx());
|
||||
}
|
||||
constexpr bool operator==(const compression_algorithm_set&) const = default;
|
||||
// Returns the contained bitset. Used for serialization.
|
||||
constexpr uint8_t value() const noexcept {
|
||||
return _bitset;
|
||||
}
|
||||
// Reconstructs the set from the output of `value()`. Used for deserialization.
|
||||
constexpr static compression_algorithm_set from_value(uint8_t bitset) {
|
||||
compression_algorithm_set x = bitset;
|
||||
x.heaviest(); // This is a validation check. It will throw if the bitset contains some illegal/unknown bits.
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
using algo_config = std::vector<enum_option<compression_algorithm>>;
|
||||
|
||||
// See docs/dev/advanced_rpc_compression.md,
|
||||
// section `Negotiation` for more information about the protocol.
|
||||
struct control_protocol {
|
||||
@@ -248,7 +152,7 @@ struct per_algorithm_stats {
|
||||
// prevent a misuse of the API (dangling references).
|
||||
class advanced_rpc_compressor::tracker : public utils::refcounted {
|
||||
public:
|
||||
using algo_config = algo_config;
|
||||
using algo_config = netw::algo_config;
|
||||
struct config {
|
||||
utils::updateable_value<uint32_t> zstd_min_msg_size{0};
|
||||
utils::updateable_value<uint32_t> zstd_max_msg_size{std::numeric_limits<uint32_t>::max()};
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "shared_dict.hh"
|
||||
#include "advanced_rpc_compressor.hh"
|
||||
#include "rpc_compression_types.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "rpc_compression_types.hh"
|
||||
#include "utils/reservoir_sampling.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -88,28 +89,7 @@ class dict_training_loop {
|
||||
seastar::semaphore _pause{0};
|
||||
seastar::abort_source _pause_as;
|
||||
public:
|
||||
struct when {
|
||||
enum class type {
|
||||
NEVER,
|
||||
WHEN_LEADER,
|
||||
ALWAYS,
|
||||
COUNT,
|
||||
};
|
||||
static constexpr std::string_view names[] = {
|
||||
"never",
|
||||
"when_leader",
|
||||
"always",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<size_t>(type::COUNT));
|
||||
// Implements enum_option.
|
||||
static std::unordered_map<std::string, type> map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert({std::string(names[i]), type(i)});
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
using when = netw::dict_training_when;
|
||||
void pause();
|
||||
void unpause();
|
||||
void cancel() noexcept;
|
||||
|
||||
@@ -54,11 +54,11 @@ dictionary_service::dictionary_service(
|
||||
void dictionary_service::maybe_toggle_dict_training() {
|
||||
auto when = _rpc_dict_training_when();
|
||||
netw::dict_trainer_logger.debug("dictionary_service::maybe_toggle_dict_training(), called, _is_leader={}, when={}", _is_leader, when);
|
||||
if (when == netw::dict_training_loop::when::type::NEVER) {
|
||||
if (when == netw::dict_training_when::type::NEVER) {
|
||||
_training_fiber.pause();
|
||||
} else if (when == netw::dict_training_loop::when::type::ALWAYS) {
|
||||
} else if (when == netw::dict_training_when::type::ALWAYS) {
|
||||
_training_fiber.unpause();
|
||||
} else if (when == netw::dict_training_loop::when::type::WHEN_LEADER) {
|
||||
} else if (when == netw::dict_training_when::type::WHEN_LEADER) {
|
||||
_is_leader ? _training_fiber.unpause() : _training_fiber.pause();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace gms {
|
||||
class dictionary_service {
|
||||
db::system_keyspace& _sys_ks;
|
||||
locator::host_id _our_host_id;
|
||||
utils::updateable_value<enum_option<netw::dict_training_loop::when>> _rpc_dict_training_when;
|
||||
utils::updateable_value<enum_option<netw::dict_training_when>> _rpc_dict_training_when;
|
||||
service::raft_group0_client& _raft_group0_client;
|
||||
abort_source& _as;
|
||||
netw::dict_training_loop _training_fiber;
|
||||
@@ -48,7 +48,7 @@ class dictionary_service {
|
||||
|
||||
bool _is_leader = false;
|
||||
utils::observer<bool> _leadership_observer;
|
||||
utils::observer<enum_option<netw::dict_training_loop::when>> _when_observer;
|
||||
utils::observer<enum_option<netw::dict_training_when>> _when_observer;
|
||||
std::optional<std::any> _feature_observer;
|
||||
|
||||
void maybe_toggle_dict_training();
|
||||
@@ -61,7 +61,7 @@ public:
|
||||
locator::host_id our_host_id = Uninitialized();
|
||||
utils::updateable_value<uint32_t> rpc_dict_training_min_time_seconds = Uninitialized();
|
||||
utils::updateable_value<uint64_t> rpc_dict_training_min_bytes = Uninitialized();
|
||||
utils::updateable_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when = Uninitialized();
|
||||
utils::updateable_value<enum_option<netw::dict_training_when>> rpc_dict_training_when = Uninitialized();
|
||||
};
|
||||
// Note: the training fiber will start as soon as the relevant cluster feature is enabled.
|
||||
dictionary_service(
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <seastar/coroutine/all.hh>
|
||||
|
||||
#include "message/messaging_service.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include "gms/gossiper.hh"
|
||||
#include "service/storage_service.hh"
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "streaming/stream_fwd.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "service/maintenance_mode.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/generation-number.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
#include <list>
|
||||
@@ -120,6 +120,8 @@ namespace qos {
|
||||
|
||||
namespace netw {
|
||||
|
||||
class walltime_compressor_tracker;
|
||||
|
||||
/* All verb handler identifiers */
|
||||
enum class messaging_verb : int32_t {
|
||||
CLIENT_ID = 0,
|
||||
|
||||
155
message/rpc_compression_types.hh
Normal file
155
message/rpc_compression_types.hh
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bit>
|
||||
#include <compare>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "utils/enum_option.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
// An enum wrapper, describing supported RPC compression algorithms.
|
||||
// Always contains a valid value -- the constructors won't allow
|
||||
// an invalid/unknown enum variant to be constructed.
|
||||
struct compression_algorithm {
|
||||
using underlying = uint8_t;
|
||||
enum class type : underlying {
|
||||
RAW,
|
||||
LZ4,
|
||||
ZSTD,
|
||||
COUNT,
|
||||
} _value;
|
||||
|
||||
// Construct from an integer.
|
||||
// Used to deserialize the algorithm from the first byte of the frame.
|
||||
constexpr compression_algorithm(underlying x) {
|
||||
if (x < std::to_underlying(type::RAW) || x >= std::to_underlying(type::COUNT)) {
|
||||
throw std::runtime_error(std::string("Invalid value ") + std::to_string(unsigned(x)) + " for enum compression_algorithm");
|
||||
}
|
||||
_value = static_cast<type>(x);
|
||||
}
|
||||
|
||||
// Construct from `type`. Makes sure that `type` has a valid value.
|
||||
constexpr compression_algorithm(type x) : compression_algorithm(std::to_underlying(x)) {}
|
||||
|
||||
// These names are used in multiple places:
|
||||
// RPC negotiation, in metric labels, and config.
|
||||
static constexpr std::string_view names[] = {
|
||||
"raw",
|
||||
"lz4",
|
||||
"zstd",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<int>(compression_algorithm::type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static auto map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert(std::make_pair(std::string(names[i]), compression_algorithm(i).get()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
constexpr std::string_view name() const noexcept { return names[idx()]; }
|
||||
constexpr underlying idx() const noexcept { return std::to_underlying(_value); }
|
||||
constexpr type get() const noexcept { return _value; }
|
||||
constexpr static size_t count() { return static_cast<size_t>(type::COUNT); }
|
||||
bool operator<=>(const compression_algorithm&) const = default;
|
||||
};
|
||||
|
||||
// Represents a set of compression algorithms.
|
||||
// Backed by a bitset.
|
||||
// Used for convenience during algorithm negotiations.
|
||||
class compression_algorithm_set {
|
||||
uint8_t _bitset;
|
||||
static_assert(std::numeric_limits<decltype(_bitset)>::digits > compression_algorithm::count());
|
||||
constexpr compression_algorithm_set(uint8_t v) noexcept : _bitset(v) {}
|
||||
public:
|
||||
// Returns a set containing the given algorithm and all algorithms weaker (smaller in the enum order)
|
||||
// than it.
|
||||
constexpr static compression_algorithm_set this_or_lighter(compression_algorithm algo) noexcept {
|
||||
auto x = 1 << algo.idx();
|
||||
return {uint8_t(x + (x - 1))};
|
||||
}
|
||||
|
||||
// Returns the strongest (greatest in the enum order) algorithm in the set.
|
||||
constexpr compression_algorithm heaviest() const {
|
||||
return {compression_algorithm::underlying(std::bit_width(_bitset) - 1)};
|
||||
}
|
||||
|
||||
// The usual set operations.
|
||||
constexpr static compression_algorithm_set singleton(compression_algorithm algo) noexcept {
|
||||
return {uint8_t(1 << algo.idx())};
|
||||
}
|
||||
constexpr compression_algorithm_set intersection(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset & o._bitset)};
|
||||
}
|
||||
constexpr compression_algorithm_set difference(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset &~ o._bitset)};
|
||||
}
|
||||
constexpr compression_algorithm_set sum(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset | o._bitset)};
|
||||
}
|
||||
constexpr bool contains(compression_algorithm algo) const noexcept {
|
||||
return _bitset & (1 << algo.idx());
|
||||
}
|
||||
constexpr bool operator==(const compression_algorithm_set&) const = default;
|
||||
|
||||
// Returns the contained bitset. Used for serialization.
|
||||
constexpr uint8_t value() const noexcept {
|
||||
return _bitset;
|
||||
}
|
||||
|
||||
// Reconstructs the set from the output of `value()`. Used for deserialization.
|
||||
constexpr static compression_algorithm_set from_value(uint8_t bitset) {
|
||||
compression_algorithm_set x = bitset;
|
||||
x.heaviest(); // Validation: throws on illegal/unknown bits.
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
using algo_config = std::vector<enum_option<compression_algorithm>>;
|
||||
|
||||
struct dict_training_when {
|
||||
enum class type {
|
||||
NEVER,
|
||||
WHEN_LEADER,
|
||||
ALWAYS,
|
||||
COUNT,
|
||||
};
|
||||
|
||||
static constexpr std::string_view names[] = {
|
||||
"never",
|
||||
"when_leader",
|
||||
"always",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<size_t>(type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static std::unordered_map<std::string, type> map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert({std::string(names[i]), type(i)});
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace netw
|
||||
@@ -16,8 +16,6 @@ Usage:
|
||||
import argparse, os, sys
|
||||
from typing import Sequence
|
||||
|
||||
from test.pylib.driver_utils import safe_driver_shutdown
|
||||
|
||||
def read_statements(path: str) -> list[tuple[int, str]]:
|
||||
stms: list[tuple[int, str]] = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
@@ -58,7 +56,7 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
|
||||
print(f"ERROR executing statement from file line {lineno}: {s}\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
safe_driver_shutdown(cluster)
|
||||
cluster.shutdown()
|
||||
return 0
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
|
||||
27
raft/raft_fwd.hh
Normal file
27
raft/raft_fwd.hh
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
// Lightweight forward-declaration header for commonly used raft types.
|
||||
// Include this instead of raft/raft.hh when only the basic ID/index types
|
||||
// are needed (e.g. in other header files), to avoid pulling in the full
|
||||
// raft machinery (futures, abort_source, bytes_ostream, etc.).
|
||||
|
||||
#include "internal.hh"
|
||||
|
||||
namespace raft {
|
||||
|
||||
using server_id = internal::tagged_id<struct server_id_tag>;
|
||||
using group_id = internal::tagged_id<struct group_id_tag>;
|
||||
using term_t = internal::tagged_uint64<struct term_tag>;
|
||||
using index_t = internal::tagged_uint64<struct index_tag>;
|
||||
using read_id = internal::tagged_uint64<struct read_id_tag>;
|
||||
|
||||
class server;
|
||||
|
||||
} // namespace raft
|
||||
@@ -269,10 +269,6 @@ public:
|
||||
// Gets the view a sstable currently belongs to.
|
||||
compaction::compaction_group_view& view_for_sstable(const sstables::shared_sstable& sst) const;
|
||||
utils::small_vector<compaction::compaction_group_view*, 3> all_views() const;
|
||||
// Returns true iff v is the repaired view of this compaction group.
|
||||
bool is_repaired_view(const compaction::compaction_group_view* v) const noexcept;
|
||||
// Returns an sstable set containing only repaired sstables (those classified as repaired).
|
||||
lw_shared_ptr<sstables::sstable_set> make_repaired_sstable_set() const;
|
||||
|
||||
seastar::condition_variable& get_staging_done_condition() noexcept {
|
||||
return _staging_done_condition;
|
||||
@@ -408,8 +404,6 @@ public:
|
||||
|
||||
// Make an sstable set spanning all sstables in the storage_group
|
||||
lw_shared_ptr<const sstables::sstable_set> make_sstable_set() const;
|
||||
// Like make_sstable_set(), but restricted to repaired sstables only across all compaction groups.
|
||||
lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set() const;
|
||||
|
||||
future<utils::chunked_vector<logstor::segment_snapshot>> take_logstor_snapshot() const;
|
||||
|
||||
|
||||
@@ -1006,7 +1006,7 @@ future<database::keyspace_change_per_shard> database::prepare_update_keyspace_on
|
||||
co_await modify_keyspace_on_all_shards(sharded_db, [&] (replica::database& db) -> future<> {
|
||||
auto& ks = db.find_keyspace(ksm.name());
|
||||
auto new_ksm = ::make_lw_shared<keyspace_metadata>(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(),
|
||||
ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options(), ksm.next_strategy_options_opt());
|
||||
ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options());
|
||||
|
||||
auto change = co_await db.prepare_update_keyspace(ks, new_ksm, pending_token_metadata.local());
|
||||
changes[this_shard_id()] = make_foreign(std::make_unique<keyspace_change>(std::move(change)));
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
@@ -113,6 +112,10 @@ namespace gms {
|
||||
class feature_service;
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
class abstract_replication_strategy;
|
||||
}
|
||||
|
||||
namespace alternator {
|
||||
class table_stats;
|
||||
}
|
||||
@@ -757,10 +760,6 @@ private:
|
||||
// groups during tablet split with overlapping token range, and we need to include them all in a single
|
||||
// sstable set to allow safe tombstone gc.
|
||||
lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc(const compaction_group&) const;
|
||||
// Like sstable_set_for_tombstone_gc(), but restricted to repaired sstables only across all compaction
|
||||
// groups of the same tablet (storage group). Used by the tombstone_gc=repair optimization to avoid
|
||||
// scanning unrepaired sstables when looking for GC-blocking shadows.
|
||||
lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set_for_tombstone_gc(const compaction_group&) const;
|
||||
|
||||
bool cache_enabled() const {
|
||||
return _config.enable_cache && _schema->caching_options().enabled();
|
||||
|
||||
@@ -69,6 +69,13 @@ struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace replica::logstor
|
||||
|
||||
template<>
|
||||
size_t hist_key<replica::logstor::segment_descriptor>(const replica::logstor::segment_descriptor& desc);
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
|
||||
|
||||
struct segment_set {
|
||||
|
||||
@@ -1203,35 +1203,11 @@ future<utils::chunked_vector<logstor::segment_snapshot>> storage_group::take_log
|
||||
co_return std::move(snp);
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> storage_group::make_repaired_sstable_set() const {
|
||||
if (_split_ready_groups.empty() && _merging_groups.empty()) {
|
||||
return _main_cg->make_repaired_sstable_set();
|
||||
}
|
||||
const auto& schema = _main_cg->_t.schema();
|
||||
std::vector<lw_shared_ptr<sstables::sstable_set>> underlying;
|
||||
underlying.reserve(1 + _merging_groups.size() + _split_ready_groups.size());
|
||||
underlying.emplace_back(_main_cg->make_repaired_sstable_set());
|
||||
for (const auto& cg : _merging_groups) {
|
||||
if (!cg->empty()) {
|
||||
underlying.emplace_back(cg->make_repaired_sstable_set());
|
||||
}
|
||||
}
|
||||
for (const auto& cg : _split_ready_groups) {
|
||||
underlying.emplace_back(cg->make_repaired_sstable_set());
|
||||
}
|
||||
return make_lw_shared(sstables::make_compound_sstable_set(schema, std::move(underlying)));
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> table::sstable_set_for_tombstone_gc(const compaction_group& cg) const {
|
||||
auto& sg = storage_group_for_id(cg.group_id());
|
||||
return sg.make_sstable_set();
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> table::make_repaired_sstable_set_for_tombstone_gc(const compaction_group& cg) const {
|
||||
auto& sg = storage_group_for_id(cg.group_id());
|
||||
return sg.make_repaired_sstable_set();
|
||||
}
|
||||
|
||||
bool tablet_storage_group_manager::all_storage_groups_split() {
|
||||
auto& tmap = tablet_map();
|
||||
if (_split_ready_seq_number == tmap.resize_decision().sequence_number) {
|
||||
@@ -3024,47 +3000,9 @@ public:
|
||||
future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const override {
|
||||
return make_sstable_set_for_this_view(_cg.maintenance_sstables(), [this] { return *_cg.make_maintenance_sstable_set(); });
|
||||
}
|
||||
private:
|
||||
// Returns true when tombstone GC is restricted to the repaired set:
|
||||
// tombstone_gc=repair mode and this view is the repaired view.
|
||||
//
|
||||
// The optimization is safe for materialized view tables as well as base tables.
|
||||
// The key invariant for MV: MV tablet repair calls flush_hints() before
|
||||
// take_storage_snapshot(). flush_hints() creates a sync point that covers BOTH
|
||||
// _hints_manager (base mutations) AND _hints_for_views_manager (view mutations).
|
||||
// It waits until all pending hints — including any D_view hint stored in
|
||||
// _hints_for_views_manager while the target node was down — have been replayed
|
||||
// to the target node. Only then is take_storage_snapshot() called, which flushes
|
||||
// the MV memtable and captures D_view in the repairing sstable. After repair
|
||||
// completes, D_view is in the repaired set.
|
||||
//
|
||||
// If a subsequent base repair later replays a D_base hint that causes another
|
||||
// D_view write (same key and timestamp), it is a no-op duplicate: the original
|
||||
// D_view already in the repaired set still prevents T_mv from being purged.
|
||||
//
|
||||
// USING TIMESTAMP with timestamps predating (gc_before + propagation_delay) is
|
||||
// explicitly UB and excluded from the safety argument.
|
||||
bool is_tombstone_gc_repaired_only() const noexcept {
|
||||
return _cg.is_repaired_view(this) &&
|
||||
_t.schema()->tombstone_gc_options().mode() == tombstone_gc_mode::repair;
|
||||
}
|
||||
public:
|
||||
lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const override {
|
||||
// Optimization: when tombstone_gc=repair and this is the repaired view, only check
|
||||
// repaired sstables. The repair ordering guarantee ensures that by the time a tombstone
|
||||
// becomes GC-eligible (repair_time committed to Raft), any data it shadows has already
|
||||
// been promoted from repairing to repaired. Unrepaired data always has timestamps newer
|
||||
// than any GC-eligible tombstone (legitimate writes; USING TIMESTAMP abuse is UB).
|
||||
// For all other tombstone_gc modes this invariant does not hold, so we fall through to
|
||||
// the full storage-group set.
|
||||
if (is_tombstone_gc_repaired_only()) {
|
||||
return _t.make_repaired_sstable_set_for_tombstone_gc(_cg);
|
||||
}
|
||||
return _t.sstable_set_for_tombstone_gc(_cg);
|
||||
}
|
||||
bool skip_memtable_for_tombstone_gc() const noexcept override {
|
||||
return is_tombstone_gc_repaired_only();
|
||||
}
|
||||
std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point query_time) const override {
|
||||
return compaction::get_fully_expired_sstables(*this, sstables, query_time);
|
||||
}
|
||||
@@ -5481,21 +5419,6 @@ compaction::compaction_group_view& compaction_group::view_for_unrepaired_data()
|
||||
return *_unrepaired_view;
|
||||
}
|
||||
|
||||
bool compaction_group::is_repaired_view(const compaction::compaction_group_view* v) const noexcept {
|
||||
return v == _repaired_view.get();
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> compaction_group::make_repaired_sstable_set() const {
|
||||
auto set = make_lw_shared<sstables::sstable_set>(make_main_sstable_set());
|
||||
auto sstables_repaired_at = get_sstables_repaired_at();
|
||||
for (auto& sst : *_main_sstables->all()) {
|
||||
if (repair::is_repaired(sstables_repaired_at, sst)) {
|
||||
set->insert(sst);
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
compaction::compaction_group_view& compaction_group::view_for_sstable(const sstables::shared_sstable& sst) const {
|
||||
switch (_repair_sstable_classifier(sst, get_sstables_repaired_at())) {
|
||||
case repair_sstable_classification::unrepaired: return *_unrepaired_view;
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#include "mutation/mutation.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
@@ -493,7 +493,7 @@ std::unique_ptr<service::pager::query_pager> service::pager::query_pagers::pager
|
||||
// If partition row limit is applied to paging, we still need to fall back
|
||||
// to filtering the results to avoid extraneous rows on page breaks.
|
||||
if (!filtering_restrictions && cmd->slice.partition_row_limit() < query::max_rows_if_set) {
|
||||
filtering_restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, true);
|
||||
filtering_restrictions = ::make_shared<cql3::restrictions::statement_restrictions>(s, true);
|
||||
}
|
||||
if (filtering_restrictions) {
|
||||
return std::make_unique<filtering_query_pager>(proxy, std::move(s), std::move(selection), state,
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "service/paxos/paxos_state.hh"
|
||||
#include "service/query_state.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "replica/database.hh"
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
#pragma once
|
||||
#include <unordered_set>
|
||||
#include "service/raft/group0_fwd.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -9,7 +9,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include "raft/raft.hh"
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/core/lowres_clock.hh>
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
#include "service/session_id.hh"
|
||||
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
@@ -19,12 +19,6 @@
|
||||
|
||||
namespace service {
|
||||
|
||||
using session_id = utils::tagged_uuid<struct session_id_tag>;
|
||||
|
||||
// We want it be different than default-constructed session_id to catch mistakes.
|
||||
constexpr session_id default_session_id = session_id(
|
||||
utils::UUID(0x81e7fc5a8d4411ee, 0x8577325096b39f47)); // timeuuid 2023-11-27 16:46:27.182089.0 UTC
|
||||
|
||||
/// Session is used to track execution of work related to some greater task, identified by session_id.
|
||||
/// Work can enter the session using enter(), and is considered to be part of the session
|
||||
/// as long as the guard returned by enter() is alive.
|
||||
|
||||
21
service/session_id.hh
Normal file
21
service/session_id.hh
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (C) 2023-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
using session_id = utils::tagged_uuid<struct session_id_tag>;
|
||||
|
||||
// We want it to be different than a default-constructed session_id to catch mistakes.
|
||||
constexpr session_id default_session_id = session_id(
|
||||
utils::UUID(0x81e7fc5a8d4411ee, 0x8577325096b39f47)); // timeuuid 2023-11-27 16:46:27.182089.0 UTC
|
||||
|
||||
} // namespace service
|
||||
@@ -38,7 +38,6 @@
|
||||
#include "replica/exceptions.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "dht/token_range_endpoints.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/cas_shard.hh"
|
||||
#include "service/storage_proxy_fwd.hh"
|
||||
|
||||
|
||||
@@ -1342,11 +1342,6 @@ future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstri
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.ongoing_rf_changes) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
@@ -2431,7 +2426,7 @@ storage_service::prepare_replacement_info(std::unordered_set<gms::inet_address>
|
||||
}
|
||||
|
||||
future<std::map<gms::inet_address, float>> storage_service::get_ownership() {
|
||||
return run_with_no_api_lock([] (storage_service& ss) {
|
||||
return run_with_no_api_lock([this] (storage_service& ss) {
|
||||
const auto& tm = ss.get_token_metadata();
|
||||
auto token_map = dht::token::describe_ownership(tm.sorted_tokens());
|
||||
// describeOwnership returns tokens in an unspecified order, let's re-order them
|
||||
@@ -2439,7 +2434,7 @@ future<std::map<gms::inet_address, float>> storage_service::get_ownership() {
|
||||
for (auto entry : token_map) {
|
||||
locator::host_id id = tm.get_endpoint(entry.first).value();
|
||||
auto token_ownership = entry.second;
|
||||
ownership[ss._address_map.get(id)] += token_ownership;
|
||||
ownership[_address_map.get(id)] += token_ownership;
|
||||
}
|
||||
return ownership;
|
||||
});
|
||||
@@ -2848,8 +2843,12 @@ future<> storage_service::raft_removenode(locator::host_id host_id, locator::hos
|
||||
}
|
||||
|
||||
future<> storage_service::mark_excluded(const std::vector<locator::host_id>& hosts) {
|
||||
// Callers forward to shard 0 via run_with_no_api_lock (group0 is only set on shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.mark_excluded(hosts);
|
||||
});
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
@@ -3094,8 +3093,8 @@ future<sstring> storage_service::wait_for_topology_request_completion(utils::UUI
|
||||
}
|
||||
|
||||
future<> storage_service::abort_topology_request(utils::UUID request_id) {
|
||||
co_await container().invoke_on(0, [request_id] (storage_service& ss) {
|
||||
return ss._topology_state_machine.abort_request(*ss._group0, ss._group0_as, ss._feature_service, request_id);
|
||||
co_await container().invoke_on(0, [request_id, this] (storage_service& ss) {
|
||||
return _topology_state_machine.abort_request(*ss._group0, ss._group0_as, ss._feature_service, request_id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3108,13 +3107,13 @@ future<> storage_service::wait_for_topology_not_busy() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::abort_rf_change(utils::UUID request_id) {
|
||||
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.abort_rf_change(request_id);
|
||||
return ss.abort_paused_rf_change(request_id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3125,81 +3124,20 @@ future<> storage_service::abort_rf_change(utils::UUID request_id) {
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
if (std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id)) { // keyspace_rf_change_kind::conversion_to_rack_list
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
} else if (std::ranges::contains(_topology_state_machine._topology.ongoing_rf_changes, request_id)) { // keyspace_rf_change_kind::multi_rf_change
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
if (!req_entry.error.empty()) {
|
||||
slogger.warn("RF change request with id '{}' was already aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
if (!_db.local().has_keyspace(ks_name)) {
|
||||
co_return;
|
||||
}
|
||||
auto& ks = _db.local().find_keyspace(ks_name);
|
||||
// Check the tablet maps: if any tablet still has a missing replica
|
||||
// (i.e., needs extending), we can abort. Otherwise, we're in the
|
||||
// replica removal phase and aborting would require a rollback.
|
||||
auto next_replication = ks.metadata()->next_strategy_options_opt().value()
|
||||
| std::views::transform([] (const auto& pair) {
|
||||
return std::make_pair(pair.first, std::get<locator::rack_list>(pair.second));
|
||||
}) | std::ranges::to<std::unordered_map<sstring, std::vector<sstring>>>();
|
||||
|
||||
const auto& tm = *get_token_metadata_ptr();
|
||||
bool has_missing_replica = false;
|
||||
auto all_tables = ks.metadata()->tables();
|
||||
auto all_views = ks.metadata()->views()
|
||||
| std::views::transform([] (const auto& view) { return schema_ptr(view); })
|
||||
| std::ranges::to<std::vector<schema_ptr>>();
|
||||
all_tables.insert(all_tables.end(), all_views.begin(), all_views.end());
|
||||
for (const auto& table : all_tables) {
|
||||
if (!tm.tablets().has_tablet_map(table->id()) || !tm.tablets().is_base_table(table->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = tm.tablets().get_tablet_map(table->id());
|
||||
for (const auto& ti : tmap.tablets()) {
|
||||
std::unordered_map<sstring, std::vector<sstring>> dc_to_racks;
|
||||
for (const auto& r : ti.replicas) {
|
||||
const auto& node_dc_rack = tm.get_topology().get_node(r.host).dc_rack();
|
||||
dc_to_racks[node_dc_rack.dc].push_back(node_dc_rack.rack);
|
||||
}
|
||||
auto diff = subtract_replication(next_replication, dc_to_racks);
|
||||
if (!diff.empty()) {
|
||||
has_missing_replica = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_missing_replica) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_missing_replica) {
|
||||
auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
|
||||
ks_md->set_next_strategy_options(ks_md->strategy_options());
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db.local(), ks_md, guard.write_timestamp());
|
||||
for (auto& m : schema_muts) {
|
||||
updates.push_back(canonical_mutation(m));
|
||||
}
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.abort("Aborted by user request")
|
||||
.build()));
|
||||
} else {
|
||||
slogger.warn("RF change request with id '{}' is ongoing, but it started removing replicas, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
} else {
|
||||
slogger.warn("RF change request with id '{}' can't be aborted", request_id);
|
||||
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
|
||||
if (!found) {
|
||||
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
mixed_change change{std::move(updates)};
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
|
||||
topology_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("aborting rf change request {}", request_id));
|
||||
|
||||
@@ -3957,8 +3895,11 @@ future<> storage_service::update_tablet_metadata(const locator::tablet_metadata_
|
||||
}
|
||||
|
||||
future<> storage_service::prepare_for_tablets_migration(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.prepare_for_tablets_migration(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as);
|
||||
@@ -4098,8 +4039,11 @@ future<> storage_service::prepare_for_tablets_migration(const sstring& ks_name)
|
||||
}
|
||||
|
||||
future<> storage_service::set_node_intended_storage_mode(intended_storage_mode mode) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [mode] (auto& ss) {
|
||||
return ss.set_node_intended_storage_mode(mode);
|
||||
});
|
||||
}
|
||||
|
||||
auto& raft_server = _group0->group0_server();
|
||||
auto holder = _group0->hold_group0_gate();
|
||||
@@ -4195,8 +4139,11 @@ storage_service::migration_status storage_service::get_tablets_migration_status(
|
||||
}
|
||||
|
||||
future<storage_service::keyspace_migration_status> storage_service::get_tablets_migration_status_with_node_details(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&ks_name] (auto& ss) {
|
||||
return ss.get_tablets_migration_status_with_node_details(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
keyspace_migration_status result;
|
||||
result.keyspace = ks_name;
|
||||
@@ -4257,8 +4204,11 @@ future<storage_service::keyspace_migration_status> storage_service::get_tablets_
|
||||
}
|
||||
|
||||
future<> storage_service::finalize_tablets_migration(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&ks_name] (auto& ss) {
|
||||
return ss.finalize_tablets_migration(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
slogger.info("Finalizing vnodes-to-tablets migration for keyspace '{}'", ks_name);
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include "absl-flat_hash_map.hh"
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/client_routes.hh"
|
||||
@@ -40,11 +41,9 @@
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include "cdc/generation_id.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "node_ops/id.hh"
|
||||
#include "raft/server.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "service/tablet_operation.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
@@ -115,6 +114,10 @@ class tablet_mutation_builder;
|
||||
|
||||
namespace auth { class cache; }
|
||||
|
||||
namespace service {
|
||||
class tablet_allocator;
|
||||
}
|
||||
|
||||
namespace utils {
|
||||
class disk_space_monitor;
|
||||
}
|
||||
@@ -780,19 +783,13 @@ private:
|
||||
*/
|
||||
future<> stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, locator::host_id>> ranges_to_stream_by_keyspace);
|
||||
|
||||
// REST handlers are gated at the registration site (see gated() in
|
||||
// api/storage_service.cc) so stop() drains in-flight requests before
|
||||
// teardown. run_with_api_lock_internal and run_with_no_api_lock hold
|
||||
// _async_gate on shard 0 as well, because REST requests arriving on
|
||||
// any shard are forwarded there for execution.
|
||||
template <typename Func>
|
||||
auto run_with_api_lock_internal(storage_service& ss, Func&& func, sstring& operation) {
|
||||
auto holder = ss._async_gate.hold();
|
||||
if (!ss._operation_in_progress.empty()) {
|
||||
throw std::runtime_error(format("Operation {} is in progress, try again", ss._operation_in_progress));
|
||||
}
|
||||
ss._operation_in_progress = std::move(operation);
|
||||
return func(ss).finally([&ss, holder = std::move(holder)] {
|
||||
return func(ss).finally([&ss] {
|
||||
ss._operation_in_progress = sstring();
|
||||
});
|
||||
}
|
||||
@@ -800,10 +797,6 @@ private:
|
||||
public:
|
||||
int32_t get_exception_count();
|
||||
|
||||
auto hold_async_gate() {
|
||||
return _async_gate.hold();
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
auto run_with_api_lock(sstring operation, Func&& func) {
|
||||
return container().invoke_on(0, [operation = std::move(operation),
|
||||
@@ -814,10 +807,8 @@ public:
|
||||
|
||||
template <typename Func>
|
||||
auto run_with_no_api_lock(Func&& func) {
|
||||
return container().invoke_on(0, [func = std::forward<Func>(func)] (storage_service& ss) mutable
|
||||
-> futurize_t<std::invoke_result_t<Func, storage_service&>> {
|
||||
auto holder = ss._async_gate.hold();
|
||||
co_return co_await futurize_invoke(func, ss);
|
||||
return container().invoke_on(0, [func = std::forward<Func>(func)] (storage_service& ss) mutable {
|
||||
return func(ss);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -987,7 +978,7 @@ public:
|
||||
|
||||
future<> wait_for_topology_not_busy();
|
||||
|
||||
future<> abort_rf_change(utils::UUID request_id);
|
||||
future<> abort_paused_rf_change(utils::UUID request_id);
|
||||
|
||||
private:
|
||||
semaphore _do_sample_sstables_concurrency_limiter{1};
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user