mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-24 02:20:37 +00:00
Compare commits
15 Commits
master
...
ykaul/comp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
339f1ae1a0 | ||
|
|
07d69aa8fa | ||
|
|
c50bfb995b | ||
|
|
e7dbccbdcd | ||
|
|
faa2f8ba76 | ||
|
|
7aca42aa31 | ||
|
|
92e0597807 | ||
|
|
0798c112d0 | ||
|
|
9650390482 | ||
|
|
a1e8ef8d6e | ||
|
|
ea00cfad3d | ||
|
|
0fd89d77b3 | ||
|
|
361a717d89 | ||
|
|
9df4fc3e2f | ||
|
|
d1b4fd5683 |
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@@ -32,8 +32,8 @@ counters* @nuivall
|
||||
tests/counter_test* @nuivall
|
||||
|
||||
# DOCS
|
||||
/docs/ @annastuchlik @tzach
|
||||
/docs/alternator/ @annastuchlik @tzach @nyh
|
||||
docs/* @annastuchlik @tzach
|
||||
docs/alternator @annastuchlik @tzach @nyh
|
||||
|
||||
# GOSSIP
|
||||
gms/* @tgrabiec @asias @kbr-scylla
|
||||
|
||||
@@ -234,11 +234,15 @@ generate_scylla_version()
|
||||
|
||||
option(Scylla_USE_PRECOMPILED_HEADER "Use precompiled header for Scylla" ON)
|
||||
add_library(scylla-precompiled-header STATIC exported_templates.cc)
|
||||
target_include_directories(scylla-precompiled-header PRIVATE
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"${scylla_gen_build_dir}")
|
||||
target_link_libraries(scylla-precompiled-header PRIVATE
|
||||
absl::headers
|
||||
absl::btree
|
||||
absl::hash
|
||||
absl::raw_hash_set
|
||||
idl
|
||||
Seastar::seastar
|
||||
Snappy::snappy
|
||||
systemd
|
||||
|
||||
@@ -1892,7 +1892,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
}
|
||||
if (vector_index_updates->Size() > 1) {
|
||||
// VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates.
|
||||
// Since DynamoDB artificially limits the latter to just a
|
||||
// Since DynamoDB artifically limits the latter to just a
|
||||
// single operation (one Create or one Delete), we also
|
||||
// place the same artificial limit on VectorIndexUpdates,
|
||||
// and throw the same LimitExceeded error if the client
|
||||
|
||||
@@ -1354,7 +1354,7 @@ static future<executor::request_return_type> query_vector(
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
// Parse the Select parameter and determine which attributes to return.
|
||||
// For a vector index, the default Select is ALL_ATTRIBUTES (full items).
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficient because it
|
||||
// ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it
|
||||
// returns what the vector store returned without looking up additional
|
||||
// base-table data. Currently only the primary key attributes are projected
|
||||
// but in the future we'll implement projecting additional attributes into
|
||||
|
||||
@@ -167,8 +167,46 @@ static schema_ptr get_schema_from_arn(service::storage_proxy& proxy, const strea
|
||||
}
|
||||
}
|
||||
|
||||
// ShardId. Must be between 28 and 65 characters inclusive.
|
||||
// UUID is 36 bytes as string (including dashes).
|
||||
// Prepend a version/type marker (`S`) -> 37
|
||||
class stream_shard_id : public utils::UUID {
|
||||
public:
|
||||
using UUID = utils::UUID;
|
||||
static constexpr char marker = 'S';
|
||||
|
||||
stream_shard_id() = default;
|
||||
stream_shard_id(const UUID& uuid)
|
||||
: UUID(uuid)
|
||||
{}
|
||||
stream_shard_id(const table_id& tid)
|
||||
: UUID(tid.uuid())
|
||||
{}
|
||||
stream_shard_id(std::string_view v)
|
||||
: UUID(v.substr(1))
|
||||
{
|
||||
if (v[0] != marker) {
|
||||
throw std::invalid_argument(std::string(v));
|
||||
}
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& os, const stream_shard_id& arn) {
|
||||
const UUID& uuid = arn;
|
||||
return os << marker << uuid;
|
||||
}
|
||||
friend std::istream& operator>>(std::istream& is, stream_shard_id& arn) {
|
||||
std::string s;
|
||||
is >> s;
|
||||
arn = stream_shard_id(s);
|
||||
return is;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace alternator
|
||||
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_shard_id>
|
||||
: public from_string_helper<ValueType, alternator::stream_shard_id>
|
||||
{};
|
||||
template<typename ValueType>
|
||||
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
|
||||
: public from_string_helper<ValueType, alternator::stream_arn>
|
||||
@@ -180,8 +218,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
_stats.api_operations.list_streams++;
|
||||
|
||||
auto limit = rjson::get_opt<int>(request, "Limit").value_or(100);
|
||||
auto streams_start = rjson::get_opt<stream_arn>(request, "ExclusiveStartStreamArn");
|
||||
|
||||
auto streams_start = rjson::get_opt<stream_shard_id>(request, "ExclusiveStartStreamArn");
|
||||
auto table = find_table(_proxy, request);
|
||||
auto db = _proxy.data_dictionary();
|
||||
|
||||
@@ -207,34 +244,34 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
cfs = db.get_tables();
|
||||
}
|
||||
|
||||
// We need to sort the tables to ensure a stable order for paging.
|
||||
// We sort by keyspace and table name, which will also allow us to skip to
|
||||
// the right position by ExclusiveStartStreamArn.
|
||||
auto cmp = [](std::string_view ks1, std::string_view cf1, std::string_view ks2, std::string_view cf2) {
|
||||
return ks1 == ks2 ? cf1 < cf2 : ks1 < ks2;
|
||||
};
|
||||
// # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
|
||||
// generate duplicates in a paged listing here. Can obviously miss things if they
|
||||
// are added between paged calls and end up with a "smaller" UUID/ARN, but that
|
||||
// is to be expected.
|
||||
if (std::cmp_less(limit, cfs.size()) || streams_start) {
|
||||
std::sort(cfs.begin(), cfs.end(),
|
||||
[&cmp](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return cmp(t1.schema()->ks_name(), t1.schema()->cf_name(),
|
||||
t2.schema()->ks_name(), t2.schema()->cf_name());
|
||||
});
|
||||
std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
|
||||
return t1.schema()->id().uuid() < t2.schema()->id().uuid();
|
||||
});
|
||||
}
|
||||
|
||||
auto i = cfs.begin();
|
||||
auto e = cfs.end();
|
||||
|
||||
if (streams_start) {
|
||||
i = std::upper_bound(i, e, *streams_start,
|
||||
[&cmp](const stream_arn& arn, const data_dictionary::table& t) {
|
||||
return cmp(arn.keyspace_name(), arn.table_name(),
|
||||
t.schema()->ks_name(), t.schema()->cf_name());
|
||||
});
|
||||
i = std::find_if(i, e, [&](const data_dictionary::table& t) {
|
||||
return t.schema()->id().uuid() == streams_start
|
||||
&& cdc::get_base_table(db.real_database(), *t.schema())
|
||||
&& is_alternator_keyspace(t.schema()->ks_name())
|
||||
;
|
||||
});
|
||||
if (i != e) {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = rjson::empty_object();
|
||||
auto streams = rjson::empty_array();
|
||||
std::optional<stream_arn> last;
|
||||
std::optional<stream_shard_id> last;
|
||||
|
||||
for (;limit > 0 && i != e; ++i) {
|
||||
auto s = i->schema();
|
||||
@@ -245,24 +282,19 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
|
||||
}
|
||||
if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
|
||||
rjson::value new_entry = rjson::empty_object();
|
||||
|
||||
last = i->schema()->id();
|
||||
auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
|
||||
rjson::add(new_entry, "StreamArn", arn);
|
||||
rjson::add(new_entry, "StreamLabel", rjson::from_string(stream_label(*s)));
|
||||
rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(s->cf_name())));
|
||||
rjson::push_back(streams, std::move(new_entry));
|
||||
last = std::move(arn);
|
||||
--limit;
|
||||
}
|
||||
}
|
||||
|
||||
rjson::add(ret, "Streams", std::move(streams));
|
||||
|
||||
// Only emit LastEvaluatedStreamArn when we stopped because we hit the
|
||||
// limit (limit == 0), meaning there may be more streams to list.
|
||||
// If we exhausted all tables naturally (limit > 0), there are no more
|
||||
// streams, so we must not emit a cookie.
|
||||
if (last && limit == 0) {
|
||||
if (last) {
|
||||
rjson::add(ret, "LastEvaluatedStreamArn", *last);
|
||||
}
|
||||
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
||||
@@ -582,7 +614,7 @@ void stream_id_range::prepare_for_iterating()
|
||||
// the function returns `stream_id_range` that will allow iteration over children Streams shards for the Streams shard `parent`
|
||||
// a child Streams shard is defined as a Streams shard that touches token range that was previously covered by `parent` Streams shard
|
||||
// Streams shard contains a token, that represents end of the token range for that Streams shard (inclusive)
|
||||
// beginning of the token range is defined by previous Streams shard's token + 1
|
||||
// begginning of the token range is defined by previous Streams shard's token + 1
|
||||
// NOTE: With vnodes, ranges of Streams' shards wrap, while with tablets the biggest allowed token number is always a range end.
|
||||
// NOTE: both streams generation are guaranteed to cover whole range and be non-empty
|
||||
// NOTE: it's possible to get more than one stream shard with the same token value (thus some of those stream shards will be empty) -
|
||||
|
||||
@@ -856,9 +856,7 @@ rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::r
|
||||
}
|
||||
|
||||
apilog.info("exclude_node: hosts={}", hosts);
|
||||
co_await ss.local().run_with_no_api_lock([hosts = std::move(hosts)] (service::storage_service& ss) {
|
||||
return ss.mark_excluded(hosts);
|
||||
});
|
||||
co_await ss.local().mark_excluded(hosts);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1733,9 +1731,7 @@ rest_create_vnode_tablet_migration(http_context& ctx, sharded<service::storage_s
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.prepare_for_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().prepare_for_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1747,9 +1743,7 @@ rest_get_vnode_tablet_migration(http_context& ctx, sharded<service::storage_serv
|
||||
throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
|
||||
}
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
auto status = co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.get_tablets_migration_status_with_node_details(keyspace);
|
||||
});
|
||||
auto status = co_await ss.local().get_tablets_migration_status_with_node_details(keyspace);
|
||||
|
||||
ss::vnode_tablet_migration_status result;
|
||||
result.keyspace = status.keyspace;
|
||||
@@ -1774,9 +1768,7 @@ rest_set_vnode_tablet_migration_node_storage_mode(http_context& ctx, sharded<ser
|
||||
}
|
||||
auto mode_str = req->get_query_param("intended_mode");
|
||||
auto mode = service::intended_storage_mode_from_string(mode_str);
|
||||
co_await ss.local().run_with_no_api_lock([mode] (service::storage_service& ss) {
|
||||
return ss.set_node_intended_storage_mode(mode);
|
||||
});
|
||||
co_await ss.local().set_node_intended_storage_mode(mode);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1790,9 +1782,7 @@ rest_finalize_vnode_tablet_migration(http_context& ctx, sharded<service::storage
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
validate_keyspace(ctx, keyspace);
|
||||
|
||||
co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
|
||||
return ss.finalize_tablets_migration(keyspace);
|
||||
});
|
||||
co_await ss.local().finalize_tablets_migration(keyspace);
|
||||
co_return json_void();
|
||||
}
|
||||
|
||||
@@ -1869,106 +1859,90 @@ rest_bind(FuncType func, BindArgs&... args) {
|
||||
return std::bind_front(func, std::ref(args)...);
|
||||
}
|
||||
|
||||
// Hold the storage_service async gate for the duration of async REST
|
||||
// handlers so stop() drains in-flight requests before teardown.
|
||||
// Synchronous handlers don't yield and need no gate.
|
||||
static seastar::httpd::future_json_function
|
||||
gated(sharded<service::storage_service>& ss, seastar::httpd::future_json_function fn) {
|
||||
return [fn = std::move(fn), &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
auto holder = ss.local().hold_async_gate();
|
||||
co_return co_await fn(std::move(req));
|
||||
};
|
||||
}
|
||||
|
||||
static seastar::httpd::json_request_function
|
||||
gated(sharded<service::storage_service>&, seastar::httpd::json_request_function fn) {
|
||||
return fn;
|
||||
}
|
||||
|
||||
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
|
||||
ss::get_token_endpoint.set(r, gated(ss, rest_bind(rest_get_token_endpoint, ctx, ss)));
|
||||
ss::get_release_version.set(r, gated(ss, rest_bind(rest_get_release_version, ss)));
|
||||
ss::get_scylla_release_version.set(r, gated(ss, rest_bind(rest_get_scylla_release_version, ss)));
|
||||
ss::get_schema_version.set(r, gated(ss, rest_bind(rest_get_schema_version, ss)));
|
||||
ss::get_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_range_to_endpoint_map, ctx, ss)));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_pending_range_to_endpoint_map, ctx)));
|
||||
ss::describe_ring.set(r, gated(ss, rest_bind(rest_describe_ring, ctx, ss)));
|
||||
ss::get_current_generation_number.set(r, gated(ss, rest_bind(rest_get_current_generation_number, ss)));
|
||||
ss::get_natural_endpoints.set(r, gated(ss, rest_bind(rest_get_natural_endpoints, ctx, ss)));
|
||||
ss::get_natural_endpoints_v2.set(r, gated(ss, rest_bind(rest_get_natural_endpoints_v2, ctx, ss)));
|
||||
ss::cdc_streams_check_and_repair.set(r, gated(ss, rest_bind(rest_cdc_streams_check_and_repair, ss)));
|
||||
ss::cleanup_all.set(r, gated(ss, rest_bind(rest_cleanup_all, ctx, ss)));
|
||||
ss::reset_cleanup_needed.set(r, gated(ss, rest_bind(rest_reset_cleanup_needed, ctx, ss)));
|
||||
ss::force_flush.set(r, gated(ss, rest_bind(rest_force_flush, ctx)));
|
||||
ss::force_keyspace_flush.set(r, gated(ss, rest_bind(rest_force_keyspace_flush, ctx)));
|
||||
ss::decommission.set(r, gated(ss, rest_bind(rest_decommission, ss, ssc)));
|
||||
ss::logstor_compaction.set(r, gated(ss, rest_bind(rest_logstor_compaction, ctx)));
|
||||
ss::logstor_flush.set(r, gated(ss, rest_bind(rest_logstor_flush, ctx)));
|
||||
ss::move.set(r, gated(ss, rest_bind(rest_move, ss)));
|
||||
ss::remove_node.set(r, gated(ss, rest_bind(rest_remove_node, ss)));
|
||||
ss::exclude_node.set(r, gated(ss, rest_bind(rest_exclude_node, ss)));
|
||||
ss::get_removal_status.set(r, gated(ss, rest_bind(rest_get_removal_status, ss)));
|
||||
ss::force_remove_completion.set(r, gated(ss, rest_bind(rest_force_remove_completion, ss)));
|
||||
ss::set_logging_level.set(r, gated(ss, rest_bind(rest_set_logging_level)));
|
||||
ss::get_logging_levels.set(r, gated(ss, rest_bind(rest_get_logging_levels)));
|
||||
ss::get_operation_mode.set(r, gated(ss, rest_bind(rest_get_operation_mode, ss)));
|
||||
ss::is_starting.set(r, gated(ss, rest_bind(rest_is_starting, ss)));
|
||||
ss::get_drain_progress.set(r, gated(ss, rest_bind(rest_get_drain_progress, ss)));
|
||||
ss::drain.set(r, gated(ss, rest_bind(rest_drain, ss)));
|
||||
ss::stop_gossiping.set(r, gated(ss, rest_bind(rest_stop_gossiping, ss)));
|
||||
ss::start_gossiping.set(r, gated(ss, rest_bind(rest_start_gossiping, ss)));
|
||||
ss::is_gossip_running.set(r, gated(ss, rest_bind(rest_is_gossip_running, ss)));
|
||||
ss::stop_daemon.set(r, gated(ss, rest_bind(rest_stop_daemon)));
|
||||
ss::is_initialized.set(r, gated(ss, rest_bind(rest_is_initialized, ss)));
|
||||
ss::join_ring.set(r, gated(ss, rest_bind(rest_join_ring)));
|
||||
ss::is_joined.set(r, gated(ss, rest_bind(rest_is_joined, ss)));
|
||||
ss::is_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_is_incremental_backups_enabled, ctx)));
|
||||
ss::set_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_set_incremental_backups_enabled, ctx)));
|
||||
ss::rebuild.set(r, gated(ss, rest_bind(rest_rebuild, ss)));
|
||||
ss::bulk_load.set(r, gated(ss, rest_bind(rest_bulk_load)));
|
||||
ss::bulk_load_async.set(r, gated(ss, rest_bind(rest_bulk_load_async)));
|
||||
ss::reschedule_failed_deletions.set(r, gated(ss, rest_bind(rest_reschedule_failed_deletions)));
|
||||
ss::sample_key_range.set(r, gated(ss, rest_bind(rest_sample_key_range)));
|
||||
ss::reset_local_schema.set(r, gated(ss, rest_bind(rest_reset_local_schema, ss)));
|
||||
ss::set_trace_probability.set(r, gated(ss, rest_bind(rest_set_trace_probability)));
|
||||
ss::get_trace_probability.set(r, gated(ss, rest_bind(rest_get_trace_probability)));
|
||||
ss::get_slow_query_info.set(r, gated(ss, rest_bind(rest_get_slow_query_info)));
|
||||
ss::set_slow_query.set(r, gated(ss, rest_bind(rest_set_slow_query)));
|
||||
ss::deliver_hints.set(r, gated(ss, rest_bind(rest_deliver_hints)));
|
||||
ss::get_cluster_name.set(r, gated(ss, rest_bind(rest_get_cluster_name, ss)));
|
||||
ss::get_partitioner_name.set(r, gated(ss, rest_bind(rest_get_partitioner_name, ss)));
|
||||
ss::get_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_warn_threshold)));
|
||||
ss::set_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_warn_threshold)));
|
||||
ss::get_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_failure_threshold)));
|
||||
ss::set_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_failure_threshold)));
|
||||
ss::get_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_get_batch_size_failure_threshold)));
|
||||
ss::set_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_set_batch_size_failure_threshold)));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, gated(ss, rest_bind(rest_set_hinted_handoff_throttle_in_kb)));
|
||||
ss::get_exceptions.set(r, gated(ss, rest_bind(rest_get_exceptions, ss)));
|
||||
ss::get_total_hints_in_progress.set(r, gated(ss, rest_bind(rest_get_total_hints_in_progress)));
|
||||
ss::get_total_hints.set(r, gated(ss, rest_bind(rest_get_total_hints)));
|
||||
ss::get_ownership.set(r, gated(ss, rest_bind(rest_get_ownership, ctx, ss)));
|
||||
ss::get_effective_ownership.set(r, gated(ss, rest_bind(rest_get_effective_ownership, ctx, ss)));
|
||||
ss::retrain_dict.set(r, gated(ss, rest_bind(rest_retrain_dict, ctx, ss, group0_client)));
|
||||
ss::estimate_compression_ratios.set(r, gated(ss, rest_bind(rest_estimate_compression_ratios, ctx, ss)));
|
||||
ss::sstable_info.set(r, gated(ss, rest_bind(rest_sstable_info, ctx)));
|
||||
ss::logstor_info.set(r, gated(ss, rest_bind(rest_logstor_info, ctx)));
|
||||
ss::reload_raft_topology_state.set(r, gated(ss, rest_bind(rest_reload_raft_topology_state, ss, group0_client)));
|
||||
ss::upgrade_to_raft_topology.set(r, gated(ss, rest_bind(rest_upgrade_to_raft_topology, ss)));
|
||||
ss::raft_topology_upgrade_status.set(r, gated(ss, rest_bind(rest_raft_topology_upgrade_status, ss)));
|
||||
ss::raft_topology_get_cmd_status.set(r, gated(ss, rest_bind(rest_raft_topology_get_cmd_status, ss)));
|
||||
ss::move_tablet.set(r, gated(ss, rest_bind(rest_move_tablet, ctx, ss)));
|
||||
ss::add_tablet_replica.set(r, gated(ss, rest_bind(rest_add_tablet_replica, ctx, ss)));
|
||||
ss::del_tablet_replica.set(r, gated(ss, rest_bind(rest_del_tablet_replica, ctx, ss)));
|
||||
ss::repair_tablet.set(r, gated(ss, rest_bind(rest_repair_tablet, ctx, ss)));
|
||||
ss::tablet_balancing_enable.set(r, gated(ss, rest_bind(rest_tablet_balancing_enable, ss)));
|
||||
ss::create_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_create_vnode_tablet_migration, ctx, ss)));
|
||||
ss::get_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_get_vnode_tablet_migration, ctx, ss)));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, gated(ss, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss)));
|
||||
ss::finalize_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss)));
|
||||
ss::quiesce_topology.set(r, gated(ss, rest_bind(rest_quiesce_topology, ss)));
|
||||
sp::get_schema_versions.set(r, gated(ss, rest_bind(rest_get_schema_versions, ss)));
|
||||
ss::drop_quarantined_sstables.set(r, gated(ss, rest_bind(rest_drop_quarantined_sstables, ctx, ss)));
|
||||
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
|
||||
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
|
||||
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
|
||||
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
|
||||
ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
|
||||
ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
|
||||
ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
|
||||
ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
|
||||
ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
|
||||
ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
|
||||
ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
|
||||
ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
|
||||
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
|
||||
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
|
||||
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
|
||||
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
|
||||
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
|
||||
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
|
||||
ss::move.set(r, rest_bind(rest_move, ss));
|
||||
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
|
||||
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
|
||||
ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
|
||||
ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
|
||||
ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
|
||||
ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
|
||||
ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
|
||||
ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
|
||||
ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
|
||||
ss::drain.set(r, rest_bind(rest_drain, ss));
|
||||
ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
|
||||
ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
|
||||
ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
|
||||
ss::stop_daemon.set(r, rest_bind(rest_stop_daemon));
|
||||
ss::is_initialized.set(r, rest_bind(rest_is_initialized, ss));
|
||||
ss::join_ring.set(r, rest_bind(rest_join_ring));
|
||||
ss::is_joined.set(r, rest_bind(rest_is_joined, ss));
|
||||
ss::is_incremental_backups_enabled.set(r, rest_bind(rest_is_incremental_backups_enabled, ctx));
|
||||
ss::set_incremental_backups_enabled.set(r, rest_bind(rest_set_incremental_backups_enabled, ctx));
|
||||
ss::rebuild.set(r, rest_bind(rest_rebuild, ss));
|
||||
ss::bulk_load.set(r, rest_bind(rest_bulk_load));
|
||||
ss::bulk_load_async.set(r, rest_bind(rest_bulk_load_async));
|
||||
ss::reschedule_failed_deletions.set(r, rest_bind(rest_reschedule_failed_deletions));
|
||||
ss::sample_key_range.set(r, rest_bind(rest_sample_key_range));
|
||||
ss::reset_local_schema.set(r, rest_bind(rest_reset_local_schema, ss));
|
||||
ss::set_trace_probability.set(r, rest_bind(rest_set_trace_probability));
|
||||
ss::get_trace_probability.set(r, rest_bind(rest_get_trace_probability));
|
||||
ss::get_slow_query_info.set(r, rest_bind(rest_get_slow_query_info));
|
||||
ss::set_slow_query.set(r, rest_bind(rest_set_slow_query));
|
||||
ss::deliver_hints.set(r, rest_bind(rest_deliver_hints));
|
||||
ss::get_cluster_name.set(r, rest_bind(rest_get_cluster_name, ss));
|
||||
ss::get_partitioner_name.set(r, rest_bind(rest_get_partitioner_name, ss));
|
||||
ss::get_tombstone_warn_threshold.set(r, rest_bind(rest_get_tombstone_warn_threshold));
|
||||
ss::set_tombstone_warn_threshold.set(r, rest_bind(rest_set_tombstone_warn_threshold));
|
||||
ss::get_tombstone_failure_threshold.set(r, rest_bind(rest_get_tombstone_failure_threshold));
|
||||
ss::set_tombstone_failure_threshold.set(r, rest_bind(rest_set_tombstone_failure_threshold));
|
||||
ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
|
||||
ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
|
||||
ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
|
||||
ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
|
||||
ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
|
||||
ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
|
||||
ss::get_ownership.set(r, rest_bind(rest_get_ownership, ctx, ss));
|
||||
ss::get_effective_ownership.set(r, rest_bind(rest_get_effective_ownership, ctx, ss));
|
||||
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
|
||||
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
|
||||
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
|
||||
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
|
||||
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
|
||||
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
|
||||
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
|
||||
ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
|
||||
ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
|
||||
ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
|
||||
ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
|
||||
ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
|
||||
ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
|
||||
ss::create_vnode_tablet_migration.set(r, rest_bind(rest_create_vnode_tablet_migration, ctx, ss));
|
||||
ss::get_vnode_tablet_migration.set(r, rest_bind(rest_get_vnode_tablet_migration, ctx, ss));
|
||||
ss::set_vnode_tablet_migration_node_storage_mode.set(r, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss));
|
||||
ss::finalize_vnode_tablet_migration.set(r, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss));
|
||||
ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
|
||||
sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
|
||||
ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
|
||||
}
|
||||
|
||||
void unset_storage_service(http_context& ctx, routes& r) {
|
||||
|
||||
@@ -113,8 +113,8 @@ static category_set parse_audit_categories(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
audit::audited_tables_t result;
|
||||
static std::map<sstring, std::set<sstring>> parse_audit_tables(const sstring& data) {
|
||||
std::map<sstring, std::set<sstring>> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -139,8 +139,8 @@ static audit::audited_tables_t parse_audit_tables(const sstring& data) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static audit::audited_keyspaces_t parse_audit_keyspaces(const sstring& data) {
|
||||
audit::audited_keyspaces_t result;
|
||||
static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
|
||||
std::set<sstring> result;
|
||||
if (!data.empty()) {
|
||||
std::vector<sstring> tokens;
|
||||
boost::split(tokens, data, boost::is_any_of(","));
|
||||
@@ -156,8 +156,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg)
|
||||
: _token_metadata(token_metadata)
|
||||
@@ -165,8 +165,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
|
||||
, _audited_tables(std::move(audited_tables))
|
||||
, _audited_categories(std::move(audited_categories))
|
||||
, _cfg(cfg)
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<audited_keyspaces_t>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<audited_tables_t>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
|
||||
, _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
|
||||
, _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
|
||||
{
|
||||
_storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
|
||||
@@ -181,8 +181,8 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
|
||||
return make_ready_future<>();
|
||||
}
|
||||
category_set audited_categories = parse_audit_categories(cfg.audit_categories());
|
||||
audit::audited_tables_t audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
audit::audited_keyspaces_t audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
|
||||
std::set<sstring> audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
|
||||
|
||||
logger.info("Audit is enabled. Auditing to: \"{}\", with the following categories: \"{}\", keyspaces: \"{}\", and tables: \"{}\"",
|
||||
cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());
|
||||
@@ -304,7 +304,7 @@ future<> inspect_login(const sstring& username, socket_address client_ip, bool e
|
||||
return audit::local_audit_instance().log_login(username, client_ip, error);
|
||||
}
|
||||
|
||||
bool audit::should_log_table(std::string_view keyspace, std::string_view name) const {
|
||||
bool audit::should_log_table(const sstring& keyspace, const sstring& name) const {
|
||||
auto keyspace_it = _audited_tables.find(keyspace);
|
||||
return keyspace_it != _audited_tables.cend() && keyspace_it->second.find(name) != keyspace_it->second.cend();
|
||||
}
|
||||
@@ -319,8 +319,8 @@ bool audit::will_log(statement_category cat, std::string_view keyspace, std::str
|
||||
// so it is logged whenever the category matches.
|
||||
return _audited_categories.contains(cat)
|
||||
&& (keyspace.empty()
|
||||
|| _audited_keyspaces.find(keyspace) != _audited_keyspaces.cend()
|
||||
|| should_log_table(keyspace, table)
|
||||
|| _audited_keyspaces.find(sstring(keyspace)) != _audited_keyspaces.cend()
|
||||
|| should_log_table(sstring(keyspace), sstring(table))
|
||||
|| cat == statement_category::AUTH
|
||||
|| cat == statement_category::ADMIN
|
||||
|| cat == statement_category::DCL);
|
||||
|
||||
@@ -129,15 +129,10 @@ public:
|
||||
class storage_helper;
|
||||
|
||||
class audit final : public seastar::async_sharded_service<audit> {
|
||||
public:
|
||||
// Transparent comparator (std::less<>) enables heterogeneous lookup with
|
||||
// string_view keys.
|
||||
using audited_keyspaces_t = std::set<sstring, std::less<>>;
|
||||
using audited_tables_t = std::map<sstring, std::set<sstring, std::less<>>, std::less<>>;
|
||||
private:
|
||||
locator::shared_token_metadata& _token_metadata;
|
||||
audited_keyspaces_t _audited_keyspaces;
|
||||
audited_tables_t _audited_tables;
|
||||
std::set<sstring> _audited_keyspaces;
|
||||
// Maps keyspace name to set of table names in that keyspace
|
||||
std::map<sstring, std::set<sstring>> _audited_tables;
|
||||
category_set _audited_categories;
|
||||
|
||||
std::unique_ptr<storage_helper> _storage_helper_ptr;
|
||||
@@ -150,7 +145,7 @@ private:
|
||||
template<class T>
|
||||
void update_config(const sstring & new_value, std::function<T(const sstring&)> parse_func, T& cfg_parameter);
|
||||
|
||||
bool should_log_table(std::string_view keyspace, std::string_view name) const;
|
||||
bool should_log_table(const sstring& keyspace, const sstring& name) const;
|
||||
public:
|
||||
static seastar::sharded<audit>& audit_instance() {
|
||||
// FIXME: leaked intentionally to avoid shutdown problems, see #293
|
||||
@@ -169,8 +164,8 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
service::migration_manager& mm,
|
||||
std::set<sstring>&& audit_modes,
|
||||
audited_keyspaces_t&& audited_keyspaces,
|
||||
audited_tables_t&& audited_tables,
|
||||
std::set<sstring>&& audited_keyspaces,
|
||||
std::map<sstring, std::set<sstring>>&& audited_tables,
|
||||
category_set&& audited_categories,
|
||||
const db::config& cfg);
|
||||
~audit();
|
||||
|
||||
@@ -1625,7 +1625,7 @@ struct process_change_visitor {
|
||||
if (_enable_updating_state) {
|
||||
if (_request_options.alternator && _alternator_schema_has_no_clustering_key && _clustering_row_states.empty()) {
|
||||
// Alternator's table can be with or without clustering key. If the clustering key exists,
|
||||
// delete request will be `clustered_row_delete` and will be handled there.
|
||||
// delete request will be `clustered_row_delete` and will be hanlded there.
|
||||
// If the clustering key doesn't exist, delete request will be `partition_delete` and will be handled here.
|
||||
// The no-clustering-key case is slightly tricky, because insert of such item is handled by `clustered_row_cells`
|
||||
// and has some value as clustering_key (the value currently seems to be empty bytes object).
|
||||
@@ -1933,7 +1933,7 @@ public:
|
||||
if (_options.alternator && !_alternator_clustering_keys_to_ignore.empty()) {
|
||||
// we filter mutations for Alternator's changes here.
|
||||
// We do it per mutation object (user might submit a batch of those in one go
|
||||
// and some might be split because of different timestamps),
|
||||
// and some might be splitted because of different timestamps),
|
||||
// ignore key set is cleared afterwards.
|
||||
// If single mutation object contains two separate changes to the same row
|
||||
// and at least one of them is ignored, all of them will be ignored.
|
||||
|
||||
@@ -240,7 +240,7 @@ static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& ta
|
||||
// and if the memtable also contains the key we're calculating max purgeable timestamp for.
|
||||
// First condition helps to not penalize the common scenario where memtable only contains
|
||||
// newer data.
|
||||
if (!table_s.skip_memtable_for_tombstone_gc() && memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
|
||||
timestamp = memtable_min_timestamp;
|
||||
source = max_purgeable::timestamp_source::memtable_possibly_shadowing_data;
|
||||
}
|
||||
|
||||
@@ -39,9 +39,6 @@ public:
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
|
||||
virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
|
||||
virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
|
||||
// Returns true when tombstone GC considers only the repaired sstable set, meaning the
|
||||
// memtable does not need to be consulted (its data is always newer than any GC-eligible tombstone).
|
||||
virtual bool skip_memtable_for_tombstone_gc() const noexcept = 0;
|
||||
virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
|
||||
virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
|
||||
virtual compaction_strategy& get_compaction_strategy() const noexcept = 0;
|
||||
|
||||
@@ -406,11 +406,7 @@ commitlog_total_space_in_mb: -1
|
||||
# In short, `ms` needs more CPU during sstable writes,
|
||||
# but should behave better during reads,
|
||||
# although it might behave worse for very long clustering keys.
|
||||
#
|
||||
# `ms` sstable format works even better with `column_index_size_in_kb` set to 1,
|
||||
# so keep those two settings in sync (either both set, or both unset).
|
||||
sstable_format: ms
|
||||
column_index_size_in_kb: 1
|
||||
|
||||
# Auto-scaling of the promoted index prevents running out of memory
|
||||
# when the promoted index grows too large (due to partitions with many rows
|
||||
|
||||
19
configure.py
19
configure.py
@@ -2769,6 +2769,25 @@ def write_build_file(f,
|
||||
f.write('build {}: rust_source {}\n'.format(cc, src))
|
||||
obj = cc.replace('.cc', '.o')
|
||||
compiles[obj] = cc
|
||||
# Sources shared between scylla (compiled with PCH) and small tests
|
||||
# (with custom deps and partial link sets) must not use the PCH,
|
||||
# because -fpch-instantiate-templates injects symbol references that
|
||||
# the small test link sets cannot satisfy.
|
||||
small_test_srcs = set()
|
||||
for test_binary, test_deps in deps.items():
|
||||
if not test_binary.startswith('test/'):
|
||||
continue
|
||||
# Only exclude PCH for tests with truly small/partial link sets.
|
||||
# Tests that include scylla_core or similar large dep sets link
|
||||
# against enough objects to satisfy PCH-injected symbol refs.
|
||||
if len(test_deps) > 50:
|
||||
continue
|
||||
for src in test_deps:
|
||||
if src.endswith('.cc'):
|
||||
small_test_srcs.add(src)
|
||||
for src in small_test_srcs:
|
||||
obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
|
||||
compiles_with_pch.discard(obj)
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
|
||||
|
||||
84
cql3/prepared_cache_key_type.hh
Normal file
84
cql3/prepared_cache_key_type.hh
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "bytes.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "utils/loading_cache.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "cql3/prepared_cache_key_type.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/column_specification.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
@@ -27,39 +28,6 @@ struct prepared_cache_entry_size {
|
||||
}
|
||||
};
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// TODO: consolidate prepared_cache_key_type and the nested cache_key_type
|
||||
/// the latter was introduced for unifying the CQL and Thrift prepared
|
||||
/// statements so that they can be stored in the same cache.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
// derive from cql_prepared_id_type so we can customize the formatter of
|
||||
// cache_key_type
|
||||
struct cache_key_type : public cql_prepared_id_type {
|
||||
cache_key_type(cql_prepared_id_type&& id, cql3::dialect d) : cql_prepared_id_type(std::move(id)), dialect(d) {}
|
||||
cql3::dialect dialect; // Not part of hash, but we don't expect collisions because of that
|
||||
bool operator==(const cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id, dialect d) : _key(std::move(cql_id), d) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key();
|
||||
}
|
||||
|
||||
bool operator==(const prepared_cache_key_type& other) const = default;
|
||||
};
|
||||
|
||||
class prepared_statements_cache {
|
||||
public:
|
||||
struct stats {
|
||||
@@ -164,35 +132,3 @@ public:
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type::cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type::cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<cql3::prepared_cache_key_type> final {
|
||||
size_t operator()(const cql3::prepared_cache_key_type& k) const {
|
||||
return std::hash<cql3::cql_prepared_id_type>()(k.key());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// for prepared_statements_cache log printouts
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type::cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type::cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{{cql_id: {}, dialect: {}}}", static_cast<const cql3::cql_prepared_id_type&>(p), p.dialect);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<cql3::prepared_cache_key_type> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const cql3::prepared_cache_key_type& p, fmt::format_context& ctx) const {
|
||||
return fmt::format_to(ctx.out(), "{}", p.key());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include <seastar/coroutine/as_future.hh>
|
||||
#include <seastar/coroutine/try_future.hh>
|
||||
|
||||
#include "cql3/prepared_statements_cache.hh"
|
||||
#include "cql3/authorized_prepared_statements_cache.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/mapreduce_service.hh"
|
||||
@@ -77,7 +80,7 @@ static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
|
||||
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm)
|
||||
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
@@ -86,7 +89,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
|
||||
, _mcfg(mcfg)
|
||||
, _cql_config(cql_cfg)
|
||||
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
|
||||
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
|
||||
, _authorized_prepared_cache(auth_prep_cache_cfg, authorized_prepared_statements_cache_log)
|
||||
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
|
||||
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
|
||||
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
|
||||
@@ -1074,7 +1077,7 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
::shared_ptr<statements::batch_statement> batch,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
|
||||
@@ -22,13 +22,14 @@
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/dialect.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/stats.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/broadcast_tables/experimental/query_result.hh"
|
||||
#include "vector_search/vector_store_client.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/observable.hh"
|
||||
#include "utils/rolling_max_tracker.hh"
|
||||
@@ -41,6 +42,9 @@
|
||||
|
||||
|
||||
namespace lang { class manager; }
|
||||
namespace vector_search {
|
||||
class vector_store_client;
|
||||
}
|
||||
namespace service {
|
||||
class migration_manager;
|
||||
class query_state;
|
||||
@@ -58,6 +62,9 @@ struct query;
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
class prepared_statements_cache;
|
||||
class authorized_prepared_statements_cache;
|
||||
|
||||
namespace statements {
|
||||
class batch_statement;
|
||||
class schema_altering_statement;
|
||||
@@ -184,7 +191,7 @@ public:
|
||||
static std::vector<std::unique_ptr<statements::raw::parsed_statement>> parse_statements(std::string_view queries, dialect d);
|
||||
|
||||
query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc,
|
||||
memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm);
|
||||
memory_config mcfg, cql_config& cql_cfg, const utils::loading_cache_config& auth_prep_cache_cfg, lang::manager& langm);
|
||||
|
||||
~query_processor();
|
||||
|
||||
@@ -474,7 +481,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement> stmt,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries) {
|
||||
return execute_batch_without_checking_exception_message(
|
||||
std::move(stmt),
|
||||
query_state,
|
||||
@@ -490,7 +497,7 @@ public:
|
||||
::shared_ptr<statements::batch_statement>,
|
||||
service::query_state& query_state,
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries);
|
||||
std::unordered_map<prepared_cache_key_type, statements::prepared_statement::checked_weak_ptr> pending_authorization_entries);
|
||||
|
||||
future<service::broadcast_tables::query_result>
|
||||
execute_broadcast_table_query(const service::broadcast_tables::query&);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,113 +23,15 @@ namespace cql3 {
|
||||
|
||||
namespace restrictions {
|
||||
|
||||
/// A set of discrete values.
|
||||
using value_list = std::vector<managed_bytes>; // Sorted and deduped using value comparator.
|
||||
|
||||
/// General set of values. Empty set and single-element sets are always value_list. interval is
|
||||
/// never singular and never has start > end. Universal set is a interval with both bounds null.
|
||||
using value_set = std::variant<value_list, interval<managed_bytes>>;
|
||||
|
||||
// For some boolean expression (say (X = 3) = TRUE, this represents a function that solves for X.
|
||||
// (here, it would return 3). The expression is obtained by equating some factors of the WHERE
|
||||
// clause to TRUE.
|
||||
using solve_for_t = std::function<value_set (const query_options&)>;
|
||||
|
||||
struct on_row {
|
||||
bool operator==(const on_row&) const = default;
|
||||
};
|
||||
|
||||
struct on_column {
|
||||
const column_definition* column;
|
||||
|
||||
bool operator==(const on_column&) const = default;
|
||||
};
|
||||
|
||||
// Placeholder type indicating we're solving for the partition key token.
|
||||
struct on_partition_key_token {
|
||||
const ::schema* schema;
|
||||
|
||||
bool operator==(const on_partition_key_token&) const = default;
|
||||
};
|
||||
|
||||
struct on_clustering_key_prefix {
|
||||
std::vector<const column_definition*> columns;
|
||||
|
||||
bool operator==(const on_clustering_key_prefix&) const = default;
|
||||
};
|
||||
|
||||
// A predicate on a column or a combination of columns. The WHERE clause analyzer
|
||||
// will attempt to convert predicates (that return true or false for a particular row)
|
||||
// to solvers (that return the set of column values that satisfy the predicate) when possible.
|
||||
struct predicate {
|
||||
// A function that returns the set of values that satisfy the filter. Can be unset,
|
||||
// in which case the filter must be interpreted.
|
||||
solve_for_t solve_for;
|
||||
// The original filter for this column.
|
||||
expr::expression filter;
|
||||
// What column the predicate can be solved for
|
||||
std::variant<
|
||||
on_row, // cannot determine, so predicate is on entire row
|
||||
on_column, // solving for a single column: e.g. c1 = 3
|
||||
on_partition_key_token, // solving for the token, e.g. token(pk1, pk2) >= :var
|
||||
on_clustering_key_prefix // solving for a clustering key prefix: e.g. (ck1, ck2) >= (3, 4)
|
||||
> on;
|
||||
// Whether the returned value_set will resolve to a single value.
|
||||
bool is_singleton = false;
|
||||
// Whether the returned value_set follows CQL comparison semantics
|
||||
bool comparable = true;
|
||||
bool is_multi_column = false;
|
||||
bool is_not_null_single_column = false;
|
||||
bool equality = false; // operator is EQ
|
||||
bool is_in = false; // operator is IN
|
||||
bool is_slice = false; // operator is LT/LTE/GT/GTE
|
||||
bool is_upper_bound = false; // operator is LT/LTE
|
||||
bool is_lower_bound = false; // operator is GT/GTE
|
||||
expr::comparison_order order = expr::comparison_order::cql;
|
||||
std::optional<expr::oper_t> op; // the binary operator, if any
|
||||
bool is_subscript = false; // whether the LHS is a subscript (map element access)
|
||||
};
|
||||
|
||||
///In some cases checking if columns have indexes is undesired of even
|
||||
///impossible, because e.g. the query runs on a pseudo-table, which does not
|
||||
///have an index-manager, or even a table object.
|
||||
using check_indexes = bool_class<class check_indexes_tag>;
|
||||
|
||||
// A function that returns the partition key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE token(pk) > 1 or WHERE pk1 IN :list1 AND pk2 IN :list2.
|
||||
using get_partition_key_ranges_fn_t = std::function<dht::partition_range_vector (const query_options&)>;
|
||||
|
||||
// A function that returns the clustering key ranges for a query. It is the solver of
|
||||
// WHERE clause fragments such as WHERE ck > 1 or WHERE (ck1, ck2) > (1, 2).
|
||||
using get_clustering_bounds_fn_t = std::function<std::vector<query::clustering_range> (const query_options& options)>;
|
||||
|
||||
// A function that returns a singleton value, usable for a key (e.g. bytes_opt)
|
||||
using get_singleton_value_fn_t = std::function<bytes_opt (const query_options&)>;
|
||||
|
||||
struct no_partition_range_restrictions {
|
||||
};
|
||||
|
||||
struct token_range_restrictions {
|
||||
predicate token_restrictions;
|
||||
};
|
||||
|
||||
struct single_column_partition_range_restrictions {
|
||||
std::vector<predicate> per_column_restrictions;
|
||||
};
|
||||
|
||||
using partition_range_restrictions = std::variant<
|
||||
no_partition_range_restrictions,
|
||||
token_range_restrictions,
|
||||
single_column_partition_range_restrictions>;
|
||||
|
||||
// A map of per-column predicate vectors, ordered by schema position.
|
||||
using single_column_predicate_vectors = std::map<const column_definition*, std::vector<predicate>, expr::schema_pos_column_definition_comparator>;
|
||||
|
||||
/**
|
||||
* The restrictions corresponding to the relations specified on the where-clause of CQL query.
|
||||
*/
|
||||
class statement_restrictions {
|
||||
struct private_tag {}; // Tag for private constructor
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
|
||||
@@ -179,7 +81,7 @@ private:
|
||||
bool _has_queriable_regular_index = false, _has_queriable_pk_index = false, _has_queriable_ck_index = false;
|
||||
bool _has_multi_column; ///< True iff _clustering_columns_restrictions has a multi-column restriction.
|
||||
|
||||
std::vector<expr::expression> _where; ///< The entire WHERE clause (factorized).
|
||||
std::optional<expr::expression> _where; ///< The entire WHERE clause.
|
||||
|
||||
/// Parts of _where defining the clustering slice.
|
||||
///
|
||||
@@ -194,7 +96,7 @@ private:
|
||||
/// 4.4 elements other than the last have only EQ or IN atoms
|
||||
/// 4.5 the last element has only EQ, IN, or is_slice() atoms
|
||||
/// 5. if multi-column, then each element is a binary_operator
|
||||
std::vector<predicate> _clustering_prefix_restrictions;
|
||||
std::vector<expr::expression> _clustering_prefix_restrictions;
|
||||
|
||||
/// Like _clustering_prefix_restrictions, but for the indexing table (if this is an index-reading statement).
|
||||
/// Recall that the index-table CK is (token, PK, CK) of the base table for a global index and (indexed column,
|
||||
@@ -203,7 +105,7 @@ private:
|
||||
/// Elements are conjunctions of single-column binary operators with the same LHS.
|
||||
/// Element order follows the indexing-table clustering key.
|
||||
/// In case of a global index the first element's (token restriction) RHS is a dummy value, it is filled later.
|
||||
std::optional<std::vector<predicate>> _idx_tbl_ck_prefix;
|
||||
std::optional<std::vector<expr::expression>> _idx_tbl_ck_prefix;
|
||||
|
||||
/// Parts of _where defining the partition range.
|
||||
///
|
||||
@@ -211,25 +113,16 @@ private:
|
||||
/// binary_operators on token. If single-column restrictions define the partition range, each element holds
|
||||
/// restrictions for one partition column. Each partition column has a corresponding element, but the elements
|
||||
/// are in arbitrary order.
|
||||
partition_range_restrictions _partition_range_restrictions;
|
||||
std::vector<expr::expression> _partition_range_restrictions;
|
||||
|
||||
bool _partition_range_is_simple; ///< False iff _partition_range_restrictions imply a Cartesian product.
|
||||
|
||||
|
||||
check_indexes _check_indexes = check_indexes::yes;
|
||||
/// Columns that appear on the LHS of an EQ restriction (not IN).
|
||||
/// For multi-column EQ like (ck1, ck2) = (1, 2), all columns in the tuple are included.
|
||||
std::unordered_set<const column_definition*> _columns_with_eq;
|
||||
std::vector<const column_definition*> _column_defs_for_filtering;
|
||||
schema_ptr _view_schema;
|
||||
std::optional<secondary_index::index> _idx_opt;
|
||||
expr::expression _idx_restrictions = expr::conjunction({});
|
||||
get_partition_key_ranges_fn_t _get_partition_key_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_clustering_bounds_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_global_index_token_clustering_ranges_fn;
|
||||
get_clustering_bounds_fn_t _get_local_index_clustering_ranges_fn;
|
||||
get_singleton_value_fn_t _value_for_index_partition_key_fn;
|
||||
public:
|
||||
/**
|
||||
* Creates a new empty <code>StatementRestrictions</code>.
|
||||
@@ -237,10 +130,9 @@ public:
|
||||
* @param cfm the column family meta data
|
||||
* @return a new empty <code>StatementRestrictions</code>.
|
||||
*/
|
||||
statement_restrictions(private_tag, schema_ptr schema, bool allow_filtering);
|
||||
statement_restrictions(schema_ptr schema, bool allow_filtering);
|
||||
|
||||
public:
|
||||
friend shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
friend statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -250,15 +142,9 @@ public:
|
||||
bool for_view,
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
friend shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Important: objects of this class captures `this` extensively and so must remain non-copyable.
|
||||
statement_restrictions(const statement_restrictions&) = delete;
|
||||
statement_restrictions& operator=(const statement_restrictions&) = delete;
|
||||
statement_restrictions(private_tag,
|
||||
data_dictionary::database db,
|
||||
private:
|
||||
statement_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
const expr::expression& where_clause,
|
||||
@@ -325,7 +211,10 @@ public:
|
||||
|
||||
bool has_token_restrictions() const;
|
||||
|
||||
// Checks whether the given column has an EQ restriction (not IN).
|
||||
// Checks whether the given column has an EQ restriction.
|
||||
// EQ restriction is `col = ...` or `(col, col2) = ...`
|
||||
// IN restriction is NOT an EQ restriction, this function will not look for IN restrictions.
|
||||
// Uses column_defintion::operator== for comparison, columns with the same name but different schema will not be equal.
|
||||
bool has_eq_restriction_on_column(const column_definition&) const;
|
||||
|
||||
/**
|
||||
@@ -335,6 +224,12 @@ public:
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(data_dictionary::database db) const;
|
||||
|
||||
/**
|
||||
* Gives a score that the index has - index with the highest score will be chosen
|
||||
* in find_idx()
|
||||
*/
|
||||
int score(const secondary_index::index& index) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the data_dictionary::database context (for extracting index manager)
|
||||
@@ -355,8 +250,18 @@ public:
|
||||
|
||||
size_t partition_key_restrictions_size() const;
|
||||
|
||||
bool parition_key_restrictions_have_supporting_index(const secondary_index::secondary_index_manager& index_manager, expr::allow_local_index allow_local) const;
|
||||
|
||||
size_t clustering_columns_restrictions_size() const;
|
||||
|
||||
bool clustering_columns_restrictions_have_supporting_index(
|
||||
const secondary_index::secondary_index_manager& index_manager,
|
||||
expr::allow_local_index allow_local) const;
|
||||
|
||||
bool multi_column_clustering_restrictions_are_supported_by(const secondary_index::index& index) const;
|
||||
|
||||
bounds_slice get_clustering_slice() const;
|
||||
|
||||
/**
|
||||
* Checks if the clustering key has some unrestricted components.
|
||||
* @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
|
||||
@@ -374,6 +279,15 @@ public:
|
||||
|
||||
schema_ptr get_view_schema() const { return _view_schema; }
|
||||
private:
|
||||
std::pair<std::optional<secondary_index::index>, expr::expression> do_find_idx(const secondary_index::secondary_index_manager& sim) const;
|
||||
void add_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_is_not_restriction(const expr::binary_operator& restr, schema_ptr schema, bool for_view);
|
||||
void add_single_column_parition_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
|
||||
void add_token_partition_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_clustering_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering);
|
||||
void add_multi_column_clustering_key_restriction(const expr::binary_operator& restr);
|
||||
void add_single_column_nonprimary_key_restriction(const expr::binary_operator& restr);
|
||||
|
||||
void process_partition_key_restrictions(bool for_view, bool allow_filtering, statements::statement_type type);
|
||||
|
||||
/**
|
||||
@@ -401,17 +315,7 @@ private:
|
||||
void add_clustering_restrictions_to_idx_ck_prefix(const schema& idx_tbl_schema);
|
||||
|
||||
unsigned int num_clustering_prefix_columns_that_need_not_be_filtered() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(
|
||||
data_dictionary::database db,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
get_partition_key_ranges_fn_t build_partition_key_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_clustering_bounds_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_global_index_token_clustering_ranges_fn() const;
|
||||
get_clustering_bounds_fn_t build_get_local_index_clustering_ranges_fn() const;
|
||||
get_singleton_value_fn_t build_value_for_index_partition_key_fn() const;
|
||||
void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(data_dictionary::database db);
|
||||
public:
|
||||
/**
|
||||
* Returns the specified range of the partition key.
|
||||
@@ -485,10 +389,7 @@ public:
|
||||
private:
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_local_index_clustering_ranges().
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema,
|
||||
const single_column_predicate_vectors& sc_pk_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_ck_pred_vectors,
|
||||
const single_column_predicate_vectors& sc_nonpk_pred_vectors);
|
||||
void prepare_indexed_local(const schema& idx_tbl_schema);
|
||||
|
||||
/// Prepares internal data for evaluating index-table queries. Must be called before
|
||||
/// get_global_index_clustering_ranges() or get_global_index_token_clustering_ranges().
|
||||
@@ -497,18 +398,15 @@ private:
|
||||
public:
|
||||
/// Calculates clustering ranges for querying a global-index table.
|
||||
std::vector<query::clustering_range> get_global_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a global-index table for queries with token restrictions present.
|
||||
std::vector<query::clustering_range> get_global_index_token_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
/// Calculates clustering ranges for querying a local-index table.
|
||||
std::vector<query::clustering_range> get_local_index_clustering_ranges(
|
||||
const query_options& options) const;
|
||||
|
||||
/// Finds the value of partition key of the index table
|
||||
bytes_opt value_for_index_partition_key(const query_options&) const;
|
||||
const query_options& options, const schema& idx_tbl_schema) const;
|
||||
|
||||
sstring to_string() const;
|
||||
|
||||
@@ -518,7 +416,7 @@ public:
|
||||
bool is_empty() const;
|
||||
};
|
||||
|
||||
shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
statement_restrictions analyze_statement_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
statements::statement_type type,
|
||||
@@ -529,14 +427,23 @@ shared_ptr<const statement_restrictions> analyze_statement_restrictions(
|
||||
bool allow_filtering,
|
||||
check_indexes do_check_indexes);
|
||||
|
||||
shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
|
||||
schema_ptr schema,
|
||||
bool allow_filtering);
|
||||
|
||||
// Extracts all binary operators which have the given column on their left hand side.
|
||||
// Extracts only single-column restrictions.
|
||||
// Does not include multi-column restrictions.
|
||||
// Does not include token() restrictions.
|
||||
// Does not include boolean constant restrictions.
|
||||
// For example "WHERE c = 1 AND (a, c) = (2, 1) AND token(p) < 2 AND FALSE" will return {"c = 1"}.
|
||||
std::vector<expr::expression> extract_single_column_restrictions_for_column(const expr::expression&, const column_definition&);
|
||||
|
||||
|
||||
// Checks whether this expression is empty - doesn't restrict anything
|
||||
bool is_empty_restriction(const expr::expression&);
|
||||
|
||||
// Finds the value of the given column in the expression
|
||||
// In case of multpiple possible values calls on_internal_error
|
||||
bytes_opt value_for(const column_definition&, const expr::expression&, const query_options&);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -90,20 +90,6 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
auto& current_rf_per_dc = ks.metadata()->strategy_options();
|
||||
auto new_rf_per_dc = _attrs->get_replication_options();
|
||||
new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
|
||||
// Check if multi-RF change is allowed: all DC changes must be 0->N or N->0.
|
||||
auto all_changes_are_0_N = [&] {
|
||||
for (const auto& [dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf_val = size_t(0);
|
||||
if (auto it = current_rf_per_dc.find(dc); it != current_rf_per_dc.end()) {
|
||||
old_rf_val = locator::get_replication_factor(it->second);
|
||||
}
|
||||
auto new_rf_val = locator::get_replication_factor(new_rf);
|
||||
if (old_rf_val != new_rf_val && old_rf_val != 0 && new_rf_val != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
unsigned total_abs_rfs_diff = 0;
|
||||
for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
|
||||
auto old_rf = locator::replication_strategy_config_option(sstring("0"));
|
||||
@@ -117,9 +103,7 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
|
||||
// first we need to report non-existing DCs, then if RFs aren't changed by too much.
|
||||
continue;
|
||||
}
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2 &&
|
||||
!(qp.proxy().features().keyspace_multi_rf_change && locator::uses_rack_list_exclusively(current_rf_per_dc)
|
||||
&& locator::uses_rack_list_exclusively(new_ks->strategy_options()) && all_changes_are_0_N())) {
|
||||
if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2) {
|
||||
throw exceptions::invalid_request_exception("Only one DC's RF can be changed at a time and not by more than 1");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
#include "cas_request.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "utils/unique_view.hh"
|
||||
|
||||
@@ -89,10 +89,6 @@ public:
|
||||
|
||||
const std::vector<single_statement>& statements() const { return _statements; }
|
||||
|
||||
audit::audit_info_ptr audit_info() const {
|
||||
return audit::audit::create_audit_info(audit::statement_category::DML, sstring(), sstring(), true);
|
||||
}
|
||||
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "cql3/expr/evaluate.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/values.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/broadcast_tables/experimental/lang.hh"
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "auth/service.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "transport/event.hh"
|
||||
|
||||
@@ -411,10 +411,10 @@ bool ks_prop_defs::get_durable_writes() const {
|
||||
|
||||
lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
|
||||
auto sc = get_replication_strategy_class().value();
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0. The strategy will
|
||||
// validate it and throw an error if it does not support tablets.
|
||||
// if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
|
||||
auto enable_tablets = feat.tablets && cfg.enable_tablets_by_default();
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets ? std::optional<unsigned>(0) : std::nullopt;
|
||||
std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
@@ -440,7 +440,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
sc = old->strategy_name();
|
||||
options = old_options;
|
||||
}
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options(), {}, old->next_strategy_options_opt());
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -626,7 +626,7 @@ modification_statement::prepare(data_dictionary::database db, prepare_context& c
|
||||
// Since this cache is only meaningful for LWT queries, just clear the ids
|
||||
// if it's not a conditional statement so that the AST nodes don't
|
||||
// participate in the caching mechanism later.
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions) {
|
||||
if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions.has_value()) {
|
||||
ctx.clear_pk_function_calls_cache();
|
||||
}
|
||||
prepared_stmt->_may_use_token_aware_routing = ctx.get_partition_key_bind_indexes(*schema).size() != 0;
|
||||
|
||||
@@ -94,7 +94,7 @@ private:
|
||||
std::optional<bool> _is_raw_counter_shard_write;
|
||||
|
||||
protected:
|
||||
shared_ptr<const restrictions::statement_restrictions> _restrictions;
|
||||
std::optional<restrictions::statement_restrictions> _restrictions;
|
||||
public:
|
||||
typedef std::optional<std::unordered_map<sstring, bytes_opt>> json_cache_opt;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
|
||||
@@ -109,7 +109,7 @@ public:
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats, const cql_config& cfg, bool for_view);
|
||||
private:
|
||||
std::vector<selection::prepared_selector> maybe_jsonize_select_clause(std::vector<selection::prepared_selector> select, data_dictionary::database db, schema_ptr schema);
|
||||
::shared_ptr<const restrictions::statement_restrictions> prepare_restrictions(
|
||||
::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
|
||||
data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
|
||||
@@ -1027,7 +1027,7 @@ view_indexed_table_select_statement::prepare(data_dictionary::database db,
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1139,7 +1139,7 @@ lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_stat
|
||||
auto& last_base_pk = last_pos.partition;
|
||||
auto* last_base_ck = last_pos.position.has_key() ? &last_pos.position.key() : nullptr;
|
||||
|
||||
bytes_opt indexed_column_value = _restrictions->value_for_index_partition_key(options);
|
||||
bytes_opt indexed_column_value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
|
||||
auto index_pk = [&]() {
|
||||
if (_index.metadata().local()) {
|
||||
@@ -1350,7 +1350,12 @@ dht::partition_range_vector view_indexed_table_select_statement::get_partition_r
|
||||
dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
|
||||
dht::partition_range_vector partition_ranges;
|
||||
|
||||
bytes_opt value = _restrictions->value_for_index_partition_key(options);
|
||||
const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
|
||||
if (!cdef) {
|
||||
throw exceptions::invalid_request_exception("Indexed column not found in schema");
|
||||
}
|
||||
|
||||
bytes_opt value = restrictions::value_for(*cdef, _used_index_restrictions, options);
|
||||
if (value) {
|
||||
auto pk = partition_key::from_single_value(*_view_schema, *value);
|
||||
auto dk = dht::decorate_key(*_view_schema, pk);
|
||||
@@ -1369,11 +1374,11 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
// Only EQ restrictions on base partition key can be used in an index view query
|
||||
if (pk_restrictions_is_single && _restrictions->partition_key_restrictions_is_all_eq()) {
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_clustering_ranges(options));
|
||||
_restrictions->get_global_index_clustering_ranges(options, *_view_schema));
|
||||
} else if (_restrictions->has_token_restrictions()) {
|
||||
// Restrictions like token(p1, p2) < 0 have all partition key components restricted, but require special handling.
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_global_index_token_clustering_ranges(options));
|
||||
_restrictions->get_global_index_token_clustering_ranges(options, *_view_schema));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1384,7 +1389,7 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
|
||||
partition_slice_builder partition_slice_builder{*_view_schema};
|
||||
|
||||
partition_slice_builder.with_ranges(
|
||||
_restrictions->get_local_index_clustering_ranges(options));
|
||||
_restrictions->get_local_index_clustering_ranges(options, *_view_schema));
|
||||
|
||||
return partition_slice_builder.build();
|
||||
}
|
||||
@@ -1602,7 +1607,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -1640,7 +1645,7 @@ private:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const select_statement::parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
parallelized_select_statement::ordering_comparator_type ordering_comparator,
|
||||
@@ -2071,7 +2076,7 @@ static select_statement::ordering_comparator_type get_similarity_ordering_compar
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
|
||||
uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
|
||||
|
||||
@@ -2584,7 +2589,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
|
||||
return make_unique<prepared_statement>(audit_info(), std::move(stmt), ctx, std::move(partition_key_bind_indices), std::move(warnings));
|
||||
}
|
||||
|
||||
::shared_ptr<const restrictions::statement_restrictions>
|
||||
::shared_ptr<restrictions::statement_restrictions>
|
||||
select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
schema_ptr schema,
|
||||
prepare_context& ctx,
|
||||
@@ -2594,8 +2599,8 @@ select_statement::prepare_restrictions(data_dictionary::database db,
|
||||
restrictions::check_indexes do_check_indexes)
|
||||
{
|
||||
try {
|
||||
return restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes);
|
||||
return ::make_shared<restrictions::statement_restrictions>(restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
|
||||
selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes));
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the WHERE clause (name: '{}')", e.entity));
|
||||
|
||||
@@ -200,7 +200,7 @@ public:
|
||||
uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions,
|
||||
::shared_ptr<std::vector<size_t>> group_by_cell_indices,
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
@@ -372,7 +372,7 @@ public:
|
||||
|
||||
static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
|
||||
lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
|
||||
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "raw/parsed_statement.hh"
|
||||
#include "service/qos/qos_common.hh"
|
||||
#include "service/query_state.hh"
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "unimplemented.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include <optional>
|
||||
#include "validation.hh"
|
||||
|
||||
@@ -66,7 +66,7 @@ public:
|
||||
: update_statement(std::move(audit_info), statement_type::INSERT, bound_terms, s, std::move(attrs), stats)
|
||||
, _value(std::move(v))
|
||||
, _default_unset(default_unset) {
|
||||
_restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, false);
|
||||
_restrictions = restrictions::statement_restrictions(s, false);
|
||||
}
|
||||
private:
|
||||
virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const override;
|
||||
|
||||
@@ -224,12 +224,10 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
user_types_metadata user_types,
|
||||
storage_options storage_opts,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
storage_options storage_opts)
|
||||
: _name{name}
|
||||
, _strategy_name{locator::abstract_replication_strategy::to_qualified_class_name(strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name)}
|
||||
, _strategy_options{std::move(strategy_options)}
|
||||
, _next_strategy_options{std::move(next_options)}
|
||||
, _initial_tablets(initial_tablets)
|
||||
, _durable_writes{durable_writes}
|
||||
, _user_types{std::move(user_types)}
|
||||
@@ -275,15 +273,14 @@ keyspace_metadata::new_keyspace(std::string_view name,
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes,
|
||||
storage_options storage_opts,
|
||||
std::vector<schema_ptr> cf_defs,
|
||||
std::optional<locator::replication_strategy_config_options> next_options)
|
||||
std::vector<schema_ptr> cf_defs)
|
||||
{
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts, next_options);
|
||||
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
|
||||
}
|
||||
|
||||
lw_shared_ptr<keyspace_metadata>
|
||||
keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options(), {}, ksm.next_strategy_options_opt());
|
||||
return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
|
||||
}
|
||||
|
||||
void keyspace_metadata::add_user_type(const user_type ut) {
|
||||
@@ -652,8 +649,8 @@ struct fmt::formatter<data_dictionary::user_types_metadata> {
|
||||
};
|
||||
|
||||
auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dictionary::keyspace_metadata& m, fmt::format_context& ctx) const -> decltype(ctx.out()) {
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, nextStrategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.next_strategy_options_opt(), m.cf_meta_data(), m.durable_writes());
|
||||
fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
|
||||
m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes());
|
||||
if (m.initial_tablets()) {
|
||||
if (auto initial_tablets = m.initial_tablets().value()) {
|
||||
fmt::format_to(ctx.out(), "{{\"initial\":{}}}", initial_tablets);
|
||||
|
||||
@@ -28,9 +28,7 @@ namespace data_dictionary {
|
||||
class keyspace_metadata final {
|
||||
sstring _name;
|
||||
sstring _strategy_name;
|
||||
// If _next_strategy_options has value, there is ongoing rf change of this keyspace.
|
||||
locator::replication_strategy_config_options _strategy_options;
|
||||
std::optional<locator::replication_strategy_config_options> _next_strategy_options;
|
||||
std::optional<unsigned> _initial_tablets;
|
||||
std::unordered_map<sstring, schema_ptr> _cf_meta_data;
|
||||
bool _durable_writes;
|
||||
@@ -46,8 +44,7 @@ public:
|
||||
bool durable_writes,
|
||||
std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
|
||||
user_types_metadata user_types = user_types_metadata{},
|
||||
storage_options storage_opts = storage_options{},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
storage_options storage_opts = storage_options{});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(std::string_view name,
|
||||
std::string_view strategy_name,
|
||||
@@ -56,8 +53,7 @@ public:
|
||||
std::optional<consistency_config_option> consistency_option,
|
||||
bool durables_writes = true,
|
||||
storage_options storage_opts = {},
|
||||
std::vector<schema_ptr> cf_defs = {},
|
||||
std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
|
||||
std::vector<schema_ptr> cf_defs = {});
|
||||
static lw_shared_ptr<keyspace_metadata>
|
||||
new_keyspace(const keyspace_metadata& ksm);
|
||||
void validate(const gms::feature_service&, const locator::topology&) const;
|
||||
@@ -70,18 +66,6 @@ public:
|
||||
const locator::replication_strategy_config_options& strategy_options() const {
|
||||
return _strategy_options;
|
||||
}
|
||||
void set_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_strategy_options = options;
|
||||
}
|
||||
const std::optional<locator::replication_strategy_config_options>& next_strategy_options_opt() const {
|
||||
return _next_strategy_options;
|
||||
}
|
||||
void set_next_strategy_options(const locator::replication_strategy_config_options& options) {
|
||||
_next_strategy_options = options;
|
||||
}
|
||||
void clear_next_strategy_options() {
|
||||
_next_strategy_options = std::nullopt;
|
||||
}
|
||||
locator::replication_strategy_config_options strategy_options_v1() const;
|
||||
std::optional<unsigned> initial_tablets() const {
|
||||
return _initial_tablets;
|
||||
|
||||
18
db/config.cc
18
db/config.cc
@@ -330,14 +330,14 @@ const config_type& config_type_for<std::vector<db::config::error_injection_at_st
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_loop::when>>() {
|
||||
const config_type& config_type_for<enum_option<netw::dict_training_when>>() {
|
||||
static config_type ct(
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_loop::when>>);
|
||||
"dictionary training conditions", printable_to_json<enum_option<netw::dict_training_when>>);
|
||||
return ct;
|
||||
}
|
||||
|
||||
template <>
|
||||
const config_type& config_type_for<netw::advanced_rpc_compressor::tracker::algo_config>() {
|
||||
const config_type& config_type_for<netw::algo_config>() {
|
||||
static config_type ct(
|
||||
"advanced rpc compressor config", printable_vector_to_json<enum_option<netw::compression_algorithm>>);
|
||||
return ct;
|
||||
@@ -530,9 +530,9 @@ struct convert<db::config::error_injection_at_startup> {
|
||||
|
||||
|
||||
template <>
|
||||
class convert<enum_option<netw::dict_training_loop::when>> {
|
||||
class convert<enum_option<netw::dict_training_when>> {
|
||||
public:
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_loop::when>& rhs) {
|
||||
static bool decode(const Node& node, enum_option<netw::dict_training_when>& rhs) {
|
||||
std::string name;
|
||||
if (!convert<std::string>::decode(node, name)) {
|
||||
return false;
|
||||
@@ -1110,7 +1110,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Specifies RPC compression algorithms supported by this node. ")
|
||||
, internode_compression_enable_advanced(this, "internode_compression_enable_advanced", liveness::MustRestart, value_status::Used, false,
|
||||
"Enables the new implementation of RPC compression. If disabled, Scylla will fall back to the old implementation.")
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_loop::when::type::NEVER,
|
||||
, rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_when::type::NEVER,
|
||||
"Specifies when RPC compression dictionary training is performed by this node.\n"
|
||||
"* `never` disables it unconditionally.\n"
|
||||
"* `when_leader` enables it only whenever the node is the Raft leader.\n"
|
||||
@@ -2025,8 +2025,8 @@ template struct utils::config_file::named_value<enum_option<db::experimental_fea
|
||||
template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
template struct utils::config_file::named_value<netw::algo_config>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
@@ -2094,7 +2094,7 @@ future<gms::inet_address> resolve(const config_file::named_value<sstring>& addre
|
||||
}
|
||||
}
|
||||
|
||||
co_return coroutine::exception(std::move(ex));
|
||||
co_return seastar::coroutine::exception(std::move(ex));
|
||||
}
|
||||
|
||||
static std::vector<seastar::metrics::relabel_config> get_relable_from_yaml(const YAML::Node& yaml, const std::string& name) {
|
||||
|
||||
14
db/config.hh
14
db/config.hh
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
@@ -16,15 +17,14 @@
|
||||
#include <seastar/util/program-options.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/replication_strategy_type.hh"
|
||||
#include "seastarx.hh"
|
||||
#include "utils/config_file.hh"
|
||||
#include "utils/enum_option.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "db/hints/host_filter.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "message/dict_trainer.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include "message/rpc_compression_types.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "db/tri_mode_restriction.hh"
|
||||
#include "sstables/compressor.hh"
|
||||
@@ -325,9 +325,9 @@ public:
|
||||
named_value<uint32_t> internode_compression_zstd_min_message_size;
|
||||
named_value<uint32_t> internode_compression_zstd_max_message_size;
|
||||
named_value<bool> internode_compression_checksumming;
|
||||
named_value<netw::advanced_rpc_compressor::tracker::algo_config> internode_compression_algorithms;
|
||||
named_value<netw::algo_config> internode_compression_algorithms;
|
||||
named_value<bool> internode_compression_enable_advanced;
|
||||
named_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when;
|
||||
named_value<enum_option<netw::dict_training_when>> rpc_dict_training_when;
|
||||
named_value<uint32_t> rpc_dict_training_min_time_seconds;
|
||||
named_value<uint64_t> rpc_dict_training_min_bytes;
|
||||
named_value<bool> inter_dc_tcp_nodelay;
|
||||
@@ -739,8 +739,8 @@ extern template struct utils::config_file::named_value<enum_option<db::experimen
|
||||
extern template struct utils::config_file::named_value<enum_option<db::replication_strategy_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::consistency_level_restriction_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<db::tablets_mode_t>>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_loop::when>>;
|
||||
extern template struct utils::config_file::named_value<netw::advanced_rpc_compressor::tracker::algo_config>;
|
||||
extern template struct utils::config_file::named_value<enum_option<netw::dict_training_when>>;
|
||||
extern template struct utils::config_file::named_value<netw::algo_config>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::experimental_features_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::replication_strategy_restriction_t>>>;
|
||||
extern template struct utils::config_file::named_value<std::vector<enum_option<db::consistency_level_restriction_t>>>;
|
||||
|
||||
@@ -277,7 +277,7 @@ filter_for_query(consistency_level cl,
|
||||
|
||||
host_id_vector_replica_set selected_endpoints;
|
||||
|
||||
// Preselect endpoints based on client preference. If the endpoints
|
||||
// Pre-select endpoints based on client preference. If the endpoints
|
||||
// selected this way aren't enough to satisfy CL requirements select the
|
||||
// remaining ones according to the load-balancing strategy as before.
|
||||
if (!preferred_endpoints.empty()) {
|
||||
|
||||
@@ -33,11 +33,6 @@ enum class schema_feature {
|
||||
|
||||
// Per-table tablet options
|
||||
TABLET_OPTIONS,
|
||||
|
||||
// When enabled, `system_schema.keyspaces` will keep three replication values:
|
||||
// the initial, the current, and the target replication factor,
|
||||
// which reflect the phases of the multi RF change.
|
||||
KEYSPACE_MULTI_RF_CHANGE,
|
||||
};
|
||||
|
||||
using schema_features = enum_set<super_enum<schema_feature,
|
||||
@@ -48,8 +43,7 @@ using schema_features = enum_set<super_enum<schema_feature,
|
||||
schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
|
||||
schema_feature::GROUP0_SCHEMA_VERSIONING,
|
||||
schema_feature::IN_MEMORY_TABLES,
|
||||
schema_feature::TABLET_OPTIONS,
|
||||
schema_feature::KEYSPACE_MULTI_RF_CHANGE
|
||||
schema_feature::TABLET_OPTIONS
|
||||
>>;
|
||||
|
||||
}
|
||||
|
||||
@@ -216,7 +216,6 @@ schema_ptr keyspaces() {
|
||||
{"durable_writes", boolean_type},
|
||||
{"replication", map_type_impl::get_instance(utf8_type, utf8_type, false)},
|
||||
{"replication_v2", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // with rack list RF
|
||||
{"next_replication", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // target rack list RF for this RF change
|
||||
},
|
||||
// static columns
|
||||
{},
|
||||
@@ -1179,14 +1178,6 @@ utils::chunked_vector<mutation> make_create_keyspace_mutations(schema_features f
|
||||
// If the maps are different, the upgrade must be already done.
|
||||
store_map(m, ckey, "replication_v2", timestamp, cql3::statements::to_flattened_map(map));
|
||||
}
|
||||
if (features.contains<schema_feature::KEYSPACE_MULTI_RF_CHANGE>()) {
|
||||
const auto& next_map_opt = keyspace->next_strategy_options_opt();
|
||||
if (next_map_opt) {
|
||||
auto next_map = *next_map_opt;
|
||||
next_map["class"] = keyspace->strategy_name();
|
||||
store_map(m, ckey, "next_replication", timestamp, cql3::statements::to_flattened_map(next_map));
|
||||
}
|
||||
}
|
||||
|
||||
if (features.contains<schema_feature::SCYLLA_KEYSPACES>()) {
|
||||
schema_ptr scylla_keyspaces_s = scylla_keyspaces();
|
||||
@@ -1260,7 +1251,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
// (or screw up shared pointers)
|
||||
const auto& replication = row.get_nonnull<map_type_impl::native_type>("replication");
|
||||
const auto& replication_v2 = row.get<map_type_impl::native_type>("replication_v2");
|
||||
const auto& next_replication = row.get<map_type_impl::native_type>("next_replication");
|
||||
|
||||
cql3::statements::property_definitions::map_type flat_strategy_options;
|
||||
for (auto& p : replication_v2 ? *replication_v2 : replication) {
|
||||
@@ -1269,17 +1259,6 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
auto strategy_options = cql3::statements::from_flattened_map(flat_strategy_options);
|
||||
auto strategy_name = std::get<sstring>(strategy_options["class"]);
|
||||
strategy_options.erase("class");
|
||||
|
||||
std::optional<cql3::statements::property_definitions::extended_map_type> next_strategy_options = std::nullopt;
|
||||
if (next_replication) {
|
||||
cql3::statements::property_definitions::map_type flat_next_replication;
|
||||
for (auto& p : *next_replication) {
|
||||
flat_next_replication.emplace(value_cast<sstring>(p.first), value_cast<sstring>(p.second));
|
||||
}
|
||||
next_strategy_options = cql3::statements::from_flattened_map(flat_next_replication);
|
||||
next_strategy_options->erase("class");
|
||||
}
|
||||
|
||||
bool durable_writes = row.get_nonnull<bool>("durable_writes");
|
||||
|
||||
data_dictionary::storage_options storage_opts;
|
||||
@@ -1305,7 +1284,7 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts, {}, next_strategy_options);
|
||||
co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts);
|
||||
}
|
||||
|
||||
template<typename V>
|
||||
|
||||
@@ -300,7 +300,6 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("ongoing_rf_changes", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -3351,12 +3350,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("ongoing_rf_changes")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "ongoing_rf_changes")) {
|
||||
ret.ongoing_rf_changes.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
|
||||
@@ -15,10 +15,11 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/generation-number.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db_clock.hh"
|
||||
#include "mutation_query.hh"
|
||||
#include "system_keyspace_view_types.hh"
|
||||
@@ -36,6 +37,10 @@ namespace netw {
|
||||
class shared_dict;
|
||||
};
|
||||
|
||||
namespace query {
|
||||
class result_set;
|
||||
}
|
||||
|
||||
namespace sstables {
|
||||
struct entry_descriptor;
|
||||
class generation_type;
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "db/config.hh"
|
||||
#include "db/view/base_info.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "db/view/view_consumer.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
@@ -1584,11 +1586,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
|
||||
auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
|
||||
if (tombstone && _existing && !_existing->is_end_of_partition()) {
|
||||
if (_existing->is_range_tombstone_change()) {
|
||||
_existing_current_tombstone = _existing->as_range_tombstone_change().tombstone();
|
||||
} else if (_existing->is_clustering_row()) {
|
||||
// We don't care if it's a range tombstone, as we're only looking for existing entries that get deleted
|
||||
if (_existing->is_clustering_row()) {
|
||||
auto existing = clustering_row(*_schema, _existing->as_clustering_row());
|
||||
existing.apply(std::max(_existing_partition_tombstone, _existing_current_tombstone));
|
||||
auto update = clustering_row(existing.key(), row_tombstone(std::move(tombstone)), row_marker(), ::row());
|
||||
generate_update(std::move(update), { std::move(existing) });
|
||||
} else if (_existing->is_static_row()) {
|
||||
@@ -1599,10 +1599,9 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
return should_stop_updates() ? stop() : advance_existings();
|
||||
}
|
||||
|
||||
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
||||
if (_update && !_update->is_end_of_partition()) {
|
||||
if (_update->is_range_tombstone_change()) {
|
||||
_update_current_tombstone = _update->as_range_tombstone_change().tombstone();
|
||||
} else if (_update->is_clustering_row()) {
|
||||
if (_update->is_clustering_row()) {
|
||||
_update->mutate_as_clustering_row(*_schema, [&] (clustering_row& cr) mutable {
|
||||
cr.apply(std::max(_update_partition_tombstone, _update_current_tombstone));
|
||||
});
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "gms/gossiper.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
@@ -21,6 +21,8 @@
|
||||
#include "dht/token.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
#include "service/raft/raft_group0.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
#include <flat_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include <seastar/core/gate.hh>
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
|
||||
@@ -240,9 +240,6 @@ future<> view_update_generator::process_staging_sstables(lw_shared_ptr<replica::
|
||||
_progress_tracker->on_sstable_registration(sst);
|
||||
}
|
||||
|
||||
utils::get_local_injector().inject("view_update_generator_pause_before_processing",
|
||||
utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
|
||||
// Generate view updates from staging sstables
|
||||
auto start_time = db_clock::now();
|
||||
auto [result, input_size] = generate_updates_from_staging_sstables(table, sstables);
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "cdc/metadata.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "query/query-result-set.hh"
|
||||
#include "db/virtual_table.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "db/virtual_tables.hh"
|
||||
|
||||
@@ -271,7 +271,7 @@ The json structure is as follows:
|
||||
}
|
||||
|
||||
The `manifest` member contains the following attributes:
|
||||
- `version` - representing the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `version` - respresenting the version of the manifest itself. It is incremented when members are added or removed from the manifest.
|
||||
- `scope` - the scope of metadata stored in this manifest file. The following scopes are supported:
|
||||
- `node` - the manifest describes all SSTables owned by this node in this snapshot.
|
||||
|
||||
|
||||
@@ -12,9 +12,7 @@ Schema:
|
||||
CREATE TABLE system_schema.keyspaces (
|
||||
keyspace_name text PRIMARY KEY,
|
||||
durable_writes boolean,
|
||||
replication frozen<map<text, text>>,
|
||||
replication_v2 frozen<map<text, text>>,
|
||||
next_replication frozen<map<text, text>>
|
||||
replication frozen<map<text, text>>
|
||||
)
|
||||
```
|
||||
|
||||
@@ -33,8 +31,6 @@ Columns:
|
||||
stored as a flattened map of the extended options map (see below).
|
||||
|
||||
For `SimpleStrategy` there is a single option `"replication_factor"` specifying the replication factor.
|
||||
* `next_replication` - the target replication factor for the keyspace during rf change.
|
||||
If there is no ongoing rf change, `next_replication` value is not set.
|
||||
|
||||
Extended options map used by NetworkTopologyStrategy is a map where values can be either strings or lists of strings.
|
||||
|
||||
|
||||
@@ -146,25 +146,6 @@ AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
|
||||
- When set, these values are used by the S3 client to sign requests.
|
||||
- If not set, requests are sent unsigned, which may not be accepted by all servers.
|
||||
|
||||
.. _admin-oci-object-storage:
|
||||
|
||||
Using Oracle OCI Object Storage
|
||||
=================================
|
||||
|
||||
Oracle Cloud Infrastructure (OCI) Object Storage is compatible with the Amazon
|
||||
S3 API, so it works with ScyllaDB without additional configuration.
|
||||
|
||||
To use OCI Object Storage, follow the same configuration as for AWS S3, and
|
||||
specify your OCI S3-compatible endpoint.
|
||||
|
||||
Example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
object_storage_endpoints:
|
||||
- name: https://idedxcgnkfkt.compat.objectstorage.us-ashburn-1.oci.customer-oci.com:443
|
||||
aws_region: us-ashburn-1
|
||||
|
||||
.. _admin-compression:
|
||||
|
||||
Compression
|
||||
|
||||
@@ -231,46 +231,6 @@ Add New DC
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While adding a new datacenter and altering keyspaces, do **not** perform any reads or writes that involve the new datacenter.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the new datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the new datacenter is fully operational.
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
|
||||
CREATE KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``<existing_dc>`` (adds ``<existing_rack4>``) and adds ``<new_dc>`` in the same statement:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>', '<existing_rack4>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace4;
|
||||
CREATE KEYSPACE mykeyspace4 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
You can abort the keyspace alteration using :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -102,34 +102,6 @@ Procedure
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
|
||||
* Existing datacenters must keep their current replication factor.
|
||||
* An existing datacenter can be removed (**N to 0**).
|
||||
* A new datacenter can be assigned a replication factor (**0 to N**).
|
||||
|
||||
.. warning::
|
||||
|
||||
While removing a datacenter and altering keyspaces, do **not** perform any reads or writes that involve the datacenter being removed.
|
||||
In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the decommissioned datacenter in the operation.
|
||||
Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the datacenter is fully decommissioned.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba4
|
||||
cqlsh> CREATE KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
The following is **not** allowed because it changes the replication factor of ``EUROPE-DC`` (adds ``RAC9``) and removes ``ASIA-DC`` in the same statement:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8', 'RAC9']} AND tablets = { 'enabled': true };
|
||||
|
||||
Remove all replicas from the decommissioned datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
|
||||
If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
|
||||
@@ -141,10 +113,6 @@ Procedure
|
||||
|
||||
Failure to do so will result in decommission errors such as "zero replica after the removal".
|
||||
|
||||
.. warning::
|
||||
|
||||
Removal of replicas from a datacenter cannot be aborted. To get back to the previous replication, wait until the ALTER KEYSPACE finishes and then add the replicas back by running another ALTER KEYSPACE statement.
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/smp.hh>
|
||||
#include "db/schema_features.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
@@ -180,7 +179,6 @@ db::schema_features feature_service::cluster_schema_features() const {
|
||||
f.set<db::schema_feature::GROUP0_SCHEMA_VERSIONING>();
|
||||
f.set_if<db::schema_feature::IN_MEMORY_TABLES>(bool(in_memory_tables));
|
||||
f.set_if<db::schema_feature::TABLET_OPTIONS>(bool(tablet_options));
|
||||
f.set_if<db::schema_feature::KEYSPACE_MULTI_RF_CHANGE>(bool(keyspace_multi_rf_change));
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
@@ -182,7 +182,6 @@ public:
|
||||
gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv };
|
||||
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
|
||||
gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
|
||||
gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "locator/token_metadata.hh"
|
||||
#include "locator/types.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/loaded_endpoint_state.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
@@ -71,11 +72,6 @@ struct gossip_config {
|
||||
utils::updateable_value<utils::UUID> recovery_leader;
|
||||
};
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
gms::inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
/**
|
||||
* This module is responsible for Gossiping information for the local endpoint. This abstraction
|
||||
* maintains the list of live and dead endpoints. Periodically i.e. every 1 second this module
|
||||
|
||||
23
gms/loaded_endpoint_state.hh
Normal file
23
gms/loaded_endpoint_state.hh
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/types.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
struct loaded_endpoint_state {
|
||||
inet_address endpoint;
|
||||
std::optional<locator::endpoint_dc_rack> opt_dc_rack;
|
||||
};
|
||||
|
||||
} // namespace gms
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "query/query_id.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
|
||||
namespace utils {
|
||||
class UUID final {
|
||||
@@ -43,4 +43,3 @@ class host_id final {
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
@@ -284,14 +284,3 @@ future<> instance_cache::stop() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
struct equal_to<seastar::scheduling_group> {
|
||||
bool operator()(seastar::scheduling_group& sg1, seastar::scheduling_group& sg2) const noexcept {
|
||||
return sg1 == sg2;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "utils/sequenced_set.hh"
|
||||
#include "utils/simple_hashers.hh"
|
||||
#include "tablets.hh"
|
||||
#include "locator/replication_strategy_type.hh"
|
||||
#include "data_dictionary/consistency_config_options.hh"
|
||||
|
||||
// forward declaration since replica/database.hh includes this file
|
||||
@@ -38,13 +39,6 @@ extern logging::logger rslogger;
|
||||
using inet_address = gms::inet_address;
|
||||
using token = dht::token;
|
||||
|
||||
enum class replication_strategy_type {
|
||||
simple,
|
||||
local,
|
||||
network_topology,
|
||||
everywhere_topology,
|
||||
};
|
||||
|
||||
using replication_strategy_config_option = std::variant<sstring, rack_list>;
|
||||
using replication_strategy_config_options = std::map<sstring, replication_strategy_config_option>;
|
||||
|
||||
|
||||
@@ -381,10 +381,6 @@ public:
|
||||
return _nodes.at(node)._du.capacity;
|
||||
}
|
||||
|
||||
bool has_node(host_id node) const {
|
||||
return _nodes.contains(node);
|
||||
}
|
||||
|
||||
shard_id get_shard_count(host_id node) const {
|
||||
if (!_nodes.contains(node)) {
|
||||
return 0;
|
||||
|
||||
20
locator/replication_strategy_type.hh
Normal file
20
locator/replication_strategy_type.hh
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright (C) 2015-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace locator {
|
||||
|
||||
enum class replication_strategy_type {
|
||||
simple,
|
||||
local,
|
||||
network_topology,
|
||||
everywhere_topology,
|
||||
};
|
||||
|
||||
} // namespace locator
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "dht/i_partitioner_fwd.hh"
|
||||
#include "dht/token-sharding.hh"
|
||||
#include "dht/ring_position.hh"
|
||||
@@ -21,10 +21,9 @@
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
|
||||
#include <ranges>
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <seastar/util/log.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/util/noncopyable_function.hh>
|
||||
@@ -153,27 +152,19 @@ struct hash<locator::range_based_tablet_id> {
|
||||
|
||||
namespace locator {
|
||||
|
||||
/// Returns a copy of the replica set with the following modifications:
|
||||
/// - If both old_replica and new_replica are set, old_replica is substituted
|
||||
/// with new_replica. If old_replica is not found in rs, the set is returned as-is.
|
||||
/// - If only old_replica is set, it is removed from the result.
|
||||
/// - If only new_replica is set, it is appended to the result.
|
||||
/// Creates a new replica set with old_replica replaced by new_replica.
|
||||
/// If there is no old_replica, the set is returned unchanged.
|
||||
inline
|
||||
tablet_replica_set replace_replica(const tablet_replica_set& rs, std::optional<tablet_replica> old_replica, std::optional<tablet_replica> new_replica) {
|
||||
tablet_replica_set replace_replica(const tablet_replica_set& rs, tablet_replica old_replica, tablet_replica new_replica) {
|
||||
tablet_replica_set result;
|
||||
result.reserve(rs.size());
|
||||
for (auto&& r : rs) {
|
||||
if (old_replica.has_value() && r == old_replica.value()) {
|
||||
if (new_replica.has_value()) {
|
||||
result.push_back(new_replica.value());
|
||||
}
|
||||
if (r == old_replica) {
|
||||
result.push_back(new_replica);
|
||||
} else {
|
||||
result.push_back(r);
|
||||
}
|
||||
}
|
||||
if (!old_replica.has_value() && new_replica.has_value()) {
|
||||
result.push_back(new_replica.value());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -391,8 +382,8 @@ bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tab
|
||||
struct tablet_migration_info {
|
||||
locator::tablet_transition_kind kind;
|
||||
locator::global_tablet_id tablet;
|
||||
std::optional<locator::tablet_replica> src;
|
||||
std::optional<locator::tablet_replica> dst;
|
||||
locator::tablet_replica src;
|
||||
locator::tablet_replica dst;
|
||||
};
|
||||
|
||||
class tablet_map;
|
||||
|
||||
2
main.cc
2
main.cc
@@ -942,7 +942,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
|
||||
auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();
|
||||
|
||||
// Maintenance supergroup -- the collection of background low-prio activities
|
||||
// Maintenance supergroup -- the collection of background low-prio activites
|
||||
auto maintenance_supergroup = create_scheduling_supergroup(200).get();
|
||||
auto bandwidth_updater = io_throughput_updater("maintenance supergroup", maintenance_supergroup,
|
||||
cfg->maintenance_io_throughput_mb_per_sec.is_set() ? cfg->maintenance_io_throughput_mb_per_sec : cfg->stream_io_throughput_mb_per_sec);
|
||||
|
||||
@@ -11,9 +11,10 @@
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/rpc/rpc_types.hh>
|
||||
#include <utility>
|
||||
|
||||
#include "rpc_compression_types.hh"
|
||||
#include "utils/refcounted.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/enum_option.hh"
|
||||
#include "shared_dict.hh"
|
||||
|
||||
namespace netw {
|
||||
@@ -28,103 +29,6 @@ class dict_sampler;
|
||||
using dict_ptr = lw_shared_ptr<foreign_ptr<lw_shared_ptr<shared_dict>>>;
|
||||
class control_protocol_frame;
|
||||
|
||||
// An enum wrapper, describing supported RPC compression algorithms.
|
||||
// Always contains a valid value —- the constructors won't allow
|
||||
// an invalid/unknown enum variant to be constructed.
|
||||
struct compression_algorithm {
|
||||
using underlying = uint8_t;
|
||||
enum class type : underlying {
|
||||
RAW,
|
||||
LZ4,
|
||||
ZSTD,
|
||||
COUNT,
|
||||
} _value;
|
||||
// Construct from an integer.
|
||||
// Used to deserialize the algorithm from the first byte of the frame.
|
||||
constexpr compression_algorithm(underlying x) {
|
||||
if (x < std::to_underlying(type::RAW) || x >= std::to_underlying(type::COUNT)) {
|
||||
throw std::runtime_error(fmt::format("Invalid value {} for enum compression_algorithm", static_cast<int>(x)));
|
||||
}
|
||||
_value = static_cast<type>(x);
|
||||
}
|
||||
// Construct from `type`. Makes sure that `type` has a valid value.
|
||||
constexpr compression_algorithm(type x) : compression_algorithm(std::to_underlying(x)) {}
|
||||
|
||||
// These names are used in multiple places:
|
||||
// RPC negotiation, in metric labels, and config.
|
||||
static constexpr std::string_view names[] = {
|
||||
"raw",
|
||||
"lz4",
|
||||
"zstd",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<int>(compression_algorithm::type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static auto map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert(std::make_pair<std::string, type>(std::string(names[i]), compression_algorithm(i).get()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
constexpr std::string_view name() const noexcept { return names[idx()]; }
|
||||
constexpr underlying idx() const noexcept { return std::to_underlying(_value); }
|
||||
constexpr type get() const noexcept { return _value; }
|
||||
constexpr static size_t count() { return static_cast<size_t>(type::COUNT); };
|
||||
bool operator<=>(const compression_algorithm &) const = default;
|
||||
};
|
||||
|
||||
|
||||
// Represents a set of compression algorithms.
|
||||
// Backed by a bitset.
|
||||
// Used for convenience during algorithm negotiations.
|
||||
class compression_algorithm_set {
|
||||
uint8_t _bitset;
|
||||
static_assert(std::numeric_limits<decltype(_bitset)>::digits > compression_algorithm::count());
|
||||
constexpr compression_algorithm_set(uint8_t v) noexcept : _bitset(v) {}
|
||||
public:
|
||||
// Returns a set containing the given algorithm and all algorithms weaker (smaller in the enum order)
|
||||
// than it.
|
||||
constexpr static compression_algorithm_set this_or_lighter(compression_algorithm algo) noexcept {
|
||||
auto x = 1 << (algo.idx());
|
||||
return {x + (x - 1)};
|
||||
}
|
||||
// Returns the strongest (greatest in the enum order) algorithm in the set.
|
||||
constexpr compression_algorithm heaviest() const {
|
||||
return {std::bit_width(_bitset) - 1};
|
||||
}
|
||||
// The usual set operations.
|
||||
constexpr static compression_algorithm_set singleton(compression_algorithm algo) noexcept {
|
||||
return {1 << algo.idx()};
|
||||
}
|
||||
constexpr compression_algorithm_set intersection(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset & o._bitset};
|
||||
}
|
||||
constexpr compression_algorithm_set difference(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset &~ o._bitset};
|
||||
}
|
||||
constexpr compression_algorithm_set sum(compression_algorithm_set o) const noexcept {
|
||||
return {_bitset | o._bitset};
|
||||
}
|
||||
constexpr bool contains(compression_algorithm algo) const noexcept {
|
||||
return _bitset & (1 << algo.idx());
|
||||
}
|
||||
constexpr bool operator==(const compression_algorithm_set&) const = default;
|
||||
// Returns the contained bitset. Used for serialization.
|
||||
constexpr uint8_t value() const noexcept {
|
||||
return _bitset;
|
||||
}
|
||||
// Reconstructs the set from the output of `value()`. Used for deserialization.
|
||||
constexpr static compression_algorithm_set from_value(uint8_t bitset) {
|
||||
compression_algorithm_set x = bitset;
|
||||
x.heaviest(); // This is a validation check. It will throw if the bitset contains some illegal/unknown bits.
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
using algo_config = std::vector<enum_option<compression_algorithm>>;
|
||||
|
||||
// See docs/dev/advanced_rpc_compression.md,
|
||||
// section `Negotiation` for more information about the protocol.
|
||||
struct control_protocol {
|
||||
@@ -248,7 +152,7 @@ struct per_algorithm_stats {
|
||||
// prevent a misuse of the API (dangling references).
|
||||
class advanced_rpc_compressor::tracker : public utils::refcounted {
|
||||
public:
|
||||
using algo_config = algo_config;
|
||||
using algo_config = netw::algo_config;
|
||||
struct config {
|
||||
utils::updateable_value<uint32_t> zstd_min_msg_size{0};
|
||||
utils::updateable_value<uint32_t> zstd_max_msg_size{std::numeric_limits<uint32_t>::max()};
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "shared_dict.hh"
|
||||
#include "advanced_rpc_compressor.hh"
|
||||
#include "rpc_compression_types.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "rpc_compression_types.hh"
|
||||
#include "utils/reservoir_sampling.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include <seastar/core/future.hh>
|
||||
@@ -88,28 +89,7 @@ class dict_training_loop {
|
||||
seastar::semaphore _pause{0};
|
||||
seastar::abort_source _pause_as;
|
||||
public:
|
||||
struct when {
|
||||
enum class type {
|
||||
NEVER,
|
||||
WHEN_LEADER,
|
||||
ALWAYS,
|
||||
COUNT,
|
||||
};
|
||||
static constexpr std::string_view names[] = {
|
||||
"never",
|
||||
"when_leader",
|
||||
"always",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<size_t>(type::COUNT));
|
||||
// Implements enum_option.
|
||||
static std::unordered_map<std::string, type> map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert({std::string(names[i]), type(i)});
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
using when = netw::dict_training_when;
|
||||
void pause();
|
||||
void unpause();
|
||||
void cancel() noexcept;
|
||||
|
||||
@@ -54,11 +54,11 @@ dictionary_service::dictionary_service(
|
||||
void dictionary_service::maybe_toggle_dict_training() {
|
||||
auto when = _rpc_dict_training_when();
|
||||
netw::dict_trainer_logger.debug("dictionary_service::maybe_toggle_dict_training(), called, _is_leader={}, when={}", _is_leader, when);
|
||||
if (when == netw::dict_training_loop::when::type::NEVER) {
|
||||
if (when == netw::dict_training_when::type::NEVER) {
|
||||
_training_fiber.pause();
|
||||
} else if (when == netw::dict_training_loop::when::type::ALWAYS) {
|
||||
} else if (when == netw::dict_training_when::type::ALWAYS) {
|
||||
_training_fiber.unpause();
|
||||
} else if (when == netw::dict_training_loop::when::type::WHEN_LEADER) {
|
||||
} else if (when == netw::dict_training_when::type::WHEN_LEADER) {
|
||||
_is_leader ? _training_fiber.unpause() : _training_fiber.pause();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace gms {
|
||||
class dictionary_service {
|
||||
db::system_keyspace& _sys_ks;
|
||||
locator::host_id _our_host_id;
|
||||
utils::updateable_value<enum_option<netw::dict_training_loop::when>> _rpc_dict_training_when;
|
||||
utils::updateable_value<enum_option<netw::dict_training_when>> _rpc_dict_training_when;
|
||||
service::raft_group0_client& _raft_group0_client;
|
||||
abort_source& _as;
|
||||
netw::dict_training_loop _training_fiber;
|
||||
@@ -48,7 +48,7 @@ class dictionary_service {
|
||||
|
||||
bool _is_leader = false;
|
||||
utils::observer<bool> _leadership_observer;
|
||||
utils::observer<enum_option<netw::dict_training_loop::when>> _when_observer;
|
||||
utils::observer<enum_option<netw::dict_training_when>> _when_observer;
|
||||
std::optional<std::any> _feature_observer;
|
||||
|
||||
void maybe_toggle_dict_training();
|
||||
@@ -61,7 +61,7 @@ public:
|
||||
locator::host_id our_host_id = Uninitialized();
|
||||
utils::updateable_value<uint32_t> rpc_dict_training_min_time_seconds = Uninitialized();
|
||||
utils::updateable_value<uint64_t> rpc_dict_training_min_bytes = Uninitialized();
|
||||
utils::updateable_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when = Uninitialized();
|
||||
utils::updateable_value<enum_option<netw::dict_training_when>> rpc_dict_training_when = Uninitialized();
|
||||
};
|
||||
// Note: the training fiber will start as soon as the relevant cluster feature is enabled.
|
||||
dictionary_service(
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <seastar/coroutine/all.hh>
|
||||
|
||||
#include "message/messaging_service.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include "gms/gossiper.hh"
|
||||
#include "service/storage_service.hh"
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "streaming/stream_fwd.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "service/maintenance_mode.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/generation-number.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "message/advanced_rpc_compressor.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
#include <list>
|
||||
@@ -120,6 +120,8 @@ namespace qos {
|
||||
|
||||
namespace netw {
|
||||
|
||||
class walltime_compressor_tracker;
|
||||
|
||||
/* All verb handler identifiers */
|
||||
enum class messaging_verb : int32_t {
|
||||
CLIENT_ID = 0,
|
||||
|
||||
155
message/rpc_compression_types.hh
Normal file
155
message/rpc_compression_types.hh
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bit>
|
||||
#include <compare>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "utils/enum_option.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
// An enum wrapper, describing supported RPC compression algorithms.
|
||||
// Always contains a valid value -- the constructors won't allow
|
||||
// an invalid/unknown enum variant to be constructed.
|
||||
struct compression_algorithm {
|
||||
using underlying = uint8_t;
|
||||
enum class type : underlying {
|
||||
RAW,
|
||||
LZ4,
|
||||
ZSTD,
|
||||
COUNT,
|
||||
} _value;
|
||||
|
||||
// Construct from an integer.
|
||||
// Used to deserialize the algorithm from the first byte of the frame.
|
||||
constexpr compression_algorithm(underlying x) {
|
||||
if (x < std::to_underlying(type::RAW) || x >= std::to_underlying(type::COUNT)) {
|
||||
throw std::runtime_error(std::string("Invalid value ") + std::to_string(unsigned(x)) + " for enum compression_algorithm");
|
||||
}
|
||||
_value = static_cast<type>(x);
|
||||
}
|
||||
|
||||
// Construct from `type`. Makes sure that `type` has a valid value.
|
||||
constexpr compression_algorithm(type x) : compression_algorithm(std::to_underlying(x)) {}
|
||||
|
||||
// These names are used in multiple places:
|
||||
// RPC negotiation, in metric labels, and config.
|
||||
static constexpr std::string_view names[] = {
|
||||
"raw",
|
||||
"lz4",
|
||||
"zstd",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<int>(compression_algorithm::type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static auto map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert(std::make_pair(std::string(names[i]), compression_algorithm(i).get()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
constexpr std::string_view name() const noexcept { return names[idx()]; }
|
||||
constexpr underlying idx() const noexcept { return std::to_underlying(_value); }
|
||||
constexpr type get() const noexcept { return _value; }
|
||||
constexpr static size_t count() { return static_cast<size_t>(type::COUNT); }
|
||||
bool operator<=>(const compression_algorithm&) const = default;
|
||||
};
|
||||
|
||||
// Represents a set of compression algorithms.
|
||||
// Backed by a bitset.
|
||||
// Used for convenience during algorithm negotiations.
|
||||
class compression_algorithm_set {
|
||||
uint8_t _bitset;
|
||||
static_assert(std::numeric_limits<decltype(_bitset)>::digits > compression_algorithm::count());
|
||||
constexpr compression_algorithm_set(uint8_t v) noexcept : _bitset(v) {}
|
||||
public:
|
||||
// Returns a set containing the given algorithm and all algorithms weaker (smaller in the enum order)
|
||||
// than it.
|
||||
constexpr static compression_algorithm_set this_or_lighter(compression_algorithm algo) noexcept {
|
||||
auto x = 1 << algo.idx();
|
||||
return {uint8_t(x + (x - 1))};
|
||||
}
|
||||
|
||||
// Returns the strongest (greatest in the enum order) algorithm in the set.
|
||||
constexpr compression_algorithm heaviest() const {
|
||||
return {compression_algorithm::underlying(std::bit_width(_bitset) - 1)};
|
||||
}
|
||||
|
||||
// The usual set operations.
|
||||
constexpr static compression_algorithm_set singleton(compression_algorithm algo) noexcept {
|
||||
return {uint8_t(1 << algo.idx())};
|
||||
}
|
||||
constexpr compression_algorithm_set intersection(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset & o._bitset)};
|
||||
}
|
||||
constexpr compression_algorithm_set difference(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset &~ o._bitset)};
|
||||
}
|
||||
constexpr compression_algorithm_set sum(compression_algorithm_set o) const noexcept {
|
||||
return {uint8_t(_bitset | o._bitset)};
|
||||
}
|
||||
constexpr bool contains(compression_algorithm algo) const noexcept {
|
||||
return _bitset & (1 << algo.idx());
|
||||
}
|
||||
constexpr bool operator==(const compression_algorithm_set&) const = default;
|
||||
|
||||
// Returns the contained bitset. Used for serialization.
|
||||
constexpr uint8_t value() const noexcept {
|
||||
return _bitset;
|
||||
}
|
||||
|
||||
// Reconstructs the set from the output of `value()`. Used for deserialization.
|
||||
constexpr static compression_algorithm_set from_value(uint8_t bitset) {
|
||||
compression_algorithm_set x = bitset;
|
||||
x.heaviest(); // Validation: throws on illegal/unknown bits.
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
using algo_config = std::vector<enum_option<compression_algorithm>>;
|
||||
|
||||
struct dict_training_when {
|
||||
enum class type {
|
||||
NEVER,
|
||||
WHEN_LEADER,
|
||||
ALWAYS,
|
||||
COUNT,
|
||||
};
|
||||
|
||||
static constexpr std::string_view names[] = {
|
||||
"never",
|
||||
"when_leader",
|
||||
"always",
|
||||
};
|
||||
static_assert(std::size(names) == static_cast<size_t>(type::COUNT));
|
||||
|
||||
// Implements enum_option.
|
||||
static std::unordered_map<std::string, type> map() {
|
||||
std::unordered_map<std::string, type> ret;
|
||||
for (size_t i = 0; i < std::size(names); ++i) {
|
||||
ret.insert({std::string(names[i]), type(i)});
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace netw
|
||||
@@ -16,8 +16,6 @@ Usage:
|
||||
import argparse, os, sys
|
||||
from typing import Sequence
|
||||
|
||||
from test.pylib.driver_utils import safe_driver_shutdown
|
||||
|
||||
def read_statements(path: str) -> list[tuple[int, str]]:
|
||||
stms: list[tuple[int, str]] = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
@@ -58,7 +56,7 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
|
||||
print(f"ERROR executing statement from file line {lineno}: {s}\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
safe_driver_shutdown(cluster)
|
||||
cluster.shutdown()
|
||||
return 0
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
|
||||
27
raft/raft_fwd.hh
Normal file
27
raft/raft_fwd.hh
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
// Lightweight forward-declaration header for commonly used raft types.
|
||||
// Include this instead of raft/raft.hh when only the basic ID/index types
|
||||
// are needed (e.g. in other header files), to avoid pulling in the full
|
||||
// raft machinery (futures, abort_source, bytes_ostream, etc.).
|
||||
|
||||
#include "internal.hh"
|
||||
|
||||
namespace raft {
|
||||
|
||||
using server_id = internal::tagged_id<struct server_id_tag>;
|
||||
using group_id = internal::tagged_id<struct group_id_tag>;
|
||||
using term_t = internal::tagged_uint64<struct term_tag>;
|
||||
using index_t = internal::tagged_uint64<struct index_tag>;
|
||||
using read_id = internal::tagged_uint64<struct read_id_tag>;
|
||||
|
||||
class server;
|
||||
|
||||
} // namespace raft
|
||||
@@ -269,10 +269,6 @@ public:
|
||||
// Gets the view a sstable currently belongs to.
|
||||
compaction::compaction_group_view& view_for_sstable(const sstables::shared_sstable& sst) const;
|
||||
utils::small_vector<compaction::compaction_group_view*, 3> all_views() const;
|
||||
// Returns true iff v is the repaired view of this compaction group.
|
||||
bool is_repaired_view(const compaction::compaction_group_view* v) const noexcept;
|
||||
// Returns an sstable set containing only repaired sstables (those classified as repaired).
|
||||
lw_shared_ptr<sstables::sstable_set> make_repaired_sstable_set() const;
|
||||
|
||||
seastar::condition_variable& get_staging_done_condition() noexcept {
|
||||
return _staging_done_condition;
|
||||
@@ -408,8 +404,6 @@ public:
|
||||
|
||||
// Make an sstable set spanning all sstables in the storage_group
|
||||
lw_shared_ptr<const sstables::sstable_set> make_sstable_set() const;
|
||||
// Like make_sstable_set(), but restricted to repaired sstables only across all compaction groups.
|
||||
lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set() const;
|
||||
|
||||
future<utils::chunked_vector<logstor::segment_snapshot>> take_logstor_snapshot() const;
|
||||
|
||||
|
||||
@@ -1006,7 +1006,7 @@ future<database::keyspace_change_per_shard> database::prepare_update_keyspace_on
|
||||
co_await modify_keyspace_on_all_shards(sharded_db, [&] (replica::database& db) -> future<> {
|
||||
auto& ks = db.find_keyspace(ksm.name());
|
||||
auto new_ksm = ::make_lw_shared<keyspace_metadata>(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(),
|
||||
ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options(), ksm.next_strategy_options_opt());
|
||||
ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options());
|
||||
|
||||
auto change = co_await db.prepare_update_keyspace(ks, new_ksm, pending_token_metadata.local());
|
||||
changes[this_shard_id()] = make_foreign(std::make_unique<keyspace_change>(std::move(change)));
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
@@ -113,6 +112,10 @@ namespace gms {
|
||||
class feature_service;
|
||||
}
|
||||
|
||||
namespace locator {
|
||||
class abstract_replication_strategy;
|
||||
}
|
||||
|
||||
namespace alternator {
|
||||
class table_stats;
|
||||
}
|
||||
@@ -757,10 +760,6 @@ private:
|
||||
// groups during tablet split with overlapping token range, and we need to include them all in a single
|
||||
// sstable set to allow safe tombstone gc.
|
||||
lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc(const compaction_group&) const;
|
||||
// Like sstable_set_for_tombstone_gc(), but restricted to repaired sstables only across all compaction
|
||||
// groups of the same tablet (storage group). Used by the tombstone_gc=repair optimization to avoid
|
||||
// scanning unrepaired sstables when looking for GC-blocking shadows.
|
||||
lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set_for_tombstone_gc(const compaction_group&) const;
|
||||
|
||||
bool cache_enabled() const {
|
||||
return _config.enable_cache && _schema->caching_options().enabled();
|
||||
|
||||
@@ -69,6 +69,13 @@ struct segment_descriptor : public log_heap_hook<segment_descriptor_hist_options
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace replica::logstor
|
||||
|
||||
template<>
|
||||
size_t hist_key<replica::logstor::segment_descriptor>(const replica::logstor::segment_descriptor& desc);
|
||||
|
||||
namespace replica::logstor {
|
||||
|
||||
using segment_descriptor_hist = log_heap<segment_descriptor, segment_descriptor_hist_options>;
|
||||
|
||||
struct segment_set {
|
||||
|
||||
@@ -1203,35 +1203,11 @@ future<utils::chunked_vector<logstor::segment_snapshot>> storage_group::take_log
|
||||
co_return std::move(snp);
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> storage_group::make_repaired_sstable_set() const {
|
||||
if (_split_ready_groups.empty() && _merging_groups.empty()) {
|
||||
return _main_cg->make_repaired_sstable_set();
|
||||
}
|
||||
const auto& schema = _main_cg->_t.schema();
|
||||
std::vector<lw_shared_ptr<sstables::sstable_set>> underlying;
|
||||
underlying.reserve(1 + _merging_groups.size() + _split_ready_groups.size());
|
||||
underlying.emplace_back(_main_cg->make_repaired_sstable_set());
|
||||
for (const auto& cg : _merging_groups) {
|
||||
if (!cg->empty()) {
|
||||
underlying.emplace_back(cg->make_repaired_sstable_set());
|
||||
}
|
||||
}
|
||||
for (const auto& cg : _split_ready_groups) {
|
||||
underlying.emplace_back(cg->make_repaired_sstable_set());
|
||||
}
|
||||
return make_lw_shared(sstables::make_compound_sstable_set(schema, std::move(underlying)));
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> table::sstable_set_for_tombstone_gc(const compaction_group& cg) const {
|
||||
auto& sg = storage_group_for_id(cg.group_id());
|
||||
return sg.make_sstable_set();
|
||||
}
|
||||
|
||||
lw_shared_ptr<const sstables::sstable_set> table::make_repaired_sstable_set_for_tombstone_gc(const compaction_group& cg) const {
|
||||
auto& sg = storage_group_for_id(cg.group_id());
|
||||
return sg.make_repaired_sstable_set();
|
||||
}
|
||||
|
||||
bool tablet_storage_group_manager::all_storage_groups_split() {
|
||||
auto& tmap = tablet_map();
|
||||
if (_split_ready_seq_number == tmap.resize_decision().sequence_number) {
|
||||
@@ -3024,47 +3000,9 @@ public:
|
||||
future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const override {
|
||||
return make_sstable_set_for_this_view(_cg.maintenance_sstables(), [this] { return *_cg.make_maintenance_sstable_set(); });
|
||||
}
|
||||
private:
|
||||
// Returns true when tombstone GC is restricted to the repaired set:
|
||||
// tombstone_gc=repair mode and this view is the repaired view.
|
||||
//
|
||||
// The optimization is safe for materialized view tables as well as base tables.
|
||||
// The key invariant for MV: MV tablet repair calls flush_hints() before
|
||||
// take_storage_snapshot(). flush_hints() creates a sync point that covers BOTH
|
||||
// _hints_manager (base mutations) AND _hints_for_views_manager (view mutations).
|
||||
// It waits until all pending hints — including any D_view hint stored in
|
||||
// _hints_for_views_manager while the target node was down — have been replayed
|
||||
// to the target node. Only then is take_storage_snapshot() called, which flushes
|
||||
// the MV memtable and captures D_view in the repairing sstable. After repair
|
||||
// completes, D_view is in the repaired set.
|
||||
//
|
||||
// If a subsequent base repair later replays a D_base hint that causes another
|
||||
// D_view write (same key and timestamp), it is a no-op duplicate: the original
|
||||
// D_view already in the repaired set still prevents T_mv from being purged.
|
||||
//
|
||||
// USING TIMESTAMP with timestamps predating (gc_before + propagation_delay) is
|
||||
// explicitly UB and excluded from the safety argument.
|
||||
bool is_tombstone_gc_repaired_only() const noexcept {
|
||||
return _cg.is_repaired_view(this) &&
|
||||
_t.schema()->tombstone_gc_options().mode() == tombstone_gc_mode::repair;
|
||||
}
|
||||
public:
|
||||
lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const override {
|
||||
// Optimization: when tombstone_gc=repair and this is the repaired view, only check
|
||||
// repaired sstables. The repair ordering guarantee ensures that by the time a tombstone
|
||||
// becomes GC-eligible (repair_time committed to Raft), any data it shadows has already
|
||||
// been promoted from repairing to repaired. Unrepaired data always has timestamps newer
|
||||
// than any GC-eligible tombstone (legitimate writes; USING TIMESTAMP abuse is UB).
|
||||
// For all other tombstone_gc modes this invariant does not hold, so we fall through to
|
||||
// the full storage-group set.
|
||||
if (is_tombstone_gc_repaired_only()) {
|
||||
return _t.make_repaired_sstable_set_for_tombstone_gc(_cg);
|
||||
}
|
||||
return _t.sstable_set_for_tombstone_gc(_cg);
|
||||
}
|
||||
bool skip_memtable_for_tombstone_gc() const noexcept override {
|
||||
return is_tombstone_gc_repaired_only();
|
||||
}
|
||||
std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point query_time) const override {
|
||||
return compaction::get_fully_expired_sstables(*this, sstables, query_time);
|
||||
}
|
||||
@@ -5481,21 +5419,6 @@ compaction::compaction_group_view& compaction_group::view_for_unrepaired_data()
|
||||
return *_unrepaired_view;
|
||||
}
|
||||
|
||||
bool compaction_group::is_repaired_view(const compaction::compaction_group_view* v) const noexcept {
|
||||
return v == _repaired_view.get();
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> compaction_group::make_repaired_sstable_set() const {
|
||||
auto set = make_lw_shared<sstables::sstable_set>(make_main_sstable_set());
|
||||
auto sstables_repaired_at = get_sstables_repaired_at();
|
||||
for (auto& sst : *_main_sstables->all()) {
|
||||
if (repair::is_repaired(sstables_repaired_at, sst)) {
|
||||
set->insert(sst);
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
compaction::compaction_group_view& compaction_group::view_for_sstable(const sstables::shared_sstable& sst) const {
|
||||
switch (_repair_sstable_classifier(sst, get_sstables_repaired_at())) {
|
||||
case repair_sstable_classification::unrepaired: return *_unrepaired_view;
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#include "mutation/mutation.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "service/session.hh"
|
||||
#include "service/session_id.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
namespace replica {
|
||||
|
||||
@@ -493,7 +493,7 @@ std::unique_ptr<service::pager::query_pager> service::pager::query_pagers::pager
|
||||
// If partition row limit is applied to paging, we still need to fall back
|
||||
// to filtering the results to avoid extraneous rows on page breaks.
|
||||
if (!filtering_restrictions && cmd->slice.partition_row_limit() < query::max_rows_if_set) {
|
||||
filtering_restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, true);
|
||||
filtering_restrictions = ::make_shared<cql3::restrictions::statement_restrictions>(s, true);
|
||||
}
|
||||
if (filtering_restrictions) {
|
||||
return std::make_unique<filtering_query_pager>(proxy, std::move(s), std::move(selection), state,
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "service/paxos/paxos_state.hh"
|
||||
#include "service/query_state.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "replica/database.hh"
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
#pragma once
|
||||
#include <unordered_set>
|
||||
#include "service/raft/group0_fwd.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -9,7 +9,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include "raft/raft.hh"
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/core/lowres_clock.hh>
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
#include "service/session_id.hh"
|
||||
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
@@ -19,12 +19,6 @@
|
||||
|
||||
namespace service {
|
||||
|
||||
using session_id = utils::tagged_uuid<struct session_id_tag>;
|
||||
|
||||
// We want it be different than default-constructed session_id to catch mistakes.
|
||||
constexpr session_id default_session_id = session_id(
|
||||
utils::UUID(0x81e7fc5a8d4411ee, 0x8577325096b39f47)); // timeuuid 2023-11-27 16:46:27.182089.0 UTC
|
||||
|
||||
/// Session is used to track execution of work related to some greater task, identified by session_id.
|
||||
/// Work can enter the session using enter(), and is considered to be part of the session
|
||||
/// as long as the guard returned by enter() is alive.
|
||||
|
||||
21
service/session_id.hh
Normal file
21
service/session_id.hh
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (C) 2023-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
using session_id = utils::tagged_uuid<struct session_id_tag>;
|
||||
|
||||
// We want it to be different than a default-constructed session_id to catch mistakes.
|
||||
constexpr session_id default_session_id = session_id(
|
||||
utils::UUID(0x81e7fc5a8d4411ee, 0x8577325096b39f47)); // timeuuid 2023-11-27 16:46:27.182089.0 UTC
|
||||
|
||||
} // namespace service
|
||||
@@ -38,7 +38,6 @@
|
||||
#include "replica/exceptions.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "dht/token_range_endpoints.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/cas_shard.hh"
|
||||
#include "service/storage_proxy_fwd.hh"
|
||||
|
||||
|
||||
@@ -1342,11 +1342,6 @@ future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstri
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.ongoing_rf_changes) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
@@ -2431,7 +2426,7 @@ storage_service::prepare_replacement_info(std::unordered_set<gms::inet_address>
|
||||
}
|
||||
|
||||
future<std::map<gms::inet_address, float>> storage_service::get_ownership() {
|
||||
return run_with_no_api_lock([] (storage_service& ss) {
|
||||
return run_with_no_api_lock([this] (storage_service& ss) {
|
||||
const auto& tm = ss.get_token_metadata();
|
||||
auto token_map = dht::token::describe_ownership(tm.sorted_tokens());
|
||||
// describeOwnership returns tokens in an unspecified order, let's re-order them
|
||||
@@ -2439,7 +2434,7 @@ future<std::map<gms::inet_address, float>> storage_service::get_ownership() {
|
||||
for (auto entry : token_map) {
|
||||
locator::host_id id = tm.get_endpoint(entry.first).value();
|
||||
auto token_ownership = entry.second;
|
||||
ownership[ss._address_map.get(id)] += token_ownership;
|
||||
ownership[_address_map.get(id)] += token_ownership;
|
||||
}
|
||||
return ownership;
|
||||
});
|
||||
@@ -2848,8 +2843,12 @@ future<> storage_service::raft_removenode(locator::host_id host_id, locator::hos
|
||||
}
|
||||
|
||||
future<> storage_service::mark_excluded(const std::vector<locator::host_id>& hosts) {
|
||||
// Callers forward to shard 0 via run_with_no_api_lock (group0 is only set on shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.mark_excluded(hosts);
|
||||
});
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
@@ -3094,8 +3093,8 @@ future<sstring> storage_service::wait_for_topology_request_completion(utils::UUI
|
||||
}
|
||||
|
||||
future<> storage_service::abort_topology_request(utils::UUID request_id) {
|
||||
co_await container().invoke_on(0, [request_id] (storage_service& ss) {
|
||||
return ss._topology_state_machine.abort_request(*ss._group0, ss._group0_as, ss._feature_service, request_id);
|
||||
co_await container().invoke_on(0, [request_id, this] (storage_service& ss) {
|
||||
return _topology_state_machine.abort_request(*ss._group0, ss._group0_as, ss._feature_service, request_id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3108,13 +3107,13 @@ future<> storage_service::wait_for_topology_not_busy() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::abort_rf_change(utils::UUID request_id) {
|
||||
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.abort_rf_change(request_id);
|
||||
return ss.abort_paused_rf_change(request_id);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3125,81 +3124,20 @@ future<> storage_service::abort_rf_change(utils::UUID request_id) {
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
if (std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id)) { // keyspace_rf_change_kind::conversion_to_rack_list
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
} else if (std::ranges::contains(_topology_state_machine._topology.ongoing_rf_changes, request_id)) { // keyspace_rf_change_kind::multi_rf_change
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
if (!req_entry.error.empty()) {
|
||||
slogger.warn("RF change request with id '{}' was already aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
if (!_db.local().has_keyspace(ks_name)) {
|
||||
co_return;
|
||||
}
|
||||
auto& ks = _db.local().find_keyspace(ks_name);
|
||||
// Check the tablet maps: if any tablet still has a missing replica
|
||||
// (i.e., needs extending), we can abort. Otherwise, we're in the
|
||||
// replica removal phase and aborting would require a rollback.
|
||||
auto next_replication = ks.metadata()->next_strategy_options_opt().value()
|
||||
| std::views::transform([] (const auto& pair) {
|
||||
return std::make_pair(pair.first, std::get<locator::rack_list>(pair.second));
|
||||
}) | std::ranges::to<std::unordered_map<sstring, std::vector<sstring>>>();
|
||||
|
||||
const auto& tm = *get_token_metadata_ptr();
|
||||
bool has_missing_replica = false;
|
||||
auto all_tables = ks.metadata()->tables();
|
||||
auto all_views = ks.metadata()->views()
|
||||
| std::views::transform([] (const auto& view) { return schema_ptr(view); })
|
||||
| std::ranges::to<std::vector<schema_ptr>>();
|
||||
all_tables.insert(all_tables.end(), all_views.begin(), all_views.end());
|
||||
for (const auto& table : all_tables) {
|
||||
if (!tm.tablets().has_tablet_map(table->id()) || !tm.tablets().is_base_table(table->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = tm.tablets().get_tablet_map(table->id());
|
||||
for (const auto& ti : tmap.tablets()) {
|
||||
std::unordered_map<sstring, std::vector<sstring>> dc_to_racks;
|
||||
for (const auto& r : ti.replicas) {
|
||||
const auto& node_dc_rack = tm.get_topology().get_node(r.host).dc_rack();
|
||||
dc_to_racks[node_dc_rack.dc].push_back(node_dc_rack.rack);
|
||||
}
|
||||
auto diff = subtract_replication(next_replication, dc_to_racks);
|
||||
if (!diff.empty()) {
|
||||
has_missing_replica = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_missing_replica) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_missing_replica) {
|
||||
auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
|
||||
ks_md->set_next_strategy_options(ks_md->strategy_options());
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db.local(), ks_md, guard.write_timestamp());
|
||||
for (auto& m : schema_muts) {
|
||||
updates.push_back(canonical_mutation(m));
|
||||
}
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.abort("Aborted by user request")
|
||||
.build()));
|
||||
} else {
|
||||
slogger.warn("RF change request with id '{}' is ongoing, but it started removing replicas, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
} else {
|
||||
slogger.warn("RF change request with id '{}' can't be aborted", request_id);
|
||||
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
|
||||
if (!found) {
|
||||
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
mixed_change change{std::move(updates)};
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
|
||||
topology_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("aborting rf change request {}", request_id));
|
||||
|
||||
@@ -3957,8 +3895,11 @@ future<> storage_service::update_tablet_metadata(const locator::tablet_metadata_
|
||||
}
|
||||
|
||||
future<> storage_service::prepare_for_tablets_migration(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.prepare_for_tablets_migration(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as);
|
||||
@@ -4098,8 +4039,11 @@ future<> storage_service::prepare_for_tablets_migration(const sstring& ks_name)
|
||||
}
|
||||
|
||||
future<> storage_service::set_node_intended_storage_mode(intended_storage_mode mode) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [mode] (auto& ss) {
|
||||
return ss.set_node_intended_storage_mode(mode);
|
||||
});
|
||||
}
|
||||
|
||||
auto& raft_server = _group0->group0_server();
|
||||
auto holder = _group0->hold_group0_gate();
|
||||
@@ -4195,8 +4139,11 @@ storage_service::migration_status storage_service::get_tablets_migration_status(
|
||||
}
|
||||
|
||||
future<storage_service::keyspace_migration_status> storage_service::get_tablets_migration_status_with_node_details(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&ks_name] (auto& ss) {
|
||||
return ss.get_tablets_migration_status_with_node_details(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
keyspace_migration_status result;
|
||||
result.keyspace = ks_name;
|
||||
@@ -4257,8 +4204,11 @@ future<storage_service::keyspace_migration_status> storage_service::get_tablets_
|
||||
}
|
||||
|
||||
future<> storage_service::finalize_tablets_migration(const sstring& ks_name) {
|
||||
// Called via run_with_no_api_lock (forwards to shard 0).
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
if (this_shard_id() != 0) {
|
||||
co_return co_await container().invoke_on(0, [&ks_name] (auto& ss) {
|
||||
return ss.finalize_tablets_migration(ks_name);
|
||||
});
|
||||
}
|
||||
|
||||
slogger.info("Finalizing vnodes-to-tablets migration for keyspace '{}'", ks_name);
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include "absl-flat_hash_map.hh"
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/client_routes.hh"
|
||||
@@ -40,11 +41,9 @@
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include "cdc/generation_id.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "node_ops/id.hh"
|
||||
#include "raft/server.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "service/tablet_operation.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
@@ -115,6 +114,10 @@ class tablet_mutation_builder;
|
||||
|
||||
namespace auth { class cache; }
|
||||
|
||||
namespace service {
|
||||
class tablet_allocator;
|
||||
}
|
||||
|
||||
namespace utils {
|
||||
class disk_space_monitor;
|
||||
}
|
||||
@@ -780,19 +783,13 @@ private:
|
||||
*/
|
||||
future<> stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, locator::host_id>> ranges_to_stream_by_keyspace);
|
||||
|
||||
// REST handlers are gated at the registration site (see gated() in
|
||||
// api/storage_service.cc) so stop() drains in-flight requests before
|
||||
// teardown. run_with_api_lock_internal and run_with_no_api_lock hold
|
||||
// _async_gate on shard 0 as well, because REST requests arriving on
|
||||
// any shard are forwarded there for execution.
|
||||
template <typename Func>
|
||||
auto run_with_api_lock_internal(storage_service& ss, Func&& func, sstring& operation) {
|
||||
auto holder = ss._async_gate.hold();
|
||||
if (!ss._operation_in_progress.empty()) {
|
||||
throw std::runtime_error(format("Operation {} is in progress, try again", ss._operation_in_progress));
|
||||
}
|
||||
ss._operation_in_progress = std::move(operation);
|
||||
return func(ss).finally([&ss, holder = std::move(holder)] {
|
||||
return func(ss).finally([&ss] {
|
||||
ss._operation_in_progress = sstring();
|
||||
});
|
||||
}
|
||||
@@ -800,10 +797,6 @@ private:
|
||||
public:
|
||||
int32_t get_exception_count();
|
||||
|
||||
auto hold_async_gate() {
|
||||
return _async_gate.hold();
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
auto run_with_api_lock(sstring operation, Func&& func) {
|
||||
return container().invoke_on(0, [operation = std::move(operation),
|
||||
@@ -814,10 +807,8 @@ public:
|
||||
|
||||
template <typename Func>
|
||||
auto run_with_no_api_lock(Func&& func) {
|
||||
return container().invoke_on(0, [func = std::forward<Func>(func)] (storage_service& ss) mutable
|
||||
-> futurize_t<std::invoke_result_t<Func, storage_service&>> {
|
||||
auto holder = ss._async_gate.hold();
|
||||
co_return co_await futurize_invoke(func, ss);
|
||||
return container().invoke_on(0, [func = std::forward<Func>(func)] (storage_service& ss) mutable {
|
||||
return func(ss);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -987,7 +978,7 @@ public:
|
||||
|
||||
future<> wait_for_topology_not_busy();
|
||||
|
||||
future<> abort_rf_change(utils::UUID request_id);
|
||||
future<> abort_paused_rf_change(utils::UUID request_id);
|
||||
|
||||
private:
|
||||
semaphore _do_sample_sstables_concurrency_limiter{1};
|
||||
|
||||
@@ -154,7 +154,7 @@ auto coordinator::create_operation_ctx(const schema& schema, const dht::token& t
|
||||
co_await utils::get_local_injector().inject("sc_coordinator_wait_before_acquire_server",
|
||||
utils::wait_for_message(5min));
|
||||
|
||||
auto raft_server = co_await _groups_manager.acquire_server(schema.id(), raft_info.group_id, as);
|
||||
auto raft_server = co_await _groups_manager.acquire_server(raft_info.group_id, as);
|
||||
|
||||
co_return operation_ctx {
|
||||
.erm = std::move(erm),
|
||||
|
||||
@@ -332,27 +332,11 @@ void groups_manager::update(token_metadata_ptr new_tm) {
|
||||
schedule_raft_groups_deletion(false);
|
||||
}
|
||||
|
||||
future<raft_server> groups_manager::acquire_server(table_id table_id, raft::group_id group_id, abort_source& as) {
|
||||
future<raft_server> groups_manager::acquire_server(raft::group_id group_id, abort_source& as) {
|
||||
if (!_features.strongly_consistent_tables) {
|
||||
on_internal_error(logger, "strongly consistent tables are not enabled on this shard");
|
||||
}
|
||||
|
||||
// A concurrent DROP TABLE may have already removed the table from database
|
||||
// registries and erased the raft group from _raft_groups via
|
||||
// schedule_raft_group_deletion. The schema.table() in create_operation_ctx()
|
||||
// might not fail though in this case because someone might be holding
|
||||
// lw_shared_ptr<table>, so that the table is dropped but the table object
|
||||
// is still alive.
|
||||
//
|
||||
// Check that the table still exists in the database to turn the
|
||||
// fatal on_internal_error below into a clean no_such_column_family
|
||||
// exception.
|
||||
//
|
||||
// When the table does exist, we proceed to acquire state.gate->hold().
|
||||
// This prevents schedule_raft_group_deletion (which co_awaits gate::close)
|
||||
// from erasing the group until the DML operation completes.
|
||||
_db.find_column_family(table_id);
|
||||
|
||||
const auto it = _raft_groups.find(group_id);
|
||||
if (it == _raft_groups.end()) {
|
||||
on_internal_error(logger, format("raft group {} not found", group_id));
|
||||
|
||||
@@ -11,7 +11,10 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "message/messaging_service.hh"
|
||||
#include "service/raft/raft_group_registry.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
|
||||
namespace cql3 {
|
||||
class query_processor;
|
||||
}
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
@@ -110,7 +113,7 @@ public:
|
||||
void update(locator::token_metadata_ptr new_tm);
|
||||
|
||||
// The raft_server instance is used to submit write commands and perform read_barrier() before reads.
|
||||
future<raft_server> acquire_server(table_id table_id, raft::group_id group_id, abort_source& as);
|
||||
future<raft_server> acquire_server(raft::group_id group_id, abort_source& as);
|
||||
|
||||
// Called during node boot. Waits for all raft::server instances corresponding
|
||||
// to the latest group0 state to start.
|
||||
|
||||
@@ -31,7 +31,6 @@
|
||||
#include <ranges>
|
||||
#include <utility>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/coroutine/switch_to.hh>
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
@@ -534,38 +533,6 @@ struct hash<migration_tablet_set> {
|
||||
|
||||
namespace service {
|
||||
|
||||
// Subtract right from left. The result contains only keys from left.
|
||||
std::unordered_map<sstring, std::vector<sstring>> subtract_replication(const std::unordered_map<sstring, std::vector<sstring>>& left, const std::unordered_map<sstring, std::vector<sstring>>& right) {
|
||||
std::unordered_map<sstring, std::vector<sstring>> res;
|
||||
for (const auto& [dc, rf_value] : left) {
|
||||
auto it = right.find(dc);
|
||||
if (it == right.end()) {
|
||||
res[dc] = rf_value;
|
||||
} else {
|
||||
std::vector<sstring> diff = rf_value | std::views::filter([&] (const sstring& rack) {
|
||||
return std::find(it->second.begin(), it->second.end(), rack) == it->second.end();
|
||||
}) | std::ranges::to<std::vector<sstring>>();
|
||||
if (!diff.empty()) {
|
||||
res[dc] = diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
bool rf_count_per_dc_equals(const locator::replication_strategy_config_options& current, const locator::replication_strategy_config_options& next) {
|
||||
if (current.size() != next.size()) {
|
||||
return false;
|
||||
}
|
||||
for (const auto& [dc, current_rf_value] : current) {
|
||||
auto it = next.find(dc);
|
||||
if (it == next.end() || get_replication_factor(it->second) != get_replication_factor(current_rf_value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// The algorithm aims to equalize tablet count on each shard.
|
||||
/// This goal is based on the assumption that every shard has similar processing power and space capacity,
|
||||
/// and that each tablet has equal consumption of those resources. So by equalizing tablet count per shard we
|
||||
@@ -1083,22 +1050,17 @@ public:
|
||||
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
|
||||
}
|
||||
|
||||
bool ongoing_rf_change() const {
|
||||
return _topology != nullptr && _sys_ks != nullptr && !_topology->ongoing_rf_changes.empty();
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan() {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
auto rf_change_prep = co_await prepare_per_rack_rf_change_plan(plan);
|
||||
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation || !rf_change_prep.actions.empty()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack, rf_change_prep.actions[{dc, rack}]);
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Plan for {}/{}: {}", dc, rack, plan_summary(rack_plan));
|
||||
plan.merge(std::move(rack_plan));
|
||||
@@ -1488,387 +1450,6 @@ public:
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
enum class rf_change_state {
|
||||
ready, // RF change is ready (succeed or failed).
|
||||
needs_extending,
|
||||
needs_shrinking,
|
||||
};
|
||||
|
||||
using process_views = bool_class<struct process_views_tag>;
|
||||
struct rf_change_action {
|
||||
sstring keyspace;
|
||||
rf_change_state state;
|
||||
process_views pv = process_views::no;
|
||||
};
|
||||
using rf_change_actions = std::unordered_map<locator::endpoint_dc_rack, std::vector<rf_change_action>>;
|
||||
struct rf_change_preparation {
|
||||
rf_change_actions actions;
|
||||
};
|
||||
|
||||
// Determines which dc+rack combinations need RF change actions for a given keyspace,
|
||||
// by comparing current tablet replicas against the target replication configuration.
|
||||
// Scans in priority order: extend tables, extend views, shrink views, shrink tables.
|
||||
// Returns the first non-empty set of per-rack actions; colocated tables are skipped.
|
||||
// An empty result means all tablets already match the target configuration.
|
||||
future<rf_change_preparation> determine_rf_change_actions_per_rack(const sstring& ks_name, const std::vector<schema_ptr>& tables, const std::vector<schema_ptr>& views, const locator::replication_strategy_config_options& next) {
|
||||
auto add_entry = [&ks_name] (rf_change_preparation& prep, const sstring& dc, const sstring& rack, rf_change_state state, process_views pv) {
|
||||
locator::endpoint_dc_rack key{dc, rack};
|
||||
auto& actions = prep.actions[key];
|
||||
if (std::none_of(actions.begin(), actions.end(), [&](const rf_change_action& a) { return a.keyspace == ks_name; })) {
|
||||
actions.push_back(rf_change_action{.keyspace = ks_name, .state = state, .pv = pv});
|
||||
}
|
||||
};
|
||||
|
||||
auto next_replication = next | std::views::transform([] (const auto& pair) {
|
||||
return std::make_pair(pair.first, std::get<rack_list>(pair.second));
|
||||
}) | std::ranges::to<std::unordered_map<sstring, std::vector<sstring>>>();
|
||||
|
||||
auto scan_tables = [&] (const std::vector<schema_ptr>& table_list, rf_change_state target_state, process_views pv) -> future<rf_change_preparation> {
|
||||
rf_change_preparation prep;
|
||||
for (const auto& table : table_list) {
|
||||
if (!_tm->tablets().is_base_table(table->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table->id());
|
||||
for (const tablet_info& ti : tmap.tablets()) {
|
||||
std::unordered_map<sstring, std::vector<sstring>> dc_to_racks;
|
||||
for (const auto& r : ti.replicas) {
|
||||
const auto& node_dc_rack = _tm->get_topology().get_node(r.host).dc_rack();
|
||||
dc_to_racks[node_dc_rack.dc].push_back(node_dc_rack.rack);
|
||||
}
|
||||
|
||||
auto diff = (target_state == rf_change_state::needs_extending ?
|
||||
subtract_replication(next_replication, dc_to_racks) : subtract_replication(dc_to_racks, next_replication))
|
||||
| std::views::filter([] (const auto& pair) {
|
||||
return !pair.second.empty();
|
||||
}
|
||||
) | std::ranges::to<std::unordered_map<sstring, std::vector<sstring>>>();
|
||||
|
||||
for (const auto& [dc, racks] : diff) {
|
||||
for (const auto& rack : racks) {
|
||||
add_entry(prep, dc, rack, target_state, pv);
|
||||
}
|
||||
}
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
co_return prep;
|
||||
};
|
||||
|
||||
// Extend base tables.
|
||||
if (auto prep = co_await scan_tables(tables, rf_change_state::needs_extending, process_views::no); !prep.actions.empty()) {
|
||||
co_return prep;
|
||||
}
|
||||
|
||||
if (utils::get_local_injector().enter("determine_rf_change_actions_per_rack_throw")) {
|
||||
lblogger.info("determine_rf_change_actions_per_rack_throw: entered");
|
||||
throw std::runtime_error("determine_rf_change_actions_per_rack_throw injection");
|
||||
}
|
||||
|
||||
// Extend views.
|
||||
if (auto prep = co_await scan_tables(views, rf_change_state::needs_extending, process_views::yes); !prep.actions.empty()) {
|
||||
co_return prep;
|
||||
}
|
||||
|
||||
// Shrink views.
|
||||
if (auto prep = co_await scan_tables(views, rf_change_state::needs_shrinking, process_views::yes); !prep.actions.empty()) {
|
||||
co_return prep;
|
||||
}
|
||||
|
||||
// Shrink base tables.
|
||||
if (auto prep = co_await scan_tables(tables, rf_change_state::needs_shrinking, process_views::no); !prep.actions.empty()) {
|
||||
co_return prep;
|
||||
}
|
||||
|
||||
co_return rf_change_preparation{};
|
||||
}
|
||||
|
||||
future<rf_change_preparation> prepare_per_rack_rf_change_plan(migration_plan& mplan) {
|
||||
lblogger.debug("In prepare_per_rack_rf_change_plan");
|
||||
|
||||
rf_change_preparation res;
|
||||
keyspace_rf_change_plan plan;
|
||||
if (!ongoing_rf_change()) {
|
||||
co_return res;
|
||||
}
|
||||
|
||||
for (const auto& request_id : _topology->ongoing_rf_changes) {
|
||||
auto req_entry = co_await _sys_ks->get_topology_request_entry(request_id);
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
|
||||
if (!_db.has_keyspace(ks_name)) {
|
||||
if (!plan.completion) {
|
||||
plan.completion = rf_change_completion_info{
|
||||
.request_id = request_id,
|
||||
.ks_name = ks_name,
|
||||
.error = format("Keyspace {} not found", ks_name),
|
||||
.saved_ks_props = req_entry.new_keyspace_rf_change_data.value(),
|
||||
};
|
||||
}
|
||||
continue;
|
||||
}
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
if (!ks.metadata()->next_strategy_options_opt()) {
|
||||
on_internal_error(lblogger, format("There is an ongoing rf change request {} for keyspace {}, "
|
||||
"but the keyspace does not have next replication settings", request_id, ks_name));
|
||||
}
|
||||
|
||||
auto tables = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views() | std::views::transform([] (const auto& view) { return schema_ptr(view); }) | std::ranges::to<std::vector<schema_ptr>>();
|
||||
auto rf_change_prep = co_await determine_rf_change_actions_per_rack(ks_name, tables, views, *ks.metadata()->next_strategy_options_opt());
|
||||
if (rf_change_prep.actions.empty()) {
|
||||
if (!plan.completion) {
|
||||
plan.completion = rf_change_completion_info{
|
||||
.request_id = request_id,
|
||||
.ks_name = ks_name,
|
||||
.error = req_entry.error,
|
||||
.saved_ks_props = req_entry.new_keyspace_rf_change_data.value()
|
||||
};
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if any extending action targets a dc+rack with no available nodes.
|
||||
// If so, the RF change can never complete and should be aborted.
|
||||
sstring error_msg = "";
|
||||
const auto& topo = _tm->get_topology();
|
||||
const auto& dc_rack_nodes = topo.get_datacenter_rack_nodes();
|
||||
for (const auto& [dc_rack, actions] : rf_change_prep.actions) {
|
||||
bool needs_extending = std::ranges::any_of(actions, [] (const rf_change_action& a) {
|
||||
return a.state == rf_change_state::needs_extending;
|
||||
});
|
||||
if (!needs_extending) {
|
||||
break;
|
||||
}
|
||||
bool has_live_node = false;
|
||||
bool has_down_node = false;
|
||||
auto dc_it = dc_rack_nodes.find(dc_rack.dc);
|
||||
if (dc_it != dc_rack_nodes.end()) {
|
||||
auto rack_it = dc_it->second.find(dc_rack.rack);
|
||||
if (rack_it != dc_it->second.end()) {
|
||||
for (const auto& node_ref : rack_it->second) {
|
||||
const auto& node = node_ref.get();
|
||||
if (_skiplist.contains(node.host_id())) {
|
||||
has_down_node = true;
|
||||
break;
|
||||
}
|
||||
if (!node.is_excluded()) {
|
||||
has_live_node = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_down_node) {
|
||||
lblogger.warn("RF change for keyspace {} requires extending to {}/{} but there are down nodes there; aborting",
|
||||
ks_name, dc_rack.dc, dc_rack.rack);
|
||||
error_msg = format("RF change aborted: there are down nodes in required rack {}/{}", dc_rack.dc, dc_rack.rack);
|
||||
break;
|
||||
}
|
||||
if (!has_live_node) {
|
||||
lblogger.warn("RF change for keyspace {} requires extending to {}/{} but no available nodes exist there; aborting",
|
||||
ks_name, dc_rack.dc, dc_rack.rack);
|
||||
error_msg = format("RF change aborted: no available nodes in required rack {}/{}", dc_rack.dc, dc_rack.rack);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!error_msg.empty()) {
|
||||
plan.aborts.push_back(rf_change_abort_info{
|
||||
.request_id = request_id,
|
||||
.ks_name = ks_name,
|
||||
.error = error_msg,
|
||||
.current_replication = ks.metadata()->strategy_options(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto& [dc_rack, actions] : rf_change_prep.actions) {
|
||||
auto& dst = res.actions[dc_rack];
|
||||
dst.insert(dst.end(), std::make_move_iterator(actions.begin()), std::make_move_iterator(actions.end()));
|
||||
}
|
||||
}
|
||||
mplan.set_rf_change_plan(std::move(plan));
|
||||
co_return res;
|
||||
}
|
||||
|
||||
future<migration_plan> make_rf_change_plan(node_load_map& nodes, std::vector<rf_change_action> actions, sstring dc, sstring rack) {
|
||||
lblogger.debug("In make_rf_change_plan");
|
||||
|
||||
migration_plan mplan;
|
||||
keyspace_rf_change_plan plan;
|
||||
|
||||
auto nodes_by_load_dst = nodes | std::views::filter([&] (const auto& host_load) {
|
||||
auto& [host, load] = host_load;
|
||||
auto& node = *load.node;
|
||||
return node.dc_rack().dc == dc && node.dc_rack().rack == rack;
|
||||
}) | std::views::keys | std::ranges::to<std::vector<host_id>>();
|
||||
|
||||
bool has_extending = std::ranges::any_of(actions, [] (const rf_change_action& a) {
|
||||
return a.state == rf_change_state::needs_extending;
|
||||
});
|
||||
if (has_extending) {
|
||||
// Check that all normal, non-excluded nodes in the target dc/rack are present in the
|
||||
// balanced node set. If any such node is missing, extending cannot safely proceed.
|
||||
const auto& topo = _tm->get_topology();
|
||||
const auto& dc_rack_nodes = topo.get_datacenter_rack_nodes();
|
||||
bool missing_node = false;
|
||||
auto dc_it = dc_rack_nodes.find(dc);
|
||||
if (dc_it != dc_rack_nodes.end()) {
|
||||
auto rack_it = dc_it->second.find(rack);
|
||||
if (rack_it != dc_it->second.end()) {
|
||||
for (const auto& node_ref : rack_it->second) {
|
||||
const auto& node = node_ref.get();
|
||||
if (node.is_normal() && !node.is_excluded() && !nodes.contains(node.host_id())) {
|
||||
missing_node = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (missing_node || nodes_by_load_dst.empty()) {
|
||||
lblogger.warn("Not all non-excluded nodes are available for RF change extending plan in dc {}, rack {}", dc, rack);
|
||||
// Filter out extending actions since not all nodes are available.
|
||||
// Shrinking actions can still proceed without target nodes.
|
||||
std::erase_if(actions, [] (const rf_change_action& a) {
|
||||
return a.state == rf_change_state::needs_extending;
|
||||
});
|
||||
if (actions.empty()) {
|
||||
co_return mplan;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto nodes_cmp = nodes_by_load_cmp(nodes);
|
||||
auto nodes_dst_cmp = [&] (const host_id& a, const host_id& b) {
|
||||
return nodes_cmp(b, a);
|
||||
};
|
||||
|
||||
// Ascending load heap of candidate target nodes.
|
||||
std::make_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
locator::endpoint_dc_rack location{dc, rack};
|
||||
|
||||
for (const auto& action : actions) {
|
||||
const auto& ks_name = action.keyspace;
|
||||
const auto& rf_change_state = action.state;
|
||||
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
auto table_list = action.pv
|
||||
? ks.metadata()->views() | std::views::transform([] (const auto& view) { return schema_ptr(view); }) | std::ranges::to<std::vector<schema_ptr>>()
|
||||
: ks.metadata()->tables();
|
||||
for (const auto& table_or_mv : table_list) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table_or_mv->id());
|
||||
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
|
||||
if (!_tm->tablets().is_base_table(table_or_mv->id())) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto gid = locator::global_tablet_id{table_or_mv->id(), tid};
|
||||
|
||||
auto it = std::find_if(ti.replicas.begin(), ti.replicas.end(), [&] (const tablet_replica& r) {
|
||||
return topo.get_node(r.host).dc_rack() == location;
|
||||
});
|
||||
|
||||
auto replica = it != ti.replicas.end() ? std::optional<tablet_replica>{*it} : std::nullopt;
|
||||
|
||||
auto* tti = tmap.get_tablet_transition_info(tid);
|
||||
bool pending_replica_in_this_rack = false;
|
||||
bool leaving_replica_in_this_rack = false;
|
||||
if (tti) {
|
||||
auto leaving_replica = get_leaving_replica(ti, *tti);
|
||||
leaving_replica_in_this_rack = leaving_replica.has_value() && topo.get_node(leaving_replica->host).dc_rack() == location;
|
||||
pending_replica_in_this_rack = tti->pending_replica.has_value() && topo.get_node(tti->pending_replica->host).dc_rack() == location;
|
||||
}
|
||||
|
||||
if ((rf_change_state == rf_change_state::needs_extending && (replica && !leaving_replica_in_this_rack)) ||
|
||||
(rf_change_state == rf_change_state::needs_shrinking && (!replica && !pending_replica_in_this_rack))) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Skip tablet that is in transitions.
|
||||
if (tti) {
|
||||
lblogger.debug("Skipped rf change extending for tablet={} which is already in transition={} stage={}", gid, tti->transition, tti->stage);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Skip tablet that is about to be in transition.
|
||||
if (_scheduled_tablets.contains(gid)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
migration_tablet_set source_tablets {
|
||||
.tablet_s = gid, // Ignore the merge co-location.
|
||||
};
|
||||
if (rf_change_state == rf_change_state::needs_extending) {
|
||||
// Pick the least loaded node as target.
|
||||
std::pop_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
auto target = nodes_by_load_dst.back();
|
||||
|
||||
lblogger.debug("target node: {}, avg_load={}", target, nodes[target].avg_load);
|
||||
|
||||
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
|
||||
|
||||
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
|
||||
nodes[target].shards[dst.shard].tablet_count,
|
||||
nodes[target].shard_load(dst.shard, _target_tablet_size));
|
||||
|
||||
tablet_replica pending_replica{
|
||||
.host = target,
|
||||
.shard = dst.shard,
|
||||
};
|
||||
auto next = ti.replicas;
|
||||
next.push_back(pending_replica);
|
||||
tablet_migration_info mig{
|
||||
.kind = locator::tablet_transition_kind::rebuild_v2,
|
||||
.tablet = gid,
|
||||
.src = std::nullopt,
|
||||
.dst = pending_replica,
|
||||
};
|
||||
auto mig_streaming_info = get_migration_streaming_info(topo, ti, mig);
|
||||
pick(*_load_sketch, dst.host, dst.shard, source_tablets);
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
lblogger.debug("Starting rebuild_v2 transition to {}.{} of tablet {}; new_replica = {}", dc, rack, gid, pending_replica);
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
mark_as_scheduled(mig);
|
||||
mplan.add(std::move(mig));
|
||||
}
|
||||
increase_node_load(nodes, dst, source_tablets);
|
||||
std::push_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
} else {
|
||||
auto next = ti.replicas | std::views::filter([&] (const tablet_replica& r) {
|
||||
return r != *replica;
|
||||
}) | std::ranges::to<tablet_replica_set>();
|
||||
tablet_migration_info mig{
|
||||
.kind = locator::tablet_transition_kind::rebuild_v2,
|
||||
.tablet = gid,
|
||||
.src = *replica,
|
||||
.dst = std::nullopt,
|
||||
};
|
||||
auto mig_streaming_info = get_migration_streaming_info(topo, ti, mig);
|
||||
// The node being shrunk may be excluded/down and lack complete tablet stats.
|
||||
// Since we're removing a replica (not placing one), accurate load data isn't needed.
|
||||
if (_load_sketch->has_node(replica->host)) {
|
||||
unload(*_load_sketch, replica->host, replica->shard, source_tablets);
|
||||
}
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
mark_as_scheduled(mig);
|
||||
mplan.add(std::move(mig));
|
||||
}
|
||||
if (nodes.contains(replica->host)) {
|
||||
decrease_node_load(nodes, *replica, source_tablets);
|
||||
}
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
}
|
||||
mplan.set_rf_change_plan(std::move(plan));
|
||||
co_return mplan;
|
||||
}
|
||||
|
||||
// Returns true if a table has replicas of all its sibling tablets co-located.
|
||||
// This is used for determining whether merge can be finalized, since co-location
|
||||
// is a strict requirement for sibling tablets to be merged.
|
||||
@@ -3077,13 +2658,14 @@ public:
|
||||
src_shard.dusage->used -= tablet_sizes;
|
||||
}
|
||||
|
||||
void increase_node_load(node_load_map& nodes, tablet_replica replica, const migration_tablet_set& tablet_set) {
|
||||
// Adjusts the load of the source and destination (host:shard) that were picked for the migration.
|
||||
void update_node_load_on_migration(node_load_map& nodes, tablet_replica src, tablet_replica dst, const migration_tablet_set& tablet_set) {
|
||||
auto tablet_count = tablet_set.tablets().size();
|
||||
auto tablet_sizes = tablet_set.tablet_set_disk_size;
|
||||
auto table = tablet_set.tablets().front().table;
|
||||
|
||||
auto& dst_node = nodes[replica.host];
|
||||
auto& dst_shard = dst_node.shards[replica.shard];
|
||||
auto& dst_node = nodes[dst.host];
|
||||
auto& dst_shard = dst_node.shards[dst.shard];
|
||||
dst_shard.tablet_count += tablet_count;
|
||||
dst_shard.tablet_count_per_table[table] += tablet_count;
|
||||
dst_shard.tablet_sizes_per_table[table] += tablet_sizes;
|
||||
@@ -3093,15 +2675,9 @@ public:
|
||||
dst_node.tablet_count += tablet_count;
|
||||
dst_node.dusage->used += tablet_sizes;
|
||||
dst_node.update();
|
||||
}
|
||||
|
||||
void decrease_node_load(node_load_map& nodes, tablet_replica replica, const migration_tablet_set& tablet_set) {
|
||||
auto tablet_count = tablet_set.tablets().size();
|
||||
auto tablet_sizes = tablet_set.tablet_set_disk_size;
|
||||
auto table = tablet_set.tablets().front().table;
|
||||
|
||||
auto& src_node = nodes[replica.host];
|
||||
auto& src_shard = src_node.shards[replica.shard];
|
||||
auto& src_node = nodes[src.host];
|
||||
auto& src_shard = src_node.shards[src.shard];
|
||||
src_shard.tablet_count -= tablet_count;
|
||||
src_shard.tablet_count_per_table[table] -= tablet_count;
|
||||
src_shard.tablet_sizes_per_table[table] -= tablet_sizes;
|
||||
@@ -3117,12 +2693,6 @@ public:
|
||||
src_node.update();
|
||||
}
|
||||
|
||||
// Adjusts the load of the source and destination (host:shard) that were picked for the migration.
|
||||
void update_node_load_on_migration(node_load_map& nodes, tablet_replica src, tablet_replica dst, const migration_tablet_set& tablet_set) {
|
||||
increase_node_load(nodes, dst, tablet_set);
|
||||
decrease_node_load(nodes, src, tablet_set);
|
||||
}
|
||||
|
||||
static void unload(locator::load_sketch& sketch, host_id host, shard_id shard, const migration_tablet_set& tablet_set) {
|
||||
sketch.unload(host, shard, tablet_set.tablets().size(), tablet_set.tablet_set_disk_size);
|
||||
}
|
||||
@@ -4073,7 +3643,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan(dc_name dc, std::optional<sstring> rack = std::nullopt, std::vector<rf_change_action> rf_change_actions = {}) {
|
||||
future<migration_plan> make_plan(dc_name dc, std::optional<sstring> rack = std::nullopt) {
|
||||
migration_plan plan;
|
||||
|
||||
if (utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
@@ -4191,6 +3761,12 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
if (nodes.empty()) {
|
||||
lblogger.debug("No nodes to balance.");
|
||||
_current_stats->stop_balance++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
// Detect finished drain.
|
||||
|
||||
for (auto i = nodes_to_drain.begin(); i != nodes_to_drain.end();) {
|
||||
@@ -4265,6 +3841,7 @@ public:
|
||||
}
|
||||
lblogger.debug("No candidate nodes");
|
||||
_current_stats->stop_no_candidates++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
// We want to saturate the target node so we migrate several tablets in parallel, one for each shard
|
||||
@@ -4426,7 +4003,7 @@ public:
|
||||
|
||||
print_node_stats(nodes, only_active::yes);
|
||||
|
||||
if (has_dest_nodes && (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || !is_balanced(min_load, max_load)))) && !nodes.empty()) {
|
||||
if (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || !is_balanced(min_load, max_load)))) {
|
||||
host_id target = *min_load_node;
|
||||
lblogger.info("target node: {}, avg_load: {}, max: {}", target, min_load, max_load);
|
||||
plan.merge(co_await make_internode_plan(nodes, nodes_to_drain, target));
|
||||
@@ -4438,10 +4015,6 @@ public:
|
||||
plan.merge(co_await make_intranode_plan(nodes, nodes_to_drain));
|
||||
}
|
||||
|
||||
if (!rf_change_actions.empty() && rack.has_value()) {
|
||||
plan.merge(co_await make_rf_change_plan(nodes, rf_change_actions, dc, rack.value()));
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
|
||||
@@ -8,10 +8,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "replica/database_fwd.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "tablet_allocator_fwd.hh"
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include <seastar/core/metrics.hh>
|
||||
@@ -183,34 +181,6 @@ struct tablet_rack_list_colocation_plan {
|
||||
}
|
||||
};
|
||||
|
||||
struct rf_change_completion_info {
|
||||
utils::UUID request_id;
|
||||
sstring ks_name;
|
||||
sstring error;
|
||||
std::unordered_map<sstring, sstring> saved_ks_props;
|
||||
};
|
||||
|
||||
struct rf_change_abort_info {
|
||||
utils::UUID request_id;
|
||||
sstring ks_name;
|
||||
sstring error;
|
||||
locator::replication_strategy_config_options current_replication;
|
||||
};
|
||||
|
||||
struct keyspace_rf_change_plan {
|
||||
std::optional<rf_change_completion_info> completion;
|
||||
std::vector<rf_change_abort_info> aborts;
|
||||
|
||||
size_t size() const { return (completion ? 1 : 0) + aborts.size(); };
|
||||
|
||||
void merge(keyspace_rf_change_plan&& other) {
|
||||
if (!completion) {
|
||||
completion = std::move(other.completion);
|
||||
}
|
||||
std::move(other.aborts.begin(), other.aborts.end(), std::back_inserter(aborts));
|
||||
}
|
||||
};
|
||||
|
||||
class migration_plan {
|
||||
public:
|
||||
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
|
||||
@@ -219,22 +189,19 @@ private:
|
||||
table_resize_plan _resize_plan;
|
||||
tablet_repair_plan _repair_plan;
|
||||
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
|
||||
keyspace_rf_change_plan _rf_change_plan;
|
||||
bool _has_nodes_to_drain = false;
|
||||
std::vector<drain_failure> _drain_failures;
|
||||
public:
|
||||
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
bool requires_schema_changes() const { return _rf_change_plan.size() > 0; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return !size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size() + _drain_failures.size() + _rf_change_plan.size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size() + _drain_failures.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
size_t tablet_repair_count() const { return _repair_plan.size(); }
|
||||
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
|
||||
size_t keyspace_rf_change_count() const { return _rf_change_plan.size(); }
|
||||
const std::vector<drain_failure>& drain_failures() const { return _drain_failures; }
|
||||
|
||||
void add(tablet_migration_info info) {
|
||||
@@ -258,7 +225,6 @@ public:
|
||||
_resize_plan.merge(std::move(other._resize_plan));
|
||||
_repair_plan.merge(std::move(other._repair_plan));
|
||||
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
|
||||
_rf_change_plan.merge(std::move(other._rf_change_plan));
|
||||
}
|
||||
|
||||
void set_has_nodes_to_drain(bool b) {
|
||||
@@ -283,12 +249,6 @@ public:
|
||||
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
|
||||
}
|
||||
|
||||
const keyspace_rf_change_plan& rf_change_plan() const { return _rf_change_plan; }
|
||||
|
||||
void set_rf_change_plan(keyspace_rf_change_plan rf_change_plan) {
|
||||
_rf_change_plan = std::move(rf_change_plan);
|
||||
}
|
||||
|
||||
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
|
||||
};
|
||||
|
||||
@@ -357,9 +317,6 @@ future<bool> requires_rack_list_colocation(
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id);
|
||||
|
||||
bool rf_count_per_dc_equals(const locator::replication_strategy_config_options& current, const locator::replication_strategy_config_options& next);
|
||||
std::unordered_map<sstring, std::vector<sstring>> subtract_replication(const std::unordered_map<sstring, std::vector<sstring>>& left, const std::unordered_map<sstring, std::vector<sstring>>& right);
|
||||
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@@ -452,7 +452,7 @@ future<std::optional<tasks::task_status>> global_topology_request_virtual_task::
|
||||
}
|
||||
|
||||
future<> global_topology_request_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
return _ss.abort_rf_change(id.uuid());
|
||||
return _ss.abort_paused_rf_change(id.uuid());
|
||||
}
|
||||
|
||||
future<std::vector<tasks::task_stats>> global_topology_request_virtual_task::get_stats() {
|
||||
|
||||
@@ -414,20 +414,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
};
|
||||
|
||||
future<> update_topology_state_with_mixed_change(
|
||||
group0_guard guard, utils::chunked_vector<canonical_mutation>&& updates, const sstring& reason) {
|
||||
try {
|
||||
rtlogger.info("updating topology state with mixed change: {}", reason);
|
||||
rtlogger.trace("update_topology_state mutations: {}", updates);
|
||||
mixed_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
|
||||
co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
|
||||
} catch (group0_concurrent_modification&) {
|
||||
rtlogger.info("race while changing state: {}. Retrying", reason);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
raft::server_id parse_replaced_node(const std::optional<request_param>& req_param) const {
|
||||
return service::topology::parse_replaced_node(req_param);
|
||||
}
|
||||
@@ -975,63 +961,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
}
|
||||
|
||||
enum class keyspace_rf_change_kind {
|
||||
default_rf_change,
|
||||
conversion_to_rack_list,
|
||||
multi_rf_change
|
||||
};
|
||||
|
||||
future<keyspace_rf_change_kind> choose_keyspace_rf_change_kind(utils::UUID req_id,
|
||||
lw_shared_ptr<keyspace_metadata> old_ks_md,
|
||||
lw_shared_ptr<keyspace_metadata> new_ks_md,
|
||||
const std::vector<schema_ptr>& tables_with_mvs) {
|
||||
const auto& new_replication_strategy_config = new_ks_md->strategy_options();
|
||||
const auto& old_replication_strategy_config = old_ks_md->strategy_options();
|
||||
auto check_needs_colocation = [&] () -> future<bool> {
|
||||
bool rack_list_conversion = false;
|
||||
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (std::holds_alternative<locator::rack_list>(rf_value)) {
|
||||
auto it = old_replication_strategy_config.find(dc);
|
||||
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
|
||||
rack_list_conversion = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, get_token_metadata_ptr(), &_sys_ks, req_id) : false;
|
||||
};
|
||||
auto all_changes_are_0_N = [&] {
|
||||
auto all_dcs = old_replication_strategy_config | std::views::keys;
|
||||
auto new_dcs = new_replication_strategy_config | std::views::keys;
|
||||
std::set<sstring> dcs(all_dcs.begin(), all_dcs.end());
|
||||
dcs.insert(new_dcs.begin(), new_dcs.end());
|
||||
for (const auto& dc : dcs) {
|
||||
auto old_it = old_replication_strategy_config.find(dc);
|
||||
auto new_it = new_replication_strategy_config.find(dc);
|
||||
size_t old_rf = (old_it != old_replication_strategy_config.end()) ? locator::get_replication_factor(old_it->second) : 0;
|
||||
size_t new_rf = (new_it != new_replication_strategy_config.end()) ? locator::get_replication_factor(new_it->second) : 0;
|
||||
if (old_rf == new_rf) {
|
||||
continue;
|
||||
}
|
||||
if (old_rf != 0 && new_rf != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
if (tables_with_mvs.empty()) {
|
||||
co_return keyspace_rf_change_kind::default_rf_change;
|
||||
}
|
||||
if (co_await check_needs_colocation()) {
|
||||
co_return keyspace_rf_change_kind::conversion_to_rack_list;
|
||||
}
|
||||
if (_feature_service.keyspace_multi_rf_change && locator::uses_rack_list_exclusively(old_replication_strategy_config) && locator::uses_rack_list_exclusively(new_replication_strategy_config) && !rf_count_per_dc_equals(old_replication_strategy_config, new_replication_strategy_config) && all_changes_are_0_N()) {
|
||||
co_return keyspace_rf_change_kind::multi_rf_change;
|
||||
}
|
||||
co_return keyspace_rf_change_kind::default_rf_change;
|
||||
}
|
||||
|
||||
// Precondition: there is no node request and no ongoing topology transition
|
||||
// (checked under the guard we're holding).
|
||||
future<> handle_global_request(group0_guard guard) {
|
||||
@@ -1087,18 +1016,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
saved_ks_props = *req_entry.new_keyspace_rf_change_data;
|
||||
}
|
||||
|
||||
auto tbuilder_with_request_drop = [&] () {
|
||||
topology_mutation_builder tbuilder(guard.write_timestamp());
|
||||
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.del_global_topology_request()
|
||||
.del_global_topology_request_id()
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
|
||||
return tbuilder;
|
||||
};
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
sstring error;
|
||||
bool needs_colocation = false;
|
||||
if (_db.has_keyspace(ks_name)) {
|
||||
try {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
@@ -1110,93 +1030,82 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
size_t unimportant_init_tablet_count = 2; // must be a power of 2
|
||||
locator::tablet_map new_tablet_map{unimportant_init_tablet_count};
|
||||
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
auto rf_change_kind = co_await choose_keyspace_rf_change_kind(req_id, ks.metadata(), ks_md, tables_with_mvs);
|
||||
switch (rf_change_kind) {
|
||||
case keyspace_rf_change_kind::default_rf_change: {
|
||||
if (!tables_with_mvs.empty()) {
|
||||
auto table = tables_with_mvs.front();
|
||||
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
auto schedule_migrations = [&] () -> future<> {
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (!tables_with_mvs.empty()) {
|
||||
auto table = tables_with_mvs.front();
|
||||
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
// Apply the transition only on base tables.
|
||||
// If this table has a base table then the transition will be applied on the base table, and
|
||||
// the base table will coordinate the transition for the entire group.
|
||||
continue;
|
||||
auto check_needs_colocation = [&] () -> future<bool> {
|
||||
const auto& new_replication_strategy_config = new_strategy->get_config_options();
|
||||
const auto& old_replication_strategy_config = ks.metadata()->strategy_options();
|
||||
bool rack_list_conversion = false;
|
||||
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (std::holds_alternative<locator::rack_list>(rf_value)) {
|
||||
auto it = old_replication_strategy_config.find(dc);
|
||||
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
|
||||
rack_list_conversion = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
|
||||
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
|
||||
if (abandoning_replicas.size() + new_replicas.size() > 1) {
|
||||
throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
|
||||
"Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
|
||||
ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
|
||||
}
|
||||
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, tablet_info.replicas)
|
||||
.set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
|
||||
.set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
|
||||
.build()
|
||||
));
|
||||
|
||||
// Calculate abandoning replica and abort view building tasks on them
|
||||
if (!abandoning_replicas.empty()) {
|
||||
if (abandoning_replicas.size() != 1) {
|
||||
on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));
|
||||
}
|
||||
_vb_coordinator->abort_tasks(updates, guard, table_or_mv->id(), *abandoning_replicas.begin(), last_token);
|
||||
}
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
});
|
||||
}
|
||||
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, tmptr, &_sys_ks, req_id) : false;
|
||||
};
|
||||
if (needs_colocation = co_await check_needs_colocation(); needs_colocation) {
|
||||
co_return;
|
||||
}
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
updates.emplace_back(m);
|
||||
}
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
// Apply the transition only on base tables.
|
||||
// If this table has a base table then the transition will be applied on the base table, and
|
||||
// the base table will coordinate the transition for the entire group.
|
||||
continue;
|
||||
}
|
||||
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
|
||||
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
|
||||
|
||||
updates.push_back(canonical_mutation(tbuilder_with_request_drop().build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.done()
|
||||
.build()));
|
||||
break;
|
||||
}
|
||||
case keyspace_rf_change_kind::conversion_to_rack_list: {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
|
||||
topology_mutation_builder tbuilder = tbuilder_with_request_drop();
|
||||
tbuilder.pause_rf_change_request(req_id);
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
break;
|
||||
}
|
||||
case keyspace_rf_change_kind::multi_rf_change: {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} will use multi-rf change procedure", ks_name);
|
||||
ks_md->set_next_strategy_options(ks_md->strategy_options());
|
||||
ks_md->set_strategy_options(ks.metadata()->strategy_options()); // start from the old strategy
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
updates.emplace_back(m);
|
||||
}
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
|
||||
if (abandoning_replicas.size() + new_replicas.size() > 1) {
|
||||
throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
|
||||
"Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
|
||||
ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
|
||||
}
|
||||
|
||||
topology_mutation_builder tbuilder = tbuilder_with_request_drop();
|
||||
tbuilder.start_rf_change_migrations(req_id);
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
break;
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, tablet_info.replicas)
|
||||
.set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
|
||||
.set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
|
||||
.build()
|
||||
));
|
||||
|
||||
// Calculate abandoning replica and abort view building tasks on them
|
||||
if (!abandoning_replicas.empty()) {
|
||||
if (abandoning_replicas.size() != 1) {
|
||||
on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));
|
||||
}
|
||||
_vb_coordinator->abort_tasks(updates, guard, table_or_mv->id(), *abandoning_replicas.begin(), last_token);
|
||||
}
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
updates.emplace_back(m);
|
||||
}
|
||||
};
|
||||
co_await schedule_migrations();
|
||||
} catch (const std::exception& e) {
|
||||
error = e.what();
|
||||
rtlogger.error("Couldn't process global_topology_request::keyspace_rf_change, desired new ks opts: {}, error: {}",
|
||||
@@ -1207,12 +1116,22 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
|
||||
}
|
||||
|
||||
if (error != "") {
|
||||
updates.push_back(canonical_mutation(tbuilder_with_request_drop().build()));
|
||||
bool pause_request = needs_colocation && error.empty();
|
||||
topology_mutation_builder tbuilder(guard.write_timestamp());
|
||||
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.del_global_topology_request()
|
||||
.del_global_topology_request_id()
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
|
||||
if (pause_request) {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
|
||||
tbuilder.pause_rf_change_request(req_id);
|
||||
} else {
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.done(error)
|
||||
.build()));
|
||||
}
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
|
||||
sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
|
||||
rtlogger.trace("do update {} reason {}", updates, reason);
|
||||
@@ -1696,83 +1615,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
.build());
|
||||
}
|
||||
|
||||
// Updates keyspace properties; removes system_schema.keyspaces::next_replication;
|
||||
// finishes RF change request; Removes request from system.topology::ongoing_rf_changes.
|
||||
void generate_rf_change_completion_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const rf_change_completion_info& completion) {
|
||||
if (rtlogger.is_enabled(seastar::log_level::debug)) {
|
||||
sstring props_str;
|
||||
for (const auto& [key, value] : completion.saved_ks_props) {
|
||||
props_str += fmt::format(" {}={};", key, value);
|
||||
}
|
||||
rtlogger.debug("generate_rf_change_completion_update: request_id={}, ks_name={}, error='{}', saved_ks_props:{}",
|
||||
completion.request_id, completion.ks_name, completion.error, props_str);
|
||||
}
|
||||
sstring error = completion.error;
|
||||
if (_db.has_keyspace(completion.ks_name)) {
|
||||
auto& ks = _db.find_keyspace(completion.ks_name);
|
||||
if (error.empty()) {
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{completion.saved_ks_props.begin(), completion.saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *get_token_metadata_ptr(), _db.features(), _db.get_config());
|
||||
ks_md->clear_next_strategy_options();
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
out.emplace_back(m);
|
||||
}
|
||||
} else {
|
||||
auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
|
||||
ks_md->clear_next_strategy_options();
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
out.emplace_back(m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.emplace_back(topology_mutation_builder(guard.write_timestamp())
|
||||
.finish_rf_change_migrations(_topo_sm._topology.ongoing_rf_changes, completion.request_id)
|
||||
.build());
|
||||
|
||||
out.push_back(canonical_mutation(topology_request_tracking_mutation_builder(completion.request_id)
|
||||
.done(error)
|
||||
.build()));
|
||||
}
|
||||
|
||||
// Sets next_replication to current_replication and sets error on the topology request.
|
||||
// Similar to storage_service::abort_rf_change for the ongoing_rf_changes case.
|
||||
void generate_rf_change_abort_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const rf_change_abort_info& abort_info) {
|
||||
rtlogger.debug("generate_rf_change_abort_update: request_id={}, ks_name={}, error='{}'", abort_info.request_id, abort_info.ks_name, abort_info.error);
|
||||
|
||||
if (!_db.has_keyspace(abort_info.ks_name)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& ks = _db.find_keyspace(abort_info.ks_name);
|
||||
auto ks_md = make_lw_shared<data_dictionary::keyspace_metadata>(*ks.metadata());
|
||||
ks_md->set_next_strategy_options(abort_info.current_replication);
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m : schema_muts) {
|
||||
out.emplace_back(m);
|
||||
}
|
||||
|
||||
out.push_back(canonical_mutation(topology_request_tracking_mutation_builder(abort_info.request_id)
|
||||
.abort(abort_info.error)
|
||||
.build()));
|
||||
}
|
||||
|
||||
future<> generate_rf_change_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const keyspace_rf_change_plan& rf_change_plan) {
|
||||
for (const auto& abort_info : rf_change_plan.aborts) {
|
||||
co_await coroutine::maybe_yield();
|
||||
generate_rf_change_abort_update(out, guard, abort_info);
|
||||
}
|
||||
if (rf_change_plan.completion.has_value()) {
|
||||
generate_rf_change_completion_update(out, guard, *rf_change_plan.completion);
|
||||
}
|
||||
}
|
||||
|
||||
future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
|
||||
if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
|
||||
// schedule tablet migration only if there are no pending resize finalisations or if the node is draining.
|
||||
@@ -1795,8 +1637,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
|
||||
generate_rf_change_resume_update(out, guard, request_to_resume);
|
||||
}
|
||||
|
||||
co_await generate_rf_change_updates(out, guard, plan.rf_change_plan());
|
||||
}
|
||||
|
||||
auto sched_time = db_clock::now();
|
||||
@@ -2385,11 +2225,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
|
||||
bool has_nodes_to_drain = false;
|
||||
bool requires_schema_changes = false;
|
||||
if (!preempt) {
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
has_nodes_to_drain = plan.has_nodes_to_drain();
|
||||
requires_schema_changes = plan.requires_schema_changes();
|
||||
if (!drain || plan.has_nodes_to_drain()) {
|
||||
co_await generate_migration_updates(updates, guard, plan);
|
||||
}
|
||||
@@ -2405,11 +2243,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
topology_mutation_builder(guard.write_timestamp())
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.build());
|
||||
if (requires_schema_changes) {
|
||||
co_await update_topology_state_with_mixed_change(std::move(guard), std::move(updates), format("Tablet migration"));
|
||||
} else {
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), format("Tablet migration"));
|
||||
}
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), format("Tablet migration"));
|
||||
}
|
||||
|
||||
if (needs_barrier) {
|
||||
@@ -4300,11 +4134,7 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_mig
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.build());
|
||||
|
||||
if (plan.requires_schema_changes()) {
|
||||
co_await update_topology_state_with_mixed_change(std::move(guard), std::move(updates), "Starting tablet migration");
|
||||
} else {
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
|
||||
}
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
#include <seastar/core/metrics.hh>
|
||||
|
||||
#include "utils/log.hh"
|
||||
#include "raft/raft.hh"
|
||||
#include "raft/raft_fwd.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user