Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
9c401e260a Initial setup: Fix configure.py for Ubuntu/Debian platform
Temporarily add Ubuntu/Debian support in kmiplib() function to allow
configuration to proceed on Ubuntu systems.

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
2025-12-16 11:18:56 +00:00
copilot-swe-agent[bot]
1824b04e2a Initial plan 2025-12-16 10:58:17 +00:00
207 changed files with 1303 additions and 7264 deletions

View File

@@ -1,14 +0,0 @@
name: Call Jira release creation for new milestone
on:
milestone:
types: [created]
jobs:
sync-milestone-to-jira:
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
with:
# Comma-separated list of Jira project keys
jira_project_keys: "SCYLLADB,CUSTOMER"
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,13 +0,0 @@
name: validate_pr_author_email
on:
pull_request_target:
types:
- opened
- synchronize
- reopened
jobs:
validate_pr_author_email:
uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main

View File

@@ -169,7 +169,7 @@ future<> controller::request_stop_server() {
});
}
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
future<utils::chunked_vector<client_data>> controller::get_client_data() {
return _server.local().get_client_data();
}

View File

@@ -93,7 +93,7 @@ public:
// This virtual function is called (on each shard separately) when the
// virtual table "system.clients" is read. It is expected to generate a
// list of clients connected to this server (on this shard).
virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
virtual future<utils::chunked_vector<client_data>> get_client_data() override;
};
}

View File

@@ -708,12 +708,8 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
// As long as the system_clients_entry object is alive, this request will
// be visible in the "system.clients" virtual table. When requested, this
// entry will be formatted by server::ongoing_request::make_client_data().
auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
auto system_clients_entry = _ongoing_requests.emplace(
req->get_client_address(), std::move(user_agent_header),
req->get_client_address(), req->get_header("User-Agent"),
username, current_scheduling_group(),
req->get_protocol_name() == "https");
@@ -989,10 +985,10 @@ client_data server::ongoing_request::make_client_data() const {
return cd;
}
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
future<utils::chunked_vector<client_data>> server::get_client_data() {
utils::chunked_vector<client_data> ret;
co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
ret.emplace_back(r.make_client_data());
});
co_return ret;
}

View File

@@ -55,7 +55,6 @@ class server : public peering_sharded_service<server> {
// though it isn't really relevant for Alternator which defines its own
// timeouts separately. We can create this object only once.
updateable_timeout_config _timeout_config;
client_options_cache_type _connection_options_keys_and_values;
alternator_callbacks_map _callbacks;
@@ -89,7 +88,7 @@ class server : public peering_sharded_service<server> {
// is called when reading the "system.clients" virtual table.
struct ongoing_request {
socket_address _client_address;
client_options_cache_entry_type _user_agent;
sstring _user_agent;
sstring _username;
scheduling_group _scheduling_group;
bool _is_https;
@@ -108,7 +107,7 @@ public:
// table "system.clients" is read. It is expected to generate a list of
// clients connected to this server (on this shard). This function is
// called by alternator::controller::get_client_data().
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
future<utils::chunked_vector<client_data>> get_client_data();
private:
void set_routes(seastar::httpd::routes& r);
// If verification succeeds, returns the authenticated user's username

View File

@@ -31,7 +31,6 @@ set(swagger_files
api-doc/column_family.json
api-doc/commitlog.json
api-doc/compaction_manager.json
api-doc/client_routes.json
api-doc/config.json
api-doc/cql_server_test.json
api-doc/endpoint_snitch_info.json
@@ -69,7 +68,6 @@ target_sources(api
PRIVATE
api.cc
cache_service.cc
client_routes.cc
collectd.cc
column_family.cc
commitlog.cc

View File

@@ -1,23 +0,0 @@
, "client_routes_entry": {
"id": "client_routes_entry",
"summary": "An entry storing client routes",
"properties": {
"connection_id": {"type": "string"},
"host_id": {"type": "string", "format": "uuid"},
"address": {"type": "string"},
"port": {"type": "integer"},
"tls_port": {"type": "integer"},
"alternator_port": {"type": "integer"},
"alternator_https_port": {"type": "integer"}
},
"required": ["connection_id", "host_id", "address"]
}
, "client_routes_key": {
"id": "client_routes_key",
"summary": "A key of client_routes_entry",
"properties": {
"connection_id": {"type": "string"},
"host_id": {"type": "string", "format": "uuid"}
}
}

View File

@@ -1,74 +0,0 @@
, "/v2/client-routes":{
"get": {
"description":"List all client route entries",
"operationId":"get_client_routes",
"tags":["client_routes"],
"produces":[
"application/json"
],
"parameters":[],
"responses":{
"200":{
"schema":{
"type":"array",
"items":{ "$ref":"#/definitions/client_routes_entry" }
}
},
"default":{
"description":"unexpected error",
"schema":{"$ref":"#/definitions/ErrorModel"}
}
}
},
"post": {
"description":"Upsert one or more client route entries",
"operationId":"set_client_routes",
"tags":["client_routes"],
"parameters":[
{
"name":"body",
"in":"body",
"required":true,
"schema":{
"type":"array",
"items":{ "$ref":"#/definitions/client_routes_entry" }
}
}
],
"responses":{
"200":{ "description": "OK" },
"default":{
"description":"unexpected error",
"schema":{ "$ref":"#/definitions/ErrorModel" }
}
}
},
"delete": {
"description":"Delete one or more client route entries",
"operationId":"delete_client_routes",
"tags":["client_routes"],
"parameters":[
{
"name":"body",
"in":"body",
"required":true,
"schema":{
"type":"array",
"items":{ "$ref":"#/definitions/client_routes_key" }
}
}
],
"responses":{
"200":{
"description": "OK"
},
"default":{
"description":"unexpected error",
"schema":{
"$ref":"#/definitions/ErrorModel"
}
}
}
}
}

View File

@@ -37,7 +37,6 @@
#include "raft.hh"
#include "gms/gossip_address_map.hh"
#include "service_levels.hh"
#include "client_routes.hh"
logging::logger apilog("api");
@@ -68,11 +67,9 @@ future<> set_server_init(http_context& ctx) {
rb02->set_api_doc(r);
rb02->register_api_file(r, "swagger20_header");
rb02->register_api_file(r, "metrics");
rb02->register_api_file(r, "client_routes");
rb->register_function(r, "system",
"The system related API");
rb02->add_definitions_file(r, "metrics");
rb02->add_definitions_file(r, "client_routes");
set_system(ctx, r);
rb->register_function(r, "error_injection",
"The error injection API");
@@ -132,16 +129,6 @@ future<> unset_server_storage_service(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
}
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr) {
return ctx.http_server.set_routes([&ctx, &cr] (routes& r) {
set_client_routes(ctx, r, cr);
});
}
future<> unset_server_client_routes(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_client_routes(ctx, r); });
}
future<> set_load_meter(http_context& ctx, service::load_meter& lm) {
return ctx.http_server.set_routes([&ctx, &lm] (routes& r) { set_load_meter(ctx, r, lm); });
}

View File

@@ -29,7 +29,6 @@ class storage_proxy;
class storage_service;
class raft_group0_client;
class raft_group_registry;
class client_routes_service;
} // namespace service
@@ -100,8 +99,6 @@ future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snit
future<> unset_server_snitch(http_context& ctx);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
future<> unset_server_storage_service(http_context& ctx);
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
future<> unset_server_client_routes(http_context& ctx);
future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
future<> unset_server_sstables_loader(http_context& ctx);
future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g);

View File

@@ -1,176 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <seastar/http/short_streams.hh>
#include "client_routes.hh"
#include "api/api.hh"
#include "service/storage_service.hh"
#include "service/client_routes.hh"
#include "utils/rjson.hh"
#include "api/api-doc/client_routes.json.hh"
using namespace seastar::httpd;
using namespace std::chrono_literals;
using namespace json;
extern logging::logger apilog;
namespace api {
static void validate_client_routes_endpoint(sharded<service::client_routes_service>& cr, sstring endpoint_name) {
if (!cr.local().get_feature_service().client_routes) {
apilog.warn("{}: called before the cluster feature was enabled", endpoint_name);
throw std::runtime_error(fmt::format("{} requires all nodes to support the CLIENT_ROUTES cluster feature", endpoint_name));
}
}
static sstring parse_string(const char* name, rapidjson::Value const& v) {
const auto it = v.FindMember(name);
if (it == v.MemberEnd()) {
throw bad_param_exception(fmt::format("Missing '{}'", name));
}
if (!it->value.IsString()) {
throw bad_param_exception(fmt::format("'{}' must be a string", name));
}
return {it->value.GetString(), it->value.GetStringLength()};
}
static std::optional<uint32_t> parse_port(const char* name, rapidjson::Value const& v) {
const auto it = v.FindMember(name);
if (it == v.MemberEnd()) {
return std::nullopt;
}
if (!it->value.IsInt()) {
throw bad_param_exception(fmt::format("'{}' must be an integer", name));
}
auto port = it->value.GetInt();
if (port < 1 || port > 65535) {
throw bad_param_exception(fmt::format("'{}' value={} is outside the allowed port range", name, port));
}
return port;
}
static std::vector<service::client_routes_service::client_route_entry> parse_set_client_array(const rapidjson::Document& root) {
if (!root.IsArray()) {
throw bad_param_exception("Body must be a JSON array");
}
std::vector<service::client_routes_service::client_route_entry> v;
v.reserve(root.GetArray().Size());
for (const auto& element : root.GetArray()) {
if (!element.IsObject()) { throw bad_param_exception("Each element must be object"); }
const auto port = parse_port("port", element);
const auto tls_port = parse_port("tls_port", element);
const auto alternator_port = parse_port("alternator_port", element);
const auto alternator_https_port = parse_port("alternator_https_port", element);
if (!port.has_value() && !tls_port.has_value() && !alternator_port.has_value() && !alternator_https_port.has_value()) {
throw bad_param_exception("At least one port field ('port', 'tls_port', 'alternator_port', 'alternator_https_port') must be specified");
}
v.emplace_back(
parse_string("connection_id", element),
utils::UUID{parse_string("host_id", element)},
parse_string("address", element),
port,
tls_port,
alternator_port,
alternator_https_port
);
}
return v;
}
static
future<json::json_return_type>
rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
validate_client_routes_endpoint(cr, "rest_set_client_routes");
rapidjson::Document root;
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
root.Parse(content.c_str());
co_await cr.local().set_client_routes(parse_set_client_array(root));
co_return seastar::json::json_void();
}
static std::vector<service::client_routes_service::client_route_key> parse_delete_client_array(const rapidjson::Document& root) {
if (!root.IsArray()) {
throw bad_param_exception("Body must be a JSON array");
}
std::vector<service::client_routes_service::client_route_key> v;
v.reserve(root.GetArray().Size());
for (const auto& element : root.GetArray()) {
v.emplace_back(
parse_string("connection_id", element),
utils::UUID{parse_string("host_id", element)}
);
}
return v;
}
static
future<json::json_return_type>
rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
validate_client_routes_endpoint(cr, "delete_client_routes");
rapidjson::Document root;
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
root.Parse(content.c_str());
co_await cr.local().delete_client_routes(parse_delete_client_array(root));
co_return seastar::json::json_void();
}
static
future<json::json_return_type>
rest_get_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
validate_client_routes_endpoint(cr, "get_client_routes");
co_return co_await cr.invoke_on(0, [] (service::client_routes_service& cr) -> future<json::json_return_type> {
co_return json::json_return_type(stream_range_as_array(co_await cr.get_client_routes(), [](const service::client_routes_service::client_route_entry & entry) {
seastar::httpd::client_routes_json::client_routes_entry obj;
obj.connection_id = entry.connection_id;
obj.host_id = fmt::to_string(entry.host_id);
obj.address = entry.address;
if (entry.port.has_value()) { obj.port = entry.port.value(); }
if (entry.tls_port.has_value()) { obj.tls_port = entry.tls_port.value(); }
if (entry.alternator_port.has_value()) { obj.alternator_port = entry.alternator_port.value(); }
if (entry.alternator_https_port.has_value()) { obj.alternator_https_port = entry.alternator_https_port.value(); }
return obj;
}));
});
}
void set_client_routes(http_context& ctx, routes& r, sharded<service::client_routes_service>& cr) {
seastar::httpd::client_routes_json::set_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
return rest_set_client_routes(ctx, cr, std::move(req));
});
seastar::httpd::client_routes_json::delete_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
return rest_delete_client_routes(ctx, cr, std::move(req));
});
seastar::httpd::client_routes_json::get_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
return rest_get_client_routes(ctx, cr, std::move(req));
});
}
void unset_client_routes(http_context& ctx, routes& r) {
seastar::httpd::client_routes_json::set_client_routes.unset(r);
seastar::httpd::client_routes_json::delete_client_routes.unset(r);
seastar::httpd::client_routes_json::get_client_routes.unset(r);
}
}

View File

@@ -1,20 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <seastar/core/sharded.hh>
#include <seastar/json/json_elements.hh>
#include "api/api_init.hh"
namespace api {
void set_client_routes(http_context& ctx, httpd::routes& r, sharded<service::client_routes_service>& cr);
void unset_client_routes(http_context& ctx, httpd::routes& r);
}

View File

@@ -547,13 +547,17 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
vp.insert(b.second);
}
}
std::vector<sstring> res;
replica::database& db = vb.local().get_db();
auto uuid = validate_table(db, ks, cf_name);
replica::column_family& cf = db.find_column_family(uuid);
co_return cf.get_index_manager().list_indexes()
| std::views::transform([] (const auto& i) { return i.metadata().name(); })
| std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
| std::ranges::to<std::vector>();
res.reserve(cf.get_index_manager().list_indexes().size());
for (auto&& i : cf.get_index_manager().list_indexes()) {
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
res.emplace_back(i.metadata().name());
}
}
co_return res;
});
}

View File

@@ -10,9 +10,7 @@
#include <seastar/net/inet_address.hh>
#include <seastar/core/sstring.hh>
#include "seastarx.hh"
#include "utils/loading_shared_values.hh"
#include <list>
#include <optional>
enum class client_type {
@@ -29,20 +27,6 @@ enum class client_connection_stage {
ready,
};
// We implement a keys cache using a map-like utils::loading_shared_values container by storing empty values.
struct options_cache_value_type {};
using client_options_cache_type = utils::loading_shared_values<sstring, options_cache_value_type>;
using client_options_cache_entry_type = client_options_cache_type::entry_ptr;
using client_options_cache_key_type = client_options_cache_type::key_type;
// This struct represents a single OPTION key-value pair from the client's connection options.
// Both key and value are represented by corresponding "references" to their cached values.
// Each "reference" is effectively a lw_shared_ptr value.
struct client_option_key_value_cached_entry {
client_options_cache_entry_type key;
client_options_cache_entry_type value;
};
sstring to_string(client_connection_stage ct);
// Representation of a row in `system.clients'. std::optionals are for nullable cells.
@@ -53,8 +37,8 @@ struct client_data {
client_connection_stage connection_stage = client_connection_stage::established;
int32_t shard_id; /// ID of server-side shard which is processing the connection.
std::optional<client_options_cache_entry_type> driver_name;
std::optional<client_options_cache_entry_type> driver_version;
std::optional<sstring> driver_name;
std::optional<sstring> driver_version;
std::optional<sstring> hostname;
std::optional<int32_t> protocol_version;
std::optional<sstring> ssl_cipher_suite;
@@ -62,7 +46,6 @@ struct client_data {
std::optional<sstring> ssl_protocol;
std::optional<sstring> username;
std::optional<sstring> scheduling_group_name;
std::list<client_option_key_value_cached_entry> client_options;
sstring stage_str() const { return to_string(connection_stage); }
sstring client_type_str() const { return to_string(ct); }

View File

@@ -125,6 +125,10 @@ if(target_arch)
add_compile_options("-march=${target_arch}")
endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_options("SHELL:-Xclang -fexperimental-assignment-tracking=disabled")
endif()
function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")

View File

@@ -12,7 +12,6 @@
#include <seastar/core/condition-variable.hh>
#include "schema/schema_fwd.hh"
#include "sstables/open_info.hh"
#include "compaction_descriptor.hh"
class reader_permit;
@@ -45,7 +44,7 @@ public:
virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
virtual reader_permit make_compaction_reader_permit() const = 0;
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
virtual sstables::shared_sstable make_sstable() const = 0;
virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
virtual api::timestamp_type min_memtable_timestamp() const = 0;
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;

View File

@@ -416,9 +416,7 @@ future<compaction_result> compaction_task_executor::compact_sstables(compaction_
descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
}
descriptor.creator = [&t] (shard_id) {
// All compaction types going through this path will work on normal input sstables only.
// Off-strategy, for example, waits until the sstables move out of staging state.
return t.make_sstable(sstables::sstable_state::normal);
return t.make_sstable();
};
descriptor.replacer = [this, &t, &on_replace, offstrategy] (compaction_completion_desc desc) {
t.get_compaction_strategy().notify_completion(t, desc.old_sstables, desc.new_sstables);
@@ -1849,10 +1847,6 @@ protected:
throw make_compaction_stopped_exception();
}
}, false);
if (utils::get_local_injector().is_enabled("split_sstable_force_stop_exception")) {
throw make_compaction_stopped_exception();
}
co_return co_await do_rewrite_sstable(std::move(sst));
}
};
@@ -2290,16 +2284,12 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
}
future<std::vector<sstables::shared_sstable>>
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
co_return std::vector<sstables::shared_sstable>{sst};
}
// Throw an error if split cannot be performed due to e.g. out of space prevention.
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
// which is uneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
if (is_disabled()) {
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
"reason might be out of space prevention", sst->get_filename()))));
if (!can_proceed(&t)) {
co_return std::vector<sstables::shared_sstable>{sst};
}
std::vector<sstables::shared_sstable> ret;
@@ -2307,11 +2297,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
compaction_progress_monitor monitor;
compaction_data info = create_compaction_data();
compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
desc.creator = [&t, sst] (shard_id _) {
// NOTE: preserves the sstable state, since we want the output to be on the same state as the original.
// For example, if base table has views, it's important that sstable produced by repair will be
// in the staging state.
return t.make_sstable(sst->state());
desc.creator = [&t] (shard_id _) {
return t.make_sstable();
};
desc.replacer = [&] (compaction_completion_desc d) {
std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));

View File

@@ -376,8 +376,7 @@ public:
// Splits a single SSTable by segregating all its data according to the classifier.
// If SSTable doesn't need split, the same input SSTable is returned as output.
// If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
// Exception is thrown if the input sstable cannot be split due to e.g. out of space prevention.
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
// Run a custom job for a given table, defined by a function
// it completes when future returned by job is ready or returns immediately

View File

@@ -1158,7 +1158,6 @@ scylla_core = (['message/messaging_service.cc',
'locator/topology.cc',
'locator/util.cc',
'service/client_state.cc',
'service/client_routes.cc',
'service/storage_service.cc',
'service/session.cc',
'service/task_manager_module.cc',
@@ -1319,8 +1318,6 @@ api = ['api/api.cc',
'api/storage_proxy.cc',
Json2Code('api/api-doc/cache_service.json'),
'api/cache_service.cc',
Json2Code('api/api-doc/client_routes.json'),
'api/client_routes.cc',
Json2Code('api/api-doc/collectd.json'),
'api/collectd.cc',
Json2Code('api/api-doc/endpoint_snitch_info.json'),
@@ -1698,18 +1695,6 @@ deps['test/vector_search/vector_store_client_test'] = ['test/vector_search/vect
deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
# We need to link these files to all Boost tests to make sure that
# we can execute `--list_json_content` on them. That will produce
# a similar result as calling `--list_content={HRF,DOT}`.
# Unfortunately, to be able to do that, we're forced to link the
# relevant code by hand.
for key in deps.keys():
for prefix in boost_tests_prefixes:
if key.startswith(prefix):
deps[key] += ["test/lib/boost_tree_lister_injector.cc", "test/lib/boost_test_tree_lister.cc"]
wasm_deps = {}
wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
@@ -2207,6 +2192,8 @@ def kmiplib():
for id in os_ids:
if id in { 'centos', 'fedora', 'rhel' }:
return 'rhel84'
elif id in { 'ubuntu', 'debian' }:
return 'ubuntu' # Temporarily use a placeholder for Ubuntu/Debian
print('Could not resolve libkmip.a for platform {}'.format(os_ids))
sys.exit(1)
@@ -2263,6 +2250,15 @@ def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
if debuginfo and mode_config['can_have_debug_info']:
cxxflags += ['-g', '-gz']
if 'clang' in cxx:
# Since AssignmentTracking was enabled by default in clang
# (llvm/llvm-project@de6da6ad55d3ca945195d1cb109cb8efdf40a52a)
# coroutine frame debugging info (`coro_frame_ty`) is broken.
#
# It seems that we aren't losing much by disabling AssigmentTracking,
# so for now we choose to disable it to get `coro_frame_ty` back.
cxxflags.append('-Xclang -fexperimental-assignment-tracking=disabled')
return cxxflags

View File

@@ -64,10 +64,6 @@ bool query_processor::topology_global_queue_empty() {
return remote().first.get().ss.topology_global_queue_empty();
}
future<bool> query_processor::ongoing_rf_change(const service::group0_guard& guard, sstring ks) {
return remote().first.get().ss.ongoing_rf_change(guard, std::move(ks));
}
static service::query_state query_state_for_internal_call() {
return {service::client_state::for_internal_calls(), empty_service_permit()};
}

View File

@@ -474,7 +474,6 @@ public:
void reset_cache();
bool topology_global_queue_empty();
future<bool> ongoing_rf_change(const service::group0_guard& guard, sstring ks);
query_options make_internal_options(
const statements::prepared_statement::checked_weak_ptr& p,

View File

@@ -19,7 +19,6 @@
#include "locator/abstract_replication_strategy.hh"
#include "mutation/canonical_mutation.hh"
#include "prepared_statement.hh"
#include "seastar/coroutine/exception.hh"
#include "service/migration_manager.hh"
#include "service/storage_proxy.hh"
#include "service/topology_mutation.hh"
@@ -139,7 +138,6 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
using namespace cql_transport;
bool unknown_keyspace = false;
try {
event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
auto ks = qp.db().find_keyspace(_name);
@@ -160,12 +158,8 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
// when in reality nothing or only schema is being changed
if (changes_tablets(qp)) {
if (!qp.proxy().features().topology_global_request_queue && !qp.topology_global_queue_empty()) {
co_await coroutine::return_exception(
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
}
if (qp.proxy().features().rack_list_rf && co_await qp.ongoing_rf_change(mc.guard(),_name)) {
co_await coroutine::return_exception(
exceptions::invalid_request_exception(format("Another RF change for this keyspace {} ongoing, please retry.", _name)));
return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
}
qp.db().real_database().validate_keyspace_update(*ks_md_update);
@@ -248,15 +242,10 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
target_type,
keyspace());
mc.add_mutations(std::move(muts), "CQL alter keyspace");
co_return std::make_tuple(std::move(ret), warnings);
return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), warnings));
} catch (data_dictionary::no_such_keyspace& e) {
unknown_keyspace = true;
return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
}
if (unknown_keyspace) {
co_await coroutine::return_exception(
exceptions::invalid_request_exception("Unknown keyspace " + _name));
}
std::unreachable();
}
std::unique_ptr<cql3::statements::prepared_statement>

View File

@@ -61,7 +61,7 @@ expand_to_racks(const locator::token_metadata& tm,
// Handle ALTER:
// ([]|0) -> numeric is allowed, there are no existing replicas
// numeric -> numeric' is not supported unless numeric == numeric'. User should convert RF to rack list of equal count first.
// numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
// rack_list -> len(rack_list) is allowed (no-op)
// rack_list -> numeric is not allowed
if (old_options.contains(dc)) {
@@ -75,8 +75,6 @@ expand_to_racks(const locator::token_metadata& tm,
"Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
dc, old_rf_val, data.count()));
}
} else if (old_rf.count() == data.count()) {
return rf;
} else if (old_rf.count() > 0) {
throw exceptions::configuration_exception(fmt::format(
"Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
@@ -155,8 +153,6 @@ static locator::replication_strategy_config_options prepare_options(
}
// Validate options.
bool numeric_to_rack_list_transition = false;
bool rf_change = false;
for (auto&& [dc, opt] : options) {
locator::replication_factor_data rf(opt);
@@ -166,7 +162,6 @@ static locator::replication_strategy_config_options prepare_options(
old_rf = locator::replication_factor_data(i->second);
}
rf_change = rf_change || (old_rf && old_rf->count() != rf.count()) || (!old_rf && rf.count() != 0);
if (!rf.is_rack_based()) {
if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
if (old_rf->count() != rf.count()) {
@@ -192,11 +187,12 @@ static locator::replication_strategy_config_options prepare_options(
throw exceptions::configuration_exception(fmt::format(
"Rack list for '{}' contains duplicate entries", dc));
}
numeric_to_rack_list_transition = numeric_to_rack_list_transition || (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0);
}
if (numeric_to_rack_list_transition && rf_change) {
throw exceptions::configuration_exception("Cannot change replication factor from numeric to rack list and rf value at the same time");
if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
// FIXME: Allow this if replicas already conform to the given rack list.
// FIXME: Implement automatic colocation to allow transition to rack list.
throw exceptions::configuration_exception(fmt::format(
"Cannot change replication factor from numeric to rack list for '{}'", dc));
}
}
if (!rf && options.empty() && old_options.empty()) {
@@ -416,7 +412,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
? std::optional<unsigned>(0) : std::nullopt;
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
bool uses_tablets = initial_tablets.has_value();
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
bool rack_list_enabled = feat.rack_list_rf;
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
@@ -432,7 +428,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
}
auto sc = get_replication_strategy_class();
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
bool rack_list_enabled = feat.rack_list_rf;
if (sc) {
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
} else {

View File

@@ -248,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
// which is larger than the segment ID of the RP of the last written hint.
cfg.base_segment_id = _last_written_rp.base_id();
return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
// When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
if (_sender.have_segments()) {

View File

@@ -135,5 +135,5 @@ const std::string db::object_storage_endpoint_param::gs_type = "gs";
auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{}", e.to_json_string());
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
}

View File

@@ -110,7 +110,6 @@ namespace {
system_keyspace::v3::CDC_LOCAL,
system_keyspace::DICTS,
system_keyspace::VIEW_BUILDING_TASKS,
system_keyspace::CLIENT_ROUTES,
};
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.enable_schema_commitlog();
@@ -138,7 +137,6 @@ namespace {
system_keyspace::ROLE_PERMISSIONS,
system_keyspace::DICTS,
system_keyspace::VIEW_BUILDING_TASKS,
system_keyspace::CLIENT_ROUTES,
};
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.is_group0_table = true;
@@ -311,7 +309,6 @@ schema_ptr system_keyspace::topology() {
.with_column("tablet_balancing_enabled", boolean_type, column_kind::static_column)
.with_column("upgrade_state", utf8_type, column_kind::static_column)
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
.set_comment("Current state of topology change machine")
.with_hash_version()
.build();
@@ -1418,23 +1415,6 @@ schema_ptr system_keyspace::view_building_tasks() {
return schema;
}
schema_ptr system_keyspace::client_routes() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, CLIENT_ROUTES);
return schema_builder(NAME, CLIENT_ROUTES, std::make_optional(id))
.with_column("connection_id", utf8_type, column_kind::partition_key)
.with_column("host_id", uuid_type, column_kind::clustering_key)
.with_column("address", utf8_type)
.with_column("port", int32_type)
.with_column("tls_port", int32_type)
.with_column("alternator_port", int32_type)
.with_column("alternator_https_port", int32_type)
.with_hash_version()
.build();
}();
return schema;
}
future<system_keyspace::local_info> system_keyspace::load_local_info() {
auto msg = co_await execute_cql(format("SELECT host_id, cluster_name, data_center, rack FROM system.{} WHERE key=?", LOCAL), sstring(LOCAL));
@@ -2362,7 +2342,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
v3::cdc_local(),
raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
dicts(), view_building_tasks(), cdc_streams_state(), cdc_streams_history()
});
if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
@@ -3157,10 +3137,7 @@ static bool must_have_tokens(service::node_state nst) {
// A decommissioning node doesn't have tokens at the end, they are
// removed during transition to the left_token_ring state.
case service::node_state::decommissioning: return false;
// A removing node might or might not have tokens depending on whether
// REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
// cases, we allow removing nodes to not have tokens.
case service::node_state::removing: return false;
case service::node_state::removing: return true;
case service::node_state::rebuilding: return true;
case service::node_state::normal: return true;
case service::node_state::left: return false;
@@ -3400,12 +3377,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
}
}
if (some_row.has("paused_rf_change_requests")) {
for (auto&& v : deserialize_set_column(*topology(), some_row, "paused_rf_change_requests")) {
ret.paused_rf_change_requests.insert(value_cast<utils::UUID>(v));
}
}
if (some_row.has("enabled_features")) {
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
}
@@ -3617,43 +3588,35 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
return entry;
}
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id) {
auto r = co_await get_topology_request_entry_opt(id);
if (!r) {
on_internal_error(slogger, format("no entry for request id {}", id));
}
co_return std::move(*r);
}
future<std::optional<system_keyspace::topology_requests_entry>> system_keyspace::get_topology_request_entry_opt(utils::UUID id) {
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id, bool require_entry) {
auto rs = co_await execute_cql(
format("SELECT * FROM system.{} WHERE id = {}", TOPOLOGY_REQUESTS, id));
if (!rs || rs->empty()) {
co_return std::nullopt;
if (require_entry) {
on_internal_error(slogger, format("no entry for request id {}", id));
} else {
co_return topology_requests_entry{
.id = utils::null_uuid()
};
}
}
const auto& row = rs->one();
co_return topology_request_row_to_entry(id, row);
}
future<system_keyspace::topology_requests_entries> system_keyspace::get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit) {
sstring request_types_str = "";
bool first = true;
for (const auto& rt : request_types) {
if (!std::exchange(first, false)) {
request_types_str += ", ";
}
request_types_str += std::visit([] (auto&& arg) { return fmt::format("'{}'", arg); }, rt);
}
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
// Running requests.
auto rs_running = co_await execute_cql(
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, request_types_str));
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS,
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
// Requests which finished after end_time_limit.
auto rs_done = co_await execute_cql(
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(), request_types_str));
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(),
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
topology_requests_entries m;
for (const auto& row: *rs_done) {
@@ -3671,16 +3634,6 @@ future<system_keyspace::topology_requests_entries> system_keyspace::get_topology
co_return m;
}
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
return get_topology_request_entries({
service::topology_request::join,
service::topology_request::replace,
service::topology_request::rebuild,
service::topology_request::leave,
service::topology_request::remove
}, end_time_limit);
}
future<mutation> system_keyspace::get_insert_dict_mutation(
std::string_view name,
bytes data,

View File

@@ -199,8 +199,6 @@ public:
static constexpr auto VIEW_BUILD_STATUS_V2 = "view_build_status_v2";
static constexpr auto DICTS = "dicts";
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
static constexpr auto CLIENT_ROUTES = "client_routes";
static constexpr auto VERSIONS = "versions";
// auth
static constexpr auto ROLES = "roles";
@@ -278,7 +276,6 @@ public:
static schema_ptr view_build_status_v2();
static schema_ptr dicts();
static schema_ptr view_building_tasks();
static schema_ptr client_routes();
// auth
static schema_ptr roles();
@@ -670,9 +667,7 @@ public:
future<service::topology_request_state> get_topology_request_state(utils::UUID id, bool require_entry);
topology_requests_entry topology_request_row_to_entry(utils::UUID id, const cql3::untyped_result_set_row& row);
future<topology_requests_entry> get_topology_request_entry(utils::UUID id);
future<std::optional<topology_requests_entry>> get_topology_request_entry_opt(utils::UUID id);
future<system_keyspace::topology_requests_entries> get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit);
future<topology_requests_entry> get_topology_request_entry(utils::UUID id, bool require_entry);
future<topology_requests_entries> get_node_ops_request_entries(db_clock::time_point end_time_limit);
public:

View File

@@ -198,7 +198,6 @@ future<> view_building_worker::register_staging_sstable_tasks(std::vector<sstabl
future<> view_building_worker::run_staging_sstables_registrator() {
while (!_as.abort_requested()) {
bool sleep = false;
try {
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
co_await create_staging_sstable_tasks();
@@ -215,14 +214,6 @@ future<> view_building_worker::run_staging_sstables_registrator() {
vbw_logger.warn("Got group0_concurrent_modification while creating staging sstable tasks");
} catch (raft::request_aborted&) {
vbw_logger.warn("Got raft::request_aborted while creating staging sstable tasks");
} catch (...) {
vbw_logger.error("Exception while creating staging sstable tasks: {}", std::current_exception());
sleep = true;
}
if (sleep) {
vbw_logger.debug("Sleeping after exception.");
co_await seastar::sleep_abortable(1s, _as).handle_exception([] (auto x) { return make_ready_future<>(); });
}
}
}
@@ -426,12 +417,9 @@ future<> view_building_worker::check_for_aborted_tasks() {
auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
auto it = vbw._state._batch->tasks.begin();
while (it != vbw._state._batch->tasks.end()) {
auto id = it->first;
auto task_opt = building_state.get_task(it->second.base_id, my_replica, id);
++it; // Advance the iterator before potentially removing the entry from the map.
auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
for (auto& [id, t]: tasks_map) {
auto task_opt = building_state.get_task(t.base_id, my_replica, id);
if (!task_opt || task_opt->get().aborted) {
co_await vbw._state._batch->abort_task(id);
}
@@ -461,7 +449,7 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
}) | std::ranges::to<std::unordered_set>();;
}
// If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
// clear the state, save and flush new base table
future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
if (processing_base_table != building_state.currently_processed_base_table) {
@@ -583,6 +571,8 @@ future<> view_building_worker::batch::do_work() {
break;
}
}
_vbw.local()._vb_state_machine.event.broadcast();
}
future<> view_building_worker::do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as) {
@@ -784,15 +774,13 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
tasks.insert({id, *task_opt});
}
#ifdef SEASTAR_DEBUG
{
auto& some_task = tasks.begin()->second;
for (auto& [_, t]: tasks) {
SCYLLA_ASSERT(t.base_id == some_task.base_id);
SCYLLA_ASSERT(t.last_token == some_task.last_token);
SCYLLA_ASSERT(t.replica == some_task.replica);
SCYLLA_ASSERT(t.type == some_task.type);
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
}
auto& some_task = tasks.begin()->second;
for (auto& [_, t]: tasks) {
SCYLLA_ASSERT(t.base_id == some_task.base_id);
SCYLLA_ASSERT(t.last_token == some_task.last_token);
SCYLLA_ASSERT(t.replica == some_task.replica);
SCYLLA_ASSERT(t.type == some_task.type);
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
}
#endif
@@ -823,6 +811,25 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
co_return collect_completed_tasks();
}
}
}

View File

@@ -605,8 +605,8 @@ public:
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::VERSIONS);
return schema_builder(system_keyspace::NAME, system_keyspace::VERSIONS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "versions");
return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
.with_column("key", utf8_type, column_kind::partition_key)
.with_column("version", utf8_type)
.with_column("build_mode", utf8_type)
@@ -749,7 +749,6 @@ class clients_table : public streaming_virtual_table {
.with_column("ssl_protocol", utf8_type)
.with_column("username", utf8_type)
.with_column("scheduling_group", utf8_type)
.with_column("client_options", map_type_impl::get_instance(utf8_type, utf8_type, false))
.with_hash_version()
.build();
}
@@ -767,7 +766,7 @@ class clients_table : public streaming_virtual_table {
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
// Collect
using client_data_vec = utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>;
using client_data_vec = utils::chunked_vector<client_data>;
using shard_client_data = std::vector<client_data_vec>;
std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
cd_vec.resize(smp::count);
@@ -807,13 +806,13 @@ class clients_table : public streaming_virtual_table {
for (unsigned i = 0; i < smp::count; i++) {
for (auto&& ps_cdc : *cd_vec[i]) {
for (auto&& cd : ps_cdc) {
if (cd_map.contains(cd->ip)) {
cd_map[cd->ip].emplace_back(std::move(cd));
if (cd_map.contains(cd.ip)) {
cd_map[cd.ip].emplace_back(std::move(cd));
} else {
dht::decorated_key key = make_partition_key(cd->ip);
dht::decorated_key key = make_partition_key(cd.ip);
if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
ips.insert(decorated_ip{std::move(key), cd->ip});
cd_map[cd->ip].emplace_back(std::move(cd));
ips.insert(decorated_ip{std::move(key), cd.ip});
cd_map[cd.ip].emplace_back(std::move(cd));
}
}
co_await coroutine::maybe_yield();
@@ -826,58 +825,39 @@ class clients_table : public streaming_virtual_table {
co_await result.emit_partition_start(dip.key);
auto& clients = cd_map[dip.ip];
std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
return a->port < b->port || a->client_type_str() < b->client_type_str();
std::ranges::sort(clients, [] (const client_data& a, const client_data& b) {
return a.port < b.port || a.client_type_str() < b.client_type_str();
});
for (const auto& cd : clients) {
clustering_row cr(make_clustering_key(cd->port, cd->client_type_str()));
set_cell(cr.cells(), "shard_id", cd->shard_id);
set_cell(cr.cells(), "connection_stage", cd->stage_str());
if (cd->driver_name) {
set_cell(cr.cells(), "driver_name", cd->driver_name->key());
clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
set_cell(cr.cells(), "shard_id", cd.shard_id);
set_cell(cr.cells(), "connection_stage", cd.stage_str());
if (cd.driver_name) {
set_cell(cr.cells(), "driver_name", *cd.driver_name);
}
if (cd->driver_version) {
set_cell(cr.cells(), "driver_version", cd->driver_version->key());
if (cd.driver_version) {
set_cell(cr.cells(), "driver_version", *cd.driver_version);
}
if (cd->hostname) {
set_cell(cr.cells(), "hostname", *cd->hostname);
if (cd.hostname) {
set_cell(cr.cells(), "hostname", *cd.hostname);
}
if (cd->protocol_version) {
set_cell(cr.cells(), "protocol_version", *cd->protocol_version);
if (cd.protocol_version) {
set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
}
if (cd->ssl_cipher_suite) {
set_cell(cr.cells(), "ssl_cipher_suite", *cd->ssl_cipher_suite);
if (cd.ssl_cipher_suite) {
set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
}
if (cd->ssl_enabled) {
set_cell(cr.cells(), "ssl_enabled", *cd->ssl_enabled);
if (cd.ssl_enabled) {
set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
}
if (cd->ssl_protocol) {
set_cell(cr.cells(), "ssl_protocol", *cd->ssl_protocol);
if (cd.ssl_protocol) {
set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
}
set_cell(cr.cells(), "username", cd->username ? *cd->username : sstring("anonymous"));
if (cd->scheduling_group_name) {
set_cell(cr.cells(), "scheduling_group", *cd->scheduling_group_name);
set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
if (cd.scheduling_group_name) {
set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name);
}
auto map_type = map_type_impl::get_instance(
utf8_type,
utf8_type,
false
);
auto prepare_client_options = [] (const auto& client_options) {
map_type_impl::native_type tmp;
for (auto& co: client_options) {
auto map_element = std::make_pair(data_value(co.key.key()), data_value(co.value.key()));
tmp.push_back(std::move(map_element));
}
return tmp;
};
set_cell(cr.cells(), "client_options",
make_map_value(map_type, prepare_client_options(cd->client_options)));
co_await result.emit_row(std::move(cr));
}
co_await result.emit_partition_end();

View File

@@ -1,17 +1,17 @@
# Alternator: DynamoDB API in ScyllaDB
# Alternator: DynamoDB API in Scylla
## Introduction
Alternator is a ScyllaDB feature adding compatibility with Amazon DynamoDB(TM).
Alternator is a Scylla feature adding compatibility with Amazon DynamoDB(TM).
DynamoDB's API uses JSON-encoded requests and responses which are sent over
an HTTP or HTTPS transport. It is described in detail in Amazon's [DynamoDB
API Reference](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/).
Our goal is that any application written to use Amazon DynamoDB could
be run, unmodified, against ScyllaDB with Alternator enabled. Alternator's
be run, unmodified, against Scylla with Alternator enabled. Alternator's
compatibility with DynamoDB is fairly complete, but users should be aware
of some differences and some unimplemented features. The extent of
Alternator's compatibility with DynamoDB is described in the
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document,
[Scylla Alternator for DynamoDB users](compatibility.md) document,
which is updated as the work on Alternator progresses and compatibility
continues to improve.
@@ -19,8 +19,8 @@ Alternator also adds several features and APIs that are not available in
DynamoDB. These are described in [Alternator-specific APIs](new-apis.md).
## Running Alternator
By default, ScyllaDB does not listen for DynamoDB API requests. To enable
this API in ScyllaDB you must set at least two configuration options,
By default, Scylla does not listen for DynamoDB API requests. To enable
this API in Scylla you must set at least two configuration options,
**alternator_port** and **alternator_write_isolation**. For example in the
YAML configuration file:
```yaml
@@ -30,7 +30,7 @@ alternator_write_isolation: only_rmw_uses_lwt # or always, forbid or unsafe
or, equivalently, via command-line arguments: `--alternator-port=8000
--alternator-write-isolation=only_rmw_uses_lwt.
the **alternator_port** option determines on which port ScyllaDB listens for
the **alternator_port** option determines on which port Scylla listens for
DynamoDB API requests. By default, it listens on this port on all network
interfaces. To listen only on a specific interface, configure also the
**alternator_address** option.
@@ -41,12 +41,12 @@ Alternator has four different choices
for the implementation of writes, each with different advantages. You should
carefully consider which of the options makes more sense for your intended
use case and configure alternator_write_isolation accordingly. There is
currently no default for this option: Trying to run ScyllaDB with an Alternator
currently no default for this option: Trying to run Scylla with an Alternator
port selected but without configuring write isolation will result in an error message,
asking you to set it.
In addition to (or instead of) serving HTTP requests on alternator_port,
ScyllaDB can accept DynamoDB API requests over HTTPS (encrypted), on the port
Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
specified by **alternator_https_port**. As usual for HTTPS servers, the
operator must specify certificate and key files. By default these should
be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
@@ -54,7 +54,7 @@ these default locations can overridden by specifying
`--alternator-encryption-options keyfile="..."` and
`--alternator-encryption-options certificate="..."`.
By default, ScyllaDB saves a snapshot of deleted tables. But Alternator does
By default, Scylla saves a snapshot of deleted tables. But Alternator does
not offer an API to restore these snapshots, so these snapshots are not useful
and waste disk space - deleting a table does not recover any disk space.
It is therefore recommended to disable this automatic-snapshotting feature
@@ -73,11 +73,11 @@ itself. Instructions, code and examples for doing this can be found in the
This section provides only a very brief introduction to Alternator's
design. A much more detailed document about the features of the DynamoDB
API and how they are, or could be, implemented in ScyllaDB can be found in:
API and how they are, or could be, implemented in Scylla can be found in:
<https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs>
Almost all of Alternator's source code (except some initialization code)
can be found in the alternator/ subdirectory of ScyllaDB's source code.
can be found in the alternator/ subdirectory of Scylla's source code.
Extensive functional tests can be found in the test/alternator
subdirectory. These tests are written in Python, and can be run against
both Alternator and Amazon's DynamoDB; This allows verifying that
@@ -85,15 +85,15 @@ Alternator's behavior matches the one observed on DynamoDB.
See test/alternator/README.md for more information about the tests and
how to run them.
With Alternator enabled on port 8000 (for example), every ScyllaDB node
With Alternator enabled on port 8000 (for example), every Scylla node
listens for DynamoDB API requests on this port. These requests, in
JSON format over HTTP, are parsed and result in calls to internal Scylla
C++ functions - there is no CQL generation or parsing involved.
In ScyllaDB terminology, the node receiving the request acts as the
In Scylla terminology, the node receiving the request acts as the
*coordinator*, and often passes the request on to one or more other nodes -
*replicas* which hold copies of the requested data.
Alternator tables are stored as ScyllaDB tables, each in a separate keyspace.
Alternator tables are stored as Scylla tables, each in a separate keyspace.
Each keyspace is initialized when the corresponding Alternator table is
created (with a CreateTable request). The replication factor (RF) for this
keyspace is chosen at that point, depending on the size of the cluster:
@@ -101,19 +101,19 @@ RF=3 is used on clusters with three or more nodes, and RF=1 is used for
smaller clusters. Such smaller clusters are, of course, only recommended
for tests because of the risk of data loss.
Each table in Alternator is stored as a ScyllaDB table in a separate
Each table in Alternator is stored as a Scylla table in a separate
keyspace. The DynamoDB key columns (hash and sort key) have known types,
and become partition and clustering key columns of the ScyllaDB table.
and become partition and clustering key columns of the Scylla table.
All other attributes may be different for each row, so are stored in one
map column in ScyllaDB, and not as separate columns.
map column in Scylla, and not as separate columns.
DynamoDB supports two consistency levels for reads, "eventual consistency"
and "strong consistency". These two modes are implemented using ScyllaDB's CL
and "strong consistency". These two modes are implemented using Scylla's CL
(consistency level) feature: All writes are done using the `LOCAL_QUORUM`
consistency level, then strongly-consistent reads are done with
`LOCAL_QUORUM`, while eventually-consistent reads are with just `LOCAL_ONE`.
In ScyllaDB (and its inspiration, Cassandra), high write performance is
In Scylla (and its inspiration, Cassandra), high write performance is
achieved by ensuring that writes do not require reads from disk.
The DynamoDB API, however, provides many types of requests that need a read
before the write (a.k.a. RMW requests - read-modify-write). For example,
@@ -121,7 +121,7 @@ a request may copy an existing attribute, increment an attribute,
be conditional on some expression involving existing values of attribute,
or request that the previous values of attributes be returned. These
read-modify-write transactions should be _isolated_ from each other, so
by default Alternator implements every write operation using ScyllaDB's
by default Alternator implements every write operation using Scylla's
LWT (lightweight transactions). This default can be overridden on a per-table
basis, by tagging the table as explained above in the "write isolation
policies" section.

View File

@@ -1,6 +1,6 @@
# ScyllaDB Alternator for DynamoDB users
ScyllaDB supports the DynamoDB API (this feature is codenamed "Alternator").
Scylla supports the DynamoDB API (this feature is codenamed "Alternator").
Our goal is to support any application written for Amazon DynamoDB.
Nevertheless, there are a few differences between DynamoDB and Scylla, and
and a few DynamoDB features that have not yet been implemented in Scylla.
@@ -8,16 +8,16 @@ The purpose of this document is to inform users of these differences.
## Provisioning
The most obvious difference between DynamoDB and ScyllaDB is that while
DynamoDB is a shared cloud service, ScyllaDB is a dedicated service running
The most obvious difference between DynamoDB and Scylla is that while
DynamoDB is a shared cloud service, Scylla is a dedicated service running
on your private cluster. Whereas DynamoDB allows you to "provision" the
number of requests per second you'll need - or at an extra cost not even
provision that - ScyllaDB requires you to provision your cluster. You need
provision that - Scylla requires you to provision your cluster. You need
to reason about the number and size of your nodes - not the throughput.
Moreover, DynamoDB's per-table provisioning (`BillingMode=PROVISIONED`) is
not yet supported by Scylla. The BillingMode and ProvisionedThroughput options
on a table need to be valid but are ignored, and ScyllaDB behaves like DynamoDB's
on a table need to be valid but are ignored, and Scylla behaves like DynamoDB's
`BillingMode=PAY_PER_REQUEST`: All requests are accepted without a per-table
throughput cap.
@@ -33,7 +33,7 @@ Instructions for doing this can be found in:
## Write isolation policies
ScyllaDB was designed to optimize the performance of pure write operations -
Scylla was designed to optimize the performance of pure write operations -
writes which do not need to read the previous value of the item.
In CQL, writes which do need the previous value of the item must explicitly
use the slower LWT ("LightWeight Transaction") feature to be correctly
@@ -79,11 +79,11 @@ a _higher_ timestamp - and this will be the "last write" that wins.
To avoid or mitigate this write reordering issue, users may consider
one or more of the following:
1. Use NTP to keep the clocks on the different ScyllaDB nodes synchronized.
1. Use NTP to keep the clocks on the different Scylla nodes synchronized.
If the delay between the two writes is longer than NTP's accuracy,
they will not be reordered.
2. If an application wants to ensure that two specific writes are not
reordered, it should send both requests to the same ScyllaDB node.
reordered, it should send both requests to the same Scylla node.
Care should be taken when using a load balancer - which might redirect
two requests to two different nodes.
3. Consider using the `always_use_lwt` write isolation policy.
@@ -210,7 +210,7 @@ CREATE SERVICE_LEVEL IF NOT EXISTS oltp WITH SHARES = 1000;
ATTACH SERVICE_LEVEL olap TO alice;
ATTACH SERVICE_LEVEL oltp TO bob;
```
Note that `alternator_enforce_authorization` has to be enabled in ScyllaDB configuration.
Note that `alternator_enforce_authorization` has to be enabled in Scylla configuration.
See [Authorization](##Authorization) section to learn more about roles and authorization.
See [Workload Prioritization](../features/workload-prioritization)
@@ -218,11 +218,11 @@ to read about Workload Prioritization in detail.
## Metrics
ScyllaDB has an advanced and extensive monitoring framework for inspecting
and graphing hundreds of different metrics of ScyllaDB's usage and performance.
ScyllaDB's monitoring stack, based on Grafana and Prometheus, is described in
Scylla has an advanced and extensive monitoring framework for inspecting
and graphing hundreds of different metrics of Scylla's usage and performance.
Scylla's monitoring stack, based on Grafana and Prometheus, is described in
<https://docs.scylladb.com/operating-scylla/monitoring/>.
This monitoring stack is different from DynamoDB's offering - but ScyllaDB's
This monitoring stack is different from DynamoDB's offering - but Scylla's
is significantly more powerful and gives the user better insights on
the internals of the database and its performance.
@@ -248,7 +248,7 @@ data in different partition order. Applications mustn't rely on that
undocumented order.
Note that inside each partition, the individual items will be sorted the same
in DynamoDB and ScyllaDB - determined by the _sort key_ defined for that table.
in DynamoDB and Scylla - determined by the _sort key_ defined for that table.
---
@@ -274,7 +274,7 @@ is different, or can be configured in Alternator:
## Experimental API features
Some DynamoDB API features are supported by Alternator, but considered
**experimental** in this release. An experimental feature in ScyllaDB is a
**experimental** in this release. An experimental feature in Scylla is a
feature whose functionality is complete, or mostly complete, but it is not
as thoroughly tested or optimized as regular features. Also, an experimental
feature's implementation is still subject to change and upgrades may not be
@@ -351,8 +351,8 @@ they should be easy to detect. Here is a list of these unimplemented features:
* The on-demand backup APIs are not supported: CreateBackup, DescribeBackup,
DeleteBackup, ListBackups, RestoreTableFromBackup.
For now, users can use ScyllaDB's existing backup solutions such as snapshots
or ScyllaDB Manager.
For now, users can use Scylla's existing backup solutions such as snapshots
or Scylla Manager.
<https://github.com/scylladb/scylla/issues/5063>
* Continuous backup (the ability to restore any point in time) is also not
@@ -370,7 +370,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
<https://github.com/scylladb/scylla/issues/5068>
* DAX (DynamoDB Accelerator), an in-memory cache for DynamoDB, is not
available in for Alternator. Anyway, it should not be necessary - ScyllaDB's
available in for Alternator. Anyway, it should not be necessary - Scylla's
internal cache is already rather advanced and there is no need to place
another cache in front of the it. We wrote more about this here:
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
@@ -384,7 +384,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
and the operations ExecuteStatement, BatchExecuteStatement and
ExecuteTransaction are not yet supported.
A user that is interested in an SQL-like syntax can consider using ScyllaDB's
A user that is interested in an SQL-like syntax can consider using Scylla's
CQL protocol instead.
This feature was added to DynamoDB in November 2020.
<https://github.com/scylladb/scylla/issues/8787>
@@ -393,7 +393,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
which is different from AWS's. In particular, the operations
DescribeContributorInsights, ListContributorInsights and
UpdateContributorInsights that configure Amazon's "CloudWatch Contributor
Insights" are not yet supported. ScyllaDB has different ways to retrieve the
Insights" are not yet supported. Scylla has different ways to retrieve the
same information, such as which items were accessed most often.
<https://github.com/scylladb/scylla/issues/8788>

View File

@@ -11,7 +11,7 @@ This section will guide you through the steps for setting up the cluster:
<https://hub.docker.com/r/scylladb/scylla/>, but add to every `docker run`
command a `-p 8000:8000` before the image name and
`--alternator-port=8000 --alternator-write-isolation=always` at the end.
The "alternator-port" option specifies on which port ScyllaDB will listen for
The "alternator-port" option specifies on which port Scylla will listen for
the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses
whether or not Alternator will use LWT for every write.
For example,
@@ -24,10 +24,10 @@ This section will guide you through the steps for setting up the cluster:
By default, ScyllaDB run in this way will not have authentication or
authorization enabled, and any DynamoDB API request will be honored without
requiring them to be signed appropriately. See the
[ScyllaDB Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
[Scylla Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
document on how to configure authentication and authorization.
## Testing ScyllaDB's DynamoDB API support:
## Testing Scylla's DynamoDB API support:
### Running AWS Tic Tac Toe demo app to test the cluster:
1. Follow the instructions on the [AWS github page](https://github.com/awsdocs/amazon-dynamodb-developer-guide/blob/master/doc_source/TicTacToe.Phase1.md)
2. Enjoy your tic-tac-toe game :-)

View File

@@ -2,9 +2,9 @@
Alternator's primary goal is to be compatible with Amazon DynamoDB(TM)
and its APIs, so that any application written to use Amazon DynamoDB could
be run, unmodified, against ScyllaDB with Alternator enabled. The extent of
be run, unmodified, against Scylla with Alternator enabled. The extent of
Alternator's compatibility with DynamoDB is described in the
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document.
[Scylla Alternator for DynamoDB users](compatibility.md) document.
But Alternator also adds several features and APIs that are not available in
DynamoDB. These Alternator-specific APIs are documented here.
@@ -15,7 +15,7 @@ _conditional_ update or an update based on the old value of an attribute.
The read and the write should be treated as a single transaction - protected
(_isolated_) from other parallel writes to the same item.
Alternator could do this isolation by using ScyllaDB's LWT (lightweight
Alternator could do this isolation by using Scylla's LWT (lightweight
transactions) for every write operation, but this significantly slows
down writes, and not necessary for workloads which don't use read-modify-write
(RMW) updates.
@@ -41,7 +41,7 @@ isolation policy for a specific table can be overridden by tagging the table
which need a read before the write. An attempt to use such statements
(e.g., UpdateItem with a ConditionExpression) will result in an error.
In this mode, the remaining write requests which are allowed - pure writes
without a read - are performed using standard ScyllaDB writes, not LWT,
without a read - are performed using standard Scylla writes, not LWT,
so they are significantly faster than they would have been in the
`always_use_lwt`, but their isolation is still correct.
@@ -65,19 +65,19 @@ isolation policy for a specific table can be overridden by tagging the table
read-modify-write updates. This mode is not recommended for any use case,
and will likely be removed in the future.
## Accessing system tables from ScyllaDB
ScyllaDB exposes lots of useful information via its internal system tables,
## Accessing system tables from Scylla
Scylla exposes lots of useful information via its internal system tables,
which can be found in system keyspaces: 'system', 'system\_auth', etc.
In order to access to these tables via alternator interface,
Scan and Query requests can use a special table name:
`.scylla.alternator.KEYSPACE_NAME.TABLE_NAME`
which will return results fetched from corresponding ScyllaDB table.
which will return results fetched from corresponding Scylla table.
This interface can be used only to fetch data from system tables.
Attempts to read regular tables via the virtual interface will result
in an error.
Example: in order to query the contents of ScyllaDB's `system.large_rows`,
Example: in order to query the contents of Scylla's `system.large_rows`,
pass `TableName='.scylla.alternator.system.large_rows'` to a Query/Scan
request.
@@ -113,14 +113,14 @@ connection (either active or idle), not necessarily an active request as
in Alternator.
## Service discovery
As explained in [ScyllaDB Alternator for DynamoDB users](compatibility.md),
As explained in [Scylla Alternator for DynamoDB users](compatibility.md),
Alternator requires a load-balancer or a client-side load-balancing library
to distribute requests between all ScyllaDB nodes. This load-balancer needs
to be able to _discover_ the ScyllaDB nodes. Alternator provides two special
to distribute requests between all Scylla nodes. This load-balancer needs
to be able to _discover_ the Scylla nodes. Alternator provides two special
requests, `/` and `/localnodes`, to help with this service discovery, which
we will now explain.
Some setups know exactly which ScyllaDB nodes were brought up, so all that
Some setups know exactly which Scylla nodes were brought up, so all that
remains is to periodically verify that each node is still functional. The
easiest way to do this is to make an HTTP (or HTTPS) GET request to the node,
with URL `/`. This is a trivial GET request and does **not** need to be
@@ -133,10 +133,10 @@ $ curl http://localhost:8000/
healthy: localhost:8000
```
In other setups, the load balancer might not know which ScyllaDB nodes exist.
For example, it may be possible to add or remove ScyllaDB nodes without a
In other setups, the load balancer might not know which Scylla nodes exist.
For example, it may be possible to add or remove Scylla nodes without a
client-side load balancer knowing. For these setups we have the `/localnodes`
request that can be used to discover which ScyllaDB nodes exist: A load balancer
request that can be used to discover which Scylla nodes exist: A load balancer
that already knows at least one live node can discover the rest by sending
a `/localnodes` request to the known node. It's again an unauthenticated
HTTP (or HTTPS) GET request:
@@ -160,7 +160,7 @@ list the nodes in a specific _data center_ or _rack_. These options are
useful for certain use cases:
* A `dc` option (e.g., `/localnodes?dc=dc1`) can be passed to list the
nodes in a specific ScyllaDB data center, not the data center of the node
nodes in a specific Scylla data center, not the data center of the node
being contacted. This is useful when a client knowns of _some_ Scylla
node belonging to an unknown DC, but wants to list the nodes in _its_
DC, which it knows by name.
@@ -191,7 +191,7 @@ tells them to.
If you want to influence whether a specific Alternator table is created with tablets or vnodes,
you can do this by specifying the `system:initial_tablets` tag
(in earlier versions of ScyllaDB the tag was `experimental:initial_tablets`)
(in earlier versions of Scylla the tag was `experimental:initial_tablets`)
in the CreateTable operation. The value of this tag can be:
* Any valid integer as the value of this tag enables tablets.

View File

@@ -365,7 +365,7 @@ Modifying a keyspace with tablets enabled is possible and doesn't require any sp
- The replication factor (RF) can be increased or decreased by at most 1 at a time. To reach the desired RF value, modify the RF repeatedly.
- The ``ALTER`` statement rejects the ``replication_factor`` tag. List the DCs explicitly when altering a keyspace. See :ref:`NetworkTopologyStrategy <replication-strategy>`.
- An RF change cannot be requested while another RF change is pending for the same keyspace. Attempting to execute an ``ALTER`` statement in this scenario will fail with an explicit error. Wait for the ongoing RF change to complete before issuing another ``ALTER`` statement.
- If there's any other ongoing global topology operation, executing the ``ALTER`` statement will fail (with an explicit and specific error) and needs to be repeated.
- The ``ALTER`` statement may take longer than the regular query timeout, and even if it times out, it will continue to execute in the background.
- The replication strategy cannot be modified, as keyspaces with tablets only support ``NetworkTopologyStrategy``.
- The ``ALTER`` statement will fail if it would make the keyspace :term:`RF-rack-invalid <RF-rack-valid keyspace>`.
@@ -1043,8 +1043,6 @@ The following modes are available:
* - ``immediate``
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
.. _cql-per-table-tablet-options:
Per-table tablet options

View File

@@ -102,7 +102,6 @@ Additional Information
To learn more about TTL, and see a hands-on example, check out `this lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_ on ScyllaDB University.
* `Video: Managing data expiration with Time-To-Live <https://www.youtube.com/watch?v=SXkbu7mFHeA>`_
* :doc:`Apache Cassandra Query Language (CQL) Reference </cql/index>`
* :doc:`KB Article:How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds/>`
* :doc:`KB Article:Time to Live (TTL) and Compaction </kb/ttl-facts/>`

View File

@@ -2,11 +2,8 @@
## What is ScyllaDB?
ScyllaDB is a high-performance NoSQL database optimized for speed and scalability.
It is designed to efficiently handle large volumes of data with minimal latency,
making it ideal for data-intensive applications.
ScyllaDB is distributed under the [ScyllaDB Source Available License](https://github.com/scylladb/scylladb/blob/master/LICENSE-ScyllaDB-Source-Available.md).
ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
> [ScyllaDB](http://www.scylladb.com/)

View File

@@ -74,8 +74,6 @@ The keys and values are:
as an indicator to which shard client wants to connect. The desired shard number
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
Its value is a decimal representation of type `uint16_t`, by default `19142`.
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
`biased-token-round-robin`. To apply the algorithm,
@@ -238,26 +236,3 @@ the same mechanism for other protocol versions, such as CQLv4.
The feature is identified by the `SCYLLA_USE_METADATA_ID` key, which is meant to be sent
in the SUPPORTED message.
## Sending the CLIENT_ROUTES_CHANGE event
This extension allows a driver to update its connections when the
`system.client_routes` table is modified.
In some network topologies a specific mapping of addresses and ports is required (e.g.
to support Private Link). This mapping can change dynamically even when no nodes are
added or removed. The driver must adapt to those changes; otherwise connectivity can be
lost.
The extension is implemented as a new `EVENT` type: `CLIENT_ROUTES_CHANGE`. The event
body consists of:
- [string] change
- [string list] connection_ids
- [string list] host_ids
There is only one change value: `UPDATE_NODES`, which means at least one client route
was inserted, updated, or deleted.
Events already have a subscription mechanism similar to protocol extensions (that is,
the driver only receives the events it explicitly subscribed to), so no additional
`cql_protocol_extension` key is introduced for this feature.

View File

@@ -86,7 +86,6 @@ stateDiagram-v2
de_left_token_ring --> [*]
}
state removing {
re_left_token_ring : left_token_ring
re_tablet_draining : tablet_draining
re_tablet_migration : tablet_migration
re_write_both_read_old : write_both_read_old
@@ -99,8 +98,7 @@ stateDiagram-v2
re_tablet_draining --> re_write_both_read_old
re_write_both_read_old --> re_write_both_read_new: streaming completed
re_write_both_read_old --> re_rollback_to_normal: rollback
re_write_both_read_new --> re_left_token_ring
re_left_token_ring --> [*]
re_write_both_read_new --> [*]
}
rebuilding --> normal: streaming completed
decommissioning --> left: operation succeeded
@@ -124,10 +122,9 @@ Note that these are not all states, as there are other states specific to tablet
Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
to modified token ring), reads are using old replicas.
- `write_both_read_new` - as above, but reads are using new replicas.
- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
We also use this state to rollback a failed bootstrap or decommission.
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
it from group 0. We also use this state to rollback a failed bootstrap or decommission.
- `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
moving the node we tried to decommission/remove back to the normal state.
- `lock` - the topology stays in this state until externally changed (to null state), preventing topology
@@ -144,9 +141,7 @@ reads that started before this point exist in the system. Finally we remove the
transitioning state.
Decommission, removenode and replace work similarly, except they don't go through
`commit_cdc_generation`. Both decommission and removenode go through the
`left_token_ring` state to run a global barrier ensuring all nodes are aware
of the topology change before the operation completes.
`commit_cdc_generation`.
The state machine may also go only through the `commit_cdc_generation` state
after getting a request from the user to create a new CDC generation if the

View File

@@ -41,12 +41,12 @@ Unless the task was aborted, the worker will eventually reply that the task was
it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
This batching of removing finished tasks is done in order to reduce number of generated group0 operations.
On the other hand, view building tasks can can also be aborted due to 2 main reasons:
On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
- a keyspace/view was dropped
- tablet operations (see [tablet operations section](#tablet-operations))
In the first case we simply delete relevant view building tasks as they are no longer needed.
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task information
to create new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.
(*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,

View File

@@ -17,7 +17,6 @@ This document highlights ScyllaDB's key data modeling features.
Workload Prioritization </features/workload-prioritization>
Backup and Restore </features/backup-and-restore>
Incremental Repair </features/incremental-repair/>
Vector Search </features/vector-search/>
.. panel-box::
:title: ScyllaDB Features
@@ -44,5 +43,3 @@ This document highlights ScyllaDB's key data modeling features.
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
efficient and lightweight approach to maintaining data consistency by
repairing only the data that has changed since the last repair.
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
similarity-based queries on vector embeddings.

View File

@@ -1,55 +0,0 @@
=================================
Vector Search in ScyllaDB
=================================
.. note::
This feature is currently available only in `ScyllaDB Cloud <https://cloud.docs.scylladb.com/>`_.
What Is Vector Search
-------------------------
Vector Search enables similarity-based queries over high-dimensional data,
such as text, images, audio, or user behavior. Instead of searching for exact
matches, it allows applications to find items that are semantically similar to
a given input.
To do this, Vector Search works on vector embeddings, which are numerical
representations of data that capture semantic meaning. This enables queries
such as:
* “Find documents similar to this paragraph”
* “Find products similar to what the user just viewed”
* “Find previous tickets related to this support request”
Rather than relying on exact values or keywords, Vector Search returns results
based on distance or similarity between vectors. This capability is
increasingly used in modern workloads such as AI-powered search, recommendation
systems, and retrieval-augmented generation (RAG).
Why Vector Search Matters
------------------------------------
Many applications already rely on ScyllaDB for high throughput, low and
predictable latency, and large-scale data storage.
Vector Search complements these strengths by enabling new classes of workloads,
including:
* Semantic search over text or documents
* Recommendations based on user or item similarity
* AI and ML applications, including RAG pipelines
* Anomaly and pattern detection
With Vector Search, ScyllaDB can serve as the similarity search backend for
AI-driven applications.
Availability
--------------
Vector Search is currently available only in ScyllaDB Cloud, the fully managed
ScyllaDB service.
👉 For details on using Vector Search, refer to the
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/index.html>`_.

View File

@@ -20,10 +20,7 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
Amazon Web Services (AWS)
-----------------------------
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`,
:ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`,
:ref:`i7ie <system-requirements-i7ie-instances>`, :ref:`i8g<system-requirements-i8g-instances>`,
and :ref:`i8ge <system-requirements-i8ge-instances>`.
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.
.. note::
@@ -198,118 +195,6 @@ All i7i instances have the following specs:
See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.
.. _system-requirements-i8g-instances:
i8g instances
^^^^^^^^^^^^^^
The following i8g instances are supported.
.. list-table::
:widths: 30 20 20 30
:header-rows: 1
* - Model
- vCPU
- Mem (GiB)
- Storage (GB)
* - i8g.large
- 2
- 16
- 1 x 468 GB
* - i8g.xlarge
- 4
- 32
- 1 x 937 GB
* - i8g.2xlarge
- 8
- 64
- 1 x 1,875 GB
* - i8g.4xlarge
- 16
- 128
- 1 x 3,750 GB
* - i8g.8xlarge
- 32
- 256
- 2 x 3,750 GB
* - i8g.12xlarge
- 48
- 384
- 3 x 3,750 GB
* - i8g.16xlarge
- 64
- 512
- 4 x 3,750 GB
All i8g instances have the following specs:
* Powered by AWS Graviton4 processors
* 3rd generation AWS Nitro SSD storage
* DDR5-5600 memory for improved throughput
* Up to 100 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
Amazon Elastic Block Store (EBS)
* Instance sizes offer up to 45 TB of total local NVMe instance storage
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
.. _system-requirements-i8ge-instances:
i8ge instances
^^^^^^^^^^^^^^
The following i8ge instances are supported.
.. list-table::
:widths: 30 20 20 30
:header-rows: 1
* - Model
- vCPU
- Mem (GiB)
- Storage (GB)
* - i8ge.large
- 2
- 16
- 1 x 1,250 GB
* - i8ge.xlarge
- 4
- 32
- 1 x 2,500 GB
* - i8ge.2xlarge
- 8
- 64
- 2 x 2,500 GB
* - i8ge.3xlarge
- 12
- 96
- 1 x 7,500 GB
* - i8ge.6xlarge
- 24
- 192
- 2 x 7,500 GB
* - i8ge.12xlarge
- 48
- 384
- 4 x 7,500 GB
* - i8ge.18xlarge
- 72
- 576
- 6 x 7,500 GB
All i8ge instances have the following specs:
* Powered by AWS Graviton4 processors
* 3rd generation AWS Nitro SSD storage
* DDR5-5600 memory for improved throughput
* Up to 300 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
Amazon Elastic Block Store (EBS)
* Instance sizes offer up to 120 TB of total local NVMe instance storage
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
Im4gn and Is4gen instances
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ScyllaDB supports Arm-based Im4gn and Is4gen instances. See `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details.

View File

@@ -25,7 +25,8 @@ Getting Started
:id: "getting-started"
:class: my-panel
* :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
* `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
* :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
* :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
* :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`

View File

@@ -3,7 +3,8 @@
ScyllaDB Housekeeping and how to disable it
============================================
It is always recommended to run the latest stable version of ScyllaDB.
It is always recommended to run the latest version of ScyllaDB.
The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.

View File

@@ -9,8 +9,6 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on **any** of the nodes in the cluster.
.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
To check if a keyspace enables tablets, use:
.. code-block:: cql

95
docs/poetry.lock generated
View File

@@ -2,35 +2,36 @@
[[package]]
name = "alabaster"
version = "1.0.0"
version = "0.7.16"
description = "A light, configurable Sphinx theme"
optional = false
python-versions = ">=3.10"
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b"},
{file = "alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e"},
{file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"},
{file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"},
]
[[package]]
name = "anyio"
version = "4.12.0"
version = "4.11.0"
description = "High-level concurrency and networking framework on top of asyncio or Trio"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb"},
{file = "anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0"},
{file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"},
{file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"},
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
[package.extras]
trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""]
trio = ["trio (>=0.31.0)"]
[[package]]
name = "babel"
@@ -49,14 +50,14 @@ dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)"
[[package]]
name = "beartype"
version = "0.22.8"
version = "0.22.6"
description = "Unbearably fast near-real-time pure-Python runtime-static type-checker."
optional = false
python-versions = ">=3.10"
groups = ["main"]
files = [
{file = "beartype-0.22.8-py3-none-any.whl", hash = "sha256:b832882d04e41a4097bab9f63e6992bc6de58c414ee84cba9b45b67314f5ab2e"},
{file = "beartype-0.22.8.tar.gz", hash = "sha256:b19b21c9359722ee3f7cc433f063b3e13997b27ae8226551ea5062e621f61165"},
{file = "beartype-0.22.6-py3-none-any.whl", hash = "sha256:0584bc46a2ea2a871509679278cda992eadde676c01356ab0ac77421f3c9a093"},
{file = "beartype-0.22.6.tar.gz", hash = "sha256:97fbda69c20b48c5780ac2ca60ce3c1bb9af29b3a1a0216898ffabdd523e48f4"},
]
[package.extras]
@@ -69,18 +70,18 @@ test-tox-coverage = ["coverage (>=5.5)"]
[[package]]
name = "beautifulsoup4"
version = "4.14.3"
version = "4.14.2"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.7.0"
groups = ["main"]
files = [
{file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
{file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
]
[package.dependencies]
soupsieve = ">=1.6.1"
soupsieve = ">1.2"
typing-extensions = ">=4.0.0"
[package.extras]
@@ -801,6 +802,18 @@ files = [
{file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
]
[[package]]
name = "sniffio"
version = "1.3.1"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
]
[[package]]
name = "snowballstemmer"
version = "3.0.1"
@@ -827,18 +840,18 @@ files = [
[[package]]
name = "sphinx"
version = "8.1.3"
version = "7.4.7"
description = "Python documentation generator"
optional = false
python-versions = ">=3.10"
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
{file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"},
{file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"},
]
[package.dependencies]
alabaster = ">=0.7.14"
alabaster = ">=0.7.14,<0.8.0"
babel = ">=2.13"
colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""}
docutils = ">=0.20,<0.22"
@@ -848,17 +861,17 @@ packaging = ">=23.0"
Pygments = ">=2.17"
requests = ">=2.30.0"
snowballstemmer = ">=2.2"
sphinxcontrib-applehelp = ">=1.0.7"
sphinxcontrib-devhelp = ">=1.0.6"
sphinxcontrib-htmlhelp = ">=2.0.6"
sphinxcontrib-jsmath = ">=1.0.1"
sphinxcontrib-qthelp = ">=1.0.6"
sphinxcontrib-applehelp = "*"
sphinxcontrib-devhelp = "*"
sphinxcontrib-htmlhelp = ">=2.0.0"
sphinxcontrib-jsmath = "*"
sphinxcontrib-qthelp = "*"
sphinxcontrib-serializinghtml = ">=1.1.9"
tomli = {version = ">=2", markers = "python_version < \"3.11\""}
[package.extras]
docs = ["sphinxcontrib-websupport"]
lint = ["flake8 (>=6.0)", "mypy (==1.11.1)", "pyright (==1.1.384)", "pytest (>=6.0)", "ruff (==0.6.9)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-Pillow (==10.2.0.20240822)", "types-Pygments (==2.18.0.20240506)", "types-colorama (==0.4.15.20240311)", "types-defusedxml (==0.7.0.20240218)", "types-docutils (==0.21.0.20241005)", "types-requests (==2.32.0.20240914)", "types-urllib3 (==1.26.25.14)"]
lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"]
test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"]
[[package]]
@@ -988,14 +1001,13 @@ test = ["tox"]
[[package]]
name = "sphinx-scylladb-markdown"
version = "0.1.4"
version = "0.1.3"
description = "Sphinx extension for ScyllaDB documentation with enhanced Markdown support through MystParser and recommonmark."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "sphinx_scylladb_markdown-0.1.4-py3-none-any.whl", hash = "sha256:598753e01cf159d4698eb1a707958828446e21749038d3d42c5b9c7e86eda6e4"},
{file = "sphinx_scylladb_markdown-0.1.4.tar.gz", hash = "sha256:9db3ae0dcf7c3519262da65e48c7f9e4db0ad1ce9c5f874864ea218f4cbc4c68"},
{file = "sphinx_scylladb_markdown-0.1.3-py3-none-any.whl", hash = "sha256:f20160b4aadf4c8cf95637f0a544121954b792914ab6ec05b67cae75e20a5566"},
]
[package.dependencies]
@@ -1047,25 +1059,24 @@ dev = ["build", "flake8", "pre-commit", "pytest", "sphinx", "sphinx-last-updated
[[package]]
name = "sphinx-substitution-extensions"
version = "2025.11.17"
version = "2025.1.2"
description = "Extensions for Sphinx which allow for substitutions."
optional = false
python-versions = ">=3.10"
groups = ["main"]
files = [
{file = "sphinx_substitution_extensions-2025.11.17-py2.py3-none-any.whl", hash = "sha256:ac18455bdc8324b337b0fe7498c1c0d0b1cb65c74d131459be4dea9edb6abbef"},
{file = "sphinx_substitution_extensions-2025.11.17.tar.gz", hash = "sha256:aae17f8db9efc3d454a304373ae3df763f8739e05e0b98d5381db46f6d250b27"},
{file = "sphinx_substitution_extensions-2025.1.2-py2.py3-none-any.whl", hash = "sha256:ff14f40e4393bd7434a196badb8d47983355d9755af884b902e3023fb456b958"},
{file = "sphinx_substitution_extensions-2025.1.2.tar.gz", hash = "sha256:53b8d394d5098a09aef36bc687fa310aeb28466319d2c750e996e46400fb2474"},
]
[package.dependencies]
beartype = ">=0.18.5"
docutils = ">=0.19"
myst-parser = ">=4.0.0"
sphinx = ">=8.1.0"
sphinx = ">=7.3.5"
[package.extras]
dev = ["actionlint-py (==1.7.8.24)", "check-manifest (==0.51)", "deptry (==0.24.0)", "doc8 (==2.0.0)", "doccmd (==2025.11.8.1)", "docformatter (==1.7.7)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2025.4.3)", "mypy[faster-cache] (==1.18.2)", "pre-commit (==4.4.0)", "pylint[spelling] (==4.0.3)", "pyproject-fmt (==2.11.1)", "pyright (==1.1.407)", "pyroma (==5.0)", "pytest (==9.0.1)", "pytest-cov (==7.0.0)", "ruff (==0.14.5)", "shellcheck-py (==0.11.0.1)", "shfmt-py (==3.12.0.2)", "sphinx-lint (==1.0.1)", "sphinx-toolbox (==4.0.0)", "types-docutils (==0.22.2.20251006)", "vulture (==2.14)", "yamlfix (==1.19.0)"]
release = ["check-wheel-contents (==0.6.3)"]
dev = ["actionlint-py (==1.7.5.21)", "check-manifest (==0.50)", "deptry (==0.21.2)", "doc8 (==1.1.2)", "doccmd (==2024.12.26)", "docformatter (==1.7.5)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2024.12.25)", "mypy[faster-cache] (==1.14.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pyenchant (==3.3.0rc1)", "pylint (==3.3.3)", "pyproject-fmt (==2.5.0)", "pyright (==1.1.391)", "pyroma (==4.2)", "pytest (==8.3.4)", "pytest-cov (==6.0.0)", "ruff (==0.8.4)", "shellcheck-py (==0.10.0.1)", "shfmt-py (==3.7.0.1)", "sphinx-toolbox (==3.8.1)", "sphinx[test] (==8.1.3)", "types-docutils (==0.21.0.20241128)", "vulture (==2.14)", "yamlfix (==1.17.0)"]
release = ["check-wheel-contents (==0.6.1)"]
[[package]]
name = "sphinx-tabs"
@@ -1352,21 +1363,21 @@ files = [
[[package]]
name = "urllib3"
version = "2.6.2"
version = "2.5.0"
description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"},
{file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"},
{file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
{file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
]
[package.extras]
brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""]
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
h2 = ["h2 (>=4,<5)"]
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "uvicorn"
@@ -1592,4 +1603,4 @@ files = [
[metadata]
lock-version = "2.1"
python-versions = "^3.10"
content-hash = "9a17caa38b3c88f3fe3d1a60fdb73a96aa12ff1e30ecb00e2f9249e7ba9f859c"
content-hash = "0ae673106f45d3465cbdabbf511e165ca44feadd34d7753f2e68093afaa95c79"

View File

@@ -12,10 +12,10 @@ redirects_cli ="^0.1.3"
sphinx-scylladb-theme = "^1.8.10"
sphinx-sitemap = "^2.6.0"
sphinx-autobuild = "^2024.4.19"
Sphinx = "^8.0.0"
Sphinx = "^7.3.7"
sphinx-multiversion-scylla = "^0.3.4"
sphinxcontrib-datatemplates = "^0.9.2"
sphinx-scylladb-markdown = "^0.1.4"
sphinx-scylladb-markdown = "^0.1.2"
sphinx_collapse ="^0.1.3"
[build-system]

View File

@@ -202,7 +202,3 @@ Glossary
The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.
Colocated Table
An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.

View File

@@ -816,6 +816,7 @@ public:
future<data_sink> wrap_sink(const sstables::sstable& sst, sstables::component_type type, data_sink sink) override {
switch (type) {
case sstables::component_type::Scylla:
case sstables::component_type::TemporaryScylla:
case sstables::component_type::TemporaryTOC:
case sstables::component_type::TOC:
co_return sink;
@@ -844,6 +845,7 @@ public:
sstables::component_type type,
data_source src) override {
switch (type) {
case sstables::component_type::TemporaryScylla:
case sstables::component_type::Scylla:
case sstables::component_type::TemporaryTOC:
case sstables::component_type::TOC:

View File

@@ -176,8 +176,6 @@ public:
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
public:
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;

15
main.cc
View File

@@ -23,7 +23,6 @@
#include <seastar/core/future.hh>
#include <seastar/core/signal.hh>
#include <seastar/core/timer.hh>
#include "service/client_routes.hh"
#include "service/qos/raft_service_level_distributed_data_accessor.hh"
#include "db/view/view_building_state.hh"
#include "tasks/task_manager.hh"
@@ -1796,13 +1795,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
auth_cache.stop().get();
});
checkpoint(stop_signal, "initializing client routes service");
static sharded<service::client_routes_service> client_routes;
client_routes.start(std::ref(stop_signal.as_sharded_abort_source()), std::ref(feature_service), std::ref(group0_client), std::ref(qp), std::ref(lifecycle_notifier)).get();
auto stop_client_routes = defer_verbose_shutdown("client_routes", [&] {
client_routes.stop().get();
});
checkpoint(stop_signal, "initializing storage service");
debug::the_storage_service = &ss;
ss.start(std::ref(stop_signal.as_sharded_abort_source()),
@@ -1811,7 +1803,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
std::ref(messaging), std::ref(repair),
std::ref(stream_manager), std::ref(lifecycle_notifier), std::ref(bm), std::ref(snitch),
std::ref(tablet_allocator), std::ref(cdc_generation_service), std::ref(view_builder), std::ref(view_building_worker), std::ref(qp), std::ref(sl_controller),
std::ref(auth_cache), std::ref(client_routes),
std::ref(auth_cache),
std::ref(tsm), std::ref(vbsm), std::ref(task_manager), std::ref(gossip_address_map),
compression_dict_updated_callback,
only_on_shard0(&*disk_space_monitor_shard0)
@@ -2199,11 +2191,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
});
}).get();
api::set_server_client_routes(ctx, client_routes).get();
auto stop_cr_api = defer_verbose_shutdown("client routes API", [&ctx] {
api::unset_server_client_routes(ctx).get();
});
checkpoint(stop_signal, "join cluster");
// Allow abort during join_cluster since bootstrap or replace
// can take a long time.

View File

@@ -45,9 +45,7 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
: _tombstone(x._tombstone)
, _static_row(s, column_kind::static_column, x._static_row)
, _static_row_continuous(x._static_row_continuous)
, _rows(use_single_row_storage(s) ?
rows_storage_type(std::optional<deletable_row>{}) :
rows_storage_type(rows_type{}))
, _rows()
, _row_tombstones(x._row_tombstones)
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
@@ -56,30 +54,10 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
#ifdef SEASTAR_DEBUG
SCYLLA_ASSERT(x._schema_version == _schema_version);
#endif
if (use_single_row_storage(s)) {
// Copy single row if it exists
if (x.uses_single_row_storage()) {
const auto& x_row = x.get_single_row_storage();
if (x_row) {
get_single_row_storage() = deletable_row(s, *x_row);
}
} else if (!x.get_rows_storage().empty()) {
// Converting from multi-row to single-row - take the first row
// This shouldn't normally happen as schema doesn't change this way
on_internal_error(mplog, "mutation_partition: cannot convert multi-row partition to single-row");
}
} else {
// Multi-row storage
if (x.uses_single_row_storage()) {
// Converting from single-row to multi-row - this shouldn't normally happen
on_internal_error(mplog, "mutation_partition: cannot convert single-row partition to multi-row");
} else {
auto cloner = [&s] (const rows_entry* x) -> rows_entry* {
return current_allocator().construct<rows_entry>(s, *x);
};
get_rows_storage().clone_from(x.get_rows_storage(), cloner, current_deleter<rows_entry>());
}
}
auto cloner = [&s] (const rows_entry* x) -> rows_entry* {
return current_allocator().construct<rows_entry>(s, *x);
};
_rows.clone_from(x._rows, cloner, current_deleter<rows_entry>());
}
mutation_partition::mutation_partition(const mutation_partition& x, const schema& schema,
@@ -87,9 +65,7 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
: _tombstone(x._tombstone)
, _static_row(schema, column_kind::static_column, x._static_row)
, _static_row_continuous(x._static_row_continuous)
, _rows(use_single_row_storage(schema) ?
rows_storage_type(std::optional<deletable_row>{}) :
rows_storage_type(rows_type{}))
, _rows()
, _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only())
#ifdef SEASTAR_DEBUG
, _schema_version(schema.version())
@@ -98,37 +74,19 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
#ifdef SEASTAR_DEBUG
SCYLLA_ASSERT(x._schema_version == _schema_version);
#endif
if (use_single_row_storage(schema)) {
// Single-row storage: just copy the row if it exists
if (x.uses_single_row_storage()) {
const auto& x_row = x.get_single_row_storage();
if (x_row) {
get_single_row_storage() = deletable_row(schema, *x_row);
try {
for(auto&& r : ck_ranges) {
for (const rows_entry& e : x.range(schema, r)) {
auto ce = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(schema, e));
_rows.insert_before_hint(_rows.end(), std::move(ce), rows_entry::tri_compare(schema));
}
} else {
// Filtering from multi-row - shouldn't happen with consistent schema
on_internal_error(mplog, "mutation_partition: filtering from multi-row to single-row storage");
}
} else {
// Multi-row storage with filtering
if (x.uses_single_row_storage()) {
on_internal_error(mplog, "mutation_partition: filtering from single-row to multi-row storage");
} else {
try {
for(auto&& r : ck_ranges) {
for (const rows_entry& e : x.range(schema, r)) {
auto ce = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(schema, e));
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(ce), rows_entry::tri_compare(schema));
}
for (auto&& rt : x._row_tombstones.slice(schema, r)) {
_row_tombstones.apply(schema, rt.tombstone());
}
}
} catch (...) {
get_rows_storage().clear_and_dispose(current_deleter<rows_entry>());
throw;
for (auto&& rt : x._row_tombstones.slice(schema, r)) {
_row_tombstones.apply(schema, rt.tombstone());
}
}
} catch (...) {
_rows.clear_and_dispose(current_deleter<rows_entry>());
throw;
}
}
@@ -146,20 +104,14 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
#ifdef SEASTAR_DEBUG
SCYLLA_ASSERT(x._schema_version == _schema_version);
#endif
if (use_single_row_storage(schema)) {
// Single-row storage: no filtering needed, row either exists or doesn't
// The move constructor has already moved the row if it exists
} else {
// Multi-row storage: filter the rows
if (!uses_single_row_storage()) {
auto deleter = current_deleter<rows_entry>();
auto it = get_rows_storage().begin();
for (auto&& range : ck_ranges.ranges()) {
get_rows_storage().erase_and_dispose(it, lower_bound(schema, range), deleter);
it = upper_bound(schema, range);
}
get_rows_storage().erase_and_dispose(it, get_rows_storage().end(), deleter);
{
auto deleter = current_deleter<rows_entry>();
auto it = _rows.begin();
for (auto&& range : ck_ranges.ranges()) {
_rows.erase_and_dispose(it, lower_bound(schema, range), deleter);
it = upper_bound(schema, range);
}
_rows.erase_and_dispose(it, _rows.end(), deleter);
}
{
for (auto&& range : ck_ranges.ranges()) {
@@ -175,11 +127,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
}
mutation_partition::~mutation_partition() {
if (uses_single_row_storage()) {
// Single-row storage: optional destructor handles cleanup
} else {
get_rows_storage().clear_and_dispose(current_deleter<rows_entry>());
}
_rows.clear_and_dispose(current_deleter<rows_entry>());
}
mutation_partition&
@@ -193,14 +141,10 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
void mutation_partition::ensure_last_dummy(const schema& s) {
check_schema(s);
if (uses_single_row_storage()) {
// Single-row storage doesn't use dummy entries
return;
}
if (get_rows_storage().empty() || !get_rows_storage().rbegin()->is_last_dummy()) {
if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
get_rows_storage().insert_before(get_rows_storage().end(), std::move(e));
_rows.insert_before(_rows.end(), std::move(e));
}
}
@@ -475,18 +419,9 @@ mutation_partition::tombstone_for_row(const schema& schema, const clustering_key
check_schema(schema);
row_tombstone t = row_tombstone(range_tombstone_for_row(schema, key));
if (use_single_row_storage(schema)) {
// Single-row storage: check if the single row exists and has tombstone
const auto& row_opt = get_single_row_storage();
if (row_opt) {
t.apply(row_opt->deleted_at(), row_opt->marker());
}
} else {
// Multi-row storage: search in B-tree
auto j = get_rows_storage().find(key, rows_entry::tri_compare(schema));
if (j != get_rows_storage().end()) {
t.apply(j->row().deleted_at(), j->row().marker());
}
auto j = _rows.find(key, rows_entry::tri_compare(schema));
if (j != _rows.end()) {
t.apply(j->row().deleted_at(), j->row().marker());
}
return t;
@@ -569,178 +504,97 @@ void mutation_partition::apply_insert(const schema& s, clustering_key_view key,
clustered_row(s, key).apply(row_marker(created_at, ttl, expiry));
}
void mutation_partition::insert_row(const schema& s, const clustering_key& key, deletable_row&& row) {
if (use_single_row_storage(s)) {
// Single-row storage: just set the row
get_single_row_storage() = std::move(row);
} else {
// Multi-row storage: insert into B-tree
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key, std::move(row)));
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(e), rows_entry::tri_compare(s));
}
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key, std::move(row)));
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
}
void mutation_partition::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
check_schema(s);
if (use_single_row_storage(s)) {
// Single-row storage: just copy the row
get_single_row_storage() = row;
} else {
// Multi-row storage: insert into B-tree
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, key, row));
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(e), rows_entry::tri_compare(s));
}
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, key, row));
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
}
const row*
mutation_partition::find_row(const schema& s, const clustering_key& key) const {
check_schema(s);
if (use_single_row_storage(s)) {
// Single-row storage: return the single row's cells if it exists
const auto& row_opt = get_single_row_storage();
if (row_opt) {
return &row_opt->cells();
}
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
return nullptr;
} else {
// Multi-row storage: search in B-tree
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
if (i == get_rows_storage().end()) {
return nullptr;
}
return &i->row().cells();
}
return &i->row().cells();
}
deletable_row&
mutation_partition::clustered_row(const schema& s, clustering_key&& key) {
check_schema(s);
check_row_key(s, key, is_dummy::no);
if (use_single_row_storage(s)) {
// Single-row storage: create row if it doesn't exist
auto& row_opt = get_single_row_storage();
if (!row_opt) {
row_opt = deletable_row();
}
return *row_opt;
} else {
// Multi-row storage: find or insert in B-tree
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
if (i == get_rows_storage().end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(std::move(key)));
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(std::move(key)));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
deletable_row&
mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
check_schema(s);
check_row_key(s, key, is_dummy::no);
if (use_single_row_storage(s)) {
// Single-row storage: create row if it doesn't exist
auto& row_opt = get_single_row_storage();
if (!row_opt) {
row_opt = deletable_row();
}
return *row_opt;
} else {
// Multi-row storage: find or insert in B-tree
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
if (i == get_rows_storage().end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
deletable_row&
mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
check_schema(s);
check_row_key(s, key, is_dummy::no);
if (use_single_row_storage(s)) {
// Single-row storage: create row if it doesn't exist
auto& row_opt = get_single_row_storage();
if (!row_opt) {
row_opt = deletable_row();
}
return *row_opt;
} else {
// Multi-row storage: find or insert in B-tree
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
if (i == get_rows_storage().end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
auto i = _rows.find(key, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(key));
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return i->row();
}
rows_entry&
mutation_partition::clustered_rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
check_schema(s);
check_row_key(s, pos, dummy);
if (use_single_row_storage(s)) {
// Single-row storage doesn't use rows_entry - this shouldn't be called
on_internal_error(mplog, "mutation_partition::clustered_rows_entry() called with single-row storage");
}
auto i = get_rows_storage().find(pos, rows_entry::tri_compare(s));
if (i == get_rows_storage().end()) {
auto i = _rows.find(pos, rows_entry::tri_compare(s));
if (i == _rows.end()) {
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
}
return *i;
}
deletable_row&
mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
if (use_single_row_storage(s)) {
// Single-row storage: ignore dummy/continuous flags, just get/create the row
check_row_key(s, pos, dummy);
auto& row_opt = get_single_row_storage();
if (!row_opt) {
row_opt = deletable_row();
}
return *row_opt;
} else {
return clustered_rows_entry(s, pos, dummy, continuous).row();
}
return clustered_rows_entry(s, pos, dummy, continuous).row();
}
deletable_row&
mutation_partition::append_clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
check_schema(s);
check_row_key(s, pos, dummy);
if (use_single_row_storage(s)) {
// Single-row storage: just create/get the row
auto& row_opt = get_single_row_storage();
if (!row_opt) {
row_opt = deletable_row();
}
return *row_opt;
}
const auto cmp = rows_entry::tri_compare(s);
auto i = get_rows_storage().end();
if (!get_rows_storage().empty() && (cmp(*std::prev(i), pos) >= 0)) {
auto i = _rows.end();
if (!_rows.empty() && (cmp(*std::prev(i), pos) >= 0)) {
on_internal_error(mplog, format("mutation_partition::append_clustered_row(): cannot append clustering row with key {} to the partition"
", last clustering row is equal or greater: {}", pos, std::prev(i)->position()));
}
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
i = get_rows_storage().insert_before_hint(i, std::move(e), cmp).first;
i = _rows.insert_before_hint(i, std::move(e), cmp).first;
return i->row();
}
@@ -748,33 +602,19 @@ mutation_partition::append_clustered_row(const schema& s, position_in_partition_
mutation_partition::rows_type::const_iterator
mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
check_schema(schema);
if (use_single_row_storage(schema)) {
// Single-row storage: always return end iterator (empty range)
static const rows_type empty_rows;
return empty_rows.end();
}
if (!r.start()) {
return std::cbegin(get_rows_storage());
return std::cbegin(_rows);
}
return get_rows_storage().lower_bound(position_in_partition_view::for_range_start(r), rows_entry::tri_compare(schema));
return _rows.lower_bound(position_in_partition_view::for_range_start(r), rows_entry::tri_compare(schema));
}
mutation_partition::rows_type::const_iterator
mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
check_schema(schema);
if (use_single_row_storage(schema)) {
// Single-row storage: always return end iterator (empty range)
static const rows_type empty_rows;
return empty_rows.end();
}
if (!r.end()) {
return std::cend(get_rows_storage());
return std::cend(_rows);
}
return get_rows_storage().lower_bound(position_in_partition_view::for_range_end(r), rows_entry::tri_compare(schema));
return _rows.lower_bound(position_in_partition_view::for_range_end(r), rows_entry::tri_compare(schema));
}
std::ranges::subrange<mutation_partition::rows_type::const_iterator>
@@ -785,32 +625,17 @@ mutation_partition::range(const schema& schema, const query::clustering_range& r
std::ranges::subrange<mutation_partition::rows_type::iterator>
mutation_partition::range(const schema& schema, const query::clustering_range& r) {
if (use_single_row_storage(schema)) {
// Single-row storage: return empty range (rows_entry iteration not applicable)
static rows_type empty_rows;
return std::ranges::subrange(empty_rows.begin(), empty_rows.end());
}
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->range(schema, r));
return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
}
mutation_partition::rows_type::iterator
mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) {
if (use_single_row_storage(schema)) {
// Single-row storage: return end iterator (empty range)
static rows_type empty_rows;
return empty_rows.end();
}
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
}
mutation_partition::rows_type::iterator
mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) {
if (use_single_row_storage(schema)) {
// Single-row storage: return end iterator (empty range)
static rows_type empty_rows;
return empty_rows.end();
}
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
}
template<typename Func>
@@ -1552,15 +1377,7 @@ bool mutation_partition::empty() const
if (_tombstone.timestamp != api::missing_timestamp) {
return false;
}
if (_static_row.size() || !_row_tombstones.empty()) {
return false;
}
if (uses_single_row_storage()) {
return !get_single_row_storage().has_value();
} else {
return get_rows_storage().empty();
}
return !_static_row.size() && _rows.empty() && _row_tombstones.empty();
}
bool
@@ -1605,11 +1422,7 @@ mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_t
uint64_t
mutation_partition::row_count() const {
if (uses_single_row_storage()) {
return get_single_row_storage().has_value() ? 1 : 0;
} else {
return get_rows_storage().calculate_size();
}
return _rows.calculate_size();
}
rows_entry::rows_entry(rows_entry&& o) noexcept
@@ -2406,22 +2219,15 @@ public:
mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
: _tombstone(t)
, _static_row_continuous(!s.has_static_columns())
, _rows(use_single_row_storage(s) ?
rows_storage_type(std::optional<deletable_row>{}) :
rows_storage_type(rows_type{}))
, _rows()
, _row_tombstones(s)
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
#endif
{
if (use_single_row_storage(s)) {
// Single-row storage: no dummy entries needed, leave row as empty optional
} else {
// Multi-row storage: add last dummy entry for discontinuous partition
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
get_rows_storage().insert_before(get_rows_storage().end(), std::move(e));
}
auto e = alloc_strategy_unique_ptr<rows_entry>(
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
_rows.insert_before(_rows.end(), std::move(e));
}
bool mutation_partition::is_fully_continuous() const {

View File

@@ -9,7 +9,6 @@
#pragma once
#include <iosfwd>
#include <variant>
#include <boost/intrusive/parent_from_member.hpp>
#include <seastar/util/optimized_optional.hh>
@@ -1189,12 +1188,6 @@ inline void check_row_key(const schema& s, position_in_partition_view pos, is_du
}
}
// Returns true if the schema has no clustering keys, meaning partitions can have at most one row.
// When true, mutation_partition uses std::optional<deletable_row> instead of full rows_type container.
inline bool use_single_row_storage(const schema& s) {
return s.clustering_key_size() == 0;
}
// Represents a set of writes made to a single partition.
//
// The object is schema-dependent. Each instance is governed by some
@@ -1235,45 +1228,20 @@ inline bool use_single_row_storage(const schema& s) {
class mutation_partition final {
public:
using rows_type = rows_entry::container_type;
using rows_storage_type = std::variant<rows_type, std::optional<deletable_row>>;
friend class size_calculator;
private:
tombstone _tombstone;
lazy_row _static_row;
bool _static_row_continuous = true;
rows_storage_type _rows;
rows_type _rows;
// Contains only strict prefixes so that we don't have to lookup full keys
// in both _row_tombstones and _rows.
// Note: empty when using single-row storage (std::optional<deletable_row> variant)
range_tombstone_list _row_tombstones;
#ifdef SEASTAR_DEBUG
table_schema_version _schema_version;
#endif
friend class converting_mutation_partition_applier;
// Returns true if this partition uses single-row storage
bool uses_single_row_storage() const {
return std::holds_alternative<std::optional<deletable_row>>(_rows);
}
// Get reference to rows container (multi-row storage)
rows_type& get_rows_storage() {
return std::get<rows_type>(_rows);
}
const rows_type& get_rows_storage() const {
return std::get<rows_type>(_rows);
}
// Get reference to single row storage
std::optional<deletable_row>& get_single_row_storage() {
return std::get<std::optional<deletable_row>>(_rows);
}
const std::optional<deletable_row>& get_single_row_storage() const {
return std::get<std::optional<deletable_row>>(_rows);
}
public:
struct copy_comparators_only {};
struct incomplete_tag {};
@@ -1283,14 +1251,14 @@ public:
return mutation_partition(incomplete_tag(), s, t);
}
mutation_partition(const schema& s)
: _rows(use_single_row_storage(s) ? rows_storage_type(std::optional<deletable_row>{}) : rows_storage_type(rows_type{}))
: _rows()
, _row_tombstones(s)
#ifdef SEASTAR_DEBUG
, _schema_version(s.version())
#endif
{ }
mutation_partition(mutation_partition& other, copy_comparators_only)
: _rows(other._rows.index() == 0 ? rows_storage_type(rows_type{}) : rows_storage_type(std::optional<deletable_row>{}))
: _rows()
, _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
#ifdef SEASTAR_DEBUG
, _schema_version(other._schema_version)
@@ -1301,8 +1269,6 @@ public:
mutation_partition(const mutation_partition&, const schema&, query::clustering_key_filter_ranges);
mutation_partition(mutation_partition&&, const schema&, query::clustering_key_filter_ranges);
~mutation_partition();
// Returns the mutation_partition containing the given rows_type.
// Can only be used when the mutation_partition uses multi-row storage.
static mutation_partition& container_of(rows_type&);
mutation_partition& operator=(mutation_partition&& x) noexcept;
bool equal(const schema&, const mutation_partition&) const;
@@ -1496,31 +1462,9 @@ public:
const lazy_row& static_row() const { return _static_row; }
// return a set of rows_entry where each entry represents a CQL row sharing the same clustering key.
// For single-row storage (clustering_key_size() == 0), returns an empty container.
// Callers should check uses_single_row_storage() and use get_single_row() for single-row case.
const rows_type& clustered_rows() const noexcept {
if (uses_single_row_storage()) {
static const rows_type empty_rows;
return empty_rows;
}
return get_rows_storage();
}
utils::immutable_collection<rows_type> clustered_rows() noexcept {
return const_cast<const mutation_partition*>(this)->clustered_rows();
}
rows_type& mutable_clustered_rows() noexcept {
// Should only be called when NOT using single-row storage
return get_rows_storage();
}
// Access the single row when using single-row storage (clustering_key_size() == 0)
const std::optional<deletable_row>& get_single_row() const {
return get_single_row_storage();
}
std::optional<deletable_row>& get_single_row() {
return get_single_row_storage();
}
const rows_type& clustered_rows() const noexcept { return _rows; }
utils::immutable_collection<rows_type> clustered_rows() noexcept { return _rows; }
rows_type& mutable_clustered_rows() noexcept { return _rows; }
const range_tombstone_list& row_tombstones() const noexcept { return _row_tombstones; }
utils::immutable_collection<range_tombstone_list> row_tombstones() noexcept { return _row_tombstones; }
@@ -1538,14 +1482,8 @@ public:
rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r);
std::ranges::subrange<rows_type::iterator> range(const schema& schema, const query::clustering_range& r);
// Returns an iterator range of rows_entry, with only non-dummy entries.
// For single-row storage, returns an empty range.
auto non_dummy_rows() const {
if (uses_single_row_storage()) {
static const rows_type empty_rows;
return std::ranges::subrange(empty_rows.begin(), empty_rows.end())
| std::views::filter([] (const rows_entry& e) { return bool(!e.dummy()); });
}
return std::ranges::subrange(get_rows_storage().begin(), get_rows_storage().end())
return std::ranges::subrange(_rows.begin(), _rows.end())
| std::views::filter([] (const rows_entry& e) { return bool(!e.dummy()); });
}
void accept(const schema&, mutation_partition_visitor&) const;
@@ -1579,21 +1517,7 @@ private:
inline
mutation_partition& mutation_partition::container_of(rows_type& rows) {
// This method can only be called when using multi-row storage (rows_type variant alternative).
// With std::variant, when rows_type is the active alternative (index 0), it's stored at the beginning of the variant.
// We can use pointer arithmetic to get back to the mutation_partition.
// Calculate offset from rows_type to the containing variant
// The rows reference should be the active rows_type inside the variant
static_assert(std::is_same_v<std::variant_alternative_t<0, rows_storage_type>, rows_type>,
"rows_type must be the first alternative in rows_storage_type");
// Get address of the variant containing this rows_type
// When rows_type is active (index 0), it's at offset 0 in the variant's storage
rows_storage_type* variant_ptr = reinterpret_cast<rows_storage_type*>(&rows);
// Now get the mutation_partition from the variant
return *boost::intrusive::get_parent_from_member(variant_ptr, &mutation_partition::_rows);
return *boost::intrusive::get_parent_from_member(&rows, &mutation_partition::_rows);
}
bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb = tombstone(),

View File

@@ -56,16 +56,33 @@ static tasks::task_manager::task_state get_state(const db::system_keyspace::topo
}
}
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, std::chrono::seconds ttl) {
return sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
static std::set<tasks::task_id> get_pending_ids(service::topology& topology) {
std::set<tasks::task_id> ids;
for (auto& request : topology.requests) {
ids.emplace(topology.find(request.first)->second.request_id);
}
return ids;
}
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
auto entry_opt = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
if (!entry_opt) {
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, service::topology& topology, std::chrono::seconds ttl) {
// Started requests.
auto entries = co_await sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
// Pending requests.
for (auto& id : get_pending_ids(topology)) {
entries.try_emplace(id.uuid(), db::system_keyspace::topology_requests_entry{});
}
co_return entries;
}
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const {
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(id.uuid(), false);
auto started = entry.id;
service::topology& topology = _ss._topology_state_machine._topology;
if (!started && !get_pending_ids(topology).contains(id)) {
co_return std::nullopt;
}
auto& entry = *entry_opt;
co_return tasks::task_status{
.task_id = id,
.type = request_type_to_task_type(entry.request_type),
@@ -84,7 +101,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
.entity = "",
.progress_units = "",
.progress = tasks::task_manager::task::progress{},
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
.children = started ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{}
};
}
@@ -106,22 +123,26 @@ future<std::optional<tasks::virtual_task_hint>> node_ops_virtual_task::contains(
}
}
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
co_return entry && std::holds_alternative<service::topology_request>(entry->request_type) ? empty_hint : std::nullopt;
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(task_id.uuid(), false);
co_return bool(entry.id) && std::holds_alternative<service::topology_request>(entry.request_type) ? empty_hint : std::nullopt;
}
future<tasks::is_abortable> node_ops_virtual_task::is_abortable(tasks::virtual_task_hint) const {
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::no);
}
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
return get_status_helper(id, std::move(hint));
}
future<std::optional<tasks::task_status>> node_ops_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
auto entry = co_await get_status(id, hint);
auto entry = co_await get_status_helper(id, hint);
if (!entry) {
co_return std::nullopt;
}
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
co_return co_await get_status(id, std::move(hint));
co_return co_await get_status_helper(id, std::move(hint));
}
future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
@@ -130,7 +151,8 @@ future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hin
future<std::vector<tasks::task_stats>> node_ops_virtual_task::get_stats() {
db::system_keyspace& sys_ks = _ss._sys_ks.local();
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, get_task_manager().get_user_task_ttl())
service::topology& topology = _ss._topology_state_machine._topology;
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, topology, get_task_manager().get_user_task_ttl())
| std::views::transform([] (const auto& e) {
auto id = e.first;
auto& entry = e.second;

View File

@@ -39,6 +39,8 @@ public:
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
virtual future<std::vector<tasks::task_stats>> get_stats() override;
private:
future<std::optional<tasks::task_status>> get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const;
};
class streaming_task_impl : public tasks::task_manager::task::impl {

View File

@@ -176,7 +176,7 @@ void fsm::become_leader() {
_last_election_time = _clock.now();
_ping_leader = false;
// a new leader needs to commit at least one entry to make sure that
// a new leader needs to commit at lease one entry to make sure that
// all existing entries in its log are committed as well. Also it should
// send append entries RPC as soon as possible to establish its leadership
// (3.4). Do both of those by committing a dummy entry.

View File

@@ -1195,8 +1195,6 @@ private:
rlogger.info("{}", msg);
throw std::runtime_error(msg);
}
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
_rs._repair_compaction_locks[_frozen_topology_guard].push_back(std::move(lock_holder));

View File

@@ -84,10 +84,6 @@ class compaction_group {
seastar::named_gate _async_gate;
// Gates flushes.
seastar::named_gate _flush_gate;
// Gates sstable being added to the group.
// This prevents the group from being considered empty when sstables are being added.
// Crucial for tablet split which ACKs split for a table when all pre-split groups are empty.
seastar::named_gate _sstable_add_gate;
bool _tombstone_gc_enabled = true;
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
repair_classifier_func _repair_sstable_classifier;
@@ -252,10 +248,6 @@ public:
return _flush_gate;
}
seastar::named_gate& sstable_add_gate() noexcept {
return _sstable_add_gate;
}
compaction::compaction_manager& get_compaction_manager() noexcept;
const compaction::compaction_manager& get_compaction_manager() const noexcept;
@@ -442,7 +434,7 @@ public:
virtual bool all_storage_groups_split() = 0;
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
virtual future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) = 0;
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;

View File

@@ -2793,7 +2793,6 @@ future<> database::flush_all_tables() {
});
_all_tables_flushed_at = db_clock::now();
co_await _commitlog->wait_for_pending_deletes();
dblog.info("Forcing new commitlog segment and flushing all tables complete");
}
future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {

View File

@@ -604,28 +604,9 @@ public:
data_dictionary::table as_data_dictionary() const;
// The usage of these functions are restricted to preexisting sstables that aren't being
// moved anywhere, so should never be used in the context of file streaming and intra
// node migration. The only user today is distributed loader, which populates the
// sstables for each column family on boot.
future<> add_sstable_and_update_cache(sstables::shared_sstable sst,
sstables::offstrategy offstrategy = sstables::offstrategy::no);
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
// Restricted to new sstables produced by external processes such as repair.
// The sstable might undergo split if table is in split mode.
// If no need for split, the input sstable will only be attached to the sstable set.
// If split happens, the output sstables will be attached and the input sstable unlinked.
// On failure, the input sstable is unlinked and exception propagated to the caller.
// The on_add callback will be called on all sstables to be added into the set.
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
std::function<future<>(sstables::shared_sstable)> on_add,
sstables::offstrategy offstrategy = sstables::offstrategy::no);
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
std::function<future<>(sstables::shared_sstable)> on_add);
future<> move_sstables_from_staging(std::vector<sstables::shared_sstable>);
sstables::shared_sstable make_sstable();
void set_truncation_time(db_clock::time_point truncated_at) noexcept {
@@ -743,9 +724,7 @@ private:
return _config.enable_cache && _schema->caching_options().enabled();
}
void update_stats_for_new_sstable(const sstables::shared_sstable& sst) noexcept;
// This function can throw even if the sstable was added into the set. When the sstable was successfully
// added, the sstable ptr @sst will be set to nullptr. Allowing caller to optionally discard the sstable.
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy, bool trigger_compaction);
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy, bool trigger_compaction);
future<> do_add_sstable_and_update_cache(sstables::shared_sstable sst, sstables::offstrategy offstrategy, bool trigger_compaction);
// Helpers which add sstable on behalf of a compaction group and refreshes compound set.
void add_sstable(compaction_group& cg, sstables::shared_sstable sstable);
@@ -1379,8 +1358,7 @@ public:
// Clones storage of a given tablet. Memtable is flushed first to guarantee that the
// snapshot (list of sstables) will include all the data written up to the time it was taken.
// If leave_unsealead is set, all the destination sstables will be left unsealed.
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed);
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid);
friend class compaction_group;
friend class compaction::compaction_task_impl;

View File

@@ -721,7 +721,7 @@ public:
bool all_storage_groups_split() override { return true; }
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override { return make_ready_future(); }
future<> maybe_split_compaction_group_of(size_t idx) override { return make_ready_future(); }
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override {
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override {
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
}
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
@@ -879,7 +879,7 @@ public:
bool all_storage_groups_split() override;
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
future<> maybe_split_compaction_group_of(size_t idx) override;
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override;
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override;
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
return tablet_map().get_token_range_after_split(token);
}
@@ -1130,8 +1130,7 @@ future<> tablet_storage_group_manager::maybe_split_compaction_group_of(size_t id
}
future<std::vector<sstables::shared_sstable>>
tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
co_await utils::get_local_injector().inject("maybe_split_new_sstable_wait", utils::wait_for_message(120s));
tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable& sst) {
if (!tablet_map().needs_split()) {
co_return std::vector<sstables::shared_sstable>{sst};
}
@@ -1139,7 +1138,8 @@ tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sst
auto& cg = compaction_group_for_sstable(sst);
auto holder = cg.async_gate().hold();
auto& view = cg.view_for_sstable(sst);
co_return co_await _t.get_compaction_manager().maybe_split_new_sstable(sst, view, co_await split_compaction_options());
auto lock_holder = co_await _t.get_compaction_manager().get_incremental_repair_read_lock(view, "maybe_split_sstable");
co_return co_await _t.get_compaction_manager().maybe_split_sstable(sst, view, co_await split_compaction_options());
}
future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
@@ -1149,7 +1149,7 @@ future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
future<std::vector<sstables::shared_sstable>> table::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
auto holder = async_gate().hold();
co_return co_await _sg_manager->maybe_split_new_sstable(sst);
co_return co_await _sg_manager->maybe_split_sstable(sst);
}
dht::token_range table::get_token_range_after_split(const dht::token& token) const noexcept {
@@ -1330,7 +1330,7 @@ future<utils::chunked_vector<sstables::shared_sstable>> table::take_sstable_set_
}
future<utils::chunked_vector<sstables::entry_descriptor>>
table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
table::clone_tablet_storage(locator::tablet_id tid) {
utils::chunked_vector<sstables::entry_descriptor> ret;
auto holder = async_gate().hold();
@@ -1342,7 +1342,7 @@ table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
// by compaction while we are waiting for the lock.
auto deletion_guard = co_await get_sstable_list_permit();
co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
ret.push_back(co_await sst->clone(calculate_generation_for_new_table(), leave_unsealed));
ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
});
co_return ret;
}
@@ -1354,10 +1354,10 @@ void table::update_stats_for_new_sstable(const sstables::shared_sstable& sst) no
}
future<>
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy offstrategy,
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy offstrategy,
bool trigger_compaction) {
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () mutable noexcept {
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () noexcept {
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
// atomically load all opened sstables into column family.
if (!offstrategy) {
@@ -1369,8 +1369,6 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
if (trigger_compaction) {
try_trigger_compaction(cg);
}
// Reseting sstable ptr to inform the caller the sstable has been loaded successfully.
sst = nullptr;
}), dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}), [sst, schema = _schema] (const dht::decorated_key& key) {
return sst->filter_has_key(sstables::key::from_partition_key(*schema, key.key()));
});
@@ -1378,10 +1376,12 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
future<>
table::do_add_sstable_and_update_cache(sstables::shared_sstable new_sst, sstables::offstrategy offstrategy, bool trigger_compaction) {
auto& cg = compaction_group_for_sstable(new_sst);
// Hold gate to make share compaction group is alive.
auto holder = cg.async_gate().hold();
co_await do_add_sstable_and_update_cache(cg, new_sst, offstrategy, trigger_compaction);
for (auto sst : co_await maybe_split_new_sstable(new_sst)) {
auto& cg = compaction_group_for_sstable(sst);
// Hold gate to make share compaction group is alive.
auto holder = cg.async_gate().hold();
co_await do_add_sstable_and_update_cache(cg, std::move(sst), offstrategy, trigger_compaction);
}
}
future<>
@@ -1399,85 +1399,6 @@ table::add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>
trigger_compaction();
}
future<std::vector<sstables::shared_sstable>>
table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
std::function<future<>(sstables::shared_sstable)> on_add,
sstables::offstrategy offstrategy) {
std::vector<sstables::shared_sstable> ret, ssts;
std::exception_ptr ex;
try {
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
auto& cg = compaction_group_for_sstable(new_sst);
// This prevents compaction group from being considered empty until the holder is released.
// Helpful for tablet split, where split is acked for a table when all pre-split groups are empty.
auto sstable_add_holder = cg.sstable_add_gate().hold();
ret = ssts = co_await maybe_split_new_sstable(new_sst);
// on sucessful split, input sstable is unlinked.
new_sst = nullptr;
for (auto& sst : ssts) {
auto& cg = compaction_group_for_sstable(sst);
// Hold gate to make sure compaction group is alive.
auto holder = cg.async_gate().hold();
co_await on_add(sst);
// If do_add_sstable_and_update_cache() throws after sstable has been loaded, the pointer
// sst passed by reference will be set to nullptr, so it won't be unlinked in the exception
// handler below.
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
sst = nullptr;
}
} catch (...) {
ex = std::current_exception();
}
if (ex) {
// on failed split, input sstable is unlinked here.
if (new_sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
co_await new_sst->unlink();
}
// on failure after sucessful split, sstables not attached yet will be unlinked
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
if (sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
co_await sst->unlink();
}
});
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_return std::move(ret);
}
future<std::vector<sstables::shared_sstable>>
table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
std::function<future<>(sstables::shared_sstable)> on_add) {
std::exception_ptr ex;
std::vector<sstables::shared_sstable> ret;
// We rely on add_new_sstable_and_update_cache() to unlink the sstable feeded into it,
// so the exception handling below will only have to unlink sstables not processed yet.
try {
for (auto& sst: new_ssts) {
auto ssts = co_await add_new_sstable_and_update_cache(std::exchange(sst, nullptr), on_add);
std::ranges::move(ssts, std::back_inserter(ret));
}
} catch (...) {
ex = std::current_exception();
}
if (ex) {
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
if (sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
co_await sst->unlink();
}
});
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_return std::move(ret);
}
future<>
table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector<sstables::shared_sstable> ssts) {
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
@@ -2691,8 +2612,8 @@ public:
sstables::sstables_manager& get_sstables_manager() noexcept override {
return _t.get_sstables_manager();
}
sstables::shared_sstable make_sstable(sstables::sstable_state state) const override {
return _t.make_sstable(state);
sstables::shared_sstable make_sstable() const override {
return _t.make_sstable();
}
sstables::sstable_writer_config configure_writer(sstring origin) const override {
auto cfg = _t.get_sstables_manager().configure_writer(std::move(origin));
@@ -2810,7 +2731,6 @@ future<> compaction_group::stop(sstring reason) noexcept {
auto flush_future = co_await seastar::coroutine::as_future(flush());
co_await _flush_gate.close();
co_await _sstable_add_gate.close();
// FIXME: indentation
_compaction_disabler_for_views.clear();
co_await utils::get_local_injector().inject("compaction_group_stop_wait", utils::wait_for_message(60s));
@@ -2824,7 +2744,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
}
bool compaction_group::empty() const noexcept {
return _memtables->empty() && live_sstable_count() == 0 && _sstable_add_gate.get_count() == 0;
return _memtables->empty() && live_sstable_count() == 0;
}
const schema_ptr& compaction_group::schema() const {
@@ -3280,7 +3200,7 @@ db::replay_position table::highest_flushed_replay_position() const {
}
struct manifest_json : public json::json_base {
json::json_chunked_list<std::string_view> files;
json::json_chunked_list<sstring> files;
manifest_json() {
register_params();
@@ -3304,7 +3224,7 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
manifest_json manifest;
for (const auto& fsp : file_sets) {
for (auto& rf : *fsp) {
manifest.files.push(std::string_view(rf));
manifest.files.push(std::move(rf));
}
}
auto streamer = json::stream_object(std::move(manifest));
@@ -3465,15 +3385,16 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
continue;
}
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
while (auto de = lister.get().get()) {
auto snapshot_name = de->name;
lister::scan_dir(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [datadir, &all_snapshots] (fs::path snapshots_dir, directory_entry de) {
auto snapshot_name = de.name;
all_snapshots.emplace(snapshot_name, snapshot_details());
auto details = get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).get();
auto& sd = all_snapshots.at(snapshot_name);
sd.total += details.total;
sd.live += details.live;
}
return get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).then([&all_snapshots, snapshot_name] (auto details) {
auto& sd = all_snapshots.at(snapshot_name);
sd.total += details.total;
sd.live += details.live;
return make_ready_future<>();
});
}).get();
}
return all_snapshots;
});
@@ -3481,61 +3402,38 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_dir, fs::path datadir) {
table::snapshot_details details{};
std::optional<fs::path> staging_dir = snapshot_dir / sstables::staging_dir;
if (!co_await file_exists(staging_dir->native())) {
staging_dir.reset();
}
auto lister = directory_lister(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
while (auto de = co_await lister.get()) {
const auto& name = de->name;
// FIXME: optimize stat calls by keeping the base directory open and use statat instead, here and below.
// See https://github.com/scylladb/seastar/pull/3163
auto sd = co_await io_check(file_stat, (snapshot_dir / name).native(), follow_symlink::no);
co_await lister::scan_dir(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>(), [datadir, &details] (fs::path snapshot_dir, directory_entry de) -> future<> {
auto sd = co_await io_check(file_stat, (snapshot_dir / de.name).native(), follow_symlink::no);
auto size = sd.allocated_size;
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
//
// All the others should just generate an exception: there is something wrong, so don't blindly
// add it to the size.
if (name != "manifest.json" && name != "schema.cql") {
if (de.name != "manifest.json" && de.name != "schema.cql") {
details.total += size;
if (sd.number_of_links == 1) {
// File exists only in the snapshot directory.
details.live += size;
continue;
}
// If the number of linkes is greater than 1, it is still possible that the file is linked to another snapshot
// So check the datadir for the file too.
} else {
continue;
size = 0;
}
auto exists_in_dir = [&] (fs::path path) -> future<bool> {
try {
try {
// File exists in the main SSTable directory. Snapshots are not contributing to size
auto psd = co_await io_check(file_stat, path.native(), follow_symlink::no);
auto psd = co_await io_check(file_stat, (datadir / de.name).native(), follow_symlink::no);
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
(datadir / name).native(), psd.device_id, psd.inode_number, psd.size,
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
co_return false;
(datadir / de.name).native(), psd.device_id, psd.inode_number, psd.size,
(snapshot_dir / de.name).native(), sd.device_id, sd.inode_number, sd.size);
details.live += size;
}
co_return true;
} catch (std::system_error& e) {
} catch (std::system_error& e) {
if (e.code() != std::error_code(ENOENT, std::system_category())) {
throw;
}
co_return false;
}
};
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
if ((!staging_dir || !co_await exists_in_dir(*staging_dir / name)) &&
!co_await exists_in_dir(datadir / name)) {
details.live += size;
}
}
});
co_return details;
}

View File

@@ -593,7 +593,7 @@ private:
v3_columns _v3_columns;
mutable schema_registry_entry* _registry_entry = nullptr;
std::unique_ptr<::view_info> _view_info;
mutable schema_ptr _cdc_schema;
schema_ptr _cdc_schema;
const std::array<column_count_type, 3> _offsets;
@@ -957,7 +957,6 @@ public:
friend bool operator==(const schema&, const schema&);
const column_mapping& get_column_mapping() const;
friend class schema_registry_entry;
friend class schema_registry;
// May be called from different shard
schema_registry_entry* registry_entry() const noexcept;
// Returns true iff this schema version was synced with on current node.

View File

@@ -78,8 +78,10 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
}
schema_ptr schema_registry::learn(schema_ptr s) {
auto learned_cdc_schema = s->cdc_schema() ? learn(s->cdc_schema()) : nullptr;
s->_cdc_schema = learned_cdc_schema;
auto learned_cdc_schema = s->cdc_schema() ? local_schema_registry().learn(s->cdc_schema()) : nullptr;
if (learned_cdc_schema != s->cdc_schema()) {
s = s->make_with_cdc(learned_cdc_schema);
}
if (s->registry_entry()) {
return s;
}
@@ -90,9 +92,7 @@ schema_ptr schema_registry::learn(schema_ptr s) {
e.load(s);
attach_table(e);
}
auto loaded_s = e.get_schema();
loaded_s->_cdc_schema = learned_cdc_schema;
return loaded_s;
return e.get_schema();
}
slogger.debug("Learning about version {} of {}.{}", s->version(), s->ks_name(), s->cf_name());
auto e_ptr = make_lw_shared<schema_registry_entry>(s->version(), *this);

View File

@@ -390,11 +390,9 @@ dark_green = (195, 215, 195)
light_red = (255, 200, 200)
light_green = (200, 255, 200)
light_gray = (240, 240, 240)
scylla_blue = (87, 209, 229)
tablet_colors = {
(Tablet.STATE_NORMAL, None): GRAY,
(Tablet.STATE_NORMAL, 'repair'): scylla_blue,
(Tablet.STATE_JOINING, 'allow_write_both_read_old'): dark_green,
(Tablet.STATE_LEAVING, 'allow_write_both_read_old'): dark_red,
(Tablet.STATE_JOINING, 'write_both_read_old'): dark_green,
@@ -534,8 +532,6 @@ def update_from_cql(initial=False):
state = (Tablet.STATE_JOINING, tablet.stage)
elif replica in leaving:
state = (Tablet.STATE_LEAVING, tablet.stage)
elif tablet.stage == 'repair':
state = (Tablet.STATE_NORMAL, tablet.stage)
else:
state = (Tablet.STATE_NORMAL, None)

View File

@@ -3,7 +3,6 @@ target_sources(service
PRIVATE
broadcast_tables/experimental/lang.cc
client_state.cc
client_routes.cc
mapreduce_service.cc
migration_manager.cc
misc_services.cc

View File

@@ -1,137 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "service/client_routes.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
#include "mutation/mutation.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "db/system_keyspace.hh"
static logging::logger crlogger("client_routes");
service::query_state& client_routes_query_state() {
using namespace std::chrono_literals;
const auto t = 10s;
static timeout_config tc{ t, t, t, t, t, t, t };
static thread_local service::client_state cs(service::client_state::internal_tag{}, tc);
static thread_local service::query_state qs(cs, empty_service_permit());
return qs;
};
future<mutation> service::client_routes_service::make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key) {
static const sstring stmt = format("DELETE FROM {}.{} WHERE connection_id = ? and host_id = ?", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {key.connection_id, key.host_id});
if (muts.size() != 1) {
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
}
co_return std::move(muts[0]);
}
future<mutation> service::client_routes_service::make_update_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_entry& route) {
static const sstring stmt = format("INSERT INTO {}.{} (connection_id, host_id, address, port, tls_port, alternator_port, alternator_https_port) VALUES (?, ?, ?, ?, ?, ?, ?)", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {
route.connection_id,
route.host_id,
route.address,
route.port,
route.tls_port,
route.alternator_port,
route.alternator_https_port
});
if (muts.size() != 1) {
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
}
co_return std::move(muts[0]);
}
future<std::vector<service::client_routes_service::client_route_entry>> service::client_routes_service::get_client_routes() const {
std::vector<service::client_routes_service::client_route_entry> result;
static const sstring query = format("SELECT * from {}.{}", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
auto rs = co_await _qp.execute_internal(query, cql3::query_processor::cache_internal::yes);
result.reserve(rs->size());
for (const auto& row : *rs) {
result.emplace_back(
row.get_as<sstring>("connection_id"),
row.get_as<utils::UUID>("host_id"),
row.get_as<sstring>("address"),
row.get_opt<int32_t>("port"),
row.get_opt<int32_t>("tls_port"),
row.get_opt<int32_t>("alternator_port"),
row.get_opt<int32_t>("alternator_https_port")
);
}
co_return result;
}
seastar::future<> service::client_routes_service::notify_client_routes_change(const client_route_keys& client_route_keys) {
co_await container().invoke_on_all([&client_route_keys] (service::client_routes_service& client_routes) {
return client_routes._lifecycle_notifier.notify_client_routes_change(client_route_keys);
});
}
seastar::future<> service::client_routes_service::set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
utils::chunked_vector<canonical_mutation> cmuts;
for (const auto& entry : route_entries) {
auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
cmuts.emplace_back(std::move(mut));
}
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "insert client routes");
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
}
seastar::future<> service::client_routes_service::delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
utils::chunked_vector<canonical_mutation> cmuts;
for (const auto& route_key : route_keys) {
auto mut = co_await make_remove_client_route_mutation(guard.write_timestamp(), route_key);
cmuts.emplace_back(std::move(mut));
}
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "delete client routes");
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
}
seastar::future<> service::client_routes_service::set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries) {
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) mutable -> future<> {
return cr.with_retry([&cr, route_entries = std::move(route_entries)] {
return cr.set_client_routes_inner(route_entries);
});
});
}
seastar::future<> service::client_routes_service::delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys) {
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) mutable -> future<> {
return cr.with_retry([&cr, route_keys = std::move(route_keys)] {
return cr.delete_client_routes_inner(route_keys);
});
});
}
template <typename Func>
seastar::future<> service::client_routes_service::with_retry(Func func) const {
int retries = 10;
while (true) {
try {
co_await func();
} catch (const ::service::group0_concurrent_modification&) {
crlogger.warn("Failed to set client routes due to guard conflict, retries={}", retries);
if (retries--) {
continue;
}
throw;
}
break;
}
}

View File

@@ -1,88 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <seastar/core/abort_source.hh>
#include <seastar/core/sharded.hh>
#include "gms/feature_service.hh"
#include "mutation/mutation.hh"
#include "service/raft/raft_group0_client.hh"
namespace service {
class endpoint_lifecycle_notifier;
class client_routes_service : public seastar::peering_sharded_service<client_routes_service> {
public:
client_routes_service(
abort_source& abort_source,
gms::feature_service& feature_service,
service::raft_group0_client& group0_client,
cql3::query_processor& qp,
endpoint_lifecycle_notifier& elc_notif
)
: _abort_source(abort_source)
, _feature_service(feature_service)
, _group0_client(group0_client)
, _qp(qp)
, _lifecycle_notifier(elc_notif) { }
struct client_route_key {
sstring connection_id;
utils::UUID host_id;
bool operator<(const client_route_key& other) const {
if (connection_id != other.connection_id) {
return connection_id < other.connection_id;
}
return host_id < other.host_id;
}
};
using client_route_keys = std::set<client_route_key>;
struct client_route_entry {
sstring connection_id;
utils::UUID host_id;
sstring address;
// At least one of the ports should be specified
std::optional<int32_t> port;
std::optional<int32_t> tls_port;
std::optional<int32_t> alternator_port;
std::optional<int32_t> alternator_https_port;
};
gms::feature_service& get_feature_service() noexcept {
return _feature_service;
}
// mutations
future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
future<std::vector<client_route_entry>> get_client_routes() const;
seastar::future<> set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries);
seastar::future<> delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys);
// notifications
seastar::future<> notify_client_routes_change(const client_route_keys& client_route_keys);
private:
seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
template <typename Func>
seastar::future<> with_retry(Func func) const;
abort_source& _abort_source;
gms::feature_service& _feature_service;
service::raft_group0_client& _group0_client;
cql3::query_processor& _qp;
endpoint_lifecycle_notifier& _lifecycle_notifier;
};
}

View File

@@ -224,13 +224,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
ks + " can be granted only SELECT or DESCRIBE permissions to a non-superuser.");
}
static const std::unordered_set<auth::resource> vector_search_system_resources = {
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
};
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
(cmd.permission == auth::permission::SELECT && vector_search_system_resources.contains(cmd.resource))) {
if (cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) {
co_return co_await ensure_has_permission<auth::command_desc_with_permission_set>({auth::permission_set::of<auth::permission::SELECT, auth::permission::VECTOR_SEARCH_INDEXING>(), cmd.resource});
@@ -350,17 +344,3 @@ void service::client_state::update_per_service_level_params(qos::service_level_o
_workload_type = slo.workload;
}
future<> service::client_state::set_client_options(
client_options_cache_type& keys_and_values_cache,
const std::unordered_map<sstring, sstring>& client_options) {
for (const auto& [key, value] : client_options) {
auto cached_key = co_await keys_and_values_cache.get_or_load(key, [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
auto cached_value = co_await keys_and_values_cache.get_or_load(value, [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
_client_options.emplace_back(std::move(cached_key), std::move(cached_value));
}
}

View File

@@ -18,7 +18,6 @@
#include "auth/authenticated_user.hh"
#include "auth/authenticator.hh"
#include "auth/permission.hh"
#include "client_data.hh"
#include "transport/cql_protocol_extension.hh"
#include "service/qos/service_level_controller.hh"
@@ -103,8 +102,7 @@ private:
private volatile String keyspace;
#endif
std::optional<auth::authenticated_user> _user;
std::optional<client_options_cache_entry_type> _driver_name, _driver_version;
std::list<client_option_key_value_cached_entry> _client_options;
std::optional<sstring> _driver_name, _driver_version;
auth_state _auth_state = auth_state::UNINITIALIZED;
bool _control_connection = false;
@@ -153,33 +151,18 @@ public:
return _control_connection = true;
}
std::optional<client_options_cache_entry_type> get_driver_name() const {
std::optional<sstring> get_driver_name() const {
return _driver_name;
}
future<> set_driver_name(client_options_cache_type& keys_and_values_cache, const sstring& driver_name) {
_driver_name = co_await keys_and_values_cache.get_or_load(driver_name, [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
void set_driver_name(sstring driver_name) {
_driver_name = std::move(driver_name);
}
const auto& get_client_options() const {
return _client_options;
}
future<> set_client_options(
client_options_cache_type& keys_and_values_cache,
const std::unordered_map<sstring, sstring>& client_options);
std::optional<client_options_cache_entry_type> get_driver_version() const {
std::optional<sstring> get_driver_version() const {
return _driver_version;
}
future<> set_driver_version(
client_options_cache_type& keys_and_values_cache,
const sstring& driver_version)
{
_driver_version = co_await keys_and_values_cache.get_or_load(driver_version, [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
void set_driver_version(sstring driver_version) {
_driver_version = std::move(driver_version);
}
client_state(external_tag,

View File

@@ -13,7 +13,6 @@
#include "gms/inet_address.hh"
#include "locator/host_id.hh"
#include "utils/atomic_vector.hh"
#include "service/client_routes.hh"
namespace service {
@@ -66,7 +65,6 @@ public:
* @param endpoint the endpoint marked DOWN.
*/
virtual void on_down(const gms::inet_address& endpoint, locator::host_id host_id) {}
virtual void on_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {}
};
class endpoint_lifecycle_notifier {
@@ -81,8 +79,6 @@ public:
future<> notify_released(locator::host_id host_id);
future<> notify_up(gms::inet_address endpoint, locator::host_id host_id);
future<> notify_joined(gms::inet_address endpoint, locator::host_id host_id);
future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
};
}

View File

@@ -163,11 +163,7 @@ public:
void before_drop_column_family(const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
void before_drop_keyspace(const sstring& keyspace_name, utils::chunked_vector<mutation>&, api::timestamp_type);
// Called when creating a tablet map for a new table.
// When in the context of a notification callback, call `before_allocate_tablet_map_in_notification`,
// and otherwise call 'before_allocate_tablet_map'.
void before_allocate_tablet_map(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
void before_allocate_tablet_map_in_notification(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
};
}

View File

@@ -648,13 +648,6 @@ void migration_notifier::before_allocate_tablet_map(const locator::tablet_map& m
});
}
void migration_notifier::before_allocate_tablet_map_in_notification(const locator::tablet_map& map,
const schema& s, utils::chunked_vector<mutation>& mutations, api::timestamp_type ts) {
_listeners.thread_for_each_nested([&map, &s, &mutations, ts] (migration_listener* listener) {
listener->on_before_allocate_tablet_map(map, s, mutations, ts);
});
}
utils::chunked_vector<mutation> prepare_keyspace_update_announcement(replica::database& db, lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type ts) {
db.validate_keyspace_update(*ksm);
mlogger.info("Update Keyspace: {}", ksm);

View File

@@ -640,16 +640,6 @@ future<scheduling_group> service_level_controller::auth_integration::get_user_sc
}
}
scheduling_group service_level_controller::auth_integration::get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
if (usr && usr->name) {
auto sl_opt = find_cached_effective_service_level(*usr->name);
auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name;
return _sl_controller.get_scheduling_group(sl_name);
} else {
return _sl_controller.get_default_scheduling_group();
}
}
future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
// Special case:
// -------------
@@ -666,11 +656,6 @@ future<scheduling_group> service_level_controller::get_user_scheduling_group(con
return _auth_integration->get_user_scheduling_group(usr);
}
scheduling_group service_level_controller::get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
SCYLLA_ASSERT(_auth_integration != nullptr);
return _auth_integration->get_user_cached_scheduling_group(usr);
}
std::optional<sstring> service_level_controller::get_active_service_level() {
unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group());
if (_sl_lookup[sched_idx].first) {
@@ -789,10 +774,6 @@ future<service_levels_info> service_level_controller::get_distributed_service_le
return _sl_data_accessor ? _sl_data_accessor->get_service_level(service_level_name) : make_ready_future<service_levels_info>();
}
bool service_level_controller::can_use_effective_service_level_cache() const{
return _sl_data_accessor && _sl_data_accessor->can_use_effective_service_level_cache();
}
future<bool> service_level_controller::validate_before_service_level_add() {
assert(this_shard_id() == global_controller);
if (_global_controller_db->deleted_scheduling_groups.size() > 0) {

View File

@@ -154,10 +154,7 @@ public:
/// Synchronous version of `find_effective_service_level` that only checks the cache.
std::optional<service_level_options> find_cached_effective_service_level(const sstring& role_name);
/// Execute a function within the service level context of a user, get_user_scheduling_group - async version
/// get_user_cached_scheduling_group - sync version (used for v2 servers).
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
scheduling_group get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr);
template <typename Func, typename Ret = std::invoke_result_t<Func>>
requires std::invocable<Func>
@@ -342,12 +339,6 @@ public:
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
*/
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
/**
* Get the scheduling group of a specific user for the service level cache
* @param user - the user for determining the service level
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
*/
scheduling_group get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
/**
* @return the name of the currently active service level if such exists or an empty
* optional if no active service level.
@@ -409,13 +400,6 @@ public:
future<service_levels_info> get_distributed_service_levels(qos::query_context ctx);
future<service_levels_info> get_distributed_service_level(sstring service_level_name);
/*
* Returns whether effective service level cache can be populated and used.
* This is equivalent to checking whether auth + raft have been migrated to raft.
*/
bool can_use_effective_service_level_cache() const;
/**
* Returns the service level options **in effect** for a user having the given
* collection of roles.

View File

@@ -124,40 +124,8 @@ bool should_flush_system_topology_after_applying(const mutation& mut, const data
return false;
}
static void collect_client_routes_update(const mutation& mut, client_routes_service::client_route_keys& client_routes_update) {
auto s_client_routes = db::system_keyspace::client_routes();
if (mut.column_family_id() != s_client_routes->id()) {
return;
}
const auto pk_components = mut.decorated_key()._key.explode(*s_client_routes);
if (pk_components.empty()) {
return;
}
auto conn_uuid = value_cast<sstring>(utf8_type->deserialize_value(pk_components[0]));
for (const rows_entry& re : mut.partition().clustered_rows()) {
const auto ck_components = re.key().explode(*s_client_routes);
if (ck_components.empty()) {
continue;
}
auto host_uuid = value_cast<utils::UUID>(uuid_type->deserialize_value(ck_components[0]));
client_routes_update.emplace(conn_uuid, host_uuid);
}
}
static future<> notify_client_route_change_if_needed(storage_service& storage_service, const client_routes_service::client_route_keys& client_routes_update) {
if (client_routes_update.size() > 0) {
slogger.trace("write_mutations_to_database: notify_client_routes_change routes_update.size()={}", client_routes_update.size());
co_await storage_service.notify_client_routes_change(client_routes_update);
}
}
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
utils::chunked_vector<frozen_mutation_and_schema> mutations;
client_routes_service::client_route_keys client_routes_update;
mutations.reserve(cms.size());
bool need_system_topology_flush = false;
try {
@@ -165,12 +133,7 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
auto& tbl = proxy.local_db().find_column_family(cm.column_family_id());
auto& s = tbl.schema();
auto mut = co_await to_mutation_gently(cm, s);
need_system_topology_flush = need_system_topology_flush || should_flush_system_topology_after_applying(mut, proxy.data_dictionary());
if (proxy.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
collect_client_routes_update(mut, client_routes_update);
}
mutations.emplace_back(co_await freeze_gently(mut), s);
}
} catch (replica::no_such_column_family& e) {
@@ -184,8 +147,6 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
co_await proxy.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
}
co_await notify_client_route_change_if_needed(storage_service, client_routes_update);
}
group0_state_machine::modules_to_reload group0_state_machine::get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations) {
@@ -290,7 +251,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
[&] (topology_change& chng) -> future<> {
auto modules_to_reload = get_modules_to_reload(chng.mutations);
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(chng.mutations));
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
co_await reload_modules(std::move(modules_to_reload));
},
@@ -302,7 +263,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
},
[&] (write_mutations& muts) -> future<> {
auto modules_to_reload = get_modules_to_reload(muts.mutations);
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(muts.mutations));
co_await reload_modules(std::move(modules_to_reload));
}
), cmd.change);
@@ -432,7 +393,6 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) {
try {
co_await utils::get_local_injector().inject("block_group0_transfer_snapshot", utils::wait_for_message(300s));
// Note that this may bring newer state than the group0 state machine raft's
// log, so some raft entries may be double applied, but since the state
// machine is idempotent it is not a problem.
@@ -491,23 +451,11 @@ future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::
co_await _sp.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
}
client_routes_service::client_route_keys client_routes_update;
if (raft_snp) {
if (_sp.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
auto s_client_routes = db::system_keyspace::client_routes();
for (auto& canonical_mut : raft_snp->mutations) {
if (canonical_mut.column_family_id() == s_client_routes->id()) {
auto mut = co_await to_mutation_gently(canonical_mut, s_client_routes);
slogger.trace("transfer snapshot: raft snapshot includes client_routes mutation");
collect_client_routes_update(mut, client_routes_update);
}
}
}
co_await mutate_locally(std::move(raft_snp->mutations), _sp);
}
co_await _ss.auth_cache().load_all();
co_await notify_client_route_change_if_needed(_ss, client_routes_update);
co_await _sp.mutate_locally({std::move(history_mut)}, nullptr);
} catch (const abort_requested_exception&) {

View File

@@ -130,6 +130,6 @@ public:
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
// Used to write data to topology and other tables except schema tables.
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
} // end of namespace service

View File

@@ -254,10 +254,6 @@ public:
group0_batch(const group0_batch&) = delete;
group0_batch(group0_batch&&) = default;
const group0_guard& guard() const {
return _guard.value();
}
// Gets timestamp which should be used when building mutations.
api::timestamp_type write_timestamp() const;
utils::UUID new_group0_state_id() const;

View File

@@ -1114,7 +1114,7 @@ private:
// only for a truncate which is still waiting.
if (_topology_state_machine._topology.global_request) {
utils::UUID ongoing_global_request_id = _topology_state_machine._topology.global_request_id.value();
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id);
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id, true);
auto global_request = std::get<service::global_topology_request>(topology_requests_entry.request_type);
if (global_request == global_topology_request::truncate_table) {
std::optional<topology::transition_state>& tstate = _topology_state_machine._topology.tstate;

View File

@@ -205,7 +205,6 @@ storage_service::storage_service(abort_source& abort_source,
cql3::query_processor& qp,
sharded<qos::service_level_controller>& sl_controller,
auth::cache& auth_cache,
sharded<client_routes_service>& client_routes,
topology_state_machine& topology_state_machine,
db::view::view_building_state_machine& view_building_state_machine,
tasks::task_manager& tm,
@@ -225,13 +224,11 @@ storage_service::storage_service(abort_source& abort_source,
, _snitch(snitch)
, _sl_controller(sl_controller)
, _auth_cache(auth_cache)
, _client_routes(client_routes)
, _group0(nullptr)
, _async_gate("storage_service")
, _node_ops_abort_thread(node_ops_abort_thread())
, _node_ops_module(make_shared<node_ops::task_manager_module>(tm, *this))
, _tablets_module(make_shared<service::task_manager_module>(tm, *this))
, _global_topology_requests_module(make_shared<service::topo::task_manager_module>(tm))
, _address_map(address_map)
, _shared_token_metadata(stm)
, _erm_factory(erm_factory)
@@ -255,11 +252,9 @@ storage_service::storage_service(abort_source& abort_source,
{
tm.register_module(_node_ops_module->get_name(), _node_ops_module);
tm.register_module(_tablets_module->get_name(), _tablets_module);
tm.register_module(_global_topology_requests_module->get_name(), _global_topology_requests_module);
if (this_shard_id() == 0) {
_node_ops_module->make_virtual_task<node_ops::node_ops_virtual_task>(*this);
_tablets_module->make_virtual_task<service::tablet_virtual_task>(*this);
_global_topology_requests_module->make_virtual_task<service::topo::global_topology_request_virtual_task>(*this);
}
register_metrics();
@@ -588,16 +583,12 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
}
break;
case node_state::decommissioning:
[[fallthrough]];
case node_state::removing:
// A decommissioning or removing node loses its tokens when topology moves to left_token_ring.
// A decommissioning node loses its tokens when topology moves to left_token_ring.
if (_topology_state_machine._topology.tstate == topology::transition_state::left_token_ring) {
if (rs.state == node_state::removing && !_feature_service.removenode_with_left_token_ring) {
on_internal_error(
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
}
break;
}
[[fallthrough]];
case node_state::removing:
if (_topology_state_machine._topology.tstate == topology::transition_state::rollback_to_normal) {
// no need for double writes anymore since op failed
co_await process_normal_node(id, host_id, ip, rs);
@@ -1384,34 +1375,6 @@ public:
}
};
future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstring ks) const {
auto ongoing_ks_rf_change = [&] (utils::UUID request_id) -> future<bool> {
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
co_return std::holds_alternative<global_topology_request>(req_entry.request_type) &&
std::get<global_topology_request>(req_entry.request_type) == global_topology_request::keyspace_rf_change &&
req_entry.new_keyspace_rf_change_ks_name.has_value() && req_entry.new_keyspace_rf_change_ks_name.value() == ks;
};
if (_topology_state_machine._topology.global_request_id.has_value()) {
auto req_id = _topology_state_machine._topology.global_request_id.value();
if (co_await ongoing_ks_rf_change(req_id)) {
co_return true;
}
}
for (auto request_id : _topology_state_machine._topology.paused_rf_change_requests) {
if (co_await ongoing_ks_rf_change(request_id)) {
co_return true;
}
co_await coroutine::maybe_yield();
}
for (auto request_id : _topology_state_machine._topology.global_requests_queue) {
if (co_await ongoing_ks_rf_change(request_id)) {
co_return true;
}
co_await coroutine::maybe_yield();
}
co_return false;
}
future<> storage_service::raft_initialize_discovery_leader(const join_node_request_params& params) {
if (params.replaced_id.has_value()) {
throw std::runtime_error(::format("Cannot perform a replace operation because this is the first node in the cluster"));
@@ -1457,7 +1420,7 @@ future<> storage_service::raft_initialize_discovery_leader(const join_node_reque
_migration_manager.local().get_group0_client().get_history_gc_duration(), "bootstrap: adding myself as the first node to the topology");
auto mutation_creator_addr = _sys_ks.local().local_db().get_token_metadata().get_topology().my_address();
co_await write_mutations_to_database(*this, _qp.proxy(), mutation_creator_addr, std::move(change.mutations));
co_await write_mutations_to_database(_qp.proxy(), mutation_creator_addr, std::move(change.mutations));
co_await _qp.proxy().mutate_locally({history_append}, nullptr);
}
@@ -3480,7 +3443,6 @@ future<> storage_service::stop() {
_listeners.clear();
co_await _tablets_module->stop();
co_await _node_ops_module->stop();
co_await _global_topology_requests_module->stop();
co_await _async_gate.close();
co_await std::move(_node_ops_abort_thread);
_tablet_split_monitor_event.signal();
@@ -5063,50 +5025,6 @@ future<> storage_service::wait_for_topology_not_busy() {
}
}
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
auto holder = _async_gate.hold();
if (this_shard_id() != 0) {
// group0 is only set on shard 0.
co_return co_await container().invoke_on(0, [&] (auto& ss) {
return ss.abort_paused_rf_change(request_id);
});
}
if (!_feature_service.rack_list_rf) {
throw std::runtime_error("The RACK_LIST_RF feature is not enabled on the cluster yet");
}
while (true) {
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
if (!found) {
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
co_return;
}
utils::chunked_vector<canonical_mutation> updates;
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
.done("Aborted by user request")
.build()));
topology_change change{std::move(updates)};
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
format("aborting rf change request {}", request_id));
try {
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _group0_as, raft_timeout{});
} catch (group0_concurrent_modification&) {
slogger.info("aborting request {}: concurrent modification, retrying.", request_id);
continue;
}
break;
}
}
semaphore& storage_service::get_do_sample_sstables_concurrency_limiter() {
return _do_sample_sstables_concurrency_limiter;
}
@@ -5310,7 +5228,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {
request_id = _topology_state_machine._topology.global_request_id.value();
} else if (!_topology_state_machine._topology.global_requests_queue.empty()) {
request_id = _topology_state_machine._topology.global_requests_queue[0];
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id, true);
curr_req = std::get<global_topology_request>(req_entry.request_type);
} else {
request_id = utils::UUID{};
@@ -6526,19 +6444,14 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
leaving.host, pending.host));
}
// All sstables cloned locally will be left unsealed, until they're loaded into the table.
// This is to guarantee no unsplit sstables will be left sealed on disk, which could
// cause problems if unsplit sstables are found after split was ACKed to coordinator.
bool leave_unsealed = true;
auto d = co_await smp::submit_to(leaving.shard, [this, tablet, leave_unsealed] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
auto d = co_await smp::submit_to(leaving.shard, [this, tablet] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
auto& table = _db.local().find_column_family(tablet.table);
auto op = table.stream_in_progress();
co_return co_await table.clone_tablet_storage(tablet.tablet, leave_unsealed);
co_return co_await table.clone_tablet_storage(tablet.tablet);
});
rtlogger.debug("Cloned storage of tablet {} from leaving replica {}, {} sstables were found", tablet, leaving, d.size());
auto load_sstable = [leave_unsealed] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
auto load_sstable = [] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
auto& mng = t.get_sstables_manager();
auto sst = mng.make_sstable(t.schema(), t.get_storage_options(), d.generation, d.state.value_or(sstables::sstable_state::normal),
d.version, d.format, db_clock::now(), default_io_error_handler_gen());
@@ -6546,8 +6459,7 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
// will still point to leaving replica at this stage in migration. If node goes down,
// SSTables will be loaded at pending replica and migration is retried, so correctness
// wise, we're good.
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true,
.unsealed_sstable = leave_unsealed };
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true };
co_await sst->load(sharder, cfg);
co_return sst;
};
@@ -6555,23 +6467,16 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
co_await smp::submit_to(pending.shard, [this, tablet, load_sstable, d = std::move(d)] () mutable -> future<> {
// Loads cloned sstables from leaving replica into pending one.
auto& table = _db.local().find_column_family(tablet.table);
auto& sstm = table.get_sstables_manager();
auto op = table.stream_in_progress();
dht::auto_refreshing_sharder sharder(table.shared_from_this());
std::unordered_set<sstables::shared_sstable> ssts;
std::vector<sstables::shared_sstable> ssts;
ssts.reserve(d.size());
for (auto&& sst_desc : d) {
ssts.insert(co_await load_sstable(sharder, table, std::move(sst_desc)));
ssts.push_back(co_await load_sstable(sharder, table, std::move(sst_desc)));
}
auto on_add = [&ssts, &sstm] (sstables::shared_sstable loading_sst) -> future<> {
if (ssts.contains(loading_sst)) {
auto cfg = sstm.configure_writer(loading_sst->get_origin());
co_await loading_sst->seal_sstable(cfg.backup);
}
co_return;
};
auto loaded_ssts = co_await table.add_new_sstables_and_update_cache(std::vector(ssts.begin(), ssts.end()), on_add);
_view_building_worker.local().load_sstables(tablet.table, loaded_ssts);
co_await table.add_sstables_and_update_cache(ssts);
_view_building_worker.local().load_sstables(tablet.table, ssts);
});
rtlogger.debug("Successfully loaded storage of tablet {} into pending replica {}", tablet, pending);
}
@@ -7797,9 +7702,6 @@ void storage_service::init_messaging_service() {
additional_tables.push_back(db::system_keyspace::cdc_streams_state()->id());
additional_tables.push_back(db::system_keyspace::cdc_streams_history()->id());
}
if (ss._feature_service.client_routes) {
additional_tables.push_back(db::system_keyspace::client_routes()->id());
}
}
for (const auto& table : boost::join(params.tables, additional_tables)) {
@@ -8139,18 +8041,6 @@ future<> endpoint_lifecycle_notifier::notify_joined(gms::inet_address endpoint,
});
}
future<> endpoint_lifecycle_notifier::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
co_await seastar::async([this, &client_route_keys] {
_subscribers.thread_for_each([&client_route_keys] (endpoint_lifecycle_subscriber* subscriber) {
try {
subscriber->on_client_routes_change(client_route_keys);
} catch (...) {
slogger.warn("Client routes notification failed: {}", std::current_exception());
}
});
});
}
future<> storage_service::notify_joined(inet_address endpoint, locator::host_id hid) {
co_await utils::get_local_injector().inject(
"storage_service_notify_joined_sleep", std::chrono::milliseconds{500});
@@ -8175,10 +8065,6 @@ future<> storage_service::notify_cql_change(inet_address endpoint, locator::host
}
}
future<> storage_service::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
co_await _client_routes.local().notify_client_routes_change(client_route_keys);
}
bool storage_service::is_normal_state_handled_on_boot(locator::host_id node) {
return _normal_state_handled_on_boot.contains(node);
}

View File

@@ -17,10 +17,8 @@
#include "gms/endpoint_state.hh"
#include "gms/i_endpoint_state_change_subscriber.hh"
#include "schema/schema_fwd.hh"
#include "service/client_routes.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "service/qos/service_level_controller.hh"
#include "service/task_manager_module.hh"
#include "service/topology_guard.hh"
#include "locator/abstract_replication_strategy.hh"
#include "locator/snitch_base.hh"
@@ -50,7 +48,6 @@
#include "service/tablet_allocator.hh"
#include "service/tablet_operation.hh"
#include "mutation/timestamp.hh"
#include "utils/UUID.hh"
#include "utils/user_provided_param.hh"
#include "utils/sequenced_set.hh"
#include "service/topology_coordinator.hh"
@@ -205,7 +202,6 @@ private:
sharded<locator::snitch_ptr>& _snitch;
sharded<qos::service_level_controller>& _sl_controller;
auth::cache& _auth_cache;
sharded<client_routes_service>& _client_routes;
// Engaged on shard 0 before `join_cluster`.
service::raft_group0* _group0;
@@ -229,7 +225,6 @@ private:
future<> _node_ops_abort_thread;
shared_ptr<node_ops::task_manager_module> _node_ops_module;
shared_ptr<service::task_manager_module> _tablets_module;
shared_ptr<service::topo::task_manager_module> _global_topology_requests_module;
gms::gossip_address_map& _address_map;
void node_ops_insert(node_ops_id, gms::inet_address coordinator, std::list<inet_address> ignore_nodes,
std::function<future<>()> abort_func);
@@ -274,7 +269,6 @@ public:
cql3::query_processor& qp,
sharded<qos::service_level_controller>& sl_controller,
auth::cache& auth_cache,
sharded<client_routes_service>& _client_routes,
topology_state_machine& topology_state_machine,
db::view::view_building_state_machine& view_building_state_machine,
tasks::task_manager& tm,
@@ -937,7 +931,6 @@ public:
bool topology_global_queue_empty() const {
return !_topology_state_machine._topology.global_request.has_value();
}
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
future<> initialize_done_topology_upgrade_state();
private:
@@ -1075,8 +1068,6 @@ public:
future<sstring> wait_for_topology_request_completion(utils::UUID id, bool require_entry = true);
future<> wait_for_topology_not_busy();
future<> abort_paused_rf_change(utils::UUID request_id);
private:
semaphore _do_sample_sstables_concurrency_limiter{1};
// To avoid overly-large RPC messages, `do_sample_sstables` is broken up into several rounds.
@@ -1147,14 +1138,11 @@ public:
future<std::vector<std::byte>> train_dict(utils::chunked_vector<temporary_buffer<char>> sample);
future<> publish_new_sstable_dict(table_id, std::span<const std::byte>, service::raft_group0_client&);
void set_train_dict_callback(decltype(_train_dict));
seastar::future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
friend class join_node_rpc_handshaker;
friend class node_ops::node_ops_virtual_task;
friend class tasks::task_manager;
friend class tablet_virtual_task;
friend class topo::global_topology_request_virtual_task;
};
}

View File

@@ -6,16 +6,12 @@
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "cql3/statements/ks_prop_defs.hh"
#include "db/system_keyspace.hh"
#include "locator/tablets.hh"
#include "locator/topology.hh"
#include "replica/tablets.hh"
#include "locator/tablet_replication_strategy.hh"
#include "replica/database.hh"
#include "service/migration_listener.hh"
#include "service/tablet_allocator.hh"
#include "utils/UUID.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/stall_free.hh"
@@ -26,7 +22,6 @@
#include "replica/database.hh"
#include "gms/feature_service.hh"
#include <iterator>
#include <ranges>
#include <utility>
#include <fmt/ranges.h>
#include <seastar/coroutine/maybe_yield.hh>
@@ -242,147 +237,6 @@ struct migration_candidate {
migration_badness badness;
};
struct colocation_source {
locator::global_tablet_id gid;
locator::tablet_replica replica;
};
using colocation_source_set = utils::chunked_vector<colocation_source>;
using colocation_sources_by_destination_rack = std::unordered_map<endpoint_dc_rack, colocation_source_set>;
struct rack_list_colocation_state {
colocation_sources_by_destination_rack dst_dc_rack_to_tablets;
std::unordered_map<endpoint_dc_rack, std::unordered_set<utils::UUID>> dst_to_requests;
utils::UUID request_to_resume;
void maybe_set_request_to_resume(const utils::UUID& id) {
if (!request_to_resume) {
request_to_resume = id;
}
}
};
future<rack_list_colocation_state> find_required_rack_list_colocations(
replica::database& db,
token_metadata_ptr tmptr,
db::system_keyspace* sys_ks,
const std::unordered_set<utils::UUID>& paused_rf_change_requests,
const std::unordered_set<locator::global_tablet_id>& already_planned_migrations) {
rack_list_colocation_state state;
auto get_node = [&] (locator::host_id host) -> const locator::node& {
auto* node = tmptr->get_topology().find_node(host);
if (!node) {
on_internal_error(lblogger, format("Node {} not found in topology", host));
}
return *node;
};
for (const auto& request_id : paused_rf_change_requests) {
auto req_entry = co_await sys_ks->get_topology_request_entry(request_id);
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
if (!db.has_keyspace(ks_name)) {
state.maybe_set_request_to_resume(request_id);
continue;
}
auto& ks = db.find_keyspace(ks_name);
std::unordered_map<sstring, sstring> saved_ks_props = *req_entry.new_keyspace_rf_change_data;
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
new_ks_props.validate();
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, db.features(), db.get_config());
auto tables_with_mvs = ks.metadata()->tables();
auto views = ks.metadata()->views();
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
if (tables_with_mvs.empty()) {
state.maybe_set_request_to_resume(request_id);
continue;
}
bool no_changes_needed = true;
for (const auto& table_or_mv : tables_with_mvs) {
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
continue;
}
const auto& tmap = tmptr->tablets().get_tablet_map(table_or_mv->id());
const auto& new_replication_strategy_config = ks_md->strategy_options();
for (auto& [dc, rf_value] : new_replication_strategy_config) {
if (!std::holds_alternative<rack_list>(rf_value)) {
continue;
}
auto racks = std::get<rack_list>(rf_value) | std::ranges::to<std::unordered_set<sstring>>();
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
auto gid = locator::global_tablet_id{table_or_mv->id(), tid};
// Current replicas in this DC. There might be multiple replicas in the same rack.
auto dc_replicas = ti.replicas | std::views::filter([&] (const tablet_replica& r) {
return get_node(r.host).dc_rack().dc == dc;
}) | std::ranges::to<std::vector<tablet_replica>>();
if (dc_replicas.empty()) {
return make_ready_future<>();
}
// Find replicas that are not in the desired racks (src_replicas)
// and racks that do not have replicas yet (dst_racks).
auto dst_racks = racks;
std::vector<tablet_replica> src_replicas;
for (const auto& r : dc_replicas) {
auto rack = get_node(r.host).dc_rack().rack;
if (dst_racks.find(rack) != dst_racks.end()) {
// There is already a replica in this rack.
dst_racks.erase(rack);
} else {
// There is a replica in this rack, but it needs to be moved.
src_replicas.push_back(r);
}
}
auto zipped = std::views::zip(src_replicas, dst_racks);
if (!std::ranges::empty(zipped)) {
no_changes_needed = false;
}
// Skip tablet that is in transitions.
auto* tti = tmap.get_tablet_transition_info(tid);
if (tti) {
lblogger.debug("Skipped colocation for tablet={} which is already in transition={}", gid, tti->transition);
return make_ready_future<>();
}
// Skip tablet that is about to be in transition.
if (already_planned_migrations.contains(gid)) {
return make_ready_future<>();
}
for (auto src_dst : zipped) {
auto src = std::get<0>(src_dst);
auto dst = std::get<1>(src_dst);
auto endpoint = locator::endpoint_dc_rack{dc, dst};
state.dst_dc_rack_to_tablets[endpoint].emplace_back(colocation_source{{table_or_mv->id(), tid}, src});
state.dst_to_requests[endpoint].insert(request_id);
}
return make_ready_future<>();
});
}
}
if (no_changes_needed) {
state.maybe_set_request_to_resume(request_id);
}
}
co_return state;
}
future<bool> requires_rack_list_colocation(
replica::database& db,
locator::token_metadata_ptr tmptr,
db::system_keyspace* sys_ks,
utils::UUID request_id) {
auto res = co_await find_required_rack_list_colocations(db, tmptr, sys_ks, {request_id}, {});
co_return res.request_to_resume != request_id;
}
}
template<>
@@ -804,8 +658,6 @@ class load_balancer {
replica::database& _db;
token_metadata_ptr _tm;
service::topology* _topology;
db::system_keyspace* _sys_ks;
std::optional<locator::load_sketch> _load_sketch;
// Holds the set of tablets already scheduled for transition during plan-making.
std::unordered_set<global_tablet_id> _scheduled_tablets;
@@ -890,10 +742,7 @@ private:
return streaming_infos;
}
public:
load_balancer(replica::database& db, token_metadata_ptr tm,
service::topology* topology,
db::system_keyspace* sys_ks,
locator::load_stats_ptr table_load_stats,
load_balancer(replica::database& db, token_metadata_ptr tm, locator::load_stats_ptr table_load_stats,
load_balancer_stats_manager& stats,
uint64_t target_tablet_size,
unsigned tablets_per_shard_goal,
@@ -902,26 +751,19 @@ public:
, _tablets_per_shard_goal(tablets_per_shard_goal)
, _db(db)
, _tm(std::move(tm))
, _topology(topology)
, _sys_ks(sys_ks)
, _table_load_stats(std::move(table_load_stats))
, _stats(stats)
, _skiplist(std::move(skiplist))
{ }
bool ongoing_rack_list_colocation() const {
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
}
future<migration_plan> make_plan() {
const locator::topology& topo = _tm->get_topology();
migration_plan plan;
auto rack_list_colocation = ongoing_rack_list_colocation();
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
// Prepare plans for each DC separately and combine them to be executed in parallel.
for (auto&& dc : topo.get_datacenters()) {
if (_db.get_config().rf_rack_valid_keyspaces() || rack_list_colocation) {
if (_db.get_config().rf_rack_valid_keyspaces()) {
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
auto rack_plan = co_await make_plan(dc, rack);
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
@@ -937,10 +779,6 @@ public:
}
}
if (rack_list_colocation) {
plan.merge(co_await make_rack_list_colocation_plan(plan));
}
// Merge table-wide resize decisions, may emit new decisions, revoke or finalize ongoing ones.
// Note : Resize plans should be generated before repair plans to avoid scheduling repairs when there is pending resize finalization
plan.merge_resize_plan(co_await make_resize_plan(plan));
@@ -951,8 +789,8 @@ public:
}
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s)",
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count());
co_return std::move(plan);
}
@@ -977,58 +815,6 @@ public:
co_return false;
}
void ensure_node(node_load_map& nodes, host_id host) {
if (nodes.contains(host)) {
return;
}
const locator::topology& topo = _tm->get_topology();
auto* node = topo.find_node(host);
if (!node) {
on_internal_error(lblogger, format("Node {} not found in topology", host));
}
node_load& load = nodes[host];
load.id = host;
load.node = node;
load.shard_count = node->get_shard_count();
load.shards.resize(load.shard_count);
if (!load.shard_count) {
throw std::runtime_error(format("Shard count of {} not found in topology", host));
}
if (!_db.features().tablet_load_stats_v2) {
// This way load calculation will hold tablet count.
load.capacity = _target_tablet_size * load.shard_count;
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
load.capacity = _table_load_stats->capacity.at(host);
}
}
future<> consider_scheduled_load(node_load_map& nodes) {
const locator::topology& topo = _tm->get_topology();
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
const auto& tmap = _tm->tablets().get_tablet_map(table);
for (auto&& [tid, trinfo]: tmap.transitions()) {
co_await coroutine::maybe_yield();
if (is_streaming(&trinfo)) {
auto& tinfo = tmap.get_tablet_info(tid);
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
}
}
}
}
future<> consider_planned_load(node_load_map& nodes, const migration_plan& mplan) {
const locator::topology& topo = _tm->get_topology();
auto& tablet_meta = _tm->tablets();
for (const tablet_migration_info& tmi : mplan.migrations()) {
co_await coroutine::maybe_yield();
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
apply_load(nodes, streaming_info);
}
}
future<tablet_repair_plan> make_repair_plan(const migration_plan& mplan) {
lblogger.debug("In make_repair_plan");
@@ -1044,19 +830,53 @@ public:
// Populate the load of the migration that is already in the plan
node_load_map nodes;
// TODO: share code with make_plan()
auto ensure_node = [&] (host_id host) {
if (nodes.contains(host)) {
return;
}
auto* node = topo.find_node(host);
if (!node) {
on_internal_error(lblogger, format("Node {} not found in topology", host));
}
node_load& load = nodes[host];
load.id = host;
load.node = node;
load.shard_count = node->get_shard_count();
load.shards.resize(load.shard_count);
if (!load.shard_count) {
throw std::runtime_error(format("Shard count of {} not found in topology", host));
}
};
// TODO: share code with make_plan()
topo.for_each_node([&] (const locator::node& node) {
bool is_drained = node.get_state() == locator::node::state::being_decommissioned
|| node.get_state() == locator::node::state::being_removed;
if (node.get_state() == locator::node::state::normal || is_drained) {
ensure_node(nodes, node.host_id());
ensure_node(node.host_id());
}
});
// Consider load that is already scheduled
co_await consider_scheduled_load(nodes);
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
const auto& tmap = _tm->tablets().get_tablet_map(table);
for (auto&& [tid, trinfo]: tmap.transitions()) {
co_await coroutine::maybe_yield();
if (is_streaming(&trinfo)) {
auto& tinfo = tmap.get_tablet_info(tid);
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
}
}
}
// Consider load that is about to be scheduled
co_await consider_planned_load(nodes, mplan);
auto& tablet_meta = _tm->tablets();
for (const tablet_migration_info& tmi : mplan.migrations()) {
co_await coroutine::maybe_yield();
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
apply_load(nodes, streaming_info);
}
struct repair_plan {
locator::global_tablet_id gid;
@@ -1139,109 +959,6 @@ public:
co_return ret;
}
future<migration_plan> make_rack_list_colocation_plan(const migration_plan& mplan) {
lblogger.debug("In make_rack_list_colocation_plan");
migration_plan plan;
tablet_rack_list_colocation_plan rack_list_plan;
if (!ongoing_rack_list_colocation()) {
co_return plan;
}
const locator::topology& topo = _tm->get_topology();
auto migration_tablet_ids = co_await mplan.get_migration_tablet_ids();
auto colocation_state = co_await find_required_rack_list_colocations(_db, _tm, _sys_ks,
_topology->paused_rf_change_requests, std::move(migration_tablet_ids));
node_load_map nodes;
topo.for_each_node([&] (const locator::node& node) {
if (node.get_state() == locator::node::state::normal && !node.is_excluded()) {
ensure_node(nodes, node.host_id());
}
});
// Consider load that is already scheduled.
co_await consider_scheduled_load(nodes);
// Consider load that is about to be scheduled.
co_await consider_planned_load(nodes, mplan);
std::unordered_set<global_tablet_id> colocation_tablet_ids;
for (auto& [dc_rack, colocation_sources] : colocation_state.dst_dc_rack_to_tablets) {
auto nodes_by_load_dst = nodes | std::views::filter([&] (const auto& host_load) {
auto& [host, load] = host_load;
auto& node = *load.node;
return node.dc_rack() == dc_rack;
}) | std::views::keys | std::ranges::to<std::vector<host_id>>();
if (nodes_by_load_dst.empty()) {
lblogger.warn("No target nodes available for RF change colocation plan in dc {}, rack {}", dc_rack.dc, dc_rack.rack);
if (auto it = colocation_state.dst_to_requests.find(dc_rack); it != colocation_state.dst_to_requests.end()) {
rack_list_plan.maybe_add_request_to_resume(*it->second.begin());
}
continue;
}
auto nodes_cmp = nodes_by_load_cmp(nodes);
auto nodes_dst_cmp = [&] (const host_id& a, const host_id& b) {
return nodes_cmp(b, a);
};
// Ascending load heap of candidate target nodes.
std::make_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
const tablet_metadata& tmeta = _tm->tablets();
for (colocation_source& source : colocation_sources) {
if (colocation_tablet_ids.contains(source.gid)) {
lblogger.debug("Skipped colocation of replica {} of tablet={}, another replica of which is about to be colocated", source.replica, source.gid);
continue;
}
// Pick the least loaded node as target.
std::pop_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
auto target = nodes_by_load_dst.back();
auto& target_info = nodes[target];
auto push_back_target_node = seastar::defer([&] {
std::push_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
});
lblogger.debug("target node: {}, avg_load={}", target, target_info.avg_load);
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
target_info.shards[dst.shard].tablet_count,
target_info.shard_load(dst.shard, _target_tablet_size));
tablet_transition_kind kind = tablet_transition_kind::migration;
migration_tablet_set source_tablets {
.tablet_s = source.gid, // Ignore the merge co-location.
};
auto src = source.replica;
auto mig = get_migration_info(source_tablets, kind, src, dst);
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
auto mig_streaming_info = get_migration_streaming_infos(topo, tmap, mig);
pick(*_load_sketch, dst.host, dst.shard, source_tablets);
if (can_accept_load(nodes, mig_streaming_info)) {
apply_load(nodes, mig_streaming_info);
lblogger.debug("Adding migration: {}", mig);
mark_as_scheduled(mig);
for (auto& m : mig) {
plan.add(std::move(m));
colocation_tablet_ids.insert(m.tablet);
}
}
update_node_load_on_migration(nodes, src, dst, source_tablets);
}
}
if (colocation_state.request_to_resume) {
rack_list_plan.maybe_add_request_to_resume(colocation_state.request_to_resume);
}
plan.set_rack_list_colocation_plan(std::move(rack_list_plan));
co_return std::move(plan);
}
// Returns true if a table has replicas of all its sibling tablets co-located.
// This is used for determining whether merge can be finalized, since co-location
// is a strict requirement for sibling tablets to be merged.
@@ -1931,10 +1648,6 @@ public:
const auto& table_groups = _tm->tablets().all_table_groups();
auto finalize_decision = [&] {
if (utils::get_local_injector().enter("tablet_resize_finalization_postpone")) {
return;
}
_stats.for_cluster().resizes_finalized++;
resize_plan.finalize_resize.insert(table);
};
@@ -3254,6 +2967,30 @@ public:
node_load_map nodes;
std::unordered_set<host_id> nodes_to_drain;
auto ensure_node = [&] (host_id host) {
if (nodes.contains(host)) {
return;
}
auto* node = topo.find_node(host);
if (!node) {
on_internal_error(lblogger, format("Node {} not found in topology", host));
}
node_load& load = nodes[host];
load.id = host;
load.node = node;
load.shard_count = node->get_shard_count();
load.shards.resize(load.shard_count);
if (!load.shard_count) {
throw std::runtime_error(format("Shard count of {} not found in topology", host));
}
if (!_db.features().tablet_load_stats_v2) {
// This way load calculation will hold tablet count.
load.capacity = _target_tablet_size * load.shard_count;
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
load.capacity = _table_load_stats->capacity.at(host);
}
};
_tm->for_each_token_owner([&] (const locator::node& node) {
if (!node_filter(node)) {
return;
@@ -3262,7 +2999,7 @@ public:
|| node.get_state() == locator::node::state::being_removed;
if (node.get_state() == locator::node::state::normal || is_drained) {
if (is_drained) {
ensure_node(nodes, node.host_id());
ensure_node(node.host_id());
lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
nodes_to_drain.emplace(node.host_id());
nodes[node.host_id()].drained = true;
@@ -3270,7 +3007,7 @@ public:
// Excluded nodes should not be chosen as targets for migration.
lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
} else {
ensure_node(nodes, node.host_id());
ensure_node(node.host_id());
}
}
});
@@ -3303,7 +3040,7 @@ public:
r, global_tablet_id{table, tid}));
}
if (node->left() && node_filter(*node)) {
ensure_node(nodes, r.host);
ensure_node(r.host);
nodes_to_drain.insert(r.host);
nodes[r.host].drained = true;
}
@@ -3505,7 +3242,7 @@ public:
plan.merge(co_await make_intranode_plan(nodes, nodes_to_drain));
}
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
if (_tm->tablets().balancing_enabled() && plan.empty()) {
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
@@ -3527,11 +3264,9 @@ class tablet_allocator_impl : public tablet_allocator::impl
locator::load_stats_ptr _load_stats;
private:
load_balancer make_load_balancer(token_metadata_ptr tm,
service::topology* topology,
db::system_keyspace* sys_ks,
locator::load_stats_ptr table_load_stats,
std::unordered_set<host_id> skiplist) {
load_balancer lb(_db, tm, topology, sys_ks, std::move(table_load_stats), _load_balancer_stats,
load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats,
_db.get_config().target_tablet_size_in_bytes(),
_db.get_config().tablets_per_shard_goal(),
std::move(skiplist));
@@ -3558,8 +3293,8 @@ public:
_stopped = true;
}
future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
future<migration_plan> balance_tablets(token_metadata_ptr tm, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
auto lb = make_load_balancer(tm, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
co_return co_await lb.make_plan();
}
@@ -3579,7 +3314,7 @@ public:
// Allocates new tablets for a table which is not co-located with another table.
tablet_map allocate_tablets_for_new_base_table(const tablet_aware_replication_strategy* tablet_rs, const schema& s) {
auto tm = _db.get_shared_token_metadata().get();
auto lb = make_load_balancer(tm, nullptr, nullptr, nullptr, {});
auto lb = make_load_balancer(tm, nullptr, {});
auto plan = lb.make_sizing_plan(s.shared_from_this(), tablet_rs).get();
auto& table_plan = plan.tables[s.id()];
if (table_plan.target_tablet_count_aligned != table_plan.target_tablet_count) {
@@ -3593,7 +3328,6 @@ public:
// Allocate tablets for multiple new tables, which may be co-located with each other, or co-located with an existing base table.
void allocate_tablets_for_new_tables(const keyspace_metadata& ksm, const std::vector<schema_ptr>& cfms, utils::chunked_vector<mutation>& muts, api::timestamp_type ts) {
utils::get_local_injector().inject("pause_in_allocate_tablets_for_new_table", utils::wait_for_message(std::chrono::minutes(5))).get();
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
auto tm = _db.get_shared_token_metadata().get();
auto rs = abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, tm->get_topology());
@@ -3635,7 +3369,7 @@ public:
if (s.id() != base_id) {
lblogger.debug("Creating tablets for {}.{} id={} with base={}", s.ks_name(), s.cf_name(), s.id(), base_id);
muts.emplace_back(colocated_tablet_map_to_mutation(s.id(), s.ks_name(), s.cf_name(), base_id, ts));
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
}
}
};
@@ -3651,7 +3385,7 @@ public:
muts.emplace_back(std::move(m));
return make_ready_future<>();
}).get();
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
create_colocated_tablet_maps(base_map);
}
@@ -3800,8 +3534,8 @@ future<> tablet_allocator::stop() {
return impl().stop();
}
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
return impl().balance_tablets(std::move(tm), topology, sys_ks, std::move(load_stats), std::move(skiplist));
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
return impl().balance_tablets(std::move(tm), std::move(load_stats), std::move(skiplist));
}
void tablet_allocator::set_load_stats(locator::load_stats_ptr load_stats) {

View File

@@ -14,14 +14,8 @@
#include "locator/token_metadata_fwd.hh"
#include <seastar/core/metrics.hh>
namespace db {
class system_keyspace;
}
namespace service {
class topology;
struct load_balancer_dc_stats {
uint64_t calls = 0;
uint64_t migrations_produced = 0;
@@ -139,26 +133,6 @@ struct tablet_repair_plan {
}
};
struct tablet_rack_list_colocation_plan {
utils::UUID _request_to_resume;
const utils::UUID& request_to_resume() const noexcept {
return _request_to_resume;
}
size_t size() const { return _request_to_resume ? 1 : 0; };
void merge(tablet_rack_list_colocation_plan&& other) {
_request_to_resume = _request_to_resume ? _request_to_resume : other._request_to_resume;
}
void maybe_add_request_to_resume(const utils::UUID& id) {
if (!_request_to_resume) {
_request_to_resume = id;
}
}
};
class migration_plan {
public:
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
@@ -166,19 +140,17 @@ private:
migrations_vector _migrations;
table_resize_plan _resize_plan;
tablet_repair_plan _repair_plan;
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
bool _has_nodes_to_drain = false;
public:
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
const migrations_vector& migrations() const { return _migrations; }
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size(); }
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size(); }
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size();}
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size(); }
size_t tablet_migration_count() const { return _migrations.size(); }
size_t resize_decision_count() const { return _resize_plan.size(); }
size_t tablet_repair_count() const { return _repair_plan.size(); }
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
void add(tablet_migration_info info) {
_migrations.emplace_back(std::move(info));
@@ -195,7 +167,6 @@ public:
_has_nodes_to_drain |= other._has_nodes_to_drain;
_resize_plan.merge(std::move(other._resize_plan));
_repair_plan.merge(std::move(other._repair_plan));
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
}
void set_has_nodes_to_drain(bool b) {
@@ -214,12 +185,6 @@ public:
_repair_plan = std::move(repair);
}
const tablet_rack_list_colocation_plan& rack_list_colocation_plan() const { return _rack_list_colocation_plan; }
void set_rack_list_colocation_plan(tablet_rack_list_colocation_plan rack_list_colocation_plan) {
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
}
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
};
@@ -265,7 +230,7 @@ public:
///
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
///
future<migration_plan> balance_tablets(locator::token_metadata_ptr, service::topology*, db::system_keyspace*, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
future<migration_plan> balance_tablets(locator::token_metadata_ptr, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
void set_load_stats(locator::load_stats_ptr);
@@ -281,12 +246,6 @@ public:
void on_leadership_lost();
};
future<bool> requires_rack_list_colocation(
replica::database& db,
locator::token_metadata_ptr tmptr,
db::system_keyspace* sys_ks,
utils::UUID request_id);
}
template <>

View File

@@ -11,7 +11,6 @@
#include "service/migration_manager.hh"
#include "service/storage_service.hh"
#include "service/task_manager_module.hh"
#include "service/topology_state_machine.hh"
#include "tasks/task_handler.hh"
#include "tasks/virtual_task_hint.hh"
#include <seastar/coroutine/maybe_yield.hh>
@@ -289,116 +288,4 @@ std::set<locator::host_id> task_manager_module::get_nodes() const {
return get_task_manager().get_nodes(_ss);
}
namespace topo {
static tasks::task_manager::task_state get_state(const db::system_keyspace::topology_requests_entry& entry) {
if (!entry.id) {
return tasks::task_manager::task_state::created;
} else if (!entry.done) {
return tasks::task_manager::task_state::running;
} else if (entry.error == "") {
return tasks::task_manager::task_state::done;
} else {
return tasks::task_manager::task_state::failed;
}
}
tasks::task_manager::task_group global_topology_request_virtual_task::get_group() const noexcept {
return tasks::task_manager::task_group::global_topology_change_group;
}
future<std::optional<tasks::virtual_task_hint>> global_topology_request_virtual_task::contains(tasks::task_id task_id) const {
if (!task_id.uuid().is_timestamp()) {
// Task id of node ops operation is always a timestamp.
co_return std::nullopt;
}
auto hint = std::make_optional<tasks::virtual_task_hint>({});
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
if (entry.has_value() && std::holds_alternative<service::global_topology_request>(entry->request_type) &&
std::get<service::global_topology_request>(entry->request_type) == global_topology_request::keyspace_rf_change) {
co_return hint;
}
co_return std::nullopt;
}
future<tasks::is_abortable> global_topology_request_virtual_task::is_abortable(tasks::virtual_task_hint) const {
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::yes);
}
static tasks::task_stats get_task_stats(const db::system_keyspace::topology_requests_entry& entry) {
return tasks::task_stats{
.task_id = tasks::task_id{entry.id},
.type = fmt::to_string(entry.request_type),
.kind = tasks::task_kind::cluster,
.scope = "keyspace",
.state = get_state(entry),
.sequence_number = 0,
.keyspace = entry.new_keyspace_rf_change_ks_name.value_or(""),
.table = "",
.entity = "",
.shard = 0,
.start_time = entry.start_time,
.end_time = entry.end_time,
};
}
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
if (!entry.has_value()) {
co_return std::nullopt;
}
auto task_stats = get_task_stats(*entry);
co_return tasks::task_status{
.task_id = task_stats.task_id,
.type = task_stats.type,
.kind = task_stats.kind,
.scope = task_stats.scope,
.state = task_stats.state,
.is_abortable = co_await is_abortable(std::move(hint)),
.start_time = task_stats.start_time,
.end_time = task_stats.end_time,
.error = entry->error,
.parent_id = tasks::task_id::create_null_id(),
.sequence_number = task_stats.sequence_number,
.shard = task_stats.shard,
.keyspace = task_stats.keyspace,
.table = task_stats.table,
.entity = task_stats.entity,
.progress_units = "",
.progress = tasks::task_manager::task::progress{},
.children = utils::chunked_vector<tasks::task_identity>{},
};
}
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
auto entry = co_await get_status(id, hint);
if (!entry) {
co_return std::nullopt;
}
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
co_return co_await get_status(id, std::move(hint));
}
future<> global_topology_request_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
return _ss.abort_paused_rf_change(id.uuid());
}
future<std::vector<tasks::task_stats>> global_topology_request_virtual_task::get_stats() {
db::system_keyspace& sys_ks = _ss._sys_ks.local();
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await sys_ks.get_topology_request_entries({global_topology_request::keyspace_rf_change}, db_clock::now() - get_task_manager().get_user_task_ttl())
| std::views::transform([] (const auto& e) {
auto& entry = e.second;
return get_task_stats(entry);
}));
}
task_manager_module::task_manager_module(tasks::task_manager& tm) noexcept
: tasks::task_manager::module(tm, "global_topology_requests")
{}
}
}

View File

@@ -54,33 +54,4 @@ public:
std::set<locator::host_id> get_nodes() const override;
};
namespace topo {
class global_topology_request_virtual_task : public tasks::task_manager::virtual_task::impl {
private:
service::storage_service& _ss;
public:
global_topology_request_virtual_task(tasks::task_manager::module_ptr module,
service::storage_service& ss)
: tasks::task_manager::virtual_task::impl(std::move(module))
, _ss(ss)
{}
virtual tasks::task_manager::task_group get_group() const noexcept override;
virtual future<std::optional<tasks::virtual_task_hint>> contains(tasks::task_id task_id) const override;
virtual future<tasks::is_abortable> is_abortable(tasks::virtual_task_hint hint) const override;
virtual future<std::optional<tasks::task_status>> get_status(tasks::task_id id, tasks::virtual_task_hint hint) override;
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
virtual future<std::vector<tasks::task_stats>> get_stats() override;
};
class task_manager_module : public tasks::task_manager::module {
public:
task_manager_module(tasks::task_manager& tm) noexcept;
};
}
}

View File

@@ -6,7 +6,6 @@
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <algorithm>
#include <chrono>
#include <fmt/ranges.h>
@@ -55,7 +54,6 @@
#include "service/topology_state_machine.hh"
#include "db/view/view_building_coordinator.hh"
#include "topology_mutation.hh"
#include "utils/UUID.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/stall_free.hh"
@@ -955,7 +953,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
} else {
assert(_feature_service.topology_global_request_queue);
req_id = _topo_sm._topology.global_requests_queue[0];
req_entry = co_await _sys_ks.get_topology_request_entry(req_id);
req_entry = co_await _sys_ks.get_topology_request_entry(req_id, true);
req = std::get<global_topology_request>(req_entry.request_type);
}
switch (req) {
@@ -999,7 +997,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
utils::chunked_vector<canonical_mutation> updates;
sstring error;
bool needs_colocation = false;
if (_db.has_keyspace(ks_name)) {
try {
auto& ks = _db.find_keyspace(ks_name);
@@ -1007,40 +1004,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
new_ks_props.validate();
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, _db.features(), _db.get_config());
_db.validate_keyspace_update(*ks_md);
size_t unimportant_init_tablet_count = 2; // must be a power of 2
locator::tablet_map new_tablet_map{unimportant_init_tablet_count};
auto schedule_migrations = [&] () -> future<> {
auto tables_with_mvs = ks.metadata()->tables();
auto views = ks.metadata()->views();
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
if (tables_with_mvs.empty()) {
co_return;
}
auto table = tables_with_mvs.front();
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
auto check_needs_colocation = [&] () -> future<bool> {
const auto& new_replication_strategy_config = new_strategy->get_config_options();
const auto& old_replication_strategy_config = ks.metadata()->strategy_options();
bool rack_list_conversion = false;
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
if (std::holds_alternative<locator::rack_list>(rf_value)) {
auto it = old_replication_strategy_config.find(dc);
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
rack_list_conversion = true;
break;
}
}
}
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, tmptr, &_sys_ks, req_id) : false;
};
if (needs_colocation = co_await check_needs_colocation(); needs_colocation) {
co_return;
}
for (const auto& table_or_mv : tables_with_mvs) {
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
// Apply the transition only on base tables.
@@ -1049,6 +1018,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
continue;
}
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
locator::replication_strategy_params params{ks_md->strategy_options(), old_tablets.tablet_count(), ks.metadata()->consistency_option()};
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
@@ -1075,8 +1046,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await coroutine::maybe_yield();
});
}
};
co_await schedule_migrations();
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
for (auto& m: schema_muts) {
@@ -1092,22 +1061,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
}
bool pause_request = needs_colocation && error.empty();
topology_mutation_builder tbuilder(guard.write_timestamp());
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
.set_transition_state(topology::transition_state::tablet_migration)
.set_version(_topo_sm._topology.version + 1)
.del_global_topology_request()
.del_global_topology_request_id()
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
if (pause_request) {
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
tbuilder.pause_rf_change_request(req_id);
} else {
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
.build()));
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
.done(error)
.build()));
}
updates.push_back(canonical_mutation(tbuilder.build()));
sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
rtlogger.trace("do update {} reason {}", updates, reason);
@@ -1371,14 +1334,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
.build());
}
void generate_rf_change_resume_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, utils::UUID request_to_resume) {
rtlogger.debug("Generating RF change resume for request id {}", request_to_resume);
out.emplace_back(topology_mutation_builder(guard.write_timestamp())
.queue_global_topology_request_id(request_to_resume)
.resume_rf_change_request(_topo_sm._topology.paused_rf_change_requests, request_to_resume)
.build());
}
future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
// schedule tablet migration only if there are no pending resize finalisations or if the node is draining.
@@ -1386,10 +1341,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await coroutine::maybe_yield();
generate_migration_update(out, guard, mig);
}
if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
generate_rf_change_resume_update(out, guard, request_to_resume);
}
}
auto sched_time = db_clock::now();
@@ -1880,7 +1831,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
bool has_nodes_to_drain = false;
if (!preempt) {
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), {}, get_dead_nodes());
has_nodes_to_drain = plan.has_nodes_to_drain();
if (!drain || plan.has_nodes_to_drain()) {
co_await generate_migration_updates(updates, guard, plan);
@@ -2003,7 +1954,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await utils::get_local_injector().inject("tablet_resize_finalization_post_barrier", utils::wait_for_message(std::chrono::minutes(2)));
auto tm = get_token_metadata_ptr();
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
utils::chunked_vector<canonical_mutation> updates;
updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);
@@ -2083,7 +2034,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
// We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
// handler performed the truncate and cleared the session, but crashed before finalizing the request
if (_topo_sm._topology.session) {
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id);
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id, true);
const table_id& table_id = topology_requests_entry.truncate_table_id;
lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(table_id);
@@ -2623,10 +2574,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await _voter_handler.on_node_removed(replaced_node_id, _as);
}
}
utils::get_local_injector().inject("crash_coordinator_before_stream", [] {
rtlogger.info("crash_coordinator_before_stream: aborting");
abort();
});
utils::get_local_injector().inject("crash_coordinator_before_stream", [] { abort(); });
raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
auto state = node.rs->state;
try {
@@ -2675,7 +2623,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
co_await sleep_abortable(std::chrono::milliseconds(10), _as);
}
const bool removenode_with_left_token_ring = _feature_service.removenode_with_left_token_ring;
auto node = get_node_to_work_on(std::move(guard));
bool barrier_failed = false;
// In this state writes goes to old and new replicas but reads start to be done from new replicas
@@ -2730,9 +2677,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
break;
case node_state::removing: {
co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
if (!removenode_with_left_token_ring) {
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
}
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
}
[[fallthrough]];
case node_state::decommissioning: {
@@ -2740,10 +2685,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
node_state next_state;
utils::chunked_vector<canonical_mutation> muts;
muts.reserve(2);
if (removenode_with_left_token_ring || node.rs->state == node_state::decommissioning) {
// Both decommission and removenode go through left_token_ring state
// to ensure a global barrier is executed before the request is marked as done.
// This ensures all nodes have observed the topology change.
if (node.rs->state == node_state::decommissioning) {
next_state = node.rs->state;
builder.set_transition_state(topology::transition_state::left_token_ring);
} else {
@@ -2818,16 +2760,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
case topology::transition_state::left_token_ring: {
auto node = get_node_to_work_on(std::move(guard));
// Need to be captured as the node variable might become invalid (e.g. moved out) at particular points.
const auto node_rs_state = node.rs->state;
const bool is_removenode = node_rs_state == node_state::removing;
if (is_removenode && !_feature_service.removenode_with_left_token_ring) {
on_internal_error(
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
}
auto finish_left_token_ring_transition = [&](node_to_work_on& node) -> future<> {
// Remove the node from group0 here - in general, it won't be able to leave on its own
// because we'll ban it as soon as we tell it to shut down.
@@ -2847,16 +2779,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
muts.push_back(builder.build());
co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
auto str = std::invoke([&]() {
switch (node_rs_state) {
case node_state::decommissioning:
return ::format("finished decommissioning node {}", node.id);
case node_state::removing:
return ::format("finished removing node {}", node.id);
default:
return ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
}
});
auto str = node.rs->state == node_state::decommissioning
? ::format("finished decommissioning node {}", node.id)
: ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
};
@@ -2869,11 +2794,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
}
if (node.id == _raft.id()) {
// Removed node must be dead, so it shouldn't enter here (it can't coordinate its own removal).
if (is_removenode) {
on_internal_error(rtlogger, "removenode operation cannot be coordinated by the removed node itself");
}
// Someone else needs to coordinate the rest of the decommission process,
// because the decommissioning node is going to shut down in the middle of this state.
rtlogger.info("coordinator is decommissioning; giving up leadership");
@@ -2887,13 +2807,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
bool barrier_failed = false;
// Wait until other nodes observe the new token ring and stop sending writes to this node.
auto excluded_nodes = get_excluded_nodes_for_topology_request(node);
try {
// Removed node is added to ignored nodes, so it should be automatically excluded.
if (is_removenode && !excluded_nodes.contains(node.id)) {
on_internal_error(rtlogger, "removenode operation must have the removed node in excluded_nodes");
}
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), std::move(excluded_nodes)), node.id);
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
} catch (term_changed_error&) {
throw;
} catch (group0_concurrent_modification&) {
@@ -2910,17 +2825,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
}
if (barrier_failed) {
// If barrier above failed it means there may be unfinished writes to a decommissioned node,
// or some nodes might not have observed the new topology yet (one purpose of the barrier
// is to make sure all nodes observed the new topology before completing the request).
// If barrier above failed it means there may be unfinished writes to a decommissioned node.
// Lets wait for the ring delay for those writes to complete and new topology to propagate
// before continuing.
co_await sleep_abortable(_ring_delay, _as);
node = retake_node(co_await start_operation(), node.id);
}
// Make decommissioning/removed node a non voter before reporting operation completion below.
// Otherwise the node may see the completion and exit before it is removed from
// Make decommissioning node a non voter before reporting operation completion below.
// Otherwise the decommissioned node may see the completion and exit before it is removed from
// the config at which point the removal from the config will hang if the cluster had only two
// nodes before the decommission.
co_await _voter_handler.on_node_removed(node.id, _as);
@@ -2931,7 +2844,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");
// For decommission/rollback: Tell the node to shut down.
// Tell the node to shut down.
// This is done to improve user experience when there are no failures.
// In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
// so there's no guarantee that it would learn about entering that state even if it was still
@@ -2940,19 +2853,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
// There is the possibility that the node will never get the message
// and decommission will hang on that node.
// This is fine for the rest of the cluster - we will still remove, ban the node and continue.
//
// For removenode: The node is already dead, no need to send shutdown command.
auto node_id = node.id;
bool shutdown_failed = false;
if (!is_removenode) {
try {
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
} catch (...) {
rtlogger.warn("failed to tell node {} to shut down - it may hang."
" It's safe to shut it down manually now. (Exception: {})",
node.id, std::current_exception());
shutdown_failed = true;
}
try {
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
} catch (...) {
rtlogger.warn("failed to tell node {} to shut down - it may hang."
" It's safe to shut it down manually now. (Exception: {})",
node.id, std::current_exception());
shutdown_failed = true;
}
if (shutdown_failed) {
node = retake_node(co_await start_operation(), node_id);
@@ -3549,7 +3458,7 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
}
auto tm = get_token_metadata_ptr();
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
if (plan.empty()) {
rtlogger.debug("Tablet load balancer did not make any plan");
co_return false;

View File

@@ -256,20 +256,6 @@ topology_mutation_builder& topology_mutation_builder::drop_first_global_topology
}
}
topology_mutation_builder& topology_mutation_builder::pause_rf_change_request(const utils::UUID& id) {
return apply_set("paused_rf_change_requests", collection_apply_mode::update, std::vector<data_value>{id});
}
topology_mutation_builder& topology_mutation_builder::resume_rf_change_request(const std::unordered_set<utils::UUID>& values, const utils::UUID& id) {
if (values.contains(id)) {
auto new_values = values;
new_values.erase(id);
return apply_set("paused_rf_change_requests", collection_apply_mode::overwrite, new_values | std::views::transform([] (const auto& id) { return data_value{id}; }));
} else {
return *this;
}
}
topology_mutation_builder& topology_mutation_builder::set_upgrade_state(topology::upgrade_state_type value) {
return apply_atomic("upgrade_state", ::format("{}", value));
}

View File

@@ -129,8 +129,6 @@ public:
topology_mutation_builder& del_global_topology_request_id();
topology_mutation_builder& queue_global_topology_request_id(const utils::UUID& value);
topology_mutation_builder& drop_first_global_topology_request_id(const std::vector<utils::UUID>&, const utils::UUID&);
topology_mutation_builder& pause_rf_change_request(const utils::UUID&);
topology_mutation_builder& resume_rf_change_request(const std::unordered_set<utils::UUID>&, const utils::UUID&);
topology_node_mutation_builder& with_node(raft::server_id);
canonical_mutation build() { return canonical_mutation{std::move(_m)}; }
};

View File

@@ -180,10 +180,6 @@ struct topology {
// The KS options to be used when executing the scheduled ALTER KS statement
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
// The ids of RF change requests that are paused because they require tablet co-location.
// It may happen during altering from numerical RF to rack list.
std::unordered_set<utils::UUID> paused_rf_change_requests;
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
std::vector<cdc::generation_id_v2> unpublished_cdc_generations;

View File

@@ -27,6 +27,7 @@ enum class component_type {
TemporaryTOC,
TemporaryStatistics,
Scylla,
TemporaryScylla,
Rows,
Partitions,
TemporaryHashes,
@@ -76,6 +77,8 @@ struct fmt::formatter<sstables::component_type> : fmt::formatter<string_view> {
return formatter<string_view>::format("TemporaryStatistics", ctx);
case Scylla:
return formatter<string_view>::format("Scylla", ctx);
case TemporaryScylla:
return formatter<string_view>::format("TemporaryScylla", ctx);
case Partitions:
return formatter<string_view>::format("Partitions", ctx);
case Rows:

View File

@@ -632,6 +632,10 @@ private:
std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
void close_data_writer();
void close_index_writer();
void close_rows_writer();
void close_partitions_writer();
void ensure_tombstone_is_written() {
if (!_tombstone_written) {
consume(tombstone());
@@ -944,17 +948,16 @@ void writer::init_file_writers() {
_sst._schema->get_compressor_params(),
std::move(compressor)), _sst.get_filename());
}
if (_sst.has_component(component_type::Index)) {
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Index).get();
_index_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), _sst.index_filename());
_index_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, _sst.index_filename());
}
if (_sst.has_component(component_type::Partitions) && _sst.has_component(component_type::Rows)) {
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Rows).get();
_rows_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Rows));
_rows_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Rows));
_bti_row_index_writer = trie::bti_row_index_writer(*_rows_writer);
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Partitions).get();
_partitions_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Partitions));
_partitions_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Partitions));
_bti_partition_index_writer = trie::bti_partition_index_writer(*_partitions_writer);
}
if (_delayed_filter) {
@@ -982,6 +985,41 @@ void writer::close_data_writer() {
}
}
void writer::close_index_writer() {
if (_index_writer) {
auto writer = close_writer(_index_writer);
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
_sst.get_components_digests().index_digest = chksum_wr->full_checksum();
}
}
void writer::close_partitions_writer() {
if (_partitions_writer) {
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
_sst.get_version(),
_first_key.value(),
_last_key.value());
auto writer = close_writer(_partitions_writer);
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
_sst.get_components_digests().partitions_digest = chksum_wr->full_checksum();
}
}
void writer::close_rows_writer() {
if (_rows_writer) {
// Append some garbage padding to the file just to ensure that it's never empty.
// (Otherwise it would be empty if the sstable contains only small partitions).
// This is a hack to work around some bad interactions between zero-sized files
// and object storage. (It seems that e.g. minio considers a zero-sized file
// upload to be a no-op, which breaks some assumptions).
uint32_t garbage = seastar::cpu_to_be(0x13371337);
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
auto writer = close_writer(_rows_writer);
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
_sst.get_components_digests().rows_digest = chksum_wr->full_checksum();
}
}
void writer::consume_new_partition(const dht::decorated_key& dk) {
_c_stats.start_offset = _data_writer->offset();
_prev_row_start = _data_writer->offset();
@@ -1630,27 +1668,10 @@ void writer::consume_end_of_stream() {
_collector.add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
}
if (_index_writer) {
close_writer(_index_writer);
}
close_index_writer();
if (_partitions_writer) {
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
_sst.get_version(),
_first_key.value(),
_last_key.value());
close_writer(_partitions_writer);
}
if (_rows_writer) {
// Append some garbage padding to the file just to ensure that it's never empty.
// (Otherwise it would be empty if the sstable contains only small partitions).
// This is a hack to work around some bad interactions between zero-sized files
// and object storage. (It seems that e.g. minio considers a zero-sized file
// upload to be a no-op, which breaks some assumptions).
uint32_t garbage = seastar::cpu_to_be(0x13371337);
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
close_writer(_rows_writer);
}
close_partitions_writer();
close_rows_writer();
if (_hashes_writer) {
close_writer(_hashes_writer);
@@ -1696,9 +1717,7 @@ void writer::consume_end_of_stream() {
.map = _collector.get_ext_timestamp_stats()
});
_sst.write_scylla_metadata(_shard, std::move(identifier), std::move(ld_stats), std::move(ts_stats));
if (!_cfg.leave_unsealed) {
_sst.seal_sstable(_cfg.backup).get();
}
_sst.seal_sstable(_cfg.backup).get();
}
uint64_t writer::data_file_position_for_tests() const {

View File

@@ -83,8 +83,6 @@ struct sstable_open_config {
bool current_shard_as_sstable_owner = false;
// Do not move the sharding metadata to the sharder, keeping it in the scylla metadata..
bool keep_sharding_metadata = false;
// Allows unsealed sstable to be loaded, since it must read components from temporary TOC instead.
bool unsealed_sstable = false;
};
}

View File

@@ -44,6 +44,7 @@ sstable_version_constants::component_map_t sstable_version_constants::create_com
{ component_type::Filter, "Filter.db" },
{ component_type::Statistics, "Statistics.db" },
{ component_type::Scylla, "Scylla.db" },
{ component_type::TemporaryScylla, "Scylla.db.tmp" },
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
{ component_type::TemporaryStatistics, "Statistics.db.tmp" }
};

View File

@@ -836,14 +836,13 @@ future<std::vector<sstring>> sstable::read_and_parse_toc(file f) {
// This is small enough, and well-defined. Easier to just read it all
// at once
future<> sstable::read_toc(sstable_open_config cfg) noexcept {
future<> sstable::read_toc() noexcept {
if (_recognized_components.size()) {
co_return;
}
try {
auto toc_type = cfg.unsealed_sstable ? component_type::TemporaryTOC : component_type::TOC;
co_await do_read_simple(toc_type, [&] (version_types v, file f) -> future<> {
co_await do_read_simple(component_type::TOC, [&] (version_types v, file f) -> future<> {
auto comps = co_await read_and_parse_toc(f);
for (auto& c: comps) {
// accept trailing newlines
@@ -901,8 +900,8 @@ future<std::unordered_map<component_type, file>> sstable::readable_file_for_all_
co_return std::move(files);
}
future<entry_descriptor> sstable::clone(generation_type new_generation, bool leave_unsealed) const {
co_await _storage->snapshot(*this, _storage->prefix(), storage::absolute_path::yes, new_generation, storage::leave_unsealed(leave_unsealed));
future<entry_descriptor> sstable::clone(generation_type new_generation) const {
co_await _storage->snapshot(*this, _storage->prefix(), storage::absolute_path::yes, new_generation);
co_return entry_descriptor(new_generation, _version, _format, component_type::TOC, _state);
}
@@ -957,16 +956,22 @@ future<file_writer> sstable::make_component_file_writer(component_type c, file_o
});
}
future<std::unique_ptr<crc32_digest_file_writer>> sstable::make_digests_component_file_writer(component_type c, file_output_stream_options options, open_flags oflags) noexcept {
return _storage->make_component_sink(*this, c, oflags, std::move(options)).then([this, comp = component_name(*this, c)] (data_sink sink) mutable {
return std::make_unique<crc32_digest_file_writer>(std::move(sink), sstable_buffer_size, comp);
});
}
void sstable::open_sstable(const sstring& origin) {
_origin = origin;
generate_toc();
_storage->open(*this);
}
void sstable::write_toc(file_writer w) {
void sstable::write_toc(std::unique_ptr<crc32_digest_file_writer> w) {
sstlog.debug("Writing TOC file {} ", toc_filename());
do_write_simple(std::move(w), [&] (version_types v, file_writer& w) {
do_write_simple(*w, [&] (version_types v, file_writer& w) {
for (auto&& key : _recognized_components) {
// new line character is appended to the end of each component name.
auto value = sstable_version_constants::get_component_map(v).at(key) + "\n";
@@ -974,6 +979,8 @@ void sstable::write_toc(file_writer w) {
write(v, w, b);
}
});
_components_digests.toc_digest = w->full_checksum();
}
void sstable::write_crc(const checksum& c) {
@@ -990,6 +997,7 @@ void sstable::write_digest(uint32_t full_checksum) {
auto digest = to_sstring<bytes>(full_checksum);
write(v, w, digest);
}, buffer_size);
_components_digests.data_digest = full_checksum;
}
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
@@ -1046,7 +1054,7 @@ future<> sstable::read_simple(T& component) {
});
}
void sstable::do_write_simple(file_writer&& writer,
void sstable::do_write_simple(file_writer& writer,
noncopyable_function<void (version_types, file_writer&)> write_component) {
write_component(_version, writer);
_metadata_size_on_disk += writer.offset();
@@ -1061,7 +1069,7 @@ void sstable::do_write_simple(component_type type,
file_output_stream_options options;
options.buffer_size = buffer_size;
auto w = make_component_file_writer(type, std::move(options)).get();
do_write_simple(std::move(w), std::move(write_component));
do_write_simple(w, std::move(write_component));
}
template <component_type Type, typename T>
@@ -1071,10 +1079,30 @@ void sstable::write_simple(const T& component) {
}, sstable_buffer_size);
}
uint32_t sstable::do_write_simple_with_digest(component_type type,
noncopyable_function<void (version_types version, file_writer& writer)> write_component, unsigned buffer_size) {
auto file_path = filename(type);
sstlog.debug("Writing {} file {}", sstable_version_constants::get_component_map(_version).at(type), file_path);
file_output_stream_options options;
options.buffer_size = buffer_size;
auto w = make_digests_component_file_writer(type, std::move(options)).get();
do_write_simple(*w, std::move(write_component));
return w->full_checksum();
}
template <component_type Type, typename T>
uint32_t sstable::write_simple_with_digest(const T& component) {
return do_write_simple_with_digest(Type, [&component] (version_types v, file_writer& w) {
write(v, w, component);
}, sstable_buffer_size);
}
template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f);
template void sstable::write_simple<component_type::Filter>(const sstables::filter& f);
template void sstable::write_simple<component_type::Summary>(const sstables::summary_ka&);
template uint32_t sstable::write_simple_with_digest<component_type::Summary>(const sstables::summary_ka&);
future<> sstable::read_compression() {
// FIXME: If there is no compression, we should expect a CRC file to be present.
@@ -1093,7 +1121,8 @@ void sstable::write_compression() {
return;
}
write_simple<component_type::CompressionInfo>(_components->compression);
uint32_t digest = write_simple_with_digest<component_type::CompressionInfo>(_components->compression);
_components_digests.compression_digest = digest;
}
void sstable::validate_partitioner() {
@@ -1318,7 +1347,8 @@ future<> sstable::read_partitions_db_footer() {
}
void sstable::write_statistics() {
write_simple<component_type::Statistics>(_components->statistics);
auto digest = write_simple_with_digest<component_type::Statistics>(_components->statistics);
_components_digests.statistics_digest = digest;
}
void sstable::mark_as_being_repaired(const service::session_id& id) {
@@ -1341,13 +1371,25 @@ int64_t sstable::update_repaired_at(int64_t repaired_at) {
void sstable::rewrite_statistics() {
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
auto lock = get_units(_mutate_sem, 1).get();
file_output_stream_options options;
options.buffer_size = sstable_buffer_size;
auto w = make_component_file_writer(component_type::TemporaryStatistics, std::move(options),
auto w = make_digests_component_file_writer(component_type::TemporaryStatistics, std::move(options),
open_flags::wo | open_flags::create | open_flags::truncate).get();
write(_version, w, _components->statistics);
w.close();
write(_version, *w, _components->statistics);
w->close();
// When rewriting statistics, we also need to update the scylla component
// because it contains the digest of the statistics component.
if (has_scylla_component()) {
_components_digests.statistics_digest = w->full_checksum();
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests{_components_digests});
sstlog.debug("Rewriting scylla component of sstable {}", get_filename());
write_simple<component_type::TemporaryScylla>(*_components->scylla_metadata);
// rename() guarantees atomicity when renaming a file into place.
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryScylla)), fmt::to_string(filename(component_type::Scylla))).get();
}
// rename() guarantees atomicity when renaming a file into place.
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryStatistics)), fmt::to_string(filename(component_type::Statistics))).get();
}
@@ -1541,7 +1583,8 @@ void sstable::write_filter() {
auto&& bs = f->bits();
auto filter_ref = sstables::filter_ref(f->num_hashes(), bs.get_storage());
write_simple<component_type::Filter>(filter_ref);
uint32_t digest = write_simple_with_digest<component_type::Filter>(filter_ref);
_components_digests.filter_digest = digest;
}
void sstable::maybe_rebuild_filter_from_index(uint64_t num_partitions) {
@@ -1726,7 +1769,7 @@ void sstable::disable_component_memory_reload() {
}
future<> sstable::load_metadata(sstable_open_config cfg) noexcept {
co_await read_toc(cfg);
co_await read_toc();
// read scylla-meta after toc. Might need it to parse
// rest (hint extensions)
co_await read_scylla_metadata();
@@ -2000,6 +2043,8 @@ sstable::read_scylla_metadata() noexcept {
}
return read_simple<component_type::Scylla>(*_components->scylla_metadata).then([this] {
_features = _components->scylla_metadata->get_features();
_components_digests = _components->scylla_metadata->get_components_digests();
_components->digest = _components_digests.data_digest;
});
});
}
@@ -2089,6 +2134,7 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
sstable_schema.columns.elements.push_back(sstable_column_description{to_sstable_column_kind(col.kind), {col.name()}, {to_bytes(col.type->name())}});
}
_components->scylla_metadata->data.set<scylla_metadata_type::Schema>(std::move(sstable_schema));
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests(_components_digests));
write_simple<component_type::Scylla>(*_components->scylla_metadata);
}
@@ -2490,19 +2536,15 @@ std::vector<std::pair<component_type, sstring>> sstable::all_components() const
}
future<> sstable::snapshot(const sstring& dir) const {
auto lock = co_await get_units(_mutate_sem, 1);
co_await _storage->snapshot(*this, dir, storage::absolute_path::yes);
return _storage->snapshot(*this, dir, storage::absolute_path::yes);
}
future<> sstable::change_state(sstable_state to, delayed_commit_changes* delay_commit) {
auto lock = co_await get_units(_mutate_sem, 1);
co_await _storage->change_state(*this, to, _generation, delay_commit);
_state = to;
}
future<> sstable::pick_up_from_upload(sstable_state to, generation_type new_generation) {
// just in case, not really needed as the sstable is not yet in use while in the upload dir
auto lock = co_await get_units(_mutate_sem, 1);
co_await _storage->change_state(*this, to, new_generation, nullptr);
_generation = std::move(new_generation);
_state = to;
@@ -3076,6 +3118,31 @@ void sstable::set_sstable_level(uint32_t new_level) {
s.sstable_level = new_level;
}
std::optional<uint32_t> sstable::get_component_digest(component_type c) const {
switch (c) {
case component_type::Index:
return _components_digests.index_digest;
case component_type::Summary:
return _components_digests.summary_digest;
case component_type::TOC:
return _components_digests.toc_digest;
case component_type::CompressionInfo:
return _components_digests.compression_digest;
case component_type::Filter:
return _components_digests.filter_digest;
case component_type::Partitions:
return _components_digests.partitions_digest;
case component_type::Rows:
return _components_digests.rows_digest;
case component_type::Data:
return _components_digests.data_digest;
case component_type::Statistics:
return _components_digests.statistics_digest;
default:
return std::nullopt;
}
}
future<> sstable::mutate_sstable_level(uint32_t new_level) {
if (!has_component(component_type::Statistics)) {
return make_ready_future<>();
@@ -3412,9 +3479,6 @@ utils::hashed_key sstable::make_hashed_key(const schema& s, const partition_key&
future<>
sstable::unlink(storage::sync_dir sync) noexcept {
// Serialize with other calls to unlink or potentially ongoing mutations.
auto lock = co_await get_units(_mutate_sem, 1);
_unlinked = true;
_on_delete(*this);
@@ -3961,13 +4025,11 @@ class sstable_stream_sink_impl : public sstable_stream_sink {
shared_sstable _sst;
component_type _type;
bool _last_component;
bool _leave_unsealed;
public:
sstable_stream_sink_impl(shared_sstable sst, component_type type, sstable_stream_sink_cfg cfg)
sstable_stream_sink_impl(shared_sstable sst, component_type type, bool last_component)
: _sst(std::move(sst))
, _type(type)
, _last_component(cfg.last_component)
, _leave_unsealed(cfg.leave_unsealed)
, _last_component(last_component)
{}
private:
future<> load_metadata() const {
@@ -4014,12 +4076,10 @@ public:
co_return co_await make_file_output_stream(std::move(f), stream_options);
}
future<shared_sstable> close() override {
future<shared_sstable> close_and_seal() override {
if (_last_component) {
// If we are the last component in a sequence, we can seal the table.
if (!_leave_unsealed) {
co_await _sst->_storage->seal(*_sst);
}
co_await _sst->_storage->seal(*_sst);
co_return std::move(_sst);
}
_sst = {};
@@ -4036,7 +4096,7 @@ public:
}
};
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, sstable_stream_sink_cfg cfg) {
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, bool last_component) {
auto desc = parse_path(component_filename, schema->ks_name(), schema->cf_name());
auto sst = sstm.make_sstable(schema, s_opts, desc.generation, state, desc.version, desc.format);
@@ -4047,7 +4107,7 @@ std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstab
type = component_type::TemporaryTOC;
}
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, cfg);
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, last_component);
}
generation_type

View File

@@ -9,6 +9,7 @@
#pragma once
#include "sstables/writer.hh"
#include "version.hh"
#include "shared_sstable.hh"
#include "open_info.hh"
@@ -109,7 +110,6 @@ struct sstable_writer_config {
size_t promoted_index_auto_scale_threshold;
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
bool backup = false;
bool leave_unsealed = false;
mutation_fragment_stream_validation_level validation_level;
std::optional<db::replay_position> replay_position;
std::optional<int> sstable_level;
@@ -418,8 +418,8 @@ public:
return component_basename(_schema->ks_name(), _schema->cf_name(), _version, _generation, _format, f);
}
component_name get_filename(component_type f = component_type::Data) const {
return component_name(*this, f);
component_name get_filename() const {
return component_name(*this, component_type::Data);
}
component_name toc_filename() const {
@@ -629,9 +629,7 @@ private:
size_t _total_memory_reclaimed{0};
bool _unlinked{false};
// The mutate semaphore is used to serialize operations like rewrite_statistics
// with linking or moving the sstable between directories.
mutable named_semaphore _mutate_sem{1, named_semaphore_exception_factory{"sstable mutate"}};
components_digests _components_digests;
public:
bool has_component(component_type f) const;
sstables_manager& manager() { return _manager; }
@@ -652,12 +650,18 @@ private:
template <component_type Type, typename T>
void write_simple(const T& comp);
void do_write_simple(file_writer&& writer,
void do_write_simple(file_writer& writer,
noncopyable_function<void (version_types, file_writer&)> write_component);
void do_write_simple(component_type type,
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
unsigned buffer_size);
template <component_type Type, typename T>
uint32_t write_simple_with_digest(const T& comp);
uint32_t do_write_simple_with_digest(component_type type,
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
unsigned buffer_size);
void write_crc(const checksum& c);
void write_digest(uint32_t full_checksum);
@@ -668,6 +672,9 @@ private:
future<file_writer> make_component_file_writer(component_type c, file_output_stream_options options,
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
future<std::unique_ptr<crc32_digest_file_writer>> make_digests_component_file_writer(component_type c, file_output_stream_options options,
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
void generate_toc();
void open_sstable(const sstring& origin);
@@ -694,11 +701,12 @@ private:
future<> update_info_for_opened_data(sstable_open_config cfg = {});
future<> read_toc(sstable_open_config cfg = {}) noexcept;
future<> read_toc() noexcept;
future<> read_summary() noexcept;
void write_summary() {
write_simple<component_type::Summary>(_components->summary);
uint32_t digest = write_simple_with_digest<component_type::Summary>(_components->summary);
_components_digests.summary_digest = digest;
}
// To be called when we try to load an SSTable that lacks a Summary. Could
@@ -828,7 +836,7 @@ private:
future<> open_or_create_data(open_flags oflags, file_open_options options = {}) noexcept;
// runs in async context (called from storage::open)
void write_toc(file_writer w);
void write_toc(std::unique_ptr<crc32_digest_file_writer> w);
static future<uint32_t> read_digest_from_file(file f);
static future<lw_shared_ptr<checksum>> read_checksum_from_file(file f);
public:
@@ -1018,6 +1026,12 @@ public:
return _components->digest;
}
components_digests& get_components_digests() {
return _components_digests;
}
std::optional<uint32_t> get_component_digest(component_type c) const;
// Gets ratio of droppable tombstone. A tombstone is considered droppable here
// for cells and tombstones expired before the time point "GC before", which
// is the point before which expiring data can be purged.
@@ -1070,9 +1084,8 @@ public:
future<std::unordered_map<component_type, file>> readable_file_for_all_components() const;
// Clones this sstable with a new generation, under the same location as the original one.
// If leave_unsealed is true, the destination sstable is left unsealed.
// Implementation is underlying storage specific.
future<entry_descriptor> clone(generation_type new_generation, bool leave_unsealed = false) const;
future<entry_descriptor> clone(generation_type new_generation) const;
struct lesser_reclaimed_memory {
// comparator class to be used by the _reclaimed set in sstables manager
@@ -1246,18 +1259,13 @@ public:
// closes this component. If this is the last component in a set (see "last_component" in creating method below)
// the table on disk will be sealed.
// Returns sealed sstable if last, or nullptr otherwise.
virtual future<shared_sstable> close() = 0;
virtual future<shared_sstable> close_and_seal() = 0;
virtual future<> abort() = 0;
};
struct sstable_stream_sink_cfg {
bool last_component = false;
bool leave_unsealed = false;
};
// Creates a sink object which can receive a component file sourced from above source object data.
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, sstable_stream_sink_cfg cfg);
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, bool last_component);
} // namespace sstables

View File

@@ -50,14 +50,7 @@ class filesystem_storage final : public sstables::storage {
std::optional<std::filesystem::path> _temp_dir; // Valid while the sstable is being created, until sealed
private:
struct mark_for_removal_tag {};
struct leave_unsealed_tag {};
enum class link_mode {
default_mode,
mark_for_removal,
leave_unsealed,
};
using mark_for_removal = bool_class<class mark_for_removal_tag>;
template <typename Comp>
requires std::is_same_v<Comp, component_type> || std::is_same_v<Comp, sstring>
@@ -68,9 +61,7 @@ private:
future<> check_create_links_replay(const sstable& sst, const sstring& dst_dir, generation_type dst_gen, const std::vector<std::pair<sstables::component_type, sstring>>& comps) const;
future<> remove_temp_dir();
virtual future<> create_links(const sstable& sst, const std::filesystem::path& dir) const override;
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, link_mode mode) const;
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal_tag) const;
future<> create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen, leave_unsealed_tag) const;
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal mark_for_removal) const;
future<> create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> dst_gen) const;
future<> touch_temp_dir(const sstable& sst);
future<> move(const sstable& sst, sstring new_dir, generation_type generation, delayed_commit_changes* delay) override;
@@ -92,7 +83,7 @@ public:
{}
virtual future<> seal(const sstable& sst) override;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed) const override;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const override;
virtual future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
// runs in async context
virtual void open(sstable& sst) override;
@@ -213,13 +204,13 @@ void filesystem_storage::open(sstable& sst) {
open_flags::create |
open_flags::exclusive,
options).get();
auto w = file_writer(output_stream<char>(std::move(sink)), component_name(sst, component_type::TemporaryTOC));
auto w = std::make_unique<crc32_digest_file_writer>(std::move(sink), sst.sstable_buffer_size, component_name(sst, component_type::TemporaryTOC));
bool toc_exists = file_exists(fmt::to_string(sst.filename(component_type::TOC))).get();
if (toc_exists) {
// TOC will exist at this point if write_components() was called with
// the generation of a sstable that exists.
w.close();
w->close();
remove_file(fmt::to_string(sst.filename(component_type::TemporaryTOC))).get();
throw std::runtime_error(format("SSTable write failed due to existence of TOC file for generation {} of {}.{}", sst._generation, sst._schema->ks_name(), sst._schema->cf_name()));
}
@@ -365,13 +356,8 @@ future<> filesystem_storage::check_create_links_replay(const sstable& sst, const
/// \param sst - the sstable to work on
/// \param dst_dir - the destination directory.
/// \param generation - the generation of the destination sstable
/// \param mode - what will be done after all components were linked
/// mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
/// leave_unsealed - leaves the destination sstable unsealed
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, link_mode mode) const {
// They're mutually exclusive, so we can assume only one is set.
bool mark_for_removal = mode == link_mode::mark_for_removal;
bool leave_unsealed = mode == link_mode::leave_unsealed;
/// \param mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, mark_for_removal mark_for_removal) const {
sstlog.trace("create_links: {} -> {} generation={} mark_for_removal={}", sst.get_filename(), dst_dir, generation, mark_for_removal);
auto comps = sst.all_components();
co_await check_create_links_replay(sst, dst_dir, generation, comps);
@@ -380,11 +366,7 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
co_await sst.sstable_write_io_check(idempotent_link_file, fmt::to_string(sst.filename(component_type::TOC)), std::move(dst));
auto dir = opened_directory(dst_dir);
co_await dir.sync(sst._write_error_handler);
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation, leave_unsealed] (auto p) {
// Skips the linking of TOC file if the destination will be left unsealed.
if (leave_unsealed && p.first == component_type::TOC) {
return make_ready_future<>();
}
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation] (auto p) {
auto src = filename(sst, _dir.native(), sst._generation, p.second);
auto dst = filename(sst, dst_dir, generation, p.second);
return sst.sstable_write_io_check(idempotent_link_file, std::move(src), std::move(dst));
@@ -397,10 +379,9 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
auto src_temp_toc = filename(sst, _dir.native(), sst._generation, component_type::TemporaryTOC);
co_await sst.sstable_write_io_check(rename_file, std::move(dst_temp_toc), std::move(src_temp_toc));
co_await _dir.sync(sst._write_error_handler);
} else if (!leave_unsealed) {
} else {
// Now that the source sstable is linked to dir, remove
// the TemporaryTOC file at the destination.
// This is bypassed if destination will be left unsealed.
co_await sst.sstable_write_io_check(remove_file, std::move(dst_temp_toc));
}
co_await dir.sync(sst._write_error_handler);
@@ -408,23 +389,15 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
sstlog.trace("create_links: {} -> {} generation={}: done", sst.get_filename(), dst_dir, generation);
}
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal_tag) const {
return create_links_common(sst, dst_dir, dst_gen, link_mode::mark_for_removal);
}
future<> filesystem_storage::create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen, leave_unsealed_tag) const {
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), link_mode::leave_unsealed);
}
future<> filesystem_storage::create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen) const {
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), link_mode::default_mode);
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), mark_for_removal::no);
}
future<> filesystem_storage::create_links(const sstable& sst, const std::filesystem::path& dir) const {
return create_links_common(sst, dir.native(), sst._generation, link_mode::default_mode);
return create_links_common(sst, dir.native(), sst._generation, mark_for_removal::no);
}
future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed leave_unsealed) const {
future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
std::filesystem::path snapshot_dir;
if (abs) {
snapshot_dir = dir;
@@ -432,11 +405,7 @@ future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_
snapshot_dir = _dir.path() / dir;
}
co_await sst.sstable_touch_directory_io_check(snapshot_dir);
if (leave_unsealed) {
co_await create_links_common(sst, snapshot_dir, std::move(gen), leave_unsealed_tag{});
} else {
co_await create_links_common(sst, snapshot_dir, std::move(gen));
}
co_await create_links_common(sst, snapshot_dir, std::move(gen));
}
future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generation_type new_generation, delayed_commit_changes* delay_commit) {
@@ -444,7 +413,7 @@ future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generatio
sstring old_dir = _dir.native();
sstlog.debug("Moving {} old_generation={} to {} new_generation={} do_sync_dirs={}",
sst.get_filename(), sst._generation, new_dir, new_generation, delay_commit == nullptr);
co_await create_links_common(sst, new_dir, new_generation, mark_for_removal_tag{});
co_await create_links_common(sst, new_dir, new_generation, mark_for_removal::yes);
co_await change_dir(new_dir);
generation_type old_generation = sst._generation;
co_await coroutine::parallel_for_each(sst.all_components(), [&sst, old_generation, old_dir] (auto p) {
@@ -629,7 +598,7 @@ public:
{}
future<> seal(const sstable& sst) override;
future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type>, storage::leave_unsealed) const override;
future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type>) const override;
future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
// runs in async context
void open(sstable& sst) override;
@@ -701,15 +670,10 @@ void object_storage_base::open(sstable& sst) {
sst.manager().sstables_registry().create_entry(owner(), status_creating, sst._state, std::move(desc)).get();
memory_data_sink_buffers bufs;
sst.write_toc(
file_writer(
output_stream<char>(
data_sink(
std::make_unique<memory_data_sink>(bufs)
)
)
)
);
auto out = data_sink(std::make_unique<memory_data_sink>(bufs));
auto w = std::make_unique<crc32_digest_file_writer>(std::move(out), sst.sstable_buffer_size, component_name(sst, component_type::TOC));
sst.write_toc(std::move(w));
put_object(make_object_name(sst, component_type::TOC), std::move(bufs)).get();
}
@@ -846,7 +810,7 @@ future<> object_storage_base::unlink_component(const sstable& sst, component_typ
}
}
future<> object_storage_base::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed) const {
future<> object_storage_base::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
on_internal_error(sstlog, "Snapshotting S3 objects not implemented");
co_return;
}

View File

@@ -97,10 +97,9 @@ public:
using absolute_path = bool_class<class absolute_path_tag>; // FIXME -- should go away eventually
using sync_dir = bool_class<struct sync_dir_tag>; // meaningful only to filesystem storage
using leave_unsealed = bool_class<struct leave_unsealed_tag>;
virtual future<> seal(const sstable& sst) = 0;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}, leave_unsealed lu = leave_unsealed::no) const = 0;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}) const = 0;
virtual future<> change_state(const sstable& sst, sstable_state to, generation_type generation, delayed_commit_changes* delay) = 0;
// runs in async context
virtual void open(sstable& sst) = 0;

View File

@@ -547,6 +547,7 @@ enum class scylla_metadata_type : uint32_t {
ExtTimestampStats = 9,
SSTableIdentifier = 10,
Schema = 11,
ComponentsDigests = 12,
};
// UUID is used for uniqueness across nodes, such that an imported sstable
@@ -573,6 +574,24 @@ struct sstable_identifier_type {
auto describe_type(sstable_version_types v, Describer f) { return f(value); }
};
// Component digests stored in scylla metadata to track integrity of individual components
struct components_digests {
std::optional<uint32_t> data_digest;
std::optional<uint32_t> compression_digest;
std::optional<uint32_t> filter_digest;
std::optional<uint32_t> statistics_digest;
std::optional<uint32_t> summary_digest;
std::optional<uint32_t> index_digest;
std::optional<uint32_t> toc_digest;
std::optional<uint32_t> partitions_digest;
std::optional<uint32_t> rows_digest;
template <typename Describer>
auto describe_type(sstable_version_types v, Describer f) {
return f(data_digest,compression_digest, filter_digest, statistics_digest, summary_digest, index_digest, toc_digest, partitions_digest, rows_digest);
}
};
// Types of large data statistics.
//
// Note: For extensibility, never reuse an identifier,
@@ -656,7 +675,8 @@ struct scylla_metadata {
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ScyllaVersion, scylla_version>,
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtTimestampStats, ext_timestamp_stats>,
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::SSTableIdentifier, sstable_identifier>,
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>,
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>
> data;
sstable_enabled_features get_features() const {
@@ -691,6 +711,13 @@ struct scylla_metadata {
auto* sid = data.get<scylla_metadata_type::SSTableIdentifier, scylla_metadata::sstable_identifier>();
return sid ? sid->value : sstable_id::create_null_id();
}
const components_digests get_components_digests() const {
auto cd = data.get<scylla_metadata_type::ComponentsDigests, components_digests>();
if (!cd) {
return {};
}
return *cd;
}
template <typename Describer>
auto describe_type(sstable_version_types v, Describer f) { return f(data); }

View File

@@ -65,7 +65,7 @@ serialized_size(sstable_version_types v, const T& object) {
return size;
}
template <typename ChecksumType>
template <typename ChecksumType, bool calculate_chunk_checksums>
requires ChecksumUtils<ChecksumType>
class checksummed_file_data_sink_impl : public data_sink_impl {
data_sink _out;
@@ -92,7 +92,9 @@ public:
per_chunk_checksum = ChecksumType::checksum(per_chunk_checksum, buf.begin() + offset, size);
_full_checksum = checksum_combine_or_feed<ChecksumType>(_full_checksum, per_chunk_checksum, buf.begin() + offset, size);
_c.checksums.push_back(per_chunk_checksum);
if constexpr (calculate_chunk_checksums) {
_c.checksums.push_back(per_chunk_checksum);
}
}
}
return _out.put(std::move(bufs));
@@ -112,29 +114,29 @@ public:
}
};
template <typename ChecksumType>
template <typename ChecksumType, bool calculate_chunk_checksums>
requires ChecksumUtils<ChecksumType>
class checksummed_file_data_sink : public data_sink {
public:
checksummed_file_data_sink(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum)
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType>>(std::move(out), cinfo, full_file_checksum)) {}
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType, calculate_chunk_checksums>>(std::move(out), cinfo, full_file_checksum)) {}
};
template <typename ChecksumType>
template <typename ChecksumType, bool calculate_chunk_checksums>
requires ChecksumUtils<ChecksumType>
inline
output_stream<char> make_checksummed_file_output_stream(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) {
return output_stream<char>(checksummed_file_data_sink<ChecksumType>(std::move(out), cinfo, full_file_checksum));
return output_stream<char>(checksummed_file_data_sink<ChecksumType, calculate_chunk_checksums>(std::move(out), cinfo, full_file_checksum));
}
template <typename ChecksumType>
template <typename ChecksumType, bool calculate_chunk_checksums>
requires ChecksumUtils<ChecksumType>
class checksummed_file_writer : public file_writer {
checksum _c;
uint32_t _full_checksum;
public:
checksummed_file_writer(data_sink out, size_t buffer_size, component_name c)
: file_writer(make_checksummed_file_output_stream<ChecksumType>(std::move(out), _c, _full_checksum), std::move(c))
: file_writer(make_checksummed_file_output_stream<ChecksumType, calculate_chunk_checksums>(std::move(out), _c, _full_checksum), std::move(c))
, _c(uint32_t(std::min(size_t(DEFAULT_CHUNK_SIZE), buffer_size)), {})
, _full_checksum(ChecksumType::init_checksum()) {}
@@ -152,8 +154,10 @@ public:
}
};
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils>;
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils>;
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils, true>;
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils, true>;
using crc32_digest_file_writer = checksummed_file_writer<crc32_utils, false>;
template <typename T, typename W>
requires Writer<W>

View File

@@ -63,45 +63,30 @@ mutation_reader_consumer make_streaming_consumer(sstring origin,
}
schema_ptr s = reader.schema();
// SSTable will be only sealed when added to the sstable set, so we make sure unsplit sstables aren't
// left sealed on the table directory.
auto cfg = cf->get_sstables_manager().configure_writer(origin);
cfg.leave_unsealed = true;
return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
cfg, encoding_stats{}).then([sst] {
return sst->open_data();
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] -> future<std::vector<sstables::shared_sstable>> {
auto on_add = [sst, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] (sstables::shared_sstable loading_sst) -> future<> {
if (repaired_at && sstables::repair_origin == origin) {
loading_sst->being_repaired = frozen_guard;
if (sstable_list_to_mark_as_repaired) {
sstable_list_to_mark_as_repaired->insert(loading_sst);
}
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard] -> future<> {
if (repaired_at && sstables::repair_origin == origin) {
sst->being_repaired = frozen_guard;
if (sstable_list_to_mark_as_repaired) {
sstable_list_to_mark_as_repaired->insert(sst);
}
if (loading_sst == sst) {
co_await loading_sst->seal_sstable(cfg.backup);
}
co_return;
};
}
if (offstrategy && sstables::repair_origin == origin) {
sstables::sstlog.debug("Enabled automatic off-strategy trigger for table {}.{}",
cf->schema()->ks_name(), cf->schema()->cf_name());
cf->enable_off_strategy_trigger();
}
co_return co_await cf->add_new_sstable_and_update_cache(sst, on_add, offstrategy);
}).then([cf, s, sst, use_view_update_path, &vb, &vbw] (std::vector<sstables::shared_sstable> new_sstables) mutable -> future<> {
auto& vb_ = vb;
auto new_sstables_ = std::move(new_sstables);
auto table = cf;
co_await cf->add_sstable_and_update_cache(sst, offstrategy);
}).then([cf, s, sst, use_view_update_path, &vb, &vbw]() mutable -> future<> {
if (use_view_update_path == db::view::sstable_destination_decision::staging_managed_by_vbc) {
co_return co_await vbw.local().register_staging_sstable_tasks(new_sstables_, cf->schema()->id());
return vbw.local().register_staging_sstable_tasks({sst}, cf->schema()->id());
} else if (use_view_update_path == db::view::sstable_destination_decision::staging_directly_to_generator) {
co_await coroutine::parallel_for_each(new_sstables_, [&vb_, &table] (sstables::shared_sstable sst) -> future<> {
return vb_.local().register_staging_sstable(sst, table);
});
return vb.local().register_staging_sstable(sst, std::move(cf));
}
co_return;
return make_ready_future<>();
});
};
if (!offstrategy) {

Some files were not shown because too many files have changed in this diff Show More