mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
Compare commits
2 Commits
copilot/fi
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c401e260a | ||
|
|
1824b04e2a |
@@ -1,14 +0,0 @@
|
||||
name: Call Jira release creation for new milestone
|
||||
|
||||
on:
|
||||
milestone:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
sync-milestone-to-jira:
|
||||
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
|
||||
with:
|
||||
# Comma-separated list of Jira project keys
|
||||
jira_project_keys: "SCYLLADB,CUSTOMER"
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
@@ -1,13 +0,0 @@
|
||||
name: validate_pr_author_email
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
|
||||
jobs:
|
||||
validate_pr_author_email:
|
||||
uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main
|
||||
|
||||
@@ -169,7 +169,7 @@ future<> controller::request_stop_server() {
|
||||
});
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
|
||||
future<utils::chunked_vector<client_data>> controller::get_client_data() {
|
||||
return _server.local().get_client_data();
|
||||
}
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ public:
|
||||
// This virtual function is called (on each shard separately) when the
|
||||
// virtual table "system.clients" is read. It is expected to generate a
|
||||
// list of clients connected to this server (on this shard).
|
||||
virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
|
||||
virtual future<utils::chunked_vector<client_data>> get_client_data() override;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -708,12 +708,8 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
|
||||
// As long as the system_clients_entry object is alive, this request will
|
||||
// be visible in the "system.clients" virtual table. When requested, this
|
||||
// entry will be formatted by server::ongoing_request::make_client_data().
|
||||
auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
|
||||
auto system_clients_entry = _ongoing_requests.emplace(
|
||||
req->get_client_address(), std::move(user_agent_header),
|
||||
req->get_client_address(), req->get_header("User-Agent"),
|
||||
username, current_scheduling_group(),
|
||||
req->get_protocol_name() == "https");
|
||||
|
||||
@@ -989,10 +985,10 @@ client_data server::ongoing_request::make_client_data() const {
|
||||
return cd;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
|
||||
utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
|
||||
future<utils::chunked_vector<client_data>> server::get_client_data() {
|
||||
utils::chunked_vector<client_data> ret;
|
||||
co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
|
||||
ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
|
||||
ret.emplace_back(r.make_client_data());
|
||||
});
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
@@ -55,7 +55,6 @@ class server : public peering_sharded_service<server> {
|
||||
// though it isn't really relevant for Alternator which defines its own
|
||||
// timeouts separately. We can create this object only once.
|
||||
updateable_timeout_config _timeout_config;
|
||||
client_options_cache_type _connection_options_keys_and_values;
|
||||
|
||||
alternator_callbacks_map _callbacks;
|
||||
|
||||
@@ -89,7 +88,7 @@ class server : public peering_sharded_service<server> {
|
||||
// is called when reading the "system.clients" virtual table.
|
||||
struct ongoing_request {
|
||||
socket_address _client_address;
|
||||
client_options_cache_entry_type _user_agent;
|
||||
sstring _user_agent;
|
||||
sstring _username;
|
||||
scheduling_group _scheduling_group;
|
||||
bool _is_https;
|
||||
@@ -108,7 +107,7 @@ public:
|
||||
// table "system.clients" is read. It is expected to generate a list of
|
||||
// clients connected to this server (on this shard). This function is
|
||||
// called by alternator::controller::get_client_data().
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
|
||||
future<utils::chunked_vector<client_data>> get_client_data();
|
||||
private:
|
||||
void set_routes(seastar::httpd::routes& r);
|
||||
// If verification succeeds, returns the authenticated user's username
|
||||
|
||||
@@ -31,7 +31,6 @@ set(swagger_files
|
||||
api-doc/column_family.json
|
||||
api-doc/commitlog.json
|
||||
api-doc/compaction_manager.json
|
||||
api-doc/client_routes.json
|
||||
api-doc/config.json
|
||||
api-doc/cql_server_test.json
|
||||
api-doc/endpoint_snitch_info.json
|
||||
@@ -69,7 +68,6 @@ target_sources(api
|
||||
PRIVATE
|
||||
api.cc
|
||||
cache_service.cc
|
||||
client_routes.cc
|
||||
collectd.cc
|
||||
column_family.cc
|
||||
commitlog.cc
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
, "client_routes_entry": {
|
||||
"id": "client_routes_entry",
|
||||
"summary": "An entry storing client routes",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"},
|
||||
"address": {"type": "string"},
|
||||
"port": {"type": "integer"},
|
||||
"tls_port": {"type": "integer"},
|
||||
"alternator_port": {"type": "integer"},
|
||||
"alternator_https_port": {"type": "integer"}
|
||||
},
|
||||
"required": ["connection_id", "host_id", "address"]
|
||||
}
|
||||
, "client_routes_key": {
|
||||
"id": "client_routes_key",
|
||||
"summary": "A key of client_routes_entry",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
, "/v2/client-routes":{
|
||||
"get": {
|
||||
"description":"List all client route entries",
|
||||
"operationId":"get_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[],
|
||||
"responses":{
|
||||
"200":{
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{"$ref":"#/definitions/ErrorModel"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"post": {
|
||||
"description":"Upsert one or more client route entries",
|
||||
"operationId":"set_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{ "description": "OK" },
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{ "$ref":"#/definitions/ErrorModel" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"delete": {
|
||||
"description":"Delete one or more client route entries",
|
||||
"operationId":"delete_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_key" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{
|
||||
"description": "OK"
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{
|
||||
"$ref":"#/definitions/ErrorModel"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
13
api/api.cc
13
api/api.cc
@@ -37,7 +37,6 @@
|
||||
#include "raft.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "service_levels.hh"
|
||||
#include "client_routes.hh"
|
||||
|
||||
logging::logger apilog("api");
|
||||
|
||||
@@ -68,11 +67,9 @@ future<> set_server_init(http_context& ctx) {
|
||||
rb02->set_api_doc(r);
|
||||
rb02->register_api_file(r, "swagger20_header");
|
||||
rb02->register_api_file(r, "metrics");
|
||||
rb02->register_api_file(r, "client_routes");
|
||||
rb->register_function(r, "system",
|
||||
"The system related API");
|
||||
rb02->add_definitions_file(r, "metrics");
|
||||
rb02->add_definitions_file(r, "client_routes");
|
||||
set_system(ctx, r);
|
||||
rb->register_function(r, "error_injection",
|
||||
"The error injection API");
|
||||
@@ -132,16 +129,6 @@ future<> unset_server_storage_service(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr) {
|
||||
return ctx.http_server.set_routes([&ctx, &cr] (routes& r) {
|
||||
set_client_routes(ctx, r, cr);
|
||||
});
|
||||
}
|
||||
|
||||
future<> unset_server_client_routes(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_client_routes(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_load_meter(http_context& ctx, service::load_meter& lm) {
|
||||
return ctx.http_server.set_routes([&ctx, &lm] (routes& r) { set_load_meter(ctx, r, lm); });
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ class storage_proxy;
|
||||
class storage_service;
|
||||
class raft_group0_client;
|
||||
class raft_group_registry;
|
||||
class client_routes_service;
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -100,8 +99,6 @@ future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snit
|
||||
future<> unset_server_snitch(http_context& ctx);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
|
||||
future<> unset_server_storage_service(http_context& ctx);
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
|
||||
future<> unset_server_client_routes(http_context& ctx);
|
||||
future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
|
||||
future<> unset_server_sstables_loader(http_context& ctx);
|
||||
future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g);
|
||||
|
||||
@@ -1,176 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <seastar/http/short_streams.hh>
|
||||
|
||||
#include "client_routes.hh"
|
||||
#include "api/api.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "utils/rjson.hh"
|
||||
|
||||
|
||||
#include "api/api-doc/client_routes.json.hh"
|
||||
|
||||
using namespace seastar::httpd;
|
||||
using namespace std::chrono_literals;
|
||||
using namespace json;
|
||||
|
||||
extern logging::logger apilog;
|
||||
|
||||
namespace api {
|
||||
|
||||
static void validate_client_routes_endpoint(sharded<service::client_routes_service>& cr, sstring endpoint_name) {
|
||||
if (!cr.local().get_feature_service().client_routes) {
|
||||
apilog.warn("{}: called before the cluster feature was enabled", endpoint_name);
|
||||
throw std::runtime_error(fmt::format("{} requires all nodes to support the CLIENT_ROUTES cluster feature", endpoint_name));
|
||||
}
|
||||
}
|
||||
|
||||
static sstring parse_string(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
throw bad_param_exception(fmt::format("Missing '{}'", name));
|
||||
}
|
||||
if (!it->value.IsString()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be a string", name));
|
||||
}
|
||||
return {it->value.GetString(), it->value.GetStringLength()};
|
||||
}
|
||||
|
||||
static std::optional<uint32_t> parse_port(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (!it->value.IsInt()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be an integer", name));
|
||||
}
|
||||
auto port = it->value.GetInt();
|
||||
if (port < 1 || port > 65535) {
|
||||
throw bad_param_exception(fmt::format("'{}' value={} is outside the allowed port range", name, port));
|
||||
}
|
||||
return port;
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_entry> parse_set_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_entry> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
if (!element.IsObject()) { throw bad_param_exception("Each element must be object"); }
|
||||
|
||||
const auto port = parse_port("port", element);
|
||||
const auto tls_port = parse_port("tls_port", element);
|
||||
const auto alternator_port = parse_port("alternator_port", element);
|
||||
const auto alternator_https_port = parse_port("alternator_https_port", element);
|
||||
|
||||
if (!port.has_value() && !tls_port.has_value() && !alternator_port.has_value() && !alternator_https_port.has_value()) {
|
||||
throw bad_param_exception("At least one port field ('port', 'tls_port', 'alternator_port', 'alternator_https_port') must be specified");
|
||||
}
|
||||
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)},
|
||||
parse_string("address", element),
|
||||
port,
|
||||
tls_port,
|
||||
alternator_port,
|
||||
alternator_https_port
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "rest_set_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
co_await cr.local().set_client_routes(parse_set_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_key> parse_delete_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_key> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)}
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "delete_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
co_await cr.local().delete_client_routes(parse_delete_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_get_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "get_client_routes");
|
||||
|
||||
co_return co_await cr.invoke_on(0, [] (service::client_routes_service& cr) -> future<json::json_return_type> {
|
||||
co_return json::json_return_type(stream_range_as_array(co_await cr.get_client_routes(), [](const service::client_routes_service::client_route_entry & entry) {
|
||||
seastar::httpd::client_routes_json::client_routes_entry obj;
|
||||
obj.connection_id = entry.connection_id;
|
||||
obj.host_id = fmt::to_string(entry.host_id);
|
||||
obj.address = entry.address;
|
||||
if (entry.port.has_value()) { obj.port = entry.port.value(); }
|
||||
if (entry.tls_port.has_value()) { obj.tls_port = entry.tls_port.value(); }
|
||||
if (entry.alternator_port.has_value()) { obj.alternator_port = entry.alternator_port.value(); }
|
||||
if (entry.alternator_https_port.has_value()) { obj.alternator_https_port = entry.alternator_https_port.value(); }
|
||||
return obj;
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
void set_client_routes(http_context& ctx, routes& r, sharded<service::client_routes_service>& cr) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_set_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::delete_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_delete_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::get_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_get_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
}
|
||||
|
||||
void unset_client_routes(http_context& ctx, routes& r) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::delete_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::get_client_routes.unset(r);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/json/json_elements.hh>
|
||||
#include "api/api_init.hh"
|
||||
|
||||
namespace api {
|
||||
|
||||
void set_client_routes(http_context& ctx, httpd::routes& r, sharded<service::client_routes_service>& cr);
|
||||
void unset_client_routes(http_context& ctx, httpd::routes& r);
|
||||
|
||||
}
|
||||
@@ -547,13 +547,17 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
|
||||
vp.insert(b.second);
|
||||
}
|
||||
}
|
||||
std::vector<sstring> res;
|
||||
replica::database& db = vb.local().get_db();
|
||||
auto uuid = validate_table(db, ks, cf_name);
|
||||
replica::column_family& cf = db.find_column_family(uuid);
|
||||
co_return cf.get_index_manager().list_indexes()
|
||||
| std::views::transform([] (const auto& i) { return i.metadata().name(); })
|
||||
| std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
|
||||
| std::ranges::to<std::vector>();
|
||||
res.reserve(cf.get_index_manager().list_indexes().size());
|
||||
for (auto&& i : cf.get_index_manager().list_indexes()) {
|
||||
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
|
||||
res.emplace_back(i.metadata().name());
|
||||
}
|
||||
}
|
||||
co_return res;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
@@ -10,9 +10,7 @@
|
||||
#include <seastar/net/inet_address.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include "seastarx.hh"
|
||||
#include "utils/loading_shared_values.hh"
|
||||
|
||||
#include <list>
|
||||
#include <optional>
|
||||
|
||||
enum class client_type {
|
||||
@@ -29,20 +27,6 @@ enum class client_connection_stage {
|
||||
ready,
|
||||
};
|
||||
|
||||
// We implement a keys cache using a map-like utils::loading_shared_values container by storing empty values.
|
||||
struct options_cache_value_type {};
|
||||
using client_options_cache_type = utils::loading_shared_values<sstring, options_cache_value_type>;
|
||||
using client_options_cache_entry_type = client_options_cache_type::entry_ptr;
|
||||
using client_options_cache_key_type = client_options_cache_type::key_type;
|
||||
|
||||
// This struct represents a single OPTION key-value pair from the client's connection options.
|
||||
// Both key and value are represented by corresponding "references" to their cached values.
|
||||
// Each "reference" is effectively a lw_shared_ptr value.
|
||||
struct client_option_key_value_cached_entry {
|
||||
client_options_cache_entry_type key;
|
||||
client_options_cache_entry_type value;
|
||||
};
|
||||
|
||||
sstring to_string(client_connection_stage ct);
|
||||
|
||||
// Representation of a row in `system.clients'. std::optionals are for nullable cells.
|
||||
@@ -53,8 +37,8 @@ struct client_data {
|
||||
client_connection_stage connection_stage = client_connection_stage::established;
|
||||
int32_t shard_id; /// ID of server-side shard which is processing the connection.
|
||||
|
||||
std::optional<client_options_cache_entry_type> driver_name;
|
||||
std::optional<client_options_cache_entry_type> driver_version;
|
||||
std::optional<sstring> driver_name;
|
||||
std::optional<sstring> driver_version;
|
||||
std::optional<sstring> hostname;
|
||||
std::optional<int32_t> protocol_version;
|
||||
std::optional<sstring> ssl_cipher_suite;
|
||||
@@ -62,7 +46,6 @@ struct client_data {
|
||||
std::optional<sstring> ssl_protocol;
|
||||
std::optional<sstring> username;
|
||||
std::optional<sstring> scheduling_group_name;
|
||||
std::list<client_option_key_value_cached_entry> client_options;
|
||||
|
||||
sstring stage_str() const { return to_string(connection_stage); }
|
||||
sstring client_type_str() const { return to_string(ct); }
|
||||
|
||||
@@ -125,6 +125,10 @@ if(target_arch)
|
||||
add_compile_options("-march=${target_arch}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_options("SHELL:-Xclang -fexperimental-assignment-tracking=disabled")
|
||||
endif()
|
||||
|
||||
function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
|
||||
math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
|
||||
set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "sstables/open_info.hh"
|
||||
#include "compaction_descriptor.hh"
|
||||
|
||||
class reader_permit;
|
||||
@@ -45,7 +44,7 @@ public:
|
||||
virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
|
||||
virtual reader_permit make_compaction_reader_permit() const = 0;
|
||||
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
|
||||
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
|
||||
virtual sstables::shared_sstable make_sstable() const = 0;
|
||||
virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
|
||||
virtual api::timestamp_type min_memtable_timestamp() const = 0;
|
||||
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
|
||||
|
||||
@@ -416,9 +416,7 @@ future<compaction_result> compaction_task_executor::compact_sstables(compaction_
|
||||
descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
|
||||
}
|
||||
descriptor.creator = [&t] (shard_id) {
|
||||
// All compaction types going through this path will work on normal input sstables only.
|
||||
// Off-strategy, for example, waits until the sstables move out of staging state.
|
||||
return t.make_sstable(sstables::sstable_state::normal);
|
||||
return t.make_sstable();
|
||||
};
|
||||
descriptor.replacer = [this, &t, &on_replace, offstrategy] (compaction_completion_desc desc) {
|
||||
t.get_compaction_strategy().notify_completion(t, desc.old_sstables, desc.new_sstables);
|
||||
@@ -1849,10 +1847,6 @@ protected:
|
||||
throw make_compaction_stopped_exception();
|
||||
}
|
||||
}, false);
|
||||
if (utils::get_local_injector().is_enabled("split_sstable_force_stop_exception")) {
|
||||
throw make_compaction_stopped_exception();
|
||||
}
|
||||
|
||||
co_return co_await do_rewrite_sstable(std::move(sst));
|
||||
}
|
||||
};
|
||||
@@ -2290,16 +2284,12 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
}
|
||||
// Throw an error if split cannot be performed due to e.g. out of space prevention.
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is uneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
if (!can_proceed(&t)) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
@@ -2307,11 +2297,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
|
||||
compaction_progress_monitor monitor;
|
||||
compaction_data info = create_compaction_data();
|
||||
compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
|
||||
desc.creator = [&t, sst] (shard_id _) {
|
||||
// NOTE: preserves the sstable state, since we want the output to be on the same state as the original.
|
||||
// For example, if base table has views, it's important that sstable produced by repair will be
|
||||
// in the staging state.
|
||||
return t.make_sstable(sst->state());
|
||||
desc.creator = [&t] (shard_id _) {
|
||||
return t.make_sstable();
|
||||
};
|
||||
desc.replacer = [&] (compaction_completion_desc d) {
|
||||
std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));
|
||||
|
||||
@@ -376,8 +376,7 @@ public:
|
||||
// Splits a single SSTable by segregating all its data according to the classifier.
|
||||
// If SSTable doesn't need split, the same input SSTable is returned as output.
|
||||
// If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
|
||||
// Exception is thrown if the input sstable cannot be split due to e.g. out of space prevention.
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
|
||||
|
||||
// Run a custom job for a given table, defined by a function
|
||||
// it completes when future returned by job is ready or returns immediately
|
||||
|
||||
26
configure.py
26
configure.py
@@ -1158,7 +1158,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'locator/topology.cc',
|
||||
'locator/util.cc',
|
||||
'service/client_state.cc',
|
||||
'service/client_routes.cc',
|
||||
'service/storage_service.cc',
|
||||
'service/session.cc',
|
||||
'service/task_manager_module.cc',
|
||||
@@ -1319,8 +1318,6 @@ api = ['api/api.cc',
|
||||
'api/storage_proxy.cc',
|
||||
Json2Code('api/api-doc/cache_service.json'),
|
||||
'api/cache_service.cc',
|
||||
Json2Code('api/api-doc/client_routes.json'),
|
||||
'api/client_routes.cc',
|
||||
Json2Code('api/api-doc/collectd.json'),
|
||||
'api/collectd.cc',
|
||||
Json2Code('api/api-doc/endpoint_snitch_info.json'),
|
||||
@@ -1698,18 +1695,6 @@ deps['test/vector_search/vector_store_client_test'] = ['test/vector_search/vect
|
||||
deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
|
||||
deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
|
||||
|
||||
boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
|
||||
|
||||
# We need to link these files to all Boost tests to make sure that
|
||||
# we can execute `--list_json_content` on them. That will produce
|
||||
# a similar result as calling `--list_content={HRF,DOT}`.
|
||||
# Unfortunately, to be able to do that, we're forced to link the
|
||||
# relevant code by hand.
|
||||
for key in deps.keys():
|
||||
for prefix in boost_tests_prefixes:
|
||||
if key.startswith(prefix):
|
||||
deps[key] += ["test/lib/boost_tree_lister_injector.cc", "test/lib/boost_test_tree_lister.cc"]
|
||||
|
||||
wasm_deps = {}
|
||||
|
||||
wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
|
||||
@@ -2207,6 +2192,8 @@ def kmiplib():
|
||||
for id in os_ids:
|
||||
if id in { 'centos', 'fedora', 'rhel' }:
|
||||
return 'rhel84'
|
||||
elif id in { 'ubuntu', 'debian' }:
|
||||
return 'ubuntu' # Temporarily use a placeholder for Ubuntu/Debian
|
||||
print('Could not resolve libkmip.a for platform {}'.format(os_ids))
|
||||
sys.exit(1)
|
||||
|
||||
@@ -2263,6 +2250,15 @@ def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
|
||||
if debuginfo and mode_config['can_have_debug_info']:
|
||||
cxxflags += ['-g', '-gz']
|
||||
|
||||
if 'clang' in cxx:
|
||||
# Since AssignmentTracking was enabled by default in clang
|
||||
# (llvm/llvm-project@de6da6ad55d3ca945195d1cb109cb8efdf40a52a)
|
||||
# coroutine frame debugging info (`coro_frame_ty`) is broken.
|
||||
#
|
||||
# It seems that we aren't losing much by disabling AssigmentTracking,
|
||||
# so for now we choose to disable it to get `coro_frame_ty` back.
|
||||
cxxflags.append('-Xclang -fexperimental-assignment-tracking=disabled')
|
||||
|
||||
return cxxflags
|
||||
|
||||
|
||||
|
||||
@@ -64,10 +64,6 @@ bool query_processor::topology_global_queue_empty() {
|
||||
return remote().first.get().ss.topology_global_queue_empty();
|
||||
}
|
||||
|
||||
future<bool> query_processor::ongoing_rf_change(const service::group0_guard& guard, sstring ks) {
|
||||
return remote().first.get().ss.ongoing_rf_change(guard, std::move(ks));
|
||||
}
|
||||
|
||||
static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
@@ -474,7 +474,6 @@ public:
|
||||
void reset_cache();
|
||||
|
||||
bool topology_global_queue_empty();
|
||||
future<bool> ongoing_rf_change(const service::group0_guard& guard, sstring ks);
|
||||
|
||||
query_options make_internal_options(
|
||||
const statements::prepared_statement::checked_weak_ptr& p,
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "prepared_statement.hh"
|
||||
#include "seastar/coroutine/exception.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/topology_mutation.hh"
|
||||
@@ -139,7 +138,6 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
|
||||
future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
|
||||
cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
|
||||
using namespace cql_transport;
|
||||
bool unknown_keyspace = false;
|
||||
try {
|
||||
event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
|
||||
auto ks = qp.db().find_keyspace(_name);
|
||||
@@ -160,12 +158,8 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
// when in reality nothing or only schema is being changed
|
||||
if (changes_tablets(qp)) {
|
||||
if (!qp.proxy().features().topology_global_request_queue && !qp.topology_global_queue_empty()) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
}
|
||||
if (qp.proxy().features().rack_list_rf && co_await qp.ongoing_rf_change(mc.guard(),_name)) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception(format("Another RF change for this keyspace {} ongoing, please retry.", _name)));
|
||||
return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
}
|
||||
qp.db().real_database().validate_keyspace_update(*ks_md_update);
|
||||
|
||||
@@ -248,15 +242,10 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
target_type,
|
||||
keyspace());
|
||||
mc.add_mutations(std::move(muts), "CQL alter keyspace");
|
||||
co_return std::make_tuple(std::move(ret), warnings);
|
||||
return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), warnings));
|
||||
} catch (data_dictionary::no_such_keyspace& e) {
|
||||
unknown_keyspace = true;
|
||||
return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
}
|
||||
if (unknown_keyspace) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
}
|
||||
std::unreachable();
|
||||
}
|
||||
|
||||
std::unique_ptr<cql3::statements::prepared_statement>
|
||||
|
||||
@@ -61,7 +61,7 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
|
||||
// Handle ALTER:
|
||||
// ([]|0) -> numeric is allowed, there are no existing replicas
|
||||
// numeric -> numeric' is not supported unless numeric == numeric'. User should convert RF to rack list of equal count first.
|
||||
// numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
|
||||
// rack_list -> len(rack_list) is allowed (no-op)
|
||||
// rack_list -> numeric is not allowed
|
||||
if (old_options.contains(dc)) {
|
||||
@@ -75,8 +75,6 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
"Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
|
||||
dc, old_rf_val, data.count()));
|
||||
}
|
||||
} else if (old_rf.count() == data.count()) {
|
||||
return rf;
|
||||
} else if (old_rf.count() > 0) {
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
|
||||
@@ -155,8 +153,6 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
}
|
||||
|
||||
// Validate options.
|
||||
bool numeric_to_rack_list_transition = false;
|
||||
bool rf_change = false;
|
||||
for (auto&& [dc, opt] : options) {
|
||||
locator::replication_factor_data rf(opt);
|
||||
|
||||
@@ -166,7 +162,6 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
old_rf = locator::replication_factor_data(i->second);
|
||||
}
|
||||
|
||||
rf_change = rf_change || (old_rf && old_rf->count() != rf.count()) || (!old_rf && rf.count() != 0);
|
||||
if (!rf.is_rack_based()) {
|
||||
if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
|
||||
if (old_rf->count() != rf.count()) {
|
||||
@@ -192,11 +187,12 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Rack list for '{}' contains duplicate entries", dc));
|
||||
}
|
||||
numeric_to_rack_list_transition = numeric_to_rack_list_transition || (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0);
|
||||
}
|
||||
|
||||
if (numeric_to_rack_list_transition && rf_change) {
|
||||
throw exceptions::configuration_exception("Cannot change replication factor from numeric to rack list and rf value at the same time");
|
||||
if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
|
||||
// FIXME: Allow this if replicas already conform to the given rack list.
|
||||
// FIXME: Implement automatic colocation to allow transition to rack list.
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor from numeric to rack list for '{}'", dc));
|
||||
}
|
||||
}
|
||||
|
||||
if (!rf && options.empty() && old_options.empty()) {
|
||||
@@ -416,7 +412,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
|
||||
std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
@@ -432,7 +428,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
|
||||
}
|
||||
auto sc = get_replication_strategy_class();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
if (sc) {
|
||||
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
|
||||
} else {
|
||||
|
||||
@@ -248,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
|
||||
// which is larger than the segment ID of the RP of the last written hint.
|
||||
cfg.base_segment_id = _last_written_rp.base_id();
|
||||
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
|
||||
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
|
||||
// When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
|
||||
if (_sender.have_segments()) {
|
||||
|
||||
@@ -135,5 +135,5 @@ const std::string db::object_storage_endpoint_param::gs_type = "gs";
|
||||
|
||||
auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
|
||||
-> decltype(ctx.out()) {
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{}", e.to_json_string());
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
|
||||
}
|
||||
|
||||
@@ -110,7 +110,6 @@ namespace {
|
||||
system_keyspace::v3::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.enable_schema_commitlog();
|
||||
@@ -138,7 +137,6 @@ namespace {
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.is_group0_table = true;
|
||||
@@ -311,7 +309,6 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("tablet_balancing_enabled", boolean_type, column_kind::static_column)
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -1418,23 +1415,6 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::client_routes() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, CLIENT_ROUTES);
|
||||
return schema_builder(NAME, CLIENT_ROUTES, std::make_optional(id))
|
||||
.with_column("connection_id", utf8_type, column_kind::partition_key)
|
||||
.with_column("host_id", uuid_type, column_kind::clustering_key)
|
||||
.with_column("address", utf8_type)
|
||||
.with_column("port", int32_type)
|
||||
.with_column("tls_port", int32_type)
|
||||
.with_column("alternator_port", int32_type)
|
||||
.with_column("alternator_https_port", int32_type)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
future<system_keyspace::local_info> system_keyspace::load_local_info() {
|
||||
auto msg = co_await execute_cql(format("SELECT host_id, cluster_name, data_center, rack FROM system.{} WHERE key=?", LOCAL), sstring(LOCAL));
|
||||
|
||||
@@ -2362,7 +2342,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
|
||||
v3::cdc_local(),
|
||||
raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
|
||||
topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
|
||||
dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
|
||||
dicts(), view_building_tasks(), cdc_streams_state(), cdc_streams_history()
|
||||
});
|
||||
|
||||
if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
|
||||
@@ -3157,10 +3137,7 @@ static bool must_have_tokens(service::node_state nst) {
|
||||
// A decommissioning node doesn't have tokens at the end, they are
|
||||
// removed during transition to the left_token_ring state.
|
||||
case service::node_state::decommissioning: return false;
|
||||
// A removing node might or might not have tokens depending on whether
|
||||
// REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
|
||||
// cases, we allow removing nodes to not have tokens.
|
||||
case service::node_state::removing: return false;
|
||||
case service::node_state::removing: return true;
|
||||
case service::node_state::rebuilding: return true;
|
||||
case service::node_state::normal: return true;
|
||||
case service::node_state::left: return false;
|
||||
@@ -3400,12 +3377,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("paused_rf_change_requests")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "paused_rf_change_requests")) {
|
||||
ret.paused_rf_change_requests.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
@@ -3617,43 +3588,35 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
|
||||
return entry;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id) {
|
||||
auto r = co_await get_topology_request_entry_opt(id);
|
||||
if (!r) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
}
|
||||
co_return std::move(*r);
|
||||
}
|
||||
|
||||
future<std::optional<system_keyspace::topology_requests_entry>> system_keyspace::get_topology_request_entry_opt(utils::UUID id) {
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id, bool require_entry) {
|
||||
auto rs = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE id = {}", TOPOLOGY_REQUESTS, id));
|
||||
|
||||
if (!rs || rs->empty()) {
|
||||
co_return std::nullopt;
|
||||
if (require_entry) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
} else {
|
||||
co_return topology_requests_entry{
|
||||
.id = utils::null_uuid()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const auto& row = rs->one();
|
||||
co_return topology_request_row_to_entry(id, row);
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit) {
|
||||
sstring request_types_str = "";
|
||||
bool first = true;
|
||||
for (const auto& rt : request_types) {
|
||||
if (!std::exchange(first, false)) {
|
||||
request_types_str += ", ";
|
||||
}
|
||||
request_types_str += std::visit([] (auto&& arg) { return fmt::format("'{}'", arg); }, rt);
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
// Running requests.
|
||||
auto rs_running = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, request_types_str));
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS,
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
|
||||
|
||||
// Requests which finished after end_time_limit.
|
||||
auto rs_done = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(), request_types_str));
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(),
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
|
||||
topology_requests_entries m;
|
||||
for (const auto& row: *rs_done) {
|
||||
@@ -3671,16 +3634,6 @@ future<system_keyspace::topology_requests_entries> system_keyspace::get_topology
|
||||
co_return m;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
return get_topology_request_entries({
|
||||
service::topology_request::join,
|
||||
service::topology_request::replace,
|
||||
service::topology_request::rebuild,
|
||||
service::topology_request::leave,
|
||||
service::topology_request::remove
|
||||
}, end_time_limit);
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::get_insert_dict_mutation(
|
||||
std::string_view name,
|
||||
bytes data,
|
||||
|
||||
@@ -199,8 +199,6 @@ public:
|
||||
static constexpr auto VIEW_BUILD_STATUS_V2 = "view_build_status_v2";
|
||||
static constexpr auto DICTS = "dicts";
|
||||
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
|
||||
static constexpr auto CLIENT_ROUTES = "client_routes";
|
||||
static constexpr auto VERSIONS = "versions";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
@@ -278,7 +276,6 @@ public:
|
||||
static schema_ptr view_build_status_v2();
|
||||
static schema_ptr dicts();
|
||||
static schema_ptr view_building_tasks();
|
||||
static schema_ptr client_routes();
|
||||
|
||||
// auth
|
||||
static schema_ptr roles();
|
||||
@@ -670,9 +667,7 @@ public:
|
||||
|
||||
future<service::topology_request_state> get_topology_request_state(utils::UUID id, bool require_entry);
|
||||
topology_requests_entry topology_request_row_to_entry(utils::UUID id, const cql3::untyped_result_set_row& row);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id);
|
||||
future<std::optional<topology_requests_entry>> get_topology_request_entry_opt(utils::UUID id);
|
||||
future<system_keyspace::topology_requests_entries> get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id, bool require_entry);
|
||||
future<topology_requests_entries> get_node_ops_request_entries(db_clock::time_point end_time_limit);
|
||||
|
||||
public:
|
||||
|
||||
@@ -198,7 +198,6 @@ future<> view_building_worker::register_staging_sstable_tasks(std::vector<sstabl
|
||||
|
||||
future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
try {
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
co_await create_staging_sstable_tasks();
|
||||
@@ -215,14 +214,6 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
vbw_logger.warn("Got group0_concurrent_modification while creating staging sstable tasks");
|
||||
} catch (raft::request_aborted&) {
|
||||
vbw_logger.warn("Got raft::request_aborted while creating staging sstable tasks");
|
||||
} catch (...) {
|
||||
vbw_logger.error("Exception while creating staging sstable tasks: {}", std::current_exception());
|
||||
sleep = true;
|
||||
}
|
||||
|
||||
if (sleep) {
|
||||
vbw_logger.debug("Sleeping after exception.");
|
||||
co_await seastar::sleep_abortable(1s, _as).handle_exception([] (auto x) { return make_ready_future<>(); });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -426,12 +417,9 @@ future<> view_building_worker::check_for_aborted_tasks() {
|
||||
|
||||
auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
|
||||
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
|
||||
auto it = vbw._state._batch->tasks.begin();
|
||||
while (it != vbw._state._batch->tasks.end()) {
|
||||
auto id = it->first;
|
||||
auto task_opt = building_state.get_task(it->second.base_id, my_replica, id);
|
||||
|
||||
++it; // Advance the iterator before potentially removing the entry from the map.
|
||||
auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
|
||||
for (auto& [id, t]: tasks_map) {
|
||||
auto task_opt = building_state.get_task(t.base_id, my_replica, id);
|
||||
if (!task_opt || task_opt->get().aborted) {
|
||||
co_await vbw._state._batch->abort_task(id);
|
||||
}
|
||||
@@ -461,7 +449,7 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
|
||||
}) | std::ranges::to<std::unordered_set>();;
|
||||
}
|
||||
|
||||
// If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
|
||||
// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
|
||||
// clear the state, save and flush new base table
|
||||
future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
|
||||
if (processing_base_table != building_state.currently_processed_base_table) {
|
||||
@@ -583,6 +571,8 @@ future<> view_building_worker::batch::do_work() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_vbw.local()._vb_state_machine.event.broadcast();
|
||||
}
|
||||
|
||||
future<> view_building_worker::do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as) {
|
||||
@@ -784,15 +774,13 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
tasks.insert({id, *task_opt});
|
||||
}
|
||||
#ifdef SEASTAR_DEBUG
|
||||
{
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -823,6 +811,25 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
co_return collect_completed_tasks();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -605,8 +605,8 @@ public:
|
||||
}
|
||||
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::VERSIONS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::VERSIONS, std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "versions");
|
||||
return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
|
||||
.with_column("key", utf8_type, column_kind::partition_key)
|
||||
.with_column("version", utf8_type)
|
||||
.with_column("build_mode", utf8_type)
|
||||
@@ -749,7 +749,6 @@ class clients_table : public streaming_virtual_table {
|
||||
.with_column("ssl_protocol", utf8_type)
|
||||
.with_column("username", utf8_type)
|
||||
.with_column("scheduling_group", utf8_type)
|
||||
.with_column("client_options", map_type_impl::get_instance(utf8_type, utf8_type, false))
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}
|
||||
@@ -767,7 +766,7 @@ class clients_table : public streaming_virtual_table {
|
||||
|
||||
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
|
||||
// Collect
|
||||
using client_data_vec = utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>;
|
||||
using client_data_vec = utils::chunked_vector<client_data>;
|
||||
using shard_client_data = std::vector<client_data_vec>;
|
||||
std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
|
||||
cd_vec.resize(smp::count);
|
||||
@@ -807,13 +806,13 @@ class clients_table : public streaming_virtual_table {
|
||||
for (unsigned i = 0; i < smp::count; i++) {
|
||||
for (auto&& ps_cdc : *cd_vec[i]) {
|
||||
for (auto&& cd : ps_cdc) {
|
||||
if (cd_map.contains(cd->ip)) {
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
if (cd_map.contains(cd.ip)) {
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
} else {
|
||||
dht::decorated_key key = make_partition_key(cd->ip);
|
||||
dht::decorated_key key = make_partition_key(cd.ip);
|
||||
if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
|
||||
ips.insert(decorated_ip{std::move(key), cd->ip});
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
ips.insert(decorated_ip{std::move(key), cd.ip});
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
}
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
@@ -826,58 +825,39 @@ class clients_table : public streaming_virtual_table {
|
||||
co_await result.emit_partition_start(dip.key);
|
||||
auto& clients = cd_map[dip.ip];
|
||||
|
||||
std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
|
||||
return a->port < b->port || a->client_type_str() < b->client_type_str();
|
||||
std::ranges::sort(clients, [] (const client_data& a, const client_data& b) {
|
||||
return a.port < b.port || a.client_type_str() < b.client_type_str();
|
||||
});
|
||||
|
||||
for (const auto& cd : clients) {
|
||||
clustering_row cr(make_clustering_key(cd->port, cd->client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd->shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd->stage_str());
|
||||
if (cd->driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", cd->driver_name->key());
|
||||
clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd.shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd.stage_str());
|
||||
if (cd.driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", *cd.driver_name);
|
||||
}
|
||||
if (cd->driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", cd->driver_version->key());
|
||||
if (cd.driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", *cd.driver_version);
|
||||
}
|
||||
if (cd->hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd->hostname);
|
||||
if (cd.hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd.hostname);
|
||||
}
|
||||
if (cd->protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd->protocol_version);
|
||||
if (cd.protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
|
||||
}
|
||||
if (cd->ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd->ssl_cipher_suite);
|
||||
if (cd.ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
|
||||
}
|
||||
if (cd->ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd->ssl_enabled);
|
||||
if (cd.ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
|
||||
}
|
||||
if (cd->ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd->ssl_protocol);
|
||||
if (cd.ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
|
||||
}
|
||||
set_cell(cr.cells(), "username", cd->username ? *cd->username : sstring("anonymous"));
|
||||
if (cd->scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd->scheduling_group_name);
|
||||
set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
|
||||
if (cd.scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name);
|
||||
}
|
||||
|
||||
auto map_type = map_type_impl::get_instance(
|
||||
utf8_type,
|
||||
utf8_type,
|
||||
false
|
||||
);
|
||||
|
||||
auto prepare_client_options = [] (const auto& client_options) {
|
||||
map_type_impl::native_type tmp;
|
||||
for (auto& co: client_options) {
|
||||
auto map_element = std::make_pair(data_value(co.key.key()), data_value(co.value.key()));
|
||||
tmp.push_back(std::move(map_element));
|
||||
}
|
||||
return tmp;
|
||||
};
|
||||
|
||||
set_cell(cr.cells(), "client_options",
|
||||
make_map_value(map_type, prepare_client_options(cd->client_options)));
|
||||
|
||||
co_await result.emit_row(std::move(cr));
|
||||
}
|
||||
co_await result.emit_partition_end();
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# Alternator: DynamoDB API in ScyllaDB
|
||||
# Alternator: DynamoDB API in Scylla
|
||||
|
||||
## Introduction
|
||||
Alternator is a ScyllaDB feature adding compatibility with Amazon DynamoDB(TM).
|
||||
Alternator is a Scylla feature adding compatibility with Amazon DynamoDB(TM).
|
||||
DynamoDB's API uses JSON-encoded requests and responses which are sent over
|
||||
an HTTP or HTTPS transport. It is described in detail in Amazon's [DynamoDB
|
||||
API Reference](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/).
|
||||
|
||||
Our goal is that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. Alternator's
|
||||
be run, unmodified, against Scylla with Alternator enabled. Alternator's
|
||||
compatibility with DynamoDB is fairly complete, but users should be aware
|
||||
of some differences and some unimplemented features. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document,
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document,
|
||||
which is updated as the work on Alternator progresses and compatibility
|
||||
continues to improve.
|
||||
|
||||
@@ -19,8 +19,8 @@ Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These are described in [Alternator-specific APIs](new-apis.md).
|
||||
|
||||
## Running Alternator
|
||||
By default, ScyllaDB does not listen for DynamoDB API requests. To enable
|
||||
this API in ScyllaDB you must set at least two configuration options,
|
||||
By default, Scylla does not listen for DynamoDB API requests. To enable
|
||||
this API in Scylla you must set at least two configuration options,
|
||||
**alternator_port** and **alternator_write_isolation**. For example in the
|
||||
YAML configuration file:
|
||||
```yaml
|
||||
@@ -30,7 +30,7 @@ alternator_write_isolation: only_rmw_uses_lwt # or always, forbid or unsafe
|
||||
or, equivalently, via command-line arguments: `--alternator-port=8000
|
||||
--alternator-write-isolation=only_rmw_uses_lwt.
|
||||
|
||||
the **alternator_port** option determines on which port ScyllaDB listens for
|
||||
the **alternator_port** option determines on which port Scylla listens for
|
||||
DynamoDB API requests. By default, it listens on this port on all network
|
||||
interfaces. To listen only on a specific interface, configure also the
|
||||
**alternator_address** option.
|
||||
@@ -41,12 +41,12 @@ Alternator has four different choices
|
||||
for the implementation of writes, each with different advantages. You should
|
||||
carefully consider which of the options makes more sense for your intended
|
||||
use case and configure alternator_write_isolation accordingly. There is
|
||||
currently no default for this option: Trying to run ScyllaDB with an Alternator
|
||||
currently no default for this option: Trying to run Scylla with an Alternator
|
||||
port selected but without configuring write isolation will result in an error message,
|
||||
asking you to set it.
|
||||
|
||||
In addition to (or instead of) serving HTTP requests on alternator_port,
|
||||
ScyllaDB can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
specified by **alternator_https_port**. As usual for HTTPS servers, the
|
||||
operator must specify certificate and key files. By default these should
|
||||
be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
|
||||
@@ -54,7 +54,7 @@ these default locations can overridden by specifying
|
||||
`--alternator-encryption-options keyfile="..."` and
|
||||
`--alternator-encryption-options certificate="..."`.
|
||||
|
||||
By default, ScyllaDB saves a snapshot of deleted tables. But Alternator does
|
||||
By default, Scylla saves a snapshot of deleted tables. But Alternator does
|
||||
not offer an API to restore these snapshots, so these snapshots are not useful
|
||||
and waste disk space - deleting a table does not recover any disk space.
|
||||
It is therefore recommended to disable this automatic-snapshotting feature
|
||||
@@ -73,11 +73,11 @@ itself. Instructions, code and examples for doing this can be found in the
|
||||
|
||||
This section provides only a very brief introduction to Alternator's
|
||||
design. A much more detailed document about the features of the DynamoDB
|
||||
API and how they are, or could be, implemented in ScyllaDB can be found in:
|
||||
API and how they are, or could be, implemented in Scylla can be found in:
|
||||
<https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs>
|
||||
|
||||
Almost all of Alternator's source code (except some initialization code)
|
||||
can be found in the alternator/ subdirectory of ScyllaDB's source code.
|
||||
can be found in the alternator/ subdirectory of Scylla's source code.
|
||||
Extensive functional tests can be found in the test/alternator
|
||||
subdirectory. These tests are written in Python, and can be run against
|
||||
both Alternator and Amazon's DynamoDB; This allows verifying that
|
||||
@@ -85,15 +85,15 @@ Alternator's behavior matches the one observed on DynamoDB.
|
||||
See test/alternator/README.md for more information about the tests and
|
||||
how to run them.
|
||||
|
||||
With Alternator enabled on port 8000 (for example), every ScyllaDB node
|
||||
With Alternator enabled on port 8000 (for example), every Scylla node
|
||||
listens for DynamoDB API requests on this port. These requests, in
|
||||
JSON format over HTTP, are parsed and result in calls to internal Scylla
|
||||
C++ functions - there is no CQL generation or parsing involved.
|
||||
In ScyllaDB terminology, the node receiving the request acts as the
|
||||
In Scylla terminology, the node receiving the request acts as the
|
||||
*coordinator*, and often passes the request on to one or more other nodes -
|
||||
*replicas* which hold copies of the requested data.
|
||||
|
||||
Alternator tables are stored as ScyllaDB tables, each in a separate keyspace.
|
||||
Alternator tables are stored as Scylla tables, each in a separate keyspace.
|
||||
Each keyspace is initialized when the corresponding Alternator table is
|
||||
created (with a CreateTable request). The replication factor (RF) for this
|
||||
keyspace is chosen at that point, depending on the size of the cluster:
|
||||
@@ -101,19 +101,19 @@ RF=3 is used on clusters with three or more nodes, and RF=1 is used for
|
||||
smaller clusters. Such smaller clusters are, of course, only recommended
|
||||
for tests because of the risk of data loss.
|
||||
|
||||
Each table in Alternator is stored as a ScyllaDB table in a separate
|
||||
Each table in Alternator is stored as a Scylla table in a separate
|
||||
keyspace. The DynamoDB key columns (hash and sort key) have known types,
|
||||
and become partition and clustering key columns of the ScyllaDB table.
|
||||
and become partition and clustering key columns of the Scylla table.
|
||||
All other attributes may be different for each row, so are stored in one
|
||||
map column in ScyllaDB, and not as separate columns.
|
||||
map column in Scylla, and not as separate columns.
|
||||
|
||||
DynamoDB supports two consistency levels for reads, "eventual consistency"
|
||||
and "strong consistency". These two modes are implemented using ScyllaDB's CL
|
||||
and "strong consistency". These two modes are implemented using Scylla's CL
|
||||
(consistency level) feature: All writes are done using the `LOCAL_QUORUM`
|
||||
consistency level, then strongly-consistent reads are done with
|
||||
`LOCAL_QUORUM`, while eventually-consistent reads are with just `LOCAL_ONE`.
|
||||
|
||||
In ScyllaDB (and its inspiration, Cassandra), high write performance is
|
||||
In Scylla (and its inspiration, Cassandra), high write performance is
|
||||
achieved by ensuring that writes do not require reads from disk.
|
||||
The DynamoDB API, however, provides many types of requests that need a read
|
||||
before the write (a.k.a. RMW requests - read-modify-write). For example,
|
||||
@@ -121,7 +121,7 @@ a request may copy an existing attribute, increment an attribute,
|
||||
be conditional on some expression involving existing values of attribute,
|
||||
or request that the previous values of attributes be returned. These
|
||||
read-modify-write transactions should be _isolated_ from each other, so
|
||||
by default Alternator implements every write operation using ScyllaDB's
|
||||
by default Alternator implements every write operation using Scylla's
|
||||
LWT (lightweight transactions). This default can be overridden on a per-table
|
||||
basis, by tagging the table as explained above in the "write isolation
|
||||
policies" section.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# ScyllaDB Alternator for DynamoDB users
|
||||
|
||||
ScyllaDB supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
Scylla supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
Our goal is to support any application written for Amazon DynamoDB.
|
||||
Nevertheless, there are a few differences between DynamoDB and Scylla, and
|
||||
and a few DynamoDB features that have not yet been implemented in Scylla.
|
||||
@@ -8,16 +8,16 @@ The purpose of this document is to inform users of these differences.
|
||||
|
||||
## Provisioning
|
||||
|
||||
The most obvious difference between DynamoDB and ScyllaDB is that while
|
||||
DynamoDB is a shared cloud service, ScyllaDB is a dedicated service running
|
||||
The most obvious difference between DynamoDB and Scylla is that while
|
||||
DynamoDB is a shared cloud service, Scylla is a dedicated service running
|
||||
on your private cluster. Whereas DynamoDB allows you to "provision" the
|
||||
number of requests per second you'll need - or at an extra cost not even
|
||||
provision that - ScyllaDB requires you to provision your cluster. You need
|
||||
provision that - Scylla requires you to provision your cluster. You need
|
||||
to reason about the number and size of your nodes - not the throughput.
|
||||
|
||||
Moreover, DynamoDB's per-table provisioning (`BillingMode=PROVISIONED`) is
|
||||
not yet supported by Scylla. The BillingMode and ProvisionedThroughput options
|
||||
on a table need to be valid but are ignored, and ScyllaDB behaves like DynamoDB's
|
||||
on a table need to be valid but are ignored, and Scylla behaves like DynamoDB's
|
||||
`BillingMode=PAY_PER_REQUEST`: All requests are accepted without a per-table
|
||||
throughput cap.
|
||||
|
||||
@@ -33,7 +33,7 @@ Instructions for doing this can be found in:
|
||||
|
||||
## Write isolation policies
|
||||
|
||||
ScyllaDB was designed to optimize the performance of pure write operations -
|
||||
Scylla was designed to optimize the performance of pure write operations -
|
||||
writes which do not need to read the previous value of the item.
|
||||
In CQL, writes which do need the previous value of the item must explicitly
|
||||
use the slower LWT ("LightWeight Transaction") feature to be correctly
|
||||
@@ -79,11 +79,11 @@ a _higher_ timestamp - and this will be the "last write" that wins.
|
||||
To avoid or mitigate this write reordering issue, users may consider
|
||||
one or more of the following:
|
||||
|
||||
1. Use NTP to keep the clocks on the different ScyllaDB nodes synchronized.
|
||||
1. Use NTP to keep the clocks on the different Scylla nodes synchronized.
|
||||
If the delay between the two writes is longer than NTP's accuracy,
|
||||
they will not be reordered.
|
||||
2. If an application wants to ensure that two specific writes are not
|
||||
reordered, it should send both requests to the same ScyllaDB node.
|
||||
reordered, it should send both requests to the same Scylla node.
|
||||
Care should be taken when using a load balancer - which might redirect
|
||||
two requests to two different nodes.
|
||||
3. Consider using the `always_use_lwt` write isolation policy.
|
||||
@@ -210,7 +210,7 @@ CREATE SERVICE_LEVEL IF NOT EXISTS oltp WITH SHARES = 1000;
|
||||
ATTACH SERVICE_LEVEL olap TO alice;
|
||||
ATTACH SERVICE_LEVEL oltp TO bob;
|
||||
```
|
||||
Note that `alternator_enforce_authorization` has to be enabled in ScyllaDB configuration.
|
||||
Note that `alternator_enforce_authorization` has to be enabled in Scylla configuration.
|
||||
|
||||
See [Authorization](##Authorization) section to learn more about roles and authorization.
|
||||
See [Workload Prioritization](../features/workload-prioritization)
|
||||
@@ -218,11 +218,11 @@ to read about Workload Prioritization in detail.
|
||||
|
||||
## Metrics
|
||||
|
||||
ScyllaDB has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of ScyllaDB's usage and performance.
|
||||
ScyllaDB's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
Scylla has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of Scylla's usage and performance.
|
||||
Scylla's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
<https://docs.scylladb.com/operating-scylla/monitoring/>.
|
||||
This monitoring stack is different from DynamoDB's offering - but ScyllaDB's
|
||||
This monitoring stack is different from DynamoDB's offering - but Scylla's
|
||||
is significantly more powerful and gives the user better insights on
|
||||
the internals of the database and its performance.
|
||||
|
||||
@@ -248,7 +248,7 @@ data in different partition order. Applications mustn't rely on that
|
||||
undocumented order.
|
||||
|
||||
Note that inside each partition, the individual items will be sorted the same
|
||||
in DynamoDB and ScyllaDB - determined by the _sort key_ defined for that table.
|
||||
in DynamoDB and Scylla - determined by the _sort key_ defined for that table.
|
||||
|
||||
---
|
||||
|
||||
@@ -274,7 +274,7 @@ is different, or can be configured in Alternator:
|
||||
## Experimental API features
|
||||
|
||||
Some DynamoDB API features are supported by Alternator, but considered
|
||||
**experimental** in this release. An experimental feature in ScyllaDB is a
|
||||
**experimental** in this release. An experimental feature in Scylla is a
|
||||
feature whose functionality is complete, or mostly complete, but it is not
|
||||
as thoroughly tested or optimized as regular features. Also, an experimental
|
||||
feature's implementation is still subject to change and upgrades may not be
|
||||
@@ -351,8 +351,8 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
|
||||
* The on-demand backup APIs are not supported: CreateBackup, DescribeBackup,
|
||||
DeleteBackup, ListBackups, RestoreTableFromBackup.
|
||||
For now, users can use ScyllaDB's existing backup solutions such as snapshots
|
||||
or ScyllaDB Manager.
|
||||
For now, users can use Scylla's existing backup solutions such as snapshots
|
||||
or Scylla Manager.
|
||||
<https://github.com/scylladb/scylla/issues/5063>
|
||||
|
||||
* Continuous backup (the ability to restore any point in time) is also not
|
||||
@@ -370,7 +370,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
<https://github.com/scylladb/scylla/issues/5068>
|
||||
|
||||
* DAX (DynamoDB Accelerator), an in-memory cache for DynamoDB, is not
|
||||
available in for Alternator. Anyway, it should not be necessary - ScyllaDB's
|
||||
available in for Alternator. Anyway, it should not be necessary - Scylla's
|
||||
internal cache is already rather advanced and there is no need to place
|
||||
another cache in front of the it. We wrote more about this here:
|
||||
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
|
||||
@@ -384,7 +384,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
|
||||
and the operations ExecuteStatement, BatchExecuteStatement and
|
||||
ExecuteTransaction are not yet supported.
|
||||
A user that is interested in an SQL-like syntax can consider using ScyllaDB's
|
||||
A user that is interested in an SQL-like syntax can consider using Scylla's
|
||||
CQL protocol instead.
|
||||
This feature was added to DynamoDB in November 2020.
|
||||
<https://github.com/scylladb/scylla/issues/8787>
|
||||
@@ -393,7 +393,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
which is different from AWS's. In particular, the operations
|
||||
DescribeContributorInsights, ListContributorInsights and
|
||||
UpdateContributorInsights that configure Amazon's "CloudWatch Contributor
|
||||
Insights" are not yet supported. ScyllaDB has different ways to retrieve the
|
||||
Insights" are not yet supported. Scylla has different ways to retrieve the
|
||||
same information, such as which items were accessed most often.
|
||||
<https://github.com/scylladb/scylla/issues/8788>
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ This section will guide you through the steps for setting up the cluster:
|
||||
<https://hub.docker.com/r/scylladb/scylla/>, but add to every `docker run`
|
||||
command a `-p 8000:8000` before the image name and
|
||||
`--alternator-port=8000 --alternator-write-isolation=always` at the end.
|
||||
The "alternator-port" option specifies on which port ScyllaDB will listen for
|
||||
The "alternator-port" option specifies on which port Scylla will listen for
|
||||
the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses
|
||||
whether or not Alternator will use LWT for every write.
|
||||
For example,
|
||||
@@ -24,10 +24,10 @@ This section will guide you through the steps for setting up the cluster:
|
||||
By default, ScyllaDB run in this way will not have authentication or
|
||||
authorization enabled, and any DynamoDB API request will be honored without
|
||||
requiring them to be signed appropriately. See the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
document on how to configure authentication and authorization.
|
||||
|
||||
## Testing ScyllaDB's DynamoDB API support:
|
||||
## Testing Scylla's DynamoDB API support:
|
||||
### Running AWS Tic Tac Toe demo app to test the cluster:
|
||||
1. Follow the instructions on the [AWS github page](https://github.com/awsdocs/amazon-dynamodb-developer-guide/blob/master/doc_source/TicTacToe.Phase1.md)
|
||||
2. Enjoy your tic-tac-toe game :-)
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
Alternator's primary goal is to be compatible with Amazon DynamoDB(TM)
|
||||
and its APIs, so that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. The extent of
|
||||
be run, unmodified, against Scylla with Alternator enabled. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document.
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document.
|
||||
|
||||
But Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These Alternator-specific APIs are documented here.
|
||||
@@ -15,7 +15,7 @@ _conditional_ update or an update based on the old value of an attribute.
|
||||
The read and the write should be treated as a single transaction - protected
|
||||
(_isolated_) from other parallel writes to the same item.
|
||||
|
||||
Alternator could do this isolation by using ScyllaDB's LWT (lightweight
|
||||
Alternator could do this isolation by using Scylla's LWT (lightweight
|
||||
transactions) for every write operation, but this significantly slows
|
||||
down writes, and not necessary for workloads which don't use read-modify-write
|
||||
(RMW) updates.
|
||||
@@ -41,7 +41,7 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
which need a read before the write. An attempt to use such statements
|
||||
(e.g., UpdateItem with a ConditionExpression) will result in an error.
|
||||
In this mode, the remaining write requests which are allowed - pure writes
|
||||
without a read - are performed using standard ScyllaDB writes, not LWT,
|
||||
without a read - are performed using standard Scylla writes, not LWT,
|
||||
so they are significantly faster than they would have been in the
|
||||
`always_use_lwt`, but their isolation is still correct.
|
||||
|
||||
@@ -65,19 +65,19 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
read-modify-write updates. This mode is not recommended for any use case,
|
||||
and will likely be removed in the future.
|
||||
|
||||
## Accessing system tables from ScyllaDB
|
||||
ScyllaDB exposes lots of useful information via its internal system tables,
|
||||
## Accessing system tables from Scylla
|
||||
Scylla exposes lots of useful information via its internal system tables,
|
||||
which can be found in system keyspaces: 'system', 'system\_auth', etc.
|
||||
In order to access to these tables via alternator interface,
|
||||
Scan and Query requests can use a special table name:
|
||||
`.scylla.alternator.KEYSPACE_NAME.TABLE_NAME`
|
||||
which will return results fetched from corresponding ScyllaDB table.
|
||||
which will return results fetched from corresponding Scylla table.
|
||||
|
||||
This interface can be used only to fetch data from system tables.
|
||||
Attempts to read regular tables via the virtual interface will result
|
||||
in an error.
|
||||
|
||||
Example: in order to query the contents of ScyllaDB's `system.large_rows`,
|
||||
Example: in order to query the contents of Scylla's `system.large_rows`,
|
||||
pass `TableName='.scylla.alternator.system.large_rows'` to a Query/Scan
|
||||
request.
|
||||
|
||||
@@ -113,14 +113,14 @@ connection (either active or idle), not necessarily an active request as
|
||||
in Alternator.
|
||||
|
||||
## Service discovery
|
||||
As explained in [ScyllaDB Alternator for DynamoDB users](compatibility.md),
|
||||
As explained in [Scylla Alternator for DynamoDB users](compatibility.md),
|
||||
Alternator requires a load-balancer or a client-side load-balancing library
|
||||
to distribute requests between all ScyllaDB nodes. This load-balancer needs
|
||||
to be able to _discover_ the ScyllaDB nodes. Alternator provides two special
|
||||
to distribute requests between all Scylla nodes. This load-balancer needs
|
||||
to be able to _discover_ the Scylla nodes. Alternator provides two special
|
||||
requests, `/` and `/localnodes`, to help with this service discovery, which
|
||||
we will now explain.
|
||||
|
||||
Some setups know exactly which ScyllaDB nodes were brought up, so all that
|
||||
Some setups know exactly which Scylla nodes were brought up, so all that
|
||||
remains is to periodically verify that each node is still functional. The
|
||||
easiest way to do this is to make an HTTP (or HTTPS) GET request to the node,
|
||||
with URL `/`. This is a trivial GET request and does **not** need to be
|
||||
@@ -133,10 +133,10 @@ $ curl http://localhost:8000/
|
||||
healthy: localhost:8000
|
||||
```
|
||||
|
||||
In other setups, the load balancer might not know which ScyllaDB nodes exist.
|
||||
For example, it may be possible to add or remove ScyllaDB nodes without a
|
||||
In other setups, the load balancer might not know which Scylla nodes exist.
|
||||
For example, it may be possible to add or remove Scylla nodes without a
|
||||
client-side load balancer knowing. For these setups we have the `/localnodes`
|
||||
request that can be used to discover which ScyllaDB nodes exist: A load balancer
|
||||
request that can be used to discover which Scylla nodes exist: A load balancer
|
||||
that already knows at least one live node can discover the rest by sending
|
||||
a `/localnodes` request to the known node. It's again an unauthenticated
|
||||
HTTP (or HTTPS) GET request:
|
||||
@@ -160,7 +160,7 @@ list the nodes in a specific _data center_ or _rack_. These options are
|
||||
useful for certain use cases:
|
||||
|
||||
* A `dc` option (e.g., `/localnodes?dc=dc1`) can be passed to list the
|
||||
nodes in a specific ScyllaDB data center, not the data center of the node
|
||||
nodes in a specific Scylla data center, not the data center of the node
|
||||
being contacted. This is useful when a client knowns of _some_ Scylla
|
||||
node belonging to an unknown DC, but wants to list the nodes in _its_
|
||||
DC, which it knows by name.
|
||||
@@ -191,7 +191,7 @@ tells them to.
|
||||
|
||||
If you want to influence whether a specific Alternator table is created with tablets or vnodes,
|
||||
you can do this by specifying the `system:initial_tablets` tag
|
||||
(in earlier versions of ScyllaDB the tag was `experimental:initial_tablets`)
|
||||
(in earlier versions of Scylla the tag was `experimental:initial_tablets`)
|
||||
in the CreateTable operation. The value of this tag can be:
|
||||
|
||||
* Any valid integer as the value of this tag enables tablets.
|
||||
|
||||
@@ -365,7 +365,7 @@ Modifying a keyspace with tablets enabled is possible and doesn't require any sp
|
||||
|
||||
- The replication factor (RF) can be increased or decreased by at most 1 at a time. To reach the desired RF value, modify the RF repeatedly.
|
||||
- The ``ALTER`` statement rejects the ``replication_factor`` tag. List the DCs explicitly when altering a keyspace. See :ref:`NetworkTopologyStrategy <replication-strategy>`.
|
||||
- An RF change cannot be requested while another RF change is pending for the same keyspace. Attempting to execute an ``ALTER`` statement in this scenario will fail with an explicit error. Wait for the ongoing RF change to complete before issuing another ``ALTER`` statement.
|
||||
- If there's any other ongoing global topology operation, executing the ``ALTER`` statement will fail (with an explicit and specific error) and needs to be repeated.
|
||||
- The ``ALTER`` statement may take longer than the regular query timeout, and even if it times out, it will continue to execute in the background.
|
||||
- The replication strategy cannot be modified, as keyspaces with tablets only support ``NetworkTopologyStrategy``.
|
||||
- The ``ALTER`` statement will fail if it would make the keyspace :term:`RF-rack-invalid <RF-rack-valid keyspace>`.
|
||||
@@ -1043,8 +1043,6 @@ The following modes are available:
|
||||
* - ``immediate``
|
||||
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
|
||||
|
||||
.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
|
||||
|
||||
.. _cql-per-table-tablet-options:
|
||||
|
||||
Per-table tablet options
|
||||
|
||||
@@ -102,7 +102,6 @@ Additional Information
|
||||
|
||||
To learn more about TTL, and see a hands-on example, check out `this lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_ on ScyllaDB University.
|
||||
|
||||
* `Video: Managing data expiration with Time-To-Live <https://www.youtube.com/watch?v=SXkbu7mFHeA>`_
|
||||
* :doc:`Apache Cassandra Query Language (CQL) Reference </cql/index>`
|
||||
* :doc:`KB Article:How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds/>`
|
||||
* :doc:`KB Article:Time to Live (TTL) and Compaction </kb/ttl-facts/>`
|
||||
|
||||
@@ -2,11 +2,8 @@
|
||||
|
||||
## What is ScyllaDB?
|
||||
|
||||
ScyllaDB is a high-performance NoSQL database optimized for speed and scalability.
|
||||
It is designed to efficiently handle large volumes of data with minimal latency,
|
||||
making it ideal for data-intensive applications.
|
||||
|
||||
ScyllaDB is distributed under the [ScyllaDB Source Available License](https://github.com/scylladb/scylladb/blob/master/LICENSE-ScyllaDB-Source-Available.md).
|
||||
ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
|
||||
ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
|
||||
|
||||
> [ScyllaDB](http://www.scylladb.com/)
|
||||
|
||||
|
||||
@@ -74,8 +74,6 @@ The keys and values are:
|
||||
as an indicator to which shard client wants to connect. The desired shard number
|
||||
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
|
||||
Its value is a decimal representation of type `uint16_t`, by default `19142`.
|
||||
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
|
||||
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
|
||||
|
||||
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
|
||||
`biased-token-round-robin`. To apply the algorithm,
|
||||
@@ -238,26 +236,3 @@ the same mechanism for other protocol versions, such as CQLv4.
|
||||
|
||||
The feature is identified by the `SCYLLA_USE_METADATA_ID` key, which is meant to be sent
|
||||
in the SUPPORTED message.
|
||||
|
||||
## Sending the CLIENT_ROUTES_CHANGE event
|
||||
|
||||
This extension allows a driver to update its connections when the
|
||||
`system.client_routes` table is modified.
|
||||
|
||||
In some network topologies a specific mapping of addresses and ports is required (e.g.
|
||||
to support Private Link). This mapping can change dynamically even when no nodes are
|
||||
added or removed. The driver must adapt to those changes; otherwise connectivity can be
|
||||
lost.
|
||||
|
||||
The extension is implemented as a new `EVENT` type: `CLIENT_ROUTES_CHANGE`. The event
|
||||
body consists of:
|
||||
- [string] change
|
||||
- [string list] connection_ids
|
||||
- [string list] host_ids
|
||||
|
||||
There is only one change value: `UPDATE_NODES`, which means at least one client route
|
||||
was inserted, updated, or deleted.
|
||||
|
||||
Events already have a subscription mechanism similar to protocol extensions (that is,
|
||||
the driver only receives the events it explicitly subscribed to), so no additional
|
||||
`cql_protocol_extension` key is introduced for this feature.
|
||||
|
||||
@@ -86,7 +86,6 @@ stateDiagram-v2
|
||||
de_left_token_ring --> [*]
|
||||
}
|
||||
state removing {
|
||||
re_left_token_ring : left_token_ring
|
||||
re_tablet_draining : tablet_draining
|
||||
re_tablet_migration : tablet_migration
|
||||
re_write_both_read_old : write_both_read_old
|
||||
@@ -99,8 +98,7 @@ stateDiagram-v2
|
||||
re_tablet_draining --> re_write_both_read_old
|
||||
re_write_both_read_old --> re_write_both_read_new: streaming completed
|
||||
re_write_both_read_old --> re_rollback_to_normal: rollback
|
||||
re_write_both_read_new --> re_left_token_ring
|
||||
re_left_token_ring --> [*]
|
||||
re_write_both_read_new --> [*]
|
||||
}
|
||||
rebuilding --> normal: streaming completed
|
||||
decommissioning --> left: operation succeeded
|
||||
@@ -124,10 +122,9 @@ Note that these are not all states, as there are other states specific to tablet
|
||||
Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
|
||||
to modified token ring), reads are using old replicas.
|
||||
- `write_both_read_new` - as above, but reads are using new replicas.
|
||||
- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
|
||||
then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
|
||||
We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
|
||||
it from group 0. We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
|
||||
moving the node we tried to decommission/remove back to the normal state.
|
||||
- `lock` - the topology stays in this state until externally changed (to null state), preventing topology
|
||||
@@ -144,9 +141,7 @@ reads that started before this point exist in the system. Finally we remove the
|
||||
transitioning state.
|
||||
|
||||
Decommission, removenode and replace work similarly, except they don't go through
|
||||
`commit_cdc_generation`. Both decommission and removenode go through the
|
||||
`left_token_ring` state to run a global barrier ensuring all nodes are aware
|
||||
of the topology change before the operation completes.
|
||||
`commit_cdc_generation`.
|
||||
|
||||
The state machine may also go only through the `commit_cdc_generation` state
|
||||
after getting a request from the user to create a new CDC generation if the
|
||||
|
||||
@@ -41,12 +41,12 @@ Unless the task was aborted, the worker will eventually reply that the task was
|
||||
it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
|
||||
This batching of removing finished tasks is done in order to reduce number of generated group0 operations.
|
||||
|
||||
On the other hand, view building tasks can can also be aborted due to 2 main reasons:
|
||||
On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
|
||||
- a keyspace/view was dropped
|
||||
- tablet operations (see [tablet operations section](#tablet-operations))
|
||||
In the first case we simply delete relevant view building tasks as they are no longer needed.
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task information
|
||||
to create new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
|
||||
to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.
|
||||
|
||||
(*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,
|
||||
|
||||
@@ -17,7 +17,6 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
:title: ScyllaDB Features
|
||||
@@ -44,5 +43,3 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
=================================
|
||||
Vector Search in ScyllaDB
|
||||
=================================
|
||||
|
||||
.. note::
|
||||
|
||||
This feature is currently available only in `ScyllaDB Cloud <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
What Is Vector Search
|
||||
-------------------------
|
||||
|
||||
Vector Search enables similarity-based queries over high-dimensional data,
|
||||
such as text, images, audio, or user behavior. Instead of searching for exact
|
||||
matches, it allows applications to find items that are semantically similar to
|
||||
a given input.
|
||||
|
||||
To do this, Vector Search works on vector embeddings, which are numerical
|
||||
representations of data that capture semantic meaning. This enables queries
|
||||
such as:
|
||||
|
||||
* “Find documents similar to this paragraph”
|
||||
* “Find products similar to what the user just viewed”
|
||||
* “Find previous tickets related to this support request”
|
||||
|
||||
Rather than relying on exact values or keywords, Vector Search returns results
|
||||
based on distance or similarity between vectors. This capability is
|
||||
increasingly used in modern workloads such as AI-powered search, recommendation
|
||||
systems, and retrieval-augmented generation (RAG).
|
||||
|
||||
Why Vector Search Matters
|
||||
------------------------------------
|
||||
|
||||
Many applications already rely on ScyllaDB for high throughput, low and
|
||||
predictable latency, and large-scale data storage.
|
||||
|
||||
Vector Search complements these strengths by enabling new classes of workloads,
|
||||
including:
|
||||
|
||||
* Semantic search over text or documents
|
||||
* Recommendations based on user or item similarity
|
||||
* AI and ML applications, including RAG pipelines
|
||||
* Anomaly and pattern detection
|
||||
|
||||
With Vector Search, ScyllaDB can serve as the similarity search backend for
|
||||
AI-driven applications.
|
||||
|
||||
Availability
|
||||
--------------
|
||||
|
||||
Vector Search is currently available only in ScyllaDB Cloud, the fully managed
|
||||
ScyllaDB service.
|
||||
|
||||
|
||||
👉 For details on using Vector Search, refer to the
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/index.html>`_.
|
||||
@@ -20,10 +20,7 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
|
||||
Amazon Web Services (AWS)
|
||||
-----------------------------
|
||||
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`,
|
||||
:ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`,
|
||||
:ref:`i7ie <system-requirements-i7ie-instances>`, :ref:`i8g<system-requirements-i8g-instances>`,
|
||||
and :ref:`i8ge <system-requirements-i8ge-instances>`.
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -198,118 +195,6 @@ All i7i instances have the following specs:
|
||||
|
||||
See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.
|
||||
|
||||
|
||||
.. _system-requirements-i8g-instances:
|
||||
|
||||
i8g instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8g instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8g.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 468 GB
|
||||
* - i8g.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 937 GB
|
||||
* - i8g.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 1 x 1,875 GB
|
||||
* - i8g.4xlarge
|
||||
- 16
|
||||
- 128
|
||||
- 1 x 3,750 GB
|
||||
* - i8g.8xlarge
|
||||
- 32
|
||||
- 256
|
||||
- 2 x 3,750 GB
|
||||
* - i8g.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 3 x 3,750 GB
|
||||
* - i8g.16xlarge
|
||||
- 64
|
||||
- 512
|
||||
- 4 x 3,750 GB
|
||||
|
||||
All i8g instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 100 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 45 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
.. _system-requirements-i8ge-instances:
|
||||
|
||||
i8ge instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8ge instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8ge.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 1,250 GB
|
||||
* - i8ge.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 2,500 GB
|
||||
* - i8ge.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 2 x 2,500 GB
|
||||
* - i8ge.3xlarge
|
||||
- 12
|
||||
- 96
|
||||
- 1 x 7,500 GB
|
||||
* - i8ge.6xlarge
|
||||
- 24
|
||||
- 192
|
||||
- 2 x 7,500 GB
|
||||
* - i8ge.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 4 x 7,500 GB
|
||||
* - i8ge.18xlarge
|
||||
- 72
|
||||
- 576
|
||||
- 6 x 7,500 GB
|
||||
|
||||
All i8ge instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 300 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 120 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
|
||||
Im4gn and Is4gen instances
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
ScyllaDB supports Arm-based Im4gn and Is4gen instances. See `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details.
|
||||
|
||||
@@ -25,7 +25,8 @@ Getting Started
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
|
||||
* `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
|
||||
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
|
||||
* :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
|
||||
* :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
ScyllaDB Housekeeping and how to disable it
|
||||
============================================
|
||||
|
||||
It is always recommended to run the latest stable version of ScyllaDB.
|
||||
It is always recommended to run the latest version of ScyllaDB.
|
||||
The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
|
||||
|
||||
When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
|
||||
Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
|
||||
|
||||
@@ -9,8 +9,6 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
|
||||
To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
|
||||
of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on **any** of the nodes in the cluster.
|
||||
|
||||
.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
|
||||
|
||||
To check if a keyspace enables tablets, use:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
95
docs/poetry.lock
generated
95
docs/poetry.lock
generated
@@ -2,35 +2,36 @@
|
||||
|
||||
[[package]]
|
||||
name = "alabaster"
|
||||
version = "1.0.0"
|
||||
version = "0.7.16"
|
||||
description = "A light, configurable Sphinx theme"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b"},
|
||||
{file = "alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e"},
|
||||
{file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"},
|
||||
{file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.12.0"
|
||||
version = "4.11.0"
|
||||
description = "High-level concurrency and networking framework on top of asyncio or Trio"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb"},
|
||||
{file = "anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0"},
|
||||
{file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"},
|
||||
{file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
|
||||
idna = ">=2.8"
|
||||
sniffio = ">=1.1"
|
||||
typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
|
||||
|
||||
[package.extras]
|
||||
trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""]
|
||||
trio = ["trio (>=0.31.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
@@ -49,14 +50,14 @@ dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)"
|
||||
|
||||
[[package]]
|
||||
name = "beartype"
|
||||
version = "0.22.8"
|
||||
version = "0.22.6"
|
||||
description = "Unbearably fast near-real-time pure-Python runtime-static type-checker."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beartype-0.22.8-py3-none-any.whl", hash = "sha256:b832882d04e41a4097bab9f63e6992bc6de58c414ee84cba9b45b67314f5ab2e"},
|
||||
{file = "beartype-0.22.8.tar.gz", hash = "sha256:b19b21c9359722ee3f7cc433f063b3e13997b27ae8226551ea5062e621f61165"},
|
||||
{file = "beartype-0.22.6-py3-none-any.whl", hash = "sha256:0584bc46a2ea2a871509679278cda992eadde676c01356ab0ac77421f3c9a093"},
|
||||
{file = "beartype-0.22.6.tar.gz", hash = "sha256:97fbda69c20b48c5780ac2ca60ce3c1bb9af29b3a1a0216898ffabdd523e48f4"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -69,18 +70,18 @@ test-tox-coverage = ["coverage (>=5.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.3"
|
||||
version = "4.14.2"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
|
||||
{file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
|
||||
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
|
||||
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">=1.6.1"
|
||||
soupsieve = ">1.2"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
@@ -801,6 +802,18 @@ files = [
|
||||
{file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sniffio"
|
||||
version = "1.3.1"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
|
||||
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "snowballstemmer"
|
||||
version = "3.0.1"
|
||||
@@ -827,18 +840,18 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "sphinx"
|
||||
version = "8.1.3"
|
||||
version = "7.4.7"
|
||||
description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
|
||||
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
|
||||
{file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"},
|
||||
{file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
alabaster = ">=0.7.14"
|
||||
alabaster = ">=0.7.14,<0.8.0"
|
||||
babel = ">=2.13"
|
||||
colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""}
|
||||
docutils = ">=0.20,<0.22"
|
||||
@@ -848,17 +861,17 @@ packaging = ">=23.0"
|
||||
Pygments = ">=2.17"
|
||||
requests = ">=2.30.0"
|
||||
snowballstemmer = ">=2.2"
|
||||
sphinxcontrib-applehelp = ">=1.0.7"
|
||||
sphinxcontrib-devhelp = ">=1.0.6"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.6"
|
||||
sphinxcontrib-jsmath = ">=1.0.1"
|
||||
sphinxcontrib-qthelp = ">=1.0.6"
|
||||
sphinxcontrib-applehelp = "*"
|
||||
sphinxcontrib-devhelp = "*"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.0"
|
||||
sphinxcontrib-jsmath = "*"
|
||||
sphinxcontrib-qthelp = "*"
|
||||
sphinxcontrib-serializinghtml = ">=1.1.9"
|
||||
tomli = {version = ">=2", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinxcontrib-websupport"]
|
||||
lint = ["flake8 (>=6.0)", "mypy (==1.11.1)", "pyright (==1.1.384)", "pytest (>=6.0)", "ruff (==0.6.9)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-Pillow (==10.2.0.20240822)", "types-Pygments (==2.18.0.20240506)", "types-colorama (==0.4.15.20240311)", "types-defusedxml (==0.7.0.20240218)", "types-docutils (==0.21.0.20241005)", "types-requests (==2.32.0.20240914)", "types-urllib3 (==1.26.25.14)"]
|
||||
lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"]
|
||||
test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"]
|
||||
|
||||
[[package]]
|
||||
@@ -988,14 +1001,13 @@ test = ["tox"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-scylladb-markdown"
|
||||
version = "0.1.4"
|
||||
version = "0.1.3"
|
||||
description = "Sphinx extension for ScyllaDB documentation with enhanced Markdown support through MystParser and recommonmark."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_scylladb_markdown-0.1.4-py3-none-any.whl", hash = "sha256:598753e01cf159d4698eb1a707958828446e21749038d3d42c5b9c7e86eda6e4"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.4.tar.gz", hash = "sha256:9db3ae0dcf7c3519262da65e48c7f9e4db0ad1ce9c5f874864ea218f4cbc4c68"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.3-py3-none-any.whl", hash = "sha256:f20160b4aadf4c8cf95637f0a544121954b792914ab6ec05b67cae75e20a5566"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1047,25 +1059,24 @@ dev = ["build", "flake8", "pre-commit", "pytest", "sphinx", "sphinx-last-updated
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-substitution-extensions"
|
||||
version = "2025.11.17"
|
||||
version = "2025.1.2"
|
||||
description = "Extensions for Sphinx which allow for substitutions."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_substitution_extensions-2025.11.17-py2.py3-none-any.whl", hash = "sha256:ac18455bdc8324b337b0fe7498c1c0d0b1cb65c74d131459be4dea9edb6abbef"},
|
||||
{file = "sphinx_substitution_extensions-2025.11.17.tar.gz", hash = "sha256:aae17f8db9efc3d454a304373ae3df763f8739e05e0b98d5381db46f6d250b27"},
|
||||
{file = "sphinx_substitution_extensions-2025.1.2-py2.py3-none-any.whl", hash = "sha256:ff14f40e4393bd7434a196badb8d47983355d9755af884b902e3023fb456b958"},
|
||||
{file = "sphinx_substitution_extensions-2025.1.2.tar.gz", hash = "sha256:53b8d394d5098a09aef36bc687fa310aeb28466319d2c750e996e46400fb2474"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beartype = ">=0.18.5"
|
||||
docutils = ">=0.19"
|
||||
myst-parser = ">=4.0.0"
|
||||
sphinx = ">=8.1.0"
|
||||
sphinx = ">=7.3.5"
|
||||
|
||||
[package.extras]
|
||||
dev = ["actionlint-py (==1.7.8.24)", "check-manifest (==0.51)", "deptry (==0.24.0)", "doc8 (==2.0.0)", "doccmd (==2025.11.8.1)", "docformatter (==1.7.7)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2025.4.3)", "mypy[faster-cache] (==1.18.2)", "pre-commit (==4.4.0)", "pylint[spelling] (==4.0.3)", "pyproject-fmt (==2.11.1)", "pyright (==1.1.407)", "pyroma (==5.0)", "pytest (==9.0.1)", "pytest-cov (==7.0.0)", "ruff (==0.14.5)", "shellcheck-py (==0.11.0.1)", "shfmt-py (==3.12.0.2)", "sphinx-lint (==1.0.1)", "sphinx-toolbox (==4.0.0)", "types-docutils (==0.22.2.20251006)", "vulture (==2.14)", "yamlfix (==1.19.0)"]
|
||||
release = ["check-wheel-contents (==0.6.3)"]
|
||||
dev = ["actionlint-py (==1.7.5.21)", "check-manifest (==0.50)", "deptry (==0.21.2)", "doc8 (==1.1.2)", "doccmd (==2024.12.26)", "docformatter (==1.7.5)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2024.12.25)", "mypy[faster-cache] (==1.14.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pyenchant (==3.3.0rc1)", "pylint (==3.3.3)", "pyproject-fmt (==2.5.0)", "pyright (==1.1.391)", "pyroma (==4.2)", "pytest (==8.3.4)", "pytest-cov (==6.0.0)", "ruff (==0.8.4)", "shellcheck-py (==0.10.0.1)", "shfmt-py (==3.7.0.1)", "sphinx-toolbox (==3.8.1)", "sphinx[test] (==8.1.3)", "types-docutils (==0.21.0.20241128)", "vulture (==2.14)", "yamlfix (==1.17.0)"]
|
||||
release = ["check-wheel-contents (==0.6.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-tabs"
|
||||
@@ -1352,21 +1363,21 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.6.2"
|
||||
version = "2.5.0"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"},
|
||||
{file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"},
|
||||
{file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
|
||||
{file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""]
|
||||
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
|
||||
h2 = ["h2 (>=4,<5)"]
|
||||
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
|
||||
zstd = ["zstandard (>=0.18.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "uvicorn"
|
||||
@@ -1592,4 +1603,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "9a17caa38b3c88f3fe3d1a60fdb73a96aa12ff1e30ecb00e2f9249e7ba9f859c"
|
||||
content-hash = "0ae673106f45d3465cbdabbf511e165ca44feadd34d7753f2e68093afaa95c79"
|
||||
|
||||
@@ -12,10 +12,10 @@ redirects_cli ="^0.1.3"
|
||||
sphinx-scylladb-theme = "^1.8.10"
|
||||
sphinx-sitemap = "^2.6.0"
|
||||
sphinx-autobuild = "^2024.4.19"
|
||||
Sphinx = "^8.0.0"
|
||||
Sphinx = "^7.3.7"
|
||||
sphinx-multiversion-scylla = "^0.3.4"
|
||||
sphinxcontrib-datatemplates = "^0.9.2"
|
||||
sphinx-scylladb-markdown = "^0.1.4"
|
||||
sphinx-scylladb-markdown = "^0.1.2"
|
||||
sphinx_collapse ="^0.1.3"
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -202,7 +202,3 @@ Glossary
|
||||
The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
|
||||
The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.
|
||||
|
||||
Colocated Table
|
||||
An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
|
||||
Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.
|
||||
|
||||
|
||||
@@ -816,6 +816,7 @@ public:
|
||||
future<data_sink> wrap_sink(const sstables::sstable& sst, sstables::component_type type, data_sink sink) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
co_return sink;
|
||||
@@ -844,6 +845,7 @@ public:
|
||||
sstables::component_type type,
|
||||
data_source src) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
|
||||
@@ -176,8 +176,6 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
15
main.cc
15
main.cc
@@ -23,7 +23,6 @@
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/signal.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/qos/raft_service_level_distributed_data_accessor.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "tasks/task_manager.hh"
|
||||
@@ -1796,13 +1795,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth_cache.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing client routes service");
|
||||
static sharded<service::client_routes_service> client_routes;
|
||||
client_routes.start(std::ref(stop_signal.as_sharded_abort_source()), std::ref(feature_service), std::ref(group0_client), std::ref(qp), std::ref(lifecycle_notifier)).get();
|
||||
auto stop_client_routes = defer_verbose_shutdown("client_routes", [&] {
|
||||
client_routes.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing storage service");
|
||||
debug::the_storage_service = &ss;
|
||||
ss.start(std::ref(stop_signal.as_sharded_abort_source()),
|
||||
@@ -1811,7 +1803,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
std::ref(messaging), std::ref(repair),
|
||||
std::ref(stream_manager), std::ref(lifecycle_notifier), std::ref(bm), std::ref(snitch),
|
||||
std::ref(tablet_allocator), std::ref(cdc_generation_service), std::ref(view_builder), std::ref(view_building_worker), std::ref(qp), std::ref(sl_controller),
|
||||
std::ref(auth_cache), std::ref(client_routes),
|
||||
std::ref(auth_cache),
|
||||
std::ref(tsm), std::ref(vbsm), std::ref(task_manager), std::ref(gossip_address_map),
|
||||
compression_dict_updated_callback,
|
||||
only_on_shard0(&*disk_space_monitor_shard0)
|
||||
@@ -2199,11 +2191,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
}).get();
|
||||
|
||||
api::set_server_client_routes(ctx, client_routes).get();
|
||||
auto stop_cr_api = defer_verbose_shutdown("client routes API", [&ctx] {
|
||||
api::unset_server_client_routes(ctx).get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "join cluster");
|
||||
// Allow abort during join_cluster since bootstrap or replace
|
||||
// can take a long time.
|
||||
|
||||
@@ -45,9 +45,7 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
|
||||
: _tombstone(x._tombstone)
|
||||
, _static_row(s, column_kind::static_column, x._static_row)
|
||||
, _static_row_continuous(x._static_row_continuous)
|
||||
, _rows(use_single_row_storage(s) ?
|
||||
rows_storage_type(std::optional<deletable_row>{}) :
|
||||
rows_storage_type(rows_type{}))
|
||||
, _rows()
|
||||
, _row_tombstones(x._row_tombstones)
|
||||
#ifdef SEASTAR_DEBUG
|
||||
, _schema_version(s.version())
|
||||
@@ -56,30 +54,10 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
|
||||
#ifdef SEASTAR_DEBUG
|
||||
SCYLLA_ASSERT(x._schema_version == _schema_version);
|
||||
#endif
|
||||
if (use_single_row_storage(s)) {
|
||||
// Copy single row if it exists
|
||||
if (x.uses_single_row_storage()) {
|
||||
const auto& x_row = x.get_single_row_storage();
|
||||
if (x_row) {
|
||||
get_single_row_storage() = deletable_row(s, *x_row);
|
||||
}
|
||||
} else if (!x.get_rows_storage().empty()) {
|
||||
// Converting from multi-row to single-row - take the first row
|
||||
// This shouldn't normally happen as schema doesn't change this way
|
||||
on_internal_error(mplog, "mutation_partition: cannot convert multi-row partition to single-row");
|
||||
}
|
||||
} else {
|
||||
// Multi-row storage
|
||||
if (x.uses_single_row_storage()) {
|
||||
// Converting from single-row to multi-row - this shouldn't normally happen
|
||||
on_internal_error(mplog, "mutation_partition: cannot convert single-row partition to multi-row");
|
||||
} else {
|
||||
auto cloner = [&s] (const rows_entry* x) -> rows_entry* {
|
||||
return current_allocator().construct<rows_entry>(s, *x);
|
||||
};
|
||||
get_rows_storage().clone_from(x.get_rows_storage(), cloner, current_deleter<rows_entry>());
|
||||
}
|
||||
}
|
||||
auto cloner = [&s] (const rows_entry* x) -> rows_entry* {
|
||||
return current_allocator().construct<rows_entry>(s, *x);
|
||||
};
|
||||
_rows.clone_from(x._rows, cloner, current_deleter<rows_entry>());
|
||||
}
|
||||
|
||||
mutation_partition::mutation_partition(const mutation_partition& x, const schema& schema,
|
||||
@@ -87,9 +65,7 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
|
||||
: _tombstone(x._tombstone)
|
||||
, _static_row(schema, column_kind::static_column, x._static_row)
|
||||
, _static_row_continuous(x._static_row_continuous)
|
||||
, _rows(use_single_row_storage(schema) ?
|
||||
rows_storage_type(std::optional<deletable_row>{}) :
|
||||
rows_storage_type(rows_type{}))
|
||||
, _rows()
|
||||
, _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only())
|
||||
#ifdef SEASTAR_DEBUG
|
||||
, _schema_version(schema.version())
|
||||
@@ -98,37 +74,19 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
|
||||
#ifdef SEASTAR_DEBUG
|
||||
SCYLLA_ASSERT(x._schema_version == _schema_version);
|
||||
#endif
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: just copy the row if it exists
|
||||
if (x.uses_single_row_storage()) {
|
||||
const auto& x_row = x.get_single_row_storage();
|
||||
if (x_row) {
|
||||
get_single_row_storage() = deletable_row(schema, *x_row);
|
||||
try {
|
||||
for(auto&& r : ck_ranges) {
|
||||
for (const rows_entry& e : x.range(schema, r)) {
|
||||
auto ce = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(schema, e));
|
||||
_rows.insert_before_hint(_rows.end(), std::move(ce), rows_entry::tri_compare(schema));
|
||||
}
|
||||
} else {
|
||||
// Filtering from multi-row - shouldn't happen with consistent schema
|
||||
on_internal_error(mplog, "mutation_partition: filtering from multi-row to single-row storage");
|
||||
}
|
||||
} else {
|
||||
// Multi-row storage with filtering
|
||||
if (x.uses_single_row_storage()) {
|
||||
on_internal_error(mplog, "mutation_partition: filtering from single-row to multi-row storage");
|
||||
} else {
|
||||
try {
|
||||
for(auto&& r : ck_ranges) {
|
||||
for (const rows_entry& e : x.range(schema, r)) {
|
||||
auto ce = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(schema, e));
|
||||
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(ce), rows_entry::tri_compare(schema));
|
||||
}
|
||||
for (auto&& rt : x._row_tombstones.slice(schema, r)) {
|
||||
_row_tombstones.apply(schema, rt.tombstone());
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
get_rows_storage().clear_and_dispose(current_deleter<rows_entry>());
|
||||
throw;
|
||||
for (auto&& rt : x._row_tombstones.slice(schema, r)) {
|
||||
_row_tombstones.apply(schema, rt.tombstone());
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
_rows.clear_and_dispose(current_deleter<rows_entry>());
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,20 +104,14 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
|
||||
#ifdef SEASTAR_DEBUG
|
||||
SCYLLA_ASSERT(x._schema_version == _schema_version);
|
||||
#endif
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: no filtering needed, row either exists or doesn't
|
||||
// The move constructor has already moved the row if it exists
|
||||
} else {
|
||||
// Multi-row storage: filter the rows
|
||||
if (!uses_single_row_storage()) {
|
||||
auto deleter = current_deleter<rows_entry>();
|
||||
auto it = get_rows_storage().begin();
|
||||
for (auto&& range : ck_ranges.ranges()) {
|
||||
get_rows_storage().erase_and_dispose(it, lower_bound(schema, range), deleter);
|
||||
it = upper_bound(schema, range);
|
||||
}
|
||||
get_rows_storage().erase_and_dispose(it, get_rows_storage().end(), deleter);
|
||||
{
|
||||
auto deleter = current_deleter<rows_entry>();
|
||||
auto it = _rows.begin();
|
||||
for (auto&& range : ck_ranges.ranges()) {
|
||||
_rows.erase_and_dispose(it, lower_bound(schema, range), deleter);
|
||||
it = upper_bound(schema, range);
|
||||
}
|
||||
_rows.erase_and_dispose(it, _rows.end(), deleter);
|
||||
}
|
||||
{
|
||||
for (auto&& range : ck_ranges.ranges()) {
|
||||
@@ -175,11 +127,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
|
||||
}
|
||||
|
||||
mutation_partition::~mutation_partition() {
|
||||
if (uses_single_row_storage()) {
|
||||
// Single-row storage: optional destructor handles cleanup
|
||||
} else {
|
||||
get_rows_storage().clear_and_dispose(current_deleter<rows_entry>());
|
||||
}
|
||||
_rows.clear_and_dispose(current_deleter<rows_entry>());
|
||||
}
|
||||
|
||||
mutation_partition&
|
||||
@@ -193,14 +141,10 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
|
||||
|
||||
void mutation_partition::ensure_last_dummy(const schema& s) {
|
||||
check_schema(s);
|
||||
if (uses_single_row_storage()) {
|
||||
// Single-row storage doesn't use dummy entries
|
||||
return;
|
||||
}
|
||||
if (get_rows_storage().empty() || !get_rows_storage().rbegin()->is_last_dummy()) {
|
||||
if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
|
||||
get_rows_storage().insert_before(get_rows_storage().end(), std::move(e));
|
||||
_rows.insert_before(_rows.end(), std::move(e));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,18 +419,9 @@ mutation_partition::tombstone_for_row(const schema& schema, const clustering_key
|
||||
check_schema(schema);
|
||||
row_tombstone t = row_tombstone(range_tombstone_for_row(schema, key));
|
||||
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: check if the single row exists and has tombstone
|
||||
const auto& row_opt = get_single_row_storage();
|
||||
if (row_opt) {
|
||||
t.apply(row_opt->deleted_at(), row_opt->marker());
|
||||
}
|
||||
} else {
|
||||
// Multi-row storage: search in B-tree
|
||||
auto j = get_rows_storage().find(key, rows_entry::tri_compare(schema));
|
||||
if (j != get_rows_storage().end()) {
|
||||
t.apply(j->row().deleted_at(), j->row().marker());
|
||||
}
|
||||
auto j = _rows.find(key, rows_entry::tri_compare(schema));
|
||||
if (j != _rows.end()) {
|
||||
t.apply(j->row().deleted_at(), j->row().marker());
|
||||
}
|
||||
|
||||
return t;
|
||||
@@ -569,178 +504,97 @@ void mutation_partition::apply_insert(const schema& s, clustering_key_view key,
|
||||
clustered_row(s, key).apply(row_marker(created_at, ttl, expiry));
|
||||
}
|
||||
void mutation_partition::insert_row(const schema& s, const clustering_key& key, deletable_row&& row) {
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: just set the row
|
||||
get_single_row_storage() = std::move(row);
|
||||
} else {
|
||||
// Multi-row storage: insert into B-tree
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key, std::move(row)));
|
||||
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(e), rows_entry::tri_compare(s));
|
||||
}
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key, std::move(row)));
|
||||
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
|
||||
}
|
||||
|
||||
void mutation_partition::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
|
||||
check_schema(s);
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: just copy the row
|
||||
get_single_row_storage() = row;
|
||||
} else {
|
||||
// Multi-row storage: insert into B-tree
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, key, row));
|
||||
get_rows_storage().insert_before_hint(get_rows_storage().end(), std::move(e), rows_entry::tri_compare(s));
|
||||
}
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, key, row));
|
||||
_rows.insert_before_hint(_rows.end(), std::move(e), rows_entry::tri_compare(s));
|
||||
}
|
||||
|
||||
const row*
|
||||
mutation_partition::find_row(const schema& s, const clustering_key& key) const {
|
||||
check_schema(s);
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: return the single row's cells if it exists
|
||||
const auto& row_opt = get_single_row_storage();
|
||||
if (row_opt) {
|
||||
return &row_opt->cells();
|
||||
}
|
||||
auto i = _rows.find(key, rows_entry::tri_compare(s));
|
||||
if (i == _rows.end()) {
|
||||
return nullptr;
|
||||
} else {
|
||||
// Multi-row storage: search in B-tree
|
||||
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
|
||||
if (i == get_rows_storage().end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return &i->row().cells();
|
||||
}
|
||||
return &i->row().cells();
|
||||
}
|
||||
|
||||
deletable_row&
|
||||
mutation_partition::clustered_row(const schema& s, clustering_key&& key) {
|
||||
check_schema(s);
|
||||
check_row_key(s, key, is_dummy::no);
|
||||
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: create row if it doesn't exist
|
||||
auto& row_opt = get_single_row_storage();
|
||||
if (!row_opt) {
|
||||
row_opt = deletable_row();
|
||||
}
|
||||
return *row_opt;
|
||||
} else {
|
||||
// Multi-row storage: find or insert in B-tree
|
||||
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
|
||||
if (i == get_rows_storage().end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(std::move(key)));
|
||||
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
auto i = _rows.find(key, rows_entry::tri_compare(s));
|
||||
if (i == _rows.end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(std::move(key)));
|
||||
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
}
|
||||
|
||||
deletable_row&
|
||||
mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
|
||||
check_schema(s);
|
||||
check_row_key(s, key, is_dummy::no);
|
||||
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: create row if it doesn't exist
|
||||
auto& row_opt = get_single_row_storage();
|
||||
if (!row_opt) {
|
||||
row_opt = deletable_row();
|
||||
}
|
||||
return *row_opt;
|
||||
} else {
|
||||
// Multi-row storage: find or insert in B-tree
|
||||
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
|
||||
if (i == get_rows_storage().end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key));
|
||||
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
auto i = _rows.find(key, rows_entry::tri_compare(s));
|
||||
if (i == _rows.end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key));
|
||||
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
}
|
||||
|
||||
deletable_row&
|
||||
mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
|
||||
check_schema(s);
|
||||
check_row_key(s, key, is_dummy::no);
|
||||
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: create row if it doesn't exist
|
||||
auto& row_opt = get_single_row_storage();
|
||||
if (!row_opt) {
|
||||
row_opt = deletable_row();
|
||||
}
|
||||
return *row_opt;
|
||||
} else {
|
||||
// Multi-row storage: find or insert in B-tree
|
||||
auto i = get_rows_storage().find(key, rows_entry::tri_compare(s));
|
||||
if (i == get_rows_storage().end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key));
|
||||
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
auto i = _rows.find(key, rows_entry::tri_compare(s));
|
||||
if (i == _rows.end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(key));
|
||||
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return i->row();
|
||||
}
|
||||
|
||||
rows_entry&
|
||||
mutation_partition::clustered_rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
|
||||
check_schema(s);
|
||||
check_row_key(s, pos, dummy);
|
||||
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage doesn't use rows_entry - this shouldn't be called
|
||||
on_internal_error(mplog, "mutation_partition::clustered_rows_entry() called with single-row storage");
|
||||
}
|
||||
|
||||
auto i = get_rows_storage().find(pos, rows_entry::tri_compare(s));
|
||||
if (i == get_rows_storage().end()) {
|
||||
auto i = _rows.find(pos, rows_entry::tri_compare(s));
|
||||
if (i == _rows.end()) {
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
|
||||
i = get_rows_storage().insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
i = _rows.insert_before_hint(i, std::move(e), rows_entry::tri_compare(s)).first;
|
||||
}
|
||||
return *i;
|
||||
}
|
||||
|
||||
deletable_row&
|
||||
mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: ignore dummy/continuous flags, just get/create the row
|
||||
check_row_key(s, pos, dummy);
|
||||
auto& row_opt = get_single_row_storage();
|
||||
if (!row_opt) {
|
||||
row_opt = deletable_row();
|
||||
}
|
||||
return *row_opt;
|
||||
} else {
|
||||
return clustered_rows_entry(s, pos, dummy, continuous).row();
|
||||
}
|
||||
return clustered_rows_entry(s, pos, dummy, continuous).row();
|
||||
}
|
||||
|
||||
deletable_row&
|
||||
mutation_partition::append_clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
|
||||
check_schema(s);
|
||||
check_row_key(s, pos, dummy);
|
||||
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: just create/get the row
|
||||
auto& row_opt = get_single_row_storage();
|
||||
if (!row_opt) {
|
||||
row_opt = deletable_row();
|
||||
}
|
||||
return *row_opt;
|
||||
}
|
||||
|
||||
const auto cmp = rows_entry::tri_compare(s);
|
||||
auto i = get_rows_storage().end();
|
||||
if (!get_rows_storage().empty() && (cmp(*std::prev(i), pos) >= 0)) {
|
||||
auto i = _rows.end();
|
||||
if (!_rows.empty() && (cmp(*std::prev(i), pos) >= 0)) {
|
||||
on_internal_error(mplog, format("mutation_partition::append_clustered_row(): cannot append clustering row with key {} to the partition"
|
||||
", last clustering row is equal or greater: {}", pos, std::prev(i)->position()));
|
||||
}
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(s, pos, dummy, continuous));
|
||||
i = get_rows_storage().insert_before_hint(i, std::move(e), cmp).first;
|
||||
i = _rows.insert_before_hint(i, std::move(e), cmp).first;
|
||||
|
||||
return i->row();
|
||||
}
|
||||
@@ -748,33 +602,19 @@ mutation_partition::append_clustered_row(const schema& s, position_in_partition_
|
||||
mutation_partition::rows_type::const_iterator
|
||||
mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
|
||||
check_schema(schema);
|
||||
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: always return end iterator (empty range)
|
||||
static const rows_type empty_rows;
|
||||
return empty_rows.end();
|
||||
}
|
||||
|
||||
if (!r.start()) {
|
||||
return std::cbegin(get_rows_storage());
|
||||
return std::cbegin(_rows);
|
||||
}
|
||||
return get_rows_storage().lower_bound(position_in_partition_view::for_range_start(r), rows_entry::tri_compare(schema));
|
||||
return _rows.lower_bound(position_in_partition_view::for_range_start(r), rows_entry::tri_compare(schema));
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::const_iterator
|
||||
mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
|
||||
check_schema(schema);
|
||||
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: always return end iterator (empty range)
|
||||
static const rows_type empty_rows;
|
||||
return empty_rows.end();
|
||||
}
|
||||
|
||||
if (!r.end()) {
|
||||
return std::cend(get_rows_storage());
|
||||
return std::cend(_rows);
|
||||
}
|
||||
return get_rows_storage().lower_bound(position_in_partition_view::for_range_end(r), rows_entry::tri_compare(schema));
|
||||
return _rows.lower_bound(position_in_partition_view::for_range_end(r), rows_entry::tri_compare(schema));
|
||||
}
|
||||
|
||||
std::ranges::subrange<mutation_partition::rows_type::const_iterator>
|
||||
@@ -785,32 +625,17 @@ mutation_partition::range(const schema& schema, const query::clustering_range& r
|
||||
|
||||
std::ranges::subrange<mutation_partition::rows_type::iterator>
|
||||
mutation_partition::range(const schema& schema, const query::clustering_range& r) {
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: return empty range (rows_entry iteration not applicable)
|
||||
static rows_type empty_rows;
|
||||
return std::ranges::subrange(empty_rows.begin(), empty_rows.end());
|
||||
}
|
||||
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->range(schema, r));
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::iterator
|
||||
mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) {
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: return end iterator (empty range)
|
||||
static rows_type empty_rows;
|
||||
return empty_rows.end();
|
||||
}
|
||||
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::iterator
|
||||
mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) {
|
||||
if (use_single_row_storage(schema)) {
|
||||
// Single-row storage: return end iterator (empty range)
|
||||
static rows_type empty_rows;
|
||||
return empty_rows.end();
|
||||
}
|
||||
return unconst(get_rows_storage(), static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
@@ -1552,15 +1377,7 @@ bool mutation_partition::empty() const
|
||||
if (_tombstone.timestamp != api::missing_timestamp) {
|
||||
return false;
|
||||
}
|
||||
if (_static_row.size() || !_row_tombstones.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (uses_single_row_storage()) {
|
||||
return !get_single_row_storage().has_value();
|
||||
} else {
|
||||
return get_rows_storage().empty();
|
||||
}
|
||||
return !_static_row.size() && _rows.empty() && _row_tombstones.empty();
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1605,11 +1422,7 @@ mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_t
|
||||
|
||||
uint64_t
|
||||
mutation_partition::row_count() const {
|
||||
if (uses_single_row_storage()) {
|
||||
return get_single_row_storage().has_value() ? 1 : 0;
|
||||
} else {
|
||||
return get_rows_storage().calculate_size();
|
||||
}
|
||||
return _rows.calculate_size();
|
||||
}
|
||||
|
||||
rows_entry::rows_entry(rows_entry&& o) noexcept
|
||||
@@ -2406,22 +2219,15 @@ public:
|
||||
mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
|
||||
: _tombstone(t)
|
||||
, _static_row_continuous(!s.has_static_columns())
|
||||
, _rows(use_single_row_storage(s) ?
|
||||
rows_storage_type(std::optional<deletable_row>{}) :
|
||||
rows_storage_type(rows_type{}))
|
||||
, _rows()
|
||||
, _row_tombstones(s)
|
||||
#ifdef SEASTAR_DEBUG
|
||||
, _schema_version(s.version())
|
||||
#endif
|
||||
{
|
||||
if (use_single_row_storage(s)) {
|
||||
// Single-row storage: no dummy entries needed, leave row as empty optional
|
||||
} else {
|
||||
// Multi-row storage: add last dummy entry for discontinuous partition
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
|
||||
get_rows_storage().insert_before(get_rows_storage().end(), std::move(e));
|
||||
}
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(
|
||||
current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
|
||||
_rows.insert_before(_rows.end(), std::move(e));
|
||||
}
|
||||
|
||||
bool mutation_partition::is_fully_continuous() const {
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include <variant>
|
||||
#include <boost/intrusive/parent_from_member.hpp>
|
||||
|
||||
#include <seastar/util/optimized_optional.hh>
|
||||
@@ -1189,12 +1188,6 @@ inline void check_row_key(const schema& s, position_in_partition_view pos, is_du
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if the schema has no clustering keys, meaning partitions can have at most one row.
|
||||
// When true, mutation_partition uses std::optional<deletable_row> instead of full rows_type container.
|
||||
inline bool use_single_row_storage(const schema& s) {
|
||||
return s.clustering_key_size() == 0;
|
||||
}
|
||||
|
||||
// Represents a set of writes made to a single partition.
|
||||
//
|
||||
// The object is schema-dependent. Each instance is governed by some
|
||||
@@ -1235,45 +1228,20 @@ inline bool use_single_row_storage(const schema& s) {
|
||||
class mutation_partition final {
|
||||
public:
|
||||
using rows_type = rows_entry::container_type;
|
||||
using rows_storage_type = std::variant<rows_type, std::optional<deletable_row>>;
|
||||
friend class size_calculator;
|
||||
private:
|
||||
tombstone _tombstone;
|
||||
lazy_row _static_row;
|
||||
bool _static_row_continuous = true;
|
||||
rows_storage_type _rows;
|
||||
rows_type _rows;
|
||||
// Contains only strict prefixes so that we don't have to lookup full keys
|
||||
// in both _row_tombstones and _rows.
|
||||
// Note: empty when using single-row storage (std::optional<deletable_row> variant)
|
||||
range_tombstone_list _row_tombstones;
|
||||
#ifdef SEASTAR_DEBUG
|
||||
table_schema_version _schema_version;
|
||||
#endif
|
||||
|
||||
friend class converting_mutation_partition_applier;
|
||||
|
||||
// Returns true if this partition uses single-row storage
|
||||
bool uses_single_row_storage() const {
|
||||
return std::holds_alternative<std::optional<deletable_row>>(_rows);
|
||||
}
|
||||
|
||||
// Get reference to rows container (multi-row storage)
|
||||
rows_type& get_rows_storage() {
|
||||
return std::get<rows_type>(_rows);
|
||||
}
|
||||
|
||||
const rows_type& get_rows_storage() const {
|
||||
return std::get<rows_type>(_rows);
|
||||
}
|
||||
|
||||
// Get reference to single row storage
|
||||
std::optional<deletable_row>& get_single_row_storage() {
|
||||
return std::get<std::optional<deletable_row>>(_rows);
|
||||
}
|
||||
|
||||
const std::optional<deletable_row>& get_single_row_storage() const {
|
||||
return std::get<std::optional<deletable_row>>(_rows);
|
||||
}
|
||||
public:
|
||||
struct copy_comparators_only {};
|
||||
struct incomplete_tag {};
|
||||
@@ -1283,14 +1251,14 @@ public:
|
||||
return mutation_partition(incomplete_tag(), s, t);
|
||||
}
|
||||
mutation_partition(const schema& s)
|
||||
: _rows(use_single_row_storage(s) ? rows_storage_type(std::optional<deletable_row>{}) : rows_storage_type(rows_type{}))
|
||||
: _rows()
|
||||
, _row_tombstones(s)
|
||||
#ifdef SEASTAR_DEBUG
|
||||
, _schema_version(s.version())
|
||||
#endif
|
||||
{ }
|
||||
mutation_partition(mutation_partition& other, copy_comparators_only)
|
||||
: _rows(other._rows.index() == 0 ? rows_storage_type(rows_type{}) : rows_storage_type(std::optional<deletable_row>{}))
|
||||
: _rows()
|
||||
, _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
|
||||
#ifdef SEASTAR_DEBUG
|
||||
, _schema_version(other._schema_version)
|
||||
@@ -1301,8 +1269,6 @@ public:
|
||||
mutation_partition(const mutation_partition&, const schema&, query::clustering_key_filter_ranges);
|
||||
mutation_partition(mutation_partition&&, const schema&, query::clustering_key_filter_ranges);
|
||||
~mutation_partition();
|
||||
// Returns the mutation_partition containing the given rows_type.
|
||||
// Can only be used when the mutation_partition uses multi-row storage.
|
||||
static mutation_partition& container_of(rows_type&);
|
||||
mutation_partition& operator=(mutation_partition&& x) noexcept;
|
||||
bool equal(const schema&, const mutation_partition&) const;
|
||||
@@ -1496,31 +1462,9 @@ public:
|
||||
const lazy_row& static_row() const { return _static_row; }
|
||||
|
||||
// return a set of rows_entry where each entry represents a CQL row sharing the same clustering key.
|
||||
// For single-row storage (clustering_key_size() == 0), returns an empty container.
|
||||
// Callers should check uses_single_row_storage() and use get_single_row() for single-row case.
|
||||
const rows_type& clustered_rows() const noexcept {
|
||||
if (uses_single_row_storage()) {
|
||||
static const rows_type empty_rows;
|
||||
return empty_rows;
|
||||
}
|
||||
return get_rows_storage();
|
||||
}
|
||||
utils::immutable_collection<rows_type> clustered_rows() noexcept {
|
||||
return const_cast<const mutation_partition*>(this)->clustered_rows();
|
||||
}
|
||||
rows_type& mutable_clustered_rows() noexcept {
|
||||
// Should only be called when NOT using single-row storage
|
||||
return get_rows_storage();
|
||||
}
|
||||
|
||||
// Access the single row when using single-row storage (clustering_key_size() == 0)
|
||||
const std::optional<deletable_row>& get_single_row() const {
|
||||
return get_single_row_storage();
|
||||
}
|
||||
|
||||
std::optional<deletable_row>& get_single_row() {
|
||||
return get_single_row_storage();
|
||||
}
|
||||
const rows_type& clustered_rows() const noexcept { return _rows; }
|
||||
utils::immutable_collection<rows_type> clustered_rows() noexcept { return _rows; }
|
||||
rows_type& mutable_clustered_rows() noexcept { return _rows; }
|
||||
|
||||
const range_tombstone_list& row_tombstones() const noexcept { return _row_tombstones; }
|
||||
utils::immutable_collection<range_tombstone_list> row_tombstones() noexcept { return _row_tombstones; }
|
||||
@@ -1538,14 +1482,8 @@ public:
|
||||
rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r);
|
||||
std::ranges::subrange<rows_type::iterator> range(const schema& schema, const query::clustering_range& r);
|
||||
// Returns an iterator range of rows_entry, with only non-dummy entries.
|
||||
// For single-row storage, returns an empty range.
|
||||
auto non_dummy_rows() const {
|
||||
if (uses_single_row_storage()) {
|
||||
static const rows_type empty_rows;
|
||||
return std::ranges::subrange(empty_rows.begin(), empty_rows.end())
|
||||
| std::views::filter([] (const rows_entry& e) { return bool(!e.dummy()); });
|
||||
}
|
||||
return std::ranges::subrange(get_rows_storage().begin(), get_rows_storage().end())
|
||||
return std::ranges::subrange(_rows.begin(), _rows.end())
|
||||
| std::views::filter([] (const rows_entry& e) { return bool(!e.dummy()); });
|
||||
}
|
||||
void accept(const schema&, mutation_partition_visitor&) const;
|
||||
@@ -1579,21 +1517,7 @@ private:
|
||||
|
||||
inline
|
||||
mutation_partition& mutation_partition::container_of(rows_type& rows) {
|
||||
// This method can only be called when using multi-row storage (rows_type variant alternative).
|
||||
// With std::variant, when rows_type is the active alternative (index 0), it's stored at the beginning of the variant.
|
||||
// We can use pointer arithmetic to get back to the mutation_partition.
|
||||
|
||||
// Calculate offset from rows_type to the containing variant
|
||||
// The rows reference should be the active rows_type inside the variant
|
||||
static_assert(std::is_same_v<std::variant_alternative_t<0, rows_storage_type>, rows_type>,
|
||||
"rows_type must be the first alternative in rows_storage_type");
|
||||
|
||||
// Get address of the variant containing this rows_type
|
||||
// When rows_type is active (index 0), it's at offset 0 in the variant's storage
|
||||
rows_storage_type* variant_ptr = reinterpret_cast<rows_storage_type*>(&rows);
|
||||
|
||||
// Now get the mutation_partition from the variant
|
||||
return *boost::intrusive::get_parent_from_member(variant_ptr, &mutation_partition::_rows);
|
||||
return *boost::intrusive::get_parent_from_member(&rows, &mutation_partition::_rows);
|
||||
}
|
||||
|
||||
bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb = tombstone(),
|
||||
|
||||
@@ -56,16 +56,33 @@ static tasks::task_manager::task_state get_state(const db::system_keyspace::topo
|
||||
}
|
||||
}
|
||||
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, std::chrono::seconds ttl) {
|
||||
return sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
static std::set<tasks::task_id> get_pending_ids(service::topology& topology) {
|
||||
std::set<tasks::task_id> ids;
|
||||
for (auto& request : topology.requests) {
|
||||
ids.emplace(topology.find(request.first)->second.request_id);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry_opt = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry_opt) {
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, service::topology& topology, std::chrono::seconds ttl) {
|
||||
// Started requests.
|
||||
auto entries = co_await sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
|
||||
// Pending requests.
|
||||
for (auto& id : get_pending_ids(topology)) {
|
||||
entries.try_emplace(id.uuid(), db::system_keyspace::topology_requests_entry{});
|
||||
}
|
||||
|
||||
co_return entries;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(id.uuid(), false);
|
||||
auto started = entry.id;
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
if (!started && !get_pending_ids(topology).contains(id)) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto& entry = *entry_opt;
|
||||
co_return tasks::task_status{
|
||||
.task_id = id,
|
||||
.type = request_type_to_task_type(entry.request_type),
|
||||
@@ -84,7 +101,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = "",
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = started ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -106,22 +123,26 @@ future<std::optional<tasks::virtual_task_hint>> node_ops_virtual_task::contains(
|
||||
}
|
||||
}
|
||||
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
co_return entry && std::holds_alternative<service::topology_request>(entry->request_type) ? empty_hint : std::nullopt;
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(task_id.uuid(), false);
|
||||
co_return bool(entry.id) && std::holds_alternative<service::topology_request>(entry.request_type) ? empty_hint : std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> node_ops_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::no);
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
return get_status_helper(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status(id, hint);
|
||||
auto entry = co_await get_status_helper(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
co_return co_await get_status_helper(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
@@ -130,7 +151,8 @@ future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hin
|
||||
|
||||
future<std::vector<tasks::task_stats>> node_ops_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, get_task_manager().get_user_task_ttl())
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, topology, get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto id = e.first;
|
||||
auto& entry = e.second;
|
||||
|
||||
@@ -39,6 +39,8 @@ public:
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
private:
|
||||
future<std::optional<tasks::task_status>> get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const;
|
||||
};
|
||||
|
||||
class streaming_task_impl : public tasks::task_manager::task::impl {
|
||||
|
||||
@@ -176,7 +176,7 @@ void fsm::become_leader() {
|
||||
|
||||
_last_election_time = _clock.now();
|
||||
_ping_leader = false;
|
||||
// a new leader needs to commit at least one entry to make sure that
|
||||
// a new leader needs to commit at lease one entry to make sure that
|
||||
// all existing entries in its log are committed as well. Also it should
|
||||
// send append entries RPC as soon as possible to establish its leadership
|
||||
// (3.4). Do both of those by committing a dummy entry.
|
||||
|
||||
@@ -1195,8 +1195,6 @@ private:
|
||||
rlogger.info("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
|
||||
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
|
||||
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
|
||||
_rs._repair_compaction_locks[_frozen_topology_guard].push_back(std::move(lock_holder));
|
||||
|
||||
@@ -84,10 +84,6 @@ class compaction_group {
|
||||
seastar::named_gate _async_gate;
|
||||
// Gates flushes.
|
||||
seastar::named_gate _flush_gate;
|
||||
// Gates sstable being added to the group.
|
||||
// This prevents the group from being considered empty when sstables are being added.
|
||||
// Crucial for tablet split which ACKs split for a table when all pre-split groups are empty.
|
||||
seastar::named_gate _sstable_add_gate;
|
||||
bool _tombstone_gc_enabled = true;
|
||||
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
|
||||
repair_classifier_func _repair_sstable_classifier;
|
||||
@@ -252,10 +248,6 @@ public:
|
||||
return _flush_gate;
|
||||
}
|
||||
|
||||
seastar::named_gate& sstable_add_gate() noexcept {
|
||||
return _sstable_add_gate;
|
||||
}
|
||||
|
||||
compaction::compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction::compaction_manager& get_compaction_manager() const noexcept;
|
||||
|
||||
@@ -442,7 +434,7 @@ public:
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
|
||||
|
||||
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
|
||||
|
||||
@@ -2793,7 +2793,6 @@ future<> database::flush_all_tables() {
|
||||
});
|
||||
_all_tables_flushed_at = db_clock::now();
|
||||
co_await _commitlog->wait_for_pending_deletes();
|
||||
dblog.info("Forcing new commitlog segment and flushing all tables complete");
|
||||
}
|
||||
|
||||
future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {
|
||||
|
||||
@@ -604,28 +604,9 @@ public:
|
||||
|
||||
data_dictionary::table as_data_dictionary() const;
|
||||
|
||||
// The usage of these functions are restricted to preexisting sstables that aren't being
|
||||
// moved anywhere, so should never be used in the context of file streaming and intra
|
||||
// node migration. The only user today is distributed loader, which populates the
|
||||
// sstables for each column family on boot.
|
||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst,
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
|
||||
|
||||
// Restricted to new sstables produced by external processes such as repair.
|
||||
// The sstable might undergo split if table is in split mode.
|
||||
// If no need for split, the input sstable will only be attached to the sstable set.
|
||||
// If split happens, the output sstables will be attached and the input sstable unlinked.
|
||||
// On failure, the input sstable is unlinked and exception propagated to the caller.
|
||||
// The on_add callback will be called on all sstables to be added into the set.
|
||||
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
|
||||
add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add,
|
||||
sstables::offstrategy offstrategy = sstables::offstrategy::no);
|
||||
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
|
||||
add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add);
|
||||
|
||||
future<> move_sstables_from_staging(std::vector<sstables::shared_sstable>);
|
||||
sstables::shared_sstable make_sstable();
|
||||
void set_truncation_time(db_clock::time_point truncated_at) noexcept {
|
||||
@@ -743,9 +724,7 @@ private:
|
||||
return _config.enable_cache && _schema->caching_options().enabled();
|
||||
}
|
||||
void update_stats_for_new_sstable(const sstables::shared_sstable& sst) noexcept;
|
||||
// This function can throw even if the sstable was added into the set. When the sstable was successfully
|
||||
// added, the sstable ptr @sst will be set to nullptr. Allowing caller to optionally discard the sstable.
|
||||
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy, bool trigger_compaction);
|
||||
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy, bool trigger_compaction);
|
||||
future<> do_add_sstable_and_update_cache(sstables::shared_sstable sst, sstables::offstrategy offstrategy, bool trigger_compaction);
|
||||
// Helpers which add sstable on behalf of a compaction group and refreshes compound set.
|
||||
void add_sstable(compaction_group& cg, sstables::shared_sstable sstable);
|
||||
@@ -1379,8 +1358,7 @@ public:
|
||||
|
||||
// Clones storage of a given tablet. Memtable is flushed first to guarantee that the
|
||||
// snapshot (list of sstables) will include all the data written up to the time it was taken.
|
||||
// If leave_unsealead is set, all the destination sstables will be left unsealed.
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed);
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid);
|
||||
|
||||
friend class compaction_group;
|
||||
friend class compaction::compaction_task_impl;
|
||||
|
||||
184
replica/table.cc
184
replica/table.cc
@@ -721,7 +721,7 @@ public:
|
||||
bool all_storage_groups_split() override { return true; }
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override { return make_ready_future(); }
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override { return make_ready_future(); }
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override {
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override {
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
|
||||
}
|
||||
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
|
||||
@@ -879,7 +879,7 @@ public:
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override;
|
||||
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override;
|
||||
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
|
||||
return tablet_map().get_token_range_after_split(token);
|
||||
}
|
||||
@@ -1130,8 +1130,7 @@ future<> tablet_storage_group_manager::maybe_split_compaction_group_of(size_t id
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
|
||||
co_await utils::get_local_injector().inject("maybe_split_new_sstable_wait", utils::wait_for_message(120s));
|
||||
tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable& sst) {
|
||||
if (!tablet_map().needs_split()) {
|
||||
co_return std::vector<sstables::shared_sstable>{sst};
|
||||
}
|
||||
@@ -1139,7 +1138,8 @@ tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sst
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
auto holder = cg.async_gate().hold();
|
||||
auto& view = cg.view_for_sstable(sst);
|
||||
co_return co_await _t.get_compaction_manager().maybe_split_new_sstable(sst, view, co_await split_compaction_options());
|
||||
auto lock_holder = co_await _t.get_compaction_manager().get_incremental_repair_read_lock(view, "maybe_split_sstable");
|
||||
co_return co_await _t.get_compaction_manager().maybe_split_sstable(sst, view, co_await split_compaction_options());
|
||||
}
|
||||
|
||||
future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
|
||||
@@ -1149,7 +1149,7 @@ future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
|
||||
|
||||
future<std::vector<sstables::shared_sstable>> table::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
|
||||
auto holder = async_gate().hold();
|
||||
co_return co_await _sg_manager->maybe_split_new_sstable(sst);
|
||||
co_return co_await _sg_manager->maybe_split_sstable(sst);
|
||||
}
|
||||
|
||||
dht::token_range table::get_token_range_after_split(const dht::token& token) const noexcept {
|
||||
@@ -1330,7 +1330,7 @@ future<utils::chunked_vector<sstables::shared_sstable>> table::take_sstable_set_
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<sstables::entry_descriptor>>
|
||||
table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
|
||||
table::clone_tablet_storage(locator::tablet_id tid) {
|
||||
utils::chunked_vector<sstables::entry_descriptor> ret;
|
||||
auto holder = async_gate().hold();
|
||||
|
||||
@@ -1342,7 +1342,7 @@ table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
|
||||
// by compaction while we are waiting for the lock.
|
||||
auto deletion_guard = co_await get_sstable_list_permit();
|
||||
co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
|
||||
ret.push_back(co_await sst->clone(calculate_generation_for_new_table(), leave_unsealed));
|
||||
ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
|
||||
});
|
||||
co_return ret;
|
||||
}
|
||||
@@ -1354,10 +1354,10 @@ void table::update_stats_for_new_sstable(const sstables::shared_sstable& sst) no
|
||||
}
|
||||
|
||||
future<>
|
||||
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy offstrategy,
|
||||
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy offstrategy,
|
||||
bool trigger_compaction) {
|
||||
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
|
||||
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () mutable noexcept {
|
||||
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () noexcept {
|
||||
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
|
||||
// atomically load all opened sstables into column family.
|
||||
if (!offstrategy) {
|
||||
@@ -1369,8 +1369,6 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
|
||||
if (trigger_compaction) {
|
||||
try_trigger_compaction(cg);
|
||||
}
|
||||
// Reseting sstable ptr to inform the caller the sstable has been loaded successfully.
|
||||
sst = nullptr;
|
||||
}), dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}), [sst, schema = _schema] (const dht::decorated_key& key) {
|
||||
return sst->filter_has_key(sstables::key::from_partition_key(*schema, key.key()));
|
||||
});
|
||||
@@ -1378,10 +1376,12 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
|
||||
|
||||
future<>
|
||||
table::do_add_sstable_and_update_cache(sstables::shared_sstable new_sst, sstables::offstrategy offstrategy, bool trigger_compaction) {
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
// Hold gate to make share compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await do_add_sstable_and_update_cache(cg, new_sst, offstrategy, trigger_compaction);
|
||||
for (auto sst : co_await maybe_split_new_sstable(new_sst)) {
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
// Hold gate to make share compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await do_add_sstable_and_update_cache(cg, std::move(sst), offstrategy, trigger_compaction);
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1399,85 +1399,6 @@ table::add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>
|
||||
trigger_compaction();
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
// This prevents compaction group from being considered empty until the holder is released.
|
||||
// Helpful for tablet split, where split is acked for a table when all pre-split groups are empty.
|
||||
auto sstable_add_holder = cg.sstable_add_gate().hold();
|
||||
|
||||
ret = ssts = co_await maybe_split_new_sstable(new_sst);
|
||||
// on sucessful split, input sstable is unlinked.
|
||||
new_sst = nullptr;
|
||||
for (auto& sst : ssts) {
|
||||
auto& cg = compaction_group_for_sstable(sst);
|
||||
// Hold gate to make sure compaction group is alive.
|
||||
auto holder = cg.async_gate().hold();
|
||||
co_await on_add(sst);
|
||||
// If do_add_sstable_and_update_cache() throws after sstable has been loaded, the pointer
|
||||
// sst passed by reference will be set to nullptr, so it won't be unlinked in the exception
|
||||
// handler below.
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after sucessful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
co_return std::move(ret);
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable feeded into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
try {
|
||||
for (auto& sst: new_ssts) {
|
||||
auto ssts = co_await add_new_sstable_and_update_cache(std::exchange(sst, nullptr), on_add);
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
co_return std::move(ret);
|
||||
}
|
||||
|
||||
future<>
|
||||
table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector<sstables::shared_sstable> ssts) {
|
||||
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
|
||||
@@ -2691,8 +2612,8 @@ public:
|
||||
sstables::sstables_manager& get_sstables_manager() noexcept override {
|
||||
return _t.get_sstables_manager();
|
||||
}
|
||||
sstables::shared_sstable make_sstable(sstables::sstable_state state) const override {
|
||||
return _t.make_sstable(state);
|
||||
sstables::shared_sstable make_sstable() const override {
|
||||
return _t.make_sstable();
|
||||
}
|
||||
sstables::sstable_writer_config configure_writer(sstring origin) const override {
|
||||
auto cfg = _t.get_sstables_manager().configure_writer(std::move(origin));
|
||||
@@ -2810,7 +2731,6 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
auto flush_future = co_await seastar::coroutine::as_future(flush());
|
||||
|
||||
co_await _flush_gate.close();
|
||||
co_await _sstable_add_gate.close();
|
||||
// FIXME: indentation
|
||||
_compaction_disabler_for_views.clear();
|
||||
co_await utils::get_local_injector().inject("compaction_group_stop_wait", utils::wait_for_message(60s));
|
||||
@@ -2824,7 +2744,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
|
||||
}
|
||||
|
||||
bool compaction_group::empty() const noexcept {
|
||||
return _memtables->empty() && live_sstable_count() == 0 && _sstable_add_gate.get_count() == 0;
|
||||
return _memtables->empty() && live_sstable_count() == 0;
|
||||
}
|
||||
|
||||
const schema_ptr& compaction_group::schema() const {
|
||||
@@ -3280,7 +3200,7 @@ db::replay_position table::highest_flushed_replay_position() const {
|
||||
}
|
||||
|
||||
struct manifest_json : public json::json_base {
|
||||
json::json_chunked_list<std::string_view> files;
|
||||
json::json_chunked_list<sstring> files;
|
||||
|
||||
manifest_json() {
|
||||
register_params();
|
||||
@@ -3304,7 +3224,7 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
|
||||
manifest_json manifest;
|
||||
for (const auto& fsp : file_sets) {
|
||||
for (auto& rf : *fsp) {
|
||||
manifest.files.push(std::string_view(rf));
|
||||
manifest.files.push(std::move(rf));
|
||||
}
|
||||
}
|
||||
auto streamer = json::stream_object(std::move(manifest));
|
||||
@@ -3465,15 +3385,16 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
continue;
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
lister::scan_dir(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [datadir, &all_snapshots] (fs::path snapshots_dir, directory_entry de) {
|
||||
auto snapshot_name = de.name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
auto details = get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).get();
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
}
|
||||
return get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).then([&all_snapshots, snapshot_name] (auto details) {
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).get();
|
||||
}
|
||||
return all_snapshots;
|
||||
});
|
||||
@@ -3481,61 +3402,38 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
|
||||
future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_dir, fs::path datadir) {
|
||||
table::snapshot_details details{};
|
||||
std::optional<fs::path> staging_dir = snapshot_dir / sstables::staging_dir;
|
||||
if (!co_await file_exists(staging_dir->native())) {
|
||||
staging_dir.reset();
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
// FIXME: optimize stat calls by keeping the base directory open and use statat instead, here and below.
|
||||
// See https://github.com/scylladb/seastar/pull/3163
|
||||
auto sd = co_await io_check(file_stat, (snapshot_dir / name).native(), follow_symlink::no);
|
||||
co_await lister::scan_dir(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>(), [datadir, &details] (fs::path snapshot_dir, directory_entry de) -> future<> {
|
||||
auto sd = co_await io_check(file_stat, (snapshot_dir / de.name).native(), follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
if (de.name != "manifest.json" && de.name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of linkes is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
size = 0;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (fs::path path) -> future<bool> {
|
||||
try {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, path.native(), follow_symlink::no);
|
||||
auto psd = co_await io_check(file_stat, (datadir / de.name).native(), follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(datadir / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
(datadir / de.name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / de.name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(*staging_dir / name)) &&
|
||||
!co_await exists_in_dir(datadir / name)) {
|
||||
details.live += size;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
co_return details;
|
||||
}
|
||||
|
||||
@@ -593,7 +593,7 @@ private:
|
||||
v3_columns _v3_columns;
|
||||
mutable schema_registry_entry* _registry_entry = nullptr;
|
||||
std::unique_ptr<::view_info> _view_info;
|
||||
mutable schema_ptr _cdc_schema;
|
||||
schema_ptr _cdc_schema;
|
||||
|
||||
const std::array<column_count_type, 3> _offsets;
|
||||
|
||||
@@ -957,7 +957,6 @@ public:
|
||||
friend bool operator==(const schema&, const schema&);
|
||||
const column_mapping& get_column_mapping() const;
|
||||
friend class schema_registry_entry;
|
||||
friend class schema_registry;
|
||||
// May be called from different shard
|
||||
schema_registry_entry* registry_entry() const noexcept;
|
||||
// Returns true iff this schema version was synced with on current node.
|
||||
|
||||
@@ -78,8 +78,10 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
|
||||
}
|
||||
|
||||
schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
auto learned_cdc_schema = s->cdc_schema() ? learn(s->cdc_schema()) : nullptr;
|
||||
s->_cdc_schema = learned_cdc_schema;
|
||||
auto learned_cdc_schema = s->cdc_schema() ? local_schema_registry().learn(s->cdc_schema()) : nullptr;
|
||||
if (learned_cdc_schema != s->cdc_schema()) {
|
||||
s = s->make_with_cdc(learned_cdc_schema);
|
||||
}
|
||||
if (s->registry_entry()) {
|
||||
return s;
|
||||
}
|
||||
@@ -90,9 +92,7 @@ schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
e.load(s);
|
||||
attach_table(e);
|
||||
}
|
||||
auto loaded_s = e.get_schema();
|
||||
loaded_s->_cdc_schema = learned_cdc_schema;
|
||||
return loaded_s;
|
||||
return e.get_schema();
|
||||
}
|
||||
slogger.debug("Learning about version {} of {}.{}", s->version(), s->ks_name(), s->cf_name());
|
||||
auto e_ptr = make_lw_shared<schema_registry_entry>(s->version(), *this);
|
||||
|
||||
@@ -390,11 +390,9 @@ dark_green = (195, 215, 195)
|
||||
light_red = (255, 200, 200)
|
||||
light_green = (200, 255, 200)
|
||||
light_gray = (240, 240, 240)
|
||||
scylla_blue = (87, 209, 229)
|
||||
|
||||
tablet_colors = {
|
||||
(Tablet.STATE_NORMAL, None): GRAY,
|
||||
(Tablet.STATE_NORMAL, 'repair'): scylla_blue,
|
||||
(Tablet.STATE_JOINING, 'allow_write_both_read_old'): dark_green,
|
||||
(Tablet.STATE_LEAVING, 'allow_write_both_read_old'): dark_red,
|
||||
(Tablet.STATE_JOINING, 'write_both_read_old'): dark_green,
|
||||
@@ -534,8 +532,6 @@ def update_from_cql(initial=False):
|
||||
state = (Tablet.STATE_JOINING, tablet.stage)
|
||||
elif replica in leaving:
|
||||
state = (Tablet.STATE_LEAVING, tablet.stage)
|
||||
elif tablet.stage == 'repair':
|
||||
state = (Tablet.STATE_NORMAL, tablet.stage)
|
||||
else:
|
||||
state = (Tablet.STATE_NORMAL, None)
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ target_sources(service
|
||||
PRIVATE
|
||||
broadcast_tables/experimental/lang.cc
|
||||
client_state.cc
|
||||
client_routes.cc
|
||||
mapreduce_service.cc
|
||||
migration_manager.cc
|
||||
misc_services.cc
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "service/client_routes.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
|
||||
static logging::logger crlogger("client_routes");
|
||||
|
||||
service::query_state& client_routes_query_state() {
|
||||
using namespace std::chrono_literals;
|
||||
const auto t = 10s;
|
||||
static timeout_config tc{ t, t, t, t, t, t, t };
|
||||
static thread_local service::client_state cs(service::client_state::internal_tag{}, tc);
|
||||
static thread_local service::query_state qs(cs, empty_service_permit());
|
||||
return qs;
|
||||
};
|
||||
|
||||
future<mutation> service::client_routes_service::make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key) {
|
||||
static const sstring stmt = format("DELETE FROM {}.{} WHERE connection_id = ? and host_id = ?", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {key.connection_id, key.host_id});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<mutation> service::client_routes_service::make_update_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_entry& route) {
|
||||
static const sstring stmt = format("INSERT INTO {}.{} (connection_id, host_id, address, port, tls_port, alternator_port, alternator_https_port) VALUES (?, ?, ?, ?, ?, ?, ?)", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {
|
||||
route.connection_id,
|
||||
route.host_id,
|
||||
route.address,
|
||||
route.port,
|
||||
route.tls_port,
|
||||
route.alternator_port,
|
||||
route.alternator_https_port
|
||||
});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<std::vector<service::client_routes_service::client_route_entry>> service::client_routes_service::get_client_routes() const {
|
||||
std::vector<service::client_routes_service::client_route_entry> result;
|
||||
static const sstring query = format("SELECT * from {}.{}", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
auto rs = co_await _qp.execute_internal(query, cql3::query_processor::cache_internal::yes);
|
||||
result.reserve(rs->size());
|
||||
for (const auto& row : *rs) {
|
||||
result.emplace_back(
|
||||
row.get_as<sstring>("connection_id"),
|
||||
row.get_as<utils::UUID>("host_id"),
|
||||
row.get_as<sstring>("address"),
|
||||
row.get_opt<int32_t>("port"),
|
||||
row.get_opt<int32_t>("tls_port"),
|
||||
row.get_opt<int32_t>("alternator_port"),
|
||||
row.get_opt<int32_t>("alternator_https_port")
|
||||
);
|
||||
}
|
||||
co_return result;
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::notify_client_routes_change(const client_route_keys& client_route_keys) {
|
||||
co_await container().invoke_on_all([&client_route_keys] (service::client_routes_service& client_routes) {
|
||||
return client_routes._lifecycle_notifier.notify_client_routes_change(client_route_keys);
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (const auto& entry : route_entries) {
|
||||
auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "insert client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (const auto& route_key : route_keys) {
|
||||
auto mut = co_await make_remove_client_route_mutation(guard.write_timestamp(), route_key);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "delete client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries) {
|
||||
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_entries = std::move(route_entries)] {
|
||||
return cr.set_client_routes_inner(route_entries);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys) {
|
||||
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_keys = std::move(route_keys)] {
|
||||
return cr.delete_client_routes_inner(route_keys);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
seastar::future<> service::client_routes_service::with_retry(Func func) const {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
try {
|
||||
co_await func();
|
||||
} catch (const ::service::group0_concurrent_modification&) {
|
||||
crlogger.warn("Failed to set client routes due to guard conflict, retries={}", retries);
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
|
||||
#include "gms/feature_service.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
class endpoint_lifecycle_notifier;
|
||||
|
||||
class client_routes_service : public seastar::peering_sharded_service<client_routes_service> {
|
||||
public:
|
||||
client_routes_service(
|
||||
abort_source& abort_source,
|
||||
gms::feature_service& feature_service,
|
||||
service::raft_group0_client& group0_client,
|
||||
cql3::query_processor& qp,
|
||||
endpoint_lifecycle_notifier& elc_notif
|
||||
)
|
||||
: _abort_source(abort_source)
|
||||
, _feature_service(feature_service)
|
||||
, _group0_client(group0_client)
|
||||
, _qp(qp)
|
||||
, _lifecycle_notifier(elc_notif) { }
|
||||
|
||||
struct client_route_key {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
|
||||
bool operator<(const client_route_key& other) const {
|
||||
if (connection_id != other.connection_id) {
|
||||
return connection_id < other.connection_id;
|
||||
}
|
||||
return host_id < other.host_id;
|
||||
}
|
||||
};
|
||||
using client_route_keys = std::set<client_route_key>;
|
||||
|
||||
struct client_route_entry {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
sstring address;
|
||||
// At least one of the ports should be specified
|
||||
std::optional<int32_t> port;
|
||||
std::optional<int32_t> tls_port;
|
||||
std::optional<int32_t> alternator_port;
|
||||
std::optional<int32_t> alternator_https_port;
|
||||
};
|
||||
|
||||
gms::feature_service& get_feature_service() noexcept {
|
||||
return _feature_service;
|
||||
}
|
||||
|
||||
// mutations
|
||||
future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
|
||||
future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
|
||||
future<std::vector<client_route_entry>> get_client_routes() const;
|
||||
seastar::future<> set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries);
|
||||
seastar::future<> delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys);
|
||||
|
||||
|
||||
// notifications
|
||||
seastar::future<> notify_client_routes_change(const client_route_keys& client_route_keys);
|
||||
private:
|
||||
seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
template <typename Func>
|
||||
seastar::future<> with_retry(Func func) const;
|
||||
|
||||
abort_source& _abort_source;
|
||||
gms::feature_service& _feature_service;
|
||||
service::raft_group0_client& _group0_client;
|
||||
cql3::query_processor& _qp;
|
||||
endpoint_lifecycle_notifier& _lifecycle_notifier;
|
||||
};
|
||||
|
||||
}
|
||||
@@ -224,13 +224,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
|
||||
ks + " can be granted only SELECT or DESCRIBE permissions to a non-superuser.");
|
||||
}
|
||||
|
||||
static const std::unordered_set<auth::resource> vector_search_system_resources = {
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
|
||||
};
|
||||
|
||||
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
|
||||
(cmd.permission == auth::permission::SELECT && vector_search_system_resources.contains(cmd.resource))) {
|
||||
if (cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) {
|
||||
|
||||
co_return co_await ensure_has_permission<auth::command_desc_with_permission_set>({auth::permission_set::of<auth::permission::SELECT, auth::permission::VECTOR_SEARCH_INDEXING>(), cmd.resource});
|
||||
|
||||
@@ -350,17 +344,3 @@ void service::client_state::update_per_service_level_params(qos::service_level_o
|
||||
|
||||
_workload_type = slo.workload;
|
||||
}
|
||||
|
||||
future<> service::client_state::set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options) {
|
||||
for (const auto& [key, value] : client_options) {
|
||||
auto cached_key = co_await keys_and_values_cache.get_or_load(key, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
auto cached_value = co_await keys_and_values_cache.get_or_load(value, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
_client_options.emplace_back(std::move(cached_key), std::move(cached_value));
|
||||
}
|
||||
}
|
||||
@@ -18,7 +18,6 @@
|
||||
#include "auth/authenticated_user.hh"
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/permission.hh"
|
||||
#include "client_data.hh"
|
||||
|
||||
#include "transport/cql_protocol_extension.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
@@ -103,8 +102,7 @@ private:
|
||||
private volatile String keyspace;
|
||||
#endif
|
||||
std::optional<auth::authenticated_user> _user;
|
||||
std::optional<client_options_cache_entry_type> _driver_name, _driver_version;
|
||||
std::list<client_option_key_value_cached_entry> _client_options;
|
||||
std::optional<sstring> _driver_name, _driver_version;
|
||||
|
||||
auth_state _auth_state = auth_state::UNINITIALIZED;
|
||||
bool _control_connection = false;
|
||||
@@ -153,33 +151,18 @@ public:
|
||||
return _control_connection = true;
|
||||
}
|
||||
|
||||
std::optional<client_options_cache_entry_type> get_driver_name() const {
|
||||
std::optional<sstring> get_driver_name() const {
|
||||
return _driver_name;
|
||||
}
|
||||
future<> set_driver_name(client_options_cache_type& keys_and_values_cache, const sstring& driver_name) {
|
||||
_driver_name = co_await keys_and_values_cache.get_or_load(driver_name, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
void set_driver_name(sstring driver_name) {
|
||||
_driver_name = std::move(driver_name);
|
||||
}
|
||||
|
||||
const auto& get_client_options() const {
|
||||
return _client_options;
|
||||
}
|
||||
|
||||
future<> set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options);
|
||||
|
||||
std::optional<client_options_cache_entry_type> get_driver_version() const {
|
||||
std::optional<sstring> get_driver_version() const {
|
||||
return _driver_version;
|
||||
}
|
||||
future<> set_driver_version(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const sstring& driver_version)
|
||||
{
|
||||
_driver_version = co_await keys_and_values_cache.get_or_load(driver_version, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
void set_driver_version(sstring driver_version) {
|
||||
_driver_version = std::move(driver_version);
|
||||
}
|
||||
|
||||
client_state(external_tag,
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "utils/atomic_vector.hh"
|
||||
#include "service/client_routes.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -66,7 +65,6 @@ public:
|
||||
* @param endpoint the endpoint marked DOWN.
|
||||
*/
|
||||
virtual void on_down(const gms::inet_address& endpoint, locator::host_id host_id) {}
|
||||
virtual void on_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {}
|
||||
};
|
||||
|
||||
class endpoint_lifecycle_notifier {
|
||||
@@ -81,8 +79,6 @@ public:
|
||||
future<> notify_released(locator::host_id host_id);
|
||||
future<> notify_up(gms::inet_address endpoint, locator::host_id host_id);
|
||||
future<> notify_joined(gms::inet_address endpoint, locator::host_id host_id);
|
||||
|
||||
future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -163,11 +163,7 @@ public:
|
||||
void before_drop_column_family(const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_drop_keyspace(const sstring& keyspace_name, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
|
||||
// Called when creating a tablet map for a new table.
|
||||
// When in the context of a notification callback, call `before_allocate_tablet_map_in_notification`,
|
||||
// and otherwise call 'before_allocate_tablet_map'.
|
||||
void before_allocate_tablet_map(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_allocate_tablet_map_in_notification(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -648,13 +648,6 @@ void migration_notifier::before_allocate_tablet_map(const locator::tablet_map& m
|
||||
});
|
||||
}
|
||||
|
||||
void migration_notifier::before_allocate_tablet_map_in_notification(const locator::tablet_map& map,
|
||||
const schema& s, utils::chunked_vector<mutation>& mutations, api::timestamp_type ts) {
|
||||
_listeners.thread_for_each_nested([&map, &s, &mutations, ts] (migration_listener* listener) {
|
||||
listener->on_before_allocate_tablet_map(map, s, mutations, ts);
|
||||
});
|
||||
}
|
||||
|
||||
utils::chunked_vector<mutation> prepare_keyspace_update_announcement(replica::database& db, lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type ts) {
|
||||
db.validate_keyspace_update(*ksm);
|
||||
mlogger.info("Update Keyspace: {}", ksm);
|
||||
|
||||
@@ -640,16 +640,6 @@ future<scheduling_group> service_level_controller::auth_integration::get_user_sc
|
||||
}
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::auth_integration::get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
if (usr && usr->name) {
|
||||
auto sl_opt = find_cached_effective_service_level(*usr->name);
|
||||
auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name;
|
||||
return _sl_controller.get_scheduling_group(sl_name);
|
||||
} else {
|
||||
return _sl_controller.get_default_scheduling_group();
|
||||
}
|
||||
}
|
||||
|
||||
future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
// Special case:
|
||||
// -------------
|
||||
@@ -666,11 +656,6 @@ future<scheduling_group> service_level_controller::get_user_scheduling_group(con
|
||||
return _auth_integration->get_user_scheduling_group(usr);
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
SCYLLA_ASSERT(_auth_integration != nullptr);
|
||||
return _auth_integration->get_user_cached_scheduling_group(usr);
|
||||
}
|
||||
|
||||
std::optional<sstring> service_level_controller::get_active_service_level() {
|
||||
unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group());
|
||||
if (_sl_lookup[sched_idx].first) {
|
||||
@@ -789,10 +774,6 @@ future<service_levels_info> service_level_controller::get_distributed_service_le
|
||||
return _sl_data_accessor ? _sl_data_accessor->get_service_level(service_level_name) : make_ready_future<service_levels_info>();
|
||||
}
|
||||
|
||||
bool service_level_controller::can_use_effective_service_level_cache() const{
|
||||
return _sl_data_accessor && _sl_data_accessor->can_use_effective_service_level_cache();
|
||||
}
|
||||
|
||||
future<bool> service_level_controller::validate_before_service_level_add() {
|
||||
assert(this_shard_id() == global_controller);
|
||||
if (_global_controller_db->deleted_scheduling_groups.size() > 0) {
|
||||
|
||||
@@ -154,10 +154,7 @@ public:
|
||||
/// Synchronous version of `find_effective_service_level` that only checks the cache.
|
||||
std::optional<service_level_options> find_cached_effective_service_level(const sstring& role_name);
|
||||
|
||||
/// Execute a function within the service level context of a user, get_user_scheduling_group - async version
|
||||
/// get_user_cached_scheduling_group - sync version (used for v2 servers).
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
scheduling_group get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
|
||||
template <typename Func, typename Ret = std::invoke_result_t<Func>>
|
||||
requires std::invocable<Func>
|
||||
@@ -342,12 +339,6 @@ public:
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* Get the scheduling group of a specific user for the service level cache
|
||||
* @param user - the user for determining the service level
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
scheduling_group get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* @return the name of the currently active service level if such exists or an empty
|
||||
* optional if no active service level.
|
||||
@@ -409,13 +400,6 @@ public:
|
||||
future<service_levels_info> get_distributed_service_levels(qos::query_context ctx);
|
||||
future<service_levels_info> get_distributed_service_level(sstring service_level_name);
|
||||
|
||||
/*
|
||||
* Returns whether effective service level cache can be populated and used.
|
||||
* This is equivalent to checking whether auth + raft have been migrated to raft.
|
||||
*/
|
||||
bool can_use_effective_service_level_cache() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the service level options **in effect** for a user having the given
|
||||
* collection of roles.
|
||||
|
||||
@@ -124,40 +124,8 @@ bool should_flush_system_topology_after_applying(const mutation& mut, const data
|
||||
return false;
|
||||
}
|
||||
|
||||
static void collect_client_routes_update(const mutation& mut, client_routes_service::client_route_keys& client_routes_update) {
|
||||
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
if (mut.column_family_id() != s_client_routes->id()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto pk_components = mut.decorated_key()._key.explode(*s_client_routes);
|
||||
if (pk_components.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto conn_uuid = value_cast<sstring>(utf8_type->deserialize_value(pk_components[0]));
|
||||
for (const rows_entry& re : mut.partition().clustered_rows()) {
|
||||
const auto ck_components = re.key().explode(*s_client_routes);
|
||||
if (ck_components.empty()) {
|
||||
continue;
|
||||
}
|
||||
auto host_uuid = value_cast<utils::UUID>(uuid_type->deserialize_value(ck_components[0]));
|
||||
client_routes_update.emplace(conn_uuid, host_uuid);
|
||||
}
|
||||
}
|
||||
|
||||
static future<> notify_client_route_change_if_needed(storage_service& storage_service, const client_routes_service::client_route_keys& client_routes_update) {
|
||||
if (client_routes_update.size() > 0) {
|
||||
slogger.trace("write_mutations_to_database: notify_client_routes_change routes_update.size()={}", client_routes_update.size());
|
||||
co_await storage_service.notify_client_routes_change(client_routes_update);
|
||||
}
|
||||
}
|
||||
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
|
||||
mutations.reserve(cms.size());
|
||||
bool need_system_topology_flush = false;
|
||||
try {
|
||||
@@ -165,12 +133,7 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
|
||||
auto& tbl = proxy.local_db().find_column_family(cm.column_family_id());
|
||||
auto& s = tbl.schema();
|
||||
auto mut = co_await to_mutation_gently(cm, s);
|
||||
|
||||
need_system_topology_flush = need_system_topology_flush || should_flush_system_topology_after_applying(mut, proxy.data_dictionary());
|
||||
if (proxy.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
|
||||
mutations.emplace_back(co_await freeze_gently(mut), s);
|
||||
}
|
||||
} catch (replica::no_such_column_family& e) {
|
||||
@@ -184,8 +147,6 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
|
||||
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
co_await proxy.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
co_await notify_client_route_change_if_needed(storage_service, client_routes_update);
|
||||
}
|
||||
|
||||
group0_state_machine::modules_to_reload group0_state_machine::get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations) {
|
||||
@@ -290,7 +251,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
[&] (topology_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
@@ -302,7 +263,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
},
|
||||
[&] (write_mutations& muts) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(muts.mutations);
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
}
|
||||
), cmd.change);
|
||||
@@ -432,7 +393,6 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
|
||||
future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) {
|
||||
try {
|
||||
co_await utils::get_local_injector().inject("block_group0_transfer_snapshot", utils::wait_for_message(300s));
|
||||
// Note that this may bring newer state than the group0 state machine raft's
|
||||
// log, so some raft entries may be double applied, but since the state
|
||||
// machine is idempotent it is not a problem.
|
||||
@@ -491,23 +451,11 @@ future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::
|
||||
co_await _sp.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
if (raft_snp) {
|
||||
if (_sp.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
for (auto& canonical_mut : raft_snp->mutations) {
|
||||
if (canonical_mut.column_family_id() == s_client_routes->id()) {
|
||||
auto mut = co_await to_mutation_gently(canonical_mut, s_client_routes);
|
||||
slogger.trace("transfer snapshot: raft snapshot includes client_routes mutation");
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await mutate_locally(std::move(raft_snp->mutations), _sp);
|
||||
}
|
||||
|
||||
co_await _ss.auth_cache().load_all();
|
||||
co_await notify_client_route_change_if_needed(_ss, client_routes_update);
|
||||
|
||||
co_await _sp.mutate_locally({std::move(history_mut)}, nullptr);
|
||||
} catch (const abort_requested_exception&) {
|
||||
|
||||
@@ -130,6 +130,6 @@ public:
|
||||
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
|
||||
|
||||
// Used to write data to topology and other tables except schema tables.
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -254,10 +254,6 @@ public:
|
||||
group0_batch(const group0_batch&) = delete;
|
||||
group0_batch(group0_batch&&) = default;
|
||||
|
||||
const group0_guard& guard() const {
|
||||
return _guard.value();
|
||||
}
|
||||
|
||||
// Gets timestamp which should be used when building mutations.
|
||||
api::timestamp_type write_timestamp() const;
|
||||
utils::UUID new_group0_state_id() const;
|
||||
|
||||
@@ -1114,7 +1114,7 @@ private:
|
||||
// only for a truncate which is still waiting.
|
||||
if (_topology_state_machine._topology.global_request) {
|
||||
utils::UUID ongoing_global_request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id);
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id, true);
|
||||
auto global_request = std::get<service::global_topology_request>(topology_requests_entry.request_type);
|
||||
if (global_request == global_topology_request::truncate_table) {
|
||||
std::optional<topology::transition_state>& tstate = _topology_state_machine._topology.tstate;
|
||||
|
||||
@@ -205,7 +205,6 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -225,13 +224,11 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
, _snitch(snitch)
|
||||
, _sl_controller(sl_controller)
|
||||
, _auth_cache(auth_cache)
|
||||
, _client_routes(client_routes)
|
||||
, _group0(nullptr)
|
||||
, _async_gate("storage_service")
|
||||
, _node_ops_abort_thread(node_ops_abort_thread())
|
||||
, _node_ops_module(make_shared<node_ops::task_manager_module>(tm, *this))
|
||||
, _tablets_module(make_shared<service::task_manager_module>(tm, *this))
|
||||
, _global_topology_requests_module(make_shared<service::topo::task_manager_module>(tm))
|
||||
, _address_map(address_map)
|
||||
, _shared_token_metadata(stm)
|
||||
, _erm_factory(erm_factory)
|
||||
@@ -255,11 +252,9 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
{
|
||||
tm.register_module(_node_ops_module->get_name(), _node_ops_module);
|
||||
tm.register_module(_tablets_module->get_name(), _tablets_module);
|
||||
tm.register_module(_global_topology_requests_module->get_name(), _global_topology_requests_module);
|
||||
if (this_shard_id() == 0) {
|
||||
_node_ops_module->make_virtual_task<node_ops::node_ops_virtual_task>(*this);
|
||||
_tablets_module->make_virtual_task<service::tablet_virtual_task>(*this);
|
||||
_global_topology_requests_module->make_virtual_task<service::topo::global_topology_request_virtual_task>(*this);
|
||||
}
|
||||
register_metrics();
|
||||
|
||||
@@ -588,16 +583,12 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
}
|
||||
break;
|
||||
case node_state::decommissioning:
|
||||
[[fallthrough]];
|
||||
case node_state::removing:
|
||||
// A decommissioning or removing node loses its tokens when topology moves to left_token_ring.
|
||||
// A decommissioning node loses its tokens when topology moves to left_token_ring.
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::left_token_ring) {
|
||||
if (rs.state == node_state::removing && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::removing:
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::rollback_to_normal) {
|
||||
// no need for double writes anymore since op failed
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
@@ -1384,34 +1375,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstring ks) const {
|
||||
auto ongoing_ks_rf_change = [&] (utils::UUID request_id) -> future<bool> {
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
co_return std::holds_alternative<global_topology_request>(req_entry.request_type) &&
|
||||
std::get<global_topology_request>(req_entry.request_type) == global_topology_request::keyspace_rf_change &&
|
||||
req_entry.new_keyspace_rf_change_ks_name.has_value() && req_entry.new_keyspace_rf_change_ks_name.value() == ks;
|
||||
};
|
||||
if (_topology_state_machine._topology.global_request_id.has_value()) {
|
||||
auto req_id = _topology_state_machine._topology.global_request_id.value();
|
||||
if (co_await ongoing_ks_rf_change(req_id)) {
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.paused_rf_change_requests) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.global_requests_queue) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
future<> storage_service::raft_initialize_discovery_leader(const join_node_request_params& params) {
|
||||
if (params.replaced_id.has_value()) {
|
||||
throw std::runtime_error(::format("Cannot perform a replace operation because this is the first node in the cluster"));
|
||||
@@ -1457,7 +1420,7 @@ future<> storage_service::raft_initialize_discovery_leader(const join_node_reque
|
||||
_migration_manager.local().get_group0_client().get_history_gc_duration(), "bootstrap: adding myself as the first node to the topology");
|
||||
auto mutation_creator_addr = _sys_ks.local().local_db().get_token_metadata().get_topology().my_address();
|
||||
|
||||
co_await write_mutations_to_database(*this, _qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await write_mutations_to_database(_qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await _qp.proxy().mutate_locally({history_append}, nullptr);
|
||||
}
|
||||
|
||||
@@ -3480,7 +3443,6 @@ future<> storage_service::stop() {
|
||||
_listeners.clear();
|
||||
co_await _tablets_module->stop();
|
||||
co_await _node_ops_module->stop();
|
||||
co_await _global_topology_requests_module->stop();
|
||||
co_await _async_gate.close();
|
||||
co_await std::move(_node_ops_abort_thread);
|
||||
_tablet_split_monitor_event.signal();
|
||||
@@ -5063,50 +5025,6 @@ future<> storage_service::wait_for_topology_not_busy() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.abort_paused_rf_change(request_id);
|
||||
});
|
||||
}
|
||||
|
||||
if (!_feature_service.rack_list_rf) {
|
||||
throw std::runtime_error("The RACK_LIST_RF feature is not enabled on the cluster yet");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
|
||||
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
|
||||
if (!found) {
|
||||
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
|
||||
topology_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("aborting rf change request {}", request_id));
|
||||
|
||||
try {
|
||||
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _group0_as, raft_timeout{});
|
||||
} catch (group0_concurrent_modification&) {
|
||||
slogger.info("aborting request {}: concurrent modification, retrying.", request_id);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
semaphore& storage_service::get_do_sample_sstables_concurrency_limiter() {
|
||||
return _do_sample_sstables_concurrency_limiter;
|
||||
}
|
||||
@@ -5310,7 +5228,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {
|
||||
request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
} else if (!_topology_state_machine._topology.global_requests_queue.empty()) {
|
||||
request_id = _topology_state_machine._topology.global_requests_queue[0];
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id, true);
|
||||
curr_req = std::get<global_topology_request>(req_entry.request_type);
|
||||
} else {
|
||||
request_id = utils::UUID{};
|
||||
@@ -6526,19 +6444,14 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
|
||||
leaving.host, pending.host));
|
||||
}
|
||||
|
||||
// All sstables cloned locally will be left unsealed, until they're loaded into the table.
|
||||
// This is to guarantee no unsplit sstables will be left sealed on disk, which could
|
||||
// cause problems if unsplit sstables are found after split was ACKed to coordinator.
|
||||
bool leave_unsealed = true;
|
||||
|
||||
auto d = co_await smp::submit_to(leaving.shard, [this, tablet, leave_unsealed] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
|
||||
auto d = co_await smp::submit_to(leaving.shard, [this, tablet] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
|
||||
auto& table = _db.local().find_column_family(tablet.table);
|
||||
auto op = table.stream_in_progress();
|
||||
co_return co_await table.clone_tablet_storage(tablet.tablet, leave_unsealed);
|
||||
co_return co_await table.clone_tablet_storage(tablet.tablet);
|
||||
});
|
||||
rtlogger.debug("Cloned storage of tablet {} from leaving replica {}, {} sstables were found", tablet, leaving, d.size());
|
||||
|
||||
auto load_sstable = [leave_unsealed] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
|
||||
auto load_sstable = [] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
|
||||
auto& mng = t.get_sstables_manager();
|
||||
auto sst = mng.make_sstable(t.schema(), t.get_storage_options(), d.generation, d.state.value_or(sstables::sstable_state::normal),
|
||||
d.version, d.format, db_clock::now(), default_io_error_handler_gen());
|
||||
@@ -6546,8 +6459,7 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
|
||||
// will still point to leaving replica at this stage in migration. If node goes down,
|
||||
// SSTables will be loaded at pending replica and migration is retried, so correctness
|
||||
// wise, we're good.
|
||||
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true,
|
||||
.unsealed_sstable = leave_unsealed };
|
||||
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true };
|
||||
co_await sst->load(sharder, cfg);
|
||||
co_return sst;
|
||||
};
|
||||
@@ -6555,23 +6467,16 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
|
||||
co_await smp::submit_to(pending.shard, [this, tablet, load_sstable, d = std::move(d)] () mutable -> future<> {
|
||||
// Loads cloned sstables from leaving replica into pending one.
|
||||
auto& table = _db.local().find_column_family(tablet.table);
|
||||
auto& sstm = table.get_sstables_manager();
|
||||
auto op = table.stream_in_progress();
|
||||
dht::auto_refreshing_sharder sharder(table.shared_from_this());
|
||||
|
||||
std::unordered_set<sstables::shared_sstable> ssts;
|
||||
std::vector<sstables::shared_sstable> ssts;
|
||||
ssts.reserve(d.size());
|
||||
for (auto&& sst_desc : d) {
|
||||
ssts.insert(co_await load_sstable(sharder, table, std::move(sst_desc)));
|
||||
ssts.push_back(co_await load_sstable(sharder, table, std::move(sst_desc)));
|
||||
}
|
||||
auto on_add = [&ssts, &sstm] (sstables::shared_sstable loading_sst) -> future<> {
|
||||
if (ssts.contains(loading_sst)) {
|
||||
auto cfg = sstm.configure_writer(loading_sst->get_origin());
|
||||
co_await loading_sst->seal_sstable(cfg.backup);
|
||||
}
|
||||
co_return;
|
||||
};
|
||||
auto loaded_ssts = co_await table.add_new_sstables_and_update_cache(std::vector(ssts.begin(), ssts.end()), on_add);
|
||||
_view_building_worker.local().load_sstables(tablet.table, loaded_ssts);
|
||||
co_await table.add_sstables_and_update_cache(ssts);
|
||||
_view_building_worker.local().load_sstables(tablet.table, ssts);
|
||||
});
|
||||
rtlogger.debug("Successfully loaded storage of tablet {} into pending replica {}", tablet, pending);
|
||||
}
|
||||
@@ -7797,9 +7702,6 @@ void storage_service::init_messaging_service() {
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_state()->id());
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_history()->id());
|
||||
}
|
||||
if (ss._feature_service.client_routes) {
|
||||
additional_tables.push_back(db::system_keyspace::client_routes()->id());
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& table : boost::join(params.tables, additional_tables)) {
|
||||
@@ -8139,18 +8041,6 @@ future<> endpoint_lifecycle_notifier::notify_joined(gms::inet_address endpoint,
|
||||
});
|
||||
}
|
||||
|
||||
future<> endpoint_lifecycle_notifier::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await seastar::async([this, &client_route_keys] {
|
||||
_subscribers.thread_for_each([&client_route_keys] (endpoint_lifecycle_subscriber* subscriber) {
|
||||
try {
|
||||
subscriber->on_client_routes_change(client_route_keys);
|
||||
} catch (...) {
|
||||
slogger.warn("Client routes notification failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_service::notify_joined(inet_address endpoint, locator::host_id hid) {
|
||||
co_await utils::get_local_injector().inject(
|
||||
"storage_service_notify_joined_sleep", std::chrono::milliseconds{500});
|
||||
@@ -8175,10 +8065,6 @@ future<> storage_service::notify_cql_change(inet_address endpoint, locator::host
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await _client_routes.local().notify_client_routes_change(client_route_keys);
|
||||
}
|
||||
|
||||
bool storage_service::is_normal_state_handled_on_boot(locator::host_id node) {
|
||||
return _normal_state_handled_on_boot.contains(node);
|
||||
}
|
||||
|
||||
@@ -17,10 +17,8 @@
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_guard.hh"
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/snitch_base.hh"
|
||||
@@ -50,7 +48,6 @@
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "service/tablet_operation.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/user_provided_param.hh"
|
||||
#include "utils/sequenced_set.hh"
|
||||
#include "service/topology_coordinator.hh"
|
||||
@@ -205,7 +202,6 @@ private:
|
||||
sharded<locator::snitch_ptr>& _snitch;
|
||||
sharded<qos::service_level_controller>& _sl_controller;
|
||||
auth::cache& _auth_cache;
|
||||
sharded<client_routes_service>& _client_routes;
|
||||
|
||||
// Engaged on shard 0 before `join_cluster`.
|
||||
service::raft_group0* _group0;
|
||||
@@ -229,7 +225,6 @@ private:
|
||||
future<> _node_ops_abort_thread;
|
||||
shared_ptr<node_ops::task_manager_module> _node_ops_module;
|
||||
shared_ptr<service::task_manager_module> _tablets_module;
|
||||
shared_ptr<service::topo::task_manager_module> _global_topology_requests_module;
|
||||
gms::gossip_address_map& _address_map;
|
||||
void node_ops_insert(node_ops_id, gms::inet_address coordinator, std::list<inet_address> ignore_nodes,
|
||||
std::function<future<>()> abort_func);
|
||||
@@ -274,7 +269,6 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& _client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -937,7 +931,6 @@ public:
|
||||
bool topology_global_queue_empty() const {
|
||||
return !_topology_state_machine._topology.global_request.has_value();
|
||||
}
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
private:
|
||||
@@ -1075,8 +1068,6 @@ public:
|
||||
future<sstring> wait_for_topology_request_completion(utils::UUID id, bool require_entry = true);
|
||||
future<> wait_for_topology_not_busy();
|
||||
|
||||
future<> abort_paused_rf_change(utils::UUID request_id);
|
||||
|
||||
private:
|
||||
semaphore _do_sample_sstables_concurrency_limiter{1};
|
||||
// To avoid overly-large RPC messages, `do_sample_sstables` is broken up into several rounds.
|
||||
@@ -1147,14 +1138,11 @@ public:
|
||||
future<std::vector<std::byte>> train_dict(utils::chunked_vector<temporary_buffer<char>> sample);
|
||||
future<> publish_new_sstable_dict(table_id, std::span<const std::byte>, service::raft_group0_client&);
|
||||
void set_train_dict_callback(decltype(_train_dict));
|
||||
seastar::future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
|
||||
|
||||
friend class join_node_rpc_handshaker;
|
||||
friend class node_ops::node_ops_virtual_task;
|
||||
friend class tasks::task_manager;
|
||||
friend class tablet_virtual_task;
|
||||
friend class topo::global_topology_request_virtual_task;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -6,16 +6,12 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "cql3/statements/ks_prop_defs.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "locator/topology.hh"
|
||||
#include "replica/tablets.hh"
|
||||
#include "locator/tablet_replication_strategy.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -26,7 +22,6 @@
|
||||
#include "replica/database.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <utility>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -242,147 +237,6 @@ struct migration_candidate {
|
||||
migration_badness badness;
|
||||
};
|
||||
|
||||
struct colocation_source {
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_replica replica;
|
||||
};
|
||||
|
||||
using colocation_source_set = utils::chunked_vector<colocation_source>;
|
||||
using colocation_sources_by_destination_rack = std::unordered_map<endpoint_dc_rack, colocation_source_set>;
|
||||
|
||||
struct rack_list_colocation_state {
|
||||
colocation_sources_by_destination_rack dst_dc_rack_to_tablets;
|
||||
std::unordered_map<endpoint_dc_rack, std::unordered_set<utils::UUID>> dst_to_requests;
|
||||
utils::UUID request_to_resume;
|
||||
|
||||
void maybe_set_request_to_resume(const utils::UUID& id) {
|
||||
if (!request_to_resume) {
|
||||
request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
future<rack_list_colocation_state> find_required_rack_list_colocations(
|
||||
replica::database& db,
|
||||
token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
const std::unordered_set<utils::UUID>& paused_rf_change_requests,
|
||||
const std::unordered_set<locator::global_tablet_id>& already_planned_migrations) {
|
||||
rack_list_colocation_state state;
|
||||
|
||||
auto get_node = [&] (locator::host_id host) -> const locator::node& {
|
||||
auto* node = tmptr->get_topology().find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
return *node;
|
||||
};
|
||||
for (const auto& request_id : paused_rf_change_requests) {
|
||||
auto req_entry = co_await sys_ks->get_topology_request_entry(request_id);
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
|
||||
if (!db.has_keyspace(ks_name)) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
auto& ks = db.find_keyspace(ks_name);
|
||||
std::unordered_map<sstring, sstring> saved_ks_props = *req_entry.new_keyspace_rf_change_data;
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, db.features(), db.get_config());
|
||||
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
bool no_changes_needed = true;
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = tmptr->tablets().get_tablet_map(table_or_mv->id());
|
||||
const auto& new_replication_strategy_config = ks_md->strategy_options();
|
||||
for (auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (!std::holds_alternative<rack_list>(rf_value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto racks = std::get<rack_list>(rf_value) | std::ranges::to<std::unordered_set<sstring>>();
|
||||
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
|
||||
auto gid = locator::global_tablet_id{table_or_mv->id(), tid};
|
||||
|
||||
// Current replicas in this DC. There might be multiple replicas in the same rack.
|
||||
auto dc_replicas = ti.replicas | std::views::filter([&] (const tablet_replica& r) {
|
||||
return get_node(r.host).dc_rack().dc == dc;
|
||||
}) | std::ranges::to<std::vector<tablet_replica>>();
|
||||
|
||||
if (dc_replicas.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Find replicas that are not in the desired racks (src_replicas)
|
||||
// and racks that do not have replicas yet (dst_racks).
|
||||
auto dst_racks = racks;
|
||||
std::vector<tablet_replica> src_replicas;
|
||||
for (const auto& r : dc_replicas) {
|
||||
auto rack = get_node(r.host).dc_rack().rack;
|
||||
if (dst_racks.find(rack) != dst_racks.end()) {
|
||||
// There is already a replica in this rack.
|
||||
dst_racks.erase(rack);
|
||||
} else {
|
||||
// There is a replica in this rack, but it needs to be moved.
|
||||
src_replicas.push_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
auto zipped = std::views::zip(src_replicas, dst_racks);
|
||||
if (!std::ranges::empty(zipped)) {
|
||||
no_changes_needed = false;
|
||||
}
|
||||
|
||||
// Skip tablet that is in transitions.
|
||||
auto* tti = tmap.get_tablet_transition_info(tid);
|
||||
if (tti) {
|
||||
lblogger.debug("Skipped colocation for tablet={} which is already in transition={}", gid, tti->transition);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Skip tablet that is about to be in transition.
|
||||
if (already_planned_migrations.contains(gid)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
for (auto src_dst : zipped) {
|
||||
auto src = std::get<0>(src_dst);
|
||||
auto dst = std::get<1>(src_dst);
|
||||
auto endpoint = locator::endpoint_dc_rack{dc, dst};
|
||||
|
||||
state.dst_dc_rack_to_tablets[endpoint].emplace_back(colocation_source{{table_or_mv->id(), tid}, src});
|
||||
state.dst_to_requests[endpoint].insert(request_id);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
}
|
||||
if (no_changes_needed) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
}
|
||||
}
|
||||
co_return state;
|
||||
}
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id) {
|
||||
auto res = co_await find_required_rack_list_colocations(db, tmptr, sys_ks, {request_id}, {});
|
||||
co_return res.request_to_resume != request_id;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
@@ -804,8 +658,6 @@ class load_balancer {
|
||||
|
||||
replica::database& _db;
|
||||
token_metadata_ptr _tm;
|
||||
service::topology* _topology;
|
||||
db::system_keyspace* _sys_ks;
|
||||
std::optional<locator::load_sketch> _load_sketch;
|
||||
// Holds the set of tablets already scheduled for transition during plan-making.
|
||||
std::unordered_set<global_tablet_id> _scheduled_tablets;
|
||||
@@ -890,10 +742,7 @@ private:
|
||||
return streaming_infos;
|
||||
}
|
||||
public:
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm, locator::load_stats_ptr table_load_stats,
|
||||
load_balancer_stats_manager& stats,
|
||||
uint64_t target_tablet_size,
|
||||
unsigned tablets_per_shard_goal,
|
||||
@@ -902,26 +751,19 @@ public:
|
||||
, _tablets_per_shard_goal(tablets_per_shard_goal)
|
||||
, _db(db)
|
||||
, _tm(std::move(tm))
|
||||
, _topology(topology)
|
||||
, _sys_ks(sys_ks)
|
||||
, _table_load_stats(std::move(table_load_stats))
|
||||
, _stats(stats)
|
||||
, _skiplist(std::move(skiplist))
|
||||
{ }
|
||||
|
||||
bool ongoing_rack_list_colocation() const {
|
||||
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan() {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || rack_list_colocation) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces()) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
@@ -937,10 +779,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (rack_list_colocation) {
|
||||
plan.merge(co_await make_rack_list_colocation_plan(plan));
|
||||
}
|
||||
|
||||
// Merge table-wide resize decisions, may emit new decisions, revoke or finalize ongoing ones.
|
||||
// Note : Resize plans should be generated before repair plans to avoid scheduling repairs when there is pending resize finalization
|
||||
plan.merge_resize_plan(co_await make_resize_plan(plan));
|
||||
@@ -951,8 +789,8 @@ public:
|
||||
}
|
||||
|
||||
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count());
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
@@ -977,58 +815,6 @@ public:
|
||||
co_return false;
|
||||
}
|
||||
|
||||
void ensure_node(node_load_map& nodes, host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_scheduled_load(node_load_map& nodes) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_planned_load(node_load_map& nodes, const migration_plan& mplan) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
}
|
||||
|
||||
future<tablet_repair_plan> make_repair_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_repair_plan");
|
||||
|
||||
@@ -1044,19 +830,53 @@ public:
|
||||
// Populate the load of the migration that is already in the plan
|
||||
node_load_map nodes;
|
||||
// TODO: share code with make_plan()
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
};
|
||||
// TODO: share code with make_plan()
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
bool is_drained = node.get_state() == locator::node::state::being_decommissioned
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled
|
||||
co_await consider_scheduled_load(nodes);
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Consider load that is about to be scheduled
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
|
||||
struct repair_plan {
|
||||
locator::global_tablet_id gid;
|
||||
@@ -1139,109 +959,6 @@ public:
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<migration_plan> make_rack_list_colocation_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_rack_list_colocation_plan");
|
||||
|
||||
migration_plan plan;
|
||||
tablet_rack_list_colocation_plan rack_list_plan;
|
||||
if (!ongoing_rack_list_colocation()) {
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
|
||||
auto migration_tablet_ids = co_await mplan.get_migration_tablet_ids();
|
||||
auto colocation_state = co_await find_required_rack_list_colocations(_db, _tm, _sys_ks,
|
||||
_topology->paused_rf_change_requests, std::move(migration_tablet_ids));
|
||||
|
||||
node_load_map nodes;
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
if (node.get_state() == locator::node::state::normal && !node.is_excluded()) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled.
|
||||
co_await consider_scheduled_load(nodes);
|
||||
|
||||
// Consider load that is about to be scheduled.
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
|
||||
std::unordered_set<global_tablet_id> colocation_tablet_ids;
|
||||
for (auto& [dc_rack, colocation_sources] : colocation_state.dst_dc_rack_to_tablets) {
|
||||
auto nodes_by_load_dst = nodes | std::views::filter([&] (const auto& host_load) {
|
||||
auto& [host, load] = host_load;
|
||||
auto& node = *load.node;
|
||||
return node.dc_rack() == dc_rack;
|
||||
}) | std::views::keys | std::ranges::to<std::vector<host_id>>();
|
||||
|
||||
if (nodes_by_load_dst.empty()) {
|
||||
lblogger.warn("No target nodes available for RF change colocation plan in dc {}, rack {}", dc_rack.dc, dc_rack.rack);
|
||||
if (auto it = colocation_state.dst_to_requests.find(dc_rack); it != colocation_state.dst_to_requests.end()) {
|
||||
rack_list_plan.maybe_add_request_to_resume(*it->second.begin());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nodes_cmp = nodes_by_load_cmp(nodes);
|
||||
auto nodes_dst_cmp = [&] (const host_id& a, const host_id& b) {
|
||||
return nodes_cmp(b, a);
|
||||
};
|
||||
|
||||
// Ascending load heap of candidate target nodes.
|
||||
std::make_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
|
||||
const tablet_metadata& tmeta = _tm->tablets();
|
||||
for (colocation_source& source : colocation_sources) {
|
||||
if (colocation_tablet_ids.contains(source.gid)) {
|
||||
lblogger.debug("Skipped colocation of replica {} of tablet={}, another replica of which is about to be colocated", source.replica, source.gid);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pick the least loaded node as target.
|
||||
std::pop_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
auto target = nodes_by_load_dst.back();
|
||||
auto& target_info = nodes[target];
|
||||
auto push_back_target_node = seastar::defer([&] {
|
||||
std::push_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
});
|
||||
|
||||
lblogger.debug("target node: {}, avg_load={}", target, target_info.avg_load);
|
||||
|
||||
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
|
||||
|
||||
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
|
||||
target_info.shards[dst.shard].tablet_count,
|
||||
target_info.shard_load(dst.shard, _target_tablet_size));
|
||||
|
||||
tablet_transition_kind kind = tablet_transition_kind::migration;
|
||||
migration_tablet_set source_tablets {
|
||||
.tablet_s = source.gid, // Ignore the merge co-location.
|
||||
};
|
||||
auto src = source.replica;
|
||||
auto mig = get_migration_info(source_tablets, kind, src, dst);
|
||||
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
|
||||
auto mig_streaming_info = get_migration_streaming_infos(topo, tmap, mig);
|
||||
pick(*_load_sketch, dst.host, dst.shard, source_tablets);
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {}", mig);
|
||||
mark_as_scheduled(mig);
|
||||
for (auto& m : mig) {
|
||||
plan.add(std::move(m));
|
||||
colocation_tablet_ids.insert(m.tablet);
|
||||
}
|
||||
}
|
||||
update_node_load_on_migration(nodes, src, dst, source_tablets);
|
||||
}
|
||||
}
|
||||
if (colocation_state.request_to_resume) {
|
||||
rack_list_plan.maybe_add_request_to_resume(colocation_state.request_to_resume);
|
||||
}
|
||||
plan.set_rack_list_colocation_plan(std::move(rack_list_plan));
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
// Returns true if a table has replicas of all its sibling tablets co-located.
|
||||
// This is used for determining whether merge can be finalized, since co-location
|
||||
// is a strict requirement for sibling tablets to be merged.
|
||||
@@ -1931,10 +1648,6 @@ public:
|
||||
const auto& table_groups = _tm->tablets().all_table_groups();
|
||||
|
||||
auto finalize_decision = [&] {
|
||||
if (utils::get_local_injector().enter("tablet_resize_finalization_postpone")) {
|
||||
return;
|
||||
}
|
||||
|
||||
_stats.for_cluster().resizes_finalized++;
|
||||
resize_plan.finalize_resize.insert(table);
|
||||
};
|
||||
@@ -3254,6 +2967,30 @@ public:
|
||||
node_load_map nodes;
|
||||
std::unordered_set<host_id> nodes_to_drain;
|
||||
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
};
|
||||
|
||||
_tm->for_each_token_owner([&] (const locator::node& node) {
|
||||
if (!node_filter(node)) {
|
||||
return;
|
||||
@@ -3262,7 +2999,7 @@ public:
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
if (is_drained) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
|
||||
nodes_to_drain.emplace(node.host_id());
|
||||
nodes[node.host_id()].drained = true;
|
||||
@@ -3270,7 +3007,7 @@ public:
|
||||
// Excluded nodes should not be chosen as targets for migration.
|
||||
lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
|
||||
} else {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -3303,7 +3040,7 @@ public:
|
||||
r, global_tablet_id{table, tid}));
|
||||
}
|
||||
if (node->left() && node_filter(*node)) {
|
||||
ensure_node(nodes, r.host);
|
||||
ensure_node(r.host);
|
||||
nodes_to_drain.insert(r.host);
|
||||
nodes[r.host].drained = true;
|
||||
}
|
||||
@@ -3505,7 +3242,7 @@ public:
|
||||
plan.merge(co_await make_intranode_plan(nodes, nodes_to_drain));
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
|
||||
@@ -3527,11 +3264,9 @@ class tablet_allocator_impl : public tablet_allocator::impl
|
||||
locator::load_stats_ptr _load_stats;
|
||||
private:
|
||||
load_balancer make_load_balancer(token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
std::unordered_set<host_id> skiplist) {
|
||||
load_balancer lb(_db, tm, topology, sys_ks, std::move(table_load_stats), _load_balancer_stats,
|
||||
load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats,
|
||||
_db.get_config().target_tablet_size_in_bytes(),
|
||||
_db.get_config().tablets_per_shard_goal(),
|
||||
std::move(skiplist));
|
||||
@@ -3558,8 +3293,8 @@ public:
|
||||
_stopped = true;
|
||||
}
|
||||
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
|
||||
co_return co_await lb.make_plan();
|
||||
}
|
||||
@@ -3579,7 +3314,7 @@ public:
|
||||
// Allocates new tablets for a table which is not co-located with another table.
|
||||
tablet_map allocate_tablets_for_new_base_table(const tablet_aware_replication_strategy* tablet_rs, const schema& s) {
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto lb = make_load_balancer(tm, nullptr, nullptr, nullptr, {});
|
||||
auto lb = make_load_balancer(tm, nullptr, {});
|
||||
auto plan = lb.make_sizing_plan(s.shared_from_this(), tablet_rs).get();
|
||||
auto& table_plan = plan.tables[s.id()];
|
||||
if (table_plan.target_tablet_count_aligned != table_plan.target_tablet_count) {
|
||||
@@ -3593,7 +3328,6 @@ public:
|
||||
|
||||
// Allocate tablets for multiple new tables, which may be co-located with each other, or co-located with an existing base table.
|
||||
void allocate_tablets_for_new_tables(const keyspace_metadata& ksm, const std::vector<schema_ptr>& cfms, utils::chunked_vector<mutation>& muts, api::timestamp_type ts) {
|
||||
utils::get_local_injector().inject("pause_in_allocate_tablets_for_new_table", utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto rs = abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, tm->get_topology());
|
||||
@@ -3635,7 +3369,7 @@ public:
|
||||
if (s.id() != base_id) {
|
||||
lblogger.debug("Creating tablets for {}.{} id={} with base={}", s.ks_name(), s.cf_name(), s.id(), base_id);
|
||||
muts.emplace_back(colocated_tablet_map_to_mutation(s.id(), s.ks_name(), s.cf_name(), base_id, ts));
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3651,7 +3385,7 @@ public:
|
||||
muts.emplace_back(std::move(m));
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
|
||||
create_colocated_tablet_maps(base_map);
|
||||
}
|
||||
@@ -3800,8 +3534,8 @@ future<> tablet_allocator::stop() {
|
||||
return impl().stop();
|
||||
}
|
||||
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), topology, sys_ks, std::move(load_stats), std::move(skiplist));
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), std::move(load_stats), std::move(skiplist));
|
||||
}
|
||||
|
||||
void tablet_allocator::set_load_stats(locator::load_stats_ptr load_stats) {
|
||||
|
||||
@@ -14,14 +14,8 @@
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include <seastar/core/metrics.hh>
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
|
||||
class topology;
|
||||
|
||||
struct load_balancer_dc_stats {
|
||||
uint64_t calls = 0;
|
||||
uint64_t migrations_produced = 0;
|
||||
@@ -139,26 +133,6 @@ struct tablet_repair_plan {
|
||||
}
|
||||
};
|
||||
|
||||
struct tablet_rack_list_colocation_plan {
|
||||
utils::UUID _request_to_resume;
|
||||
|
||||
const utils::UUID& request_to_resume() const noexcept {
|
||||
return _request_to_resume;
|
||||
}
|
||||
|
||||
size_t size() const { return _request_to_resume ? 1 : 0; };
|
||||
|
||||
void merge(tablet_rack_list_colocation_plan&& other) {
|
||||
_request_to_resume = _request_to_resume ? _request_to_resume : other._request_to_resume;
|
||||
}
|
||||
|
||||
void maybe_add_request_to_resume(const utils::UUID& id) {
|
||||
if (!_request_to_resume) {
|
||||
_request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class migration_plan {
|
||||
public:
|
||||
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
|
||||
@@ -166,19 +140,17 @@ private:
|
||||
migrations_vector _migrations;
|
||||
table_resize_plan _resize_plan;
|
||||
tablet_repair_plan _repair_plan;
|
||||
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
|
||||
bool _has_nodes_to_drain = false;
|
||||
public:
|
||||
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size(); }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size();}
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
size_t tablet_repair_count() const { return _repair_plan.size(); }
|
||||
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
|
||||
|
||||
void add(tablet_migration_info info) {
|
||||
_migrations.emplace_back(std::move(info));
|
||||
@@ -195,7 +167,6 @@ public:
|
||||
_has_nodes_to_drain |= other._has_nodes_to_drain;
|
||||
_resize_plan.merge(std::move(other._resize_plan));
|
||||
_repair_plan.merge(std::move(other._repair_plan));
|
||||
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
|
||||
}
|
||||
|
||||
void set_has_nodes_to_drain(bool b) {
|
||||
@@ -214,12 +185,6 @@ public:
|
||||
_repair_plan = std::move(repair);
|
||||
}
|
||||
|
||||
const tablet_rack_list_colocation_plan& rack_list_colocation_plan() const { return _rack_list_colocation_plan; }
|
||||
|
||||
void set_rack_list_colocation_plan(tablet_rack_list_colocation_plan rack_list_colocation_plan) {
|
||||
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
|
||||
}
|
||||
|
||||
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
|
||||
};
|
||||
|
||||
@@ -265,7 +230,7 @@ public:
|
||||
///
|
||||
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
|
||||
///
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, service::topology*, db::system_keyspace*, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
|
||||
void set_load_stats(locator::load_stats_ptr);
|
||||
|
||||
@@ -281,12 +246,6 @@ public:
|
||||
void on_leadership_lost();
|
||||
};
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id);
|
||||
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "tasks/task_handler.hh"
|
||||
#include "tasks/virtual_task_hint.hh"
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -289,116 +288,4 @@ std::set<locator::host_id> task_manager_module::get_nodes() const {
|
||||
return get_task_manager().get_nodes(_ss);
|
||||
}
|
||||
|
||||
namespace topo {
|
||||
|
||||
static tasks::task_manager::task_state get_state(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
if (!entry.id) {
|
||||
return tasks::task_manager::task_state::created;
|
||||
} else if (!entry.done) {
|
||||
return tasks::task_manager::task_state::running;
|
||||
} else if (entry.error == "") {
|
||||
return tasks::task_manager::task_state::done;
|
||||
} else {
|
||||
return tasks::task_manager::task_state::failed;
|
||||
}
|
||||
}
|
||||
|
||||
tasks::task_manager::task_group global_topology_request_virtual_task::get_group() const noexcept {
|
||||
return tasks::task_manager::task_group::global_topology_change_group;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::virtual_task_hint>> global_topology_request_virtual_task::contains(tasks::task_id task_id) const {
|
||||
if (!task_id.uuid().is_timestamp()) {
|
||||
// Task id of node ops operation is always a timestamp.
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
auto hint = std::make_optional<tasks::virtual_task_hint>({});
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
if (entry.has_value() && std::holds_alternative<service::global_topology_request>(entry->request_type) &&
|
||||
std::get<service::global_topology_request>(entry->request_type) == global_topology_request::keyspace_rf_change) {
|
||||
co_return hint;
|
||||
}
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> global_topology_request_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::yes);
|
||||
}
|
||||
|
||||
static tasks::task_stats get_task_stats(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
return tasks::task_stats{
|
||||
.task_id = tasks::task_id{entry.id},
|
||||
.type = fmt::to_string(entry.request_type),
|
||||
.kind = tasks::task_kind::cluster,
|
||||
.scope = "keyspace",
|
||||
.state = get_state(entry),
|
||||
.sequence_number = 0,
|
||||
.keyspace = entry.new_keyspace_rf_change_ks_name.value_or(""),
|
||||
.table = "",
|
||||
.entity = "",
|
||||
.shard = 0,
|
||||
.start_time = entry.start_time,
|
||||
.end_time = entry.end_time,
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry.has_value()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto task_stats = get_task_stats(*entry);
|
||||
co_return tasks::task_status{
|
||||
.task_id = task_stats.task_id,
|
||||
.type = task_stats.type,
|
||||
.kind = task_stats.kind,
|
||||
.scope = task_stats.scope,
|
||||
.state = task_stats.state,
|
||||
.is_abortable = co_await is_abortable(std::move(hint)),
|
||||
.start_time = task_stats.start_time,
|
||||
.end_time = task_stats.end_time,
|
||||
.error = entry->error,
|
||||
.parent_id = tasks::task_id::create_null_id(),
|
||||
.sequence_number = task_stats.sequence_number,
|
||||
.shard = task_stats.shard,
|
||||
.keyspace = task_stats.keyspace,
|
||||
.table = task_stats.table,
|
||||
.entity = task_stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = utils::chunked_vector<tasks::task_identity>{},
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> global_topology_request_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
return _ss.abort_paused_rf_change(id.uuid());
|
||||
}
|
||||
|
||||
future<std::vector<tasks::task_stats>> global_topology_request_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await sys_ks.get_topology_request_entries({global_topology_request::keyspace_rf_change}, db_clock::now() - get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto& entry = e.second;
|
||||
return get_task_stats(entry);
|
||||
}));
|
||||
}
|
||||
|
||||
task_manager_module::task_manager_module(tasks::task_manager& tm) noexcept
|
||||
: tasks::task_manager::module(tm, "global_topology_requests")
|
||||
{}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -54,33 +54,4 @@ public:
|
||||
|
||||
std::set<locator::host_id> get_nodes() const override;
|
||||
};
|
||||
|
||||
namespace topo {
|
||||
|
||||
class global_topology_request_virtual_task : public tasks::task_manager::virtual_task::impl {
|
||||
private:
|
||||
service::storage_service& _ss;
|
||||
public:
|
||||
global_topology_request_virtual_task(tasks::task_manager::module_ptr module,
|
||||
service::storage_service& ss)
|
||||
: tasks::task_manager::virtual_task::impl(std::move(module))
|
||||
, _ss(ss)
|
||||
{}
|
||||
virtual tasks::task_manager::task_group get_group() const noexcept override;
|
||||
virtual future<std::optional<tasks::virtual_task_hint>> contains(tasks::task_id task_id) const override;
|
||||
virtual future<tasks::is_abortable> is_abortable(tasks::virtual_task_hint hint) const override;
|
||||
|
||||
virtual future<std::optional<tasks::task_status>> get_status(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
};
|
||||
|
||||
class task_manager_module : public tasks::task_manager::module {
|
||||
public:
|
||||
task_manager_module(tasks::task_manager& tm) noexcept;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <fmt/ranges.h>
|
||||
|
||||
@@ -55,7 +54,6 @@
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "topology_mutation.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -955,7 +953,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
} else {
|
||||
assert(_feature_service.topology_global_request_queue);
|
||||
req_id = _topo_sm._topology.global_requests_queue[0];
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id);
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id, true);
|
||||
req = std::get<global_topology_request>(req_entry.request_type);
|
||||
}
|
||||
switch (req) {
|
||||
@@ -999,7 +997,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
sstring error;
|
||||
bool needs_colocation = false;
|
||||
if (_db.has_keyspace(ks_name)) {
|
||||
try {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
@@ -1007,40 +1004,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, _db.features(), _db.get_config());
|
||||
_db.validate_keyspace_update(*ks_md);
|
||||
size_t unimportant_init_tablet_count = 2; // must be a power of 2
|
||||
locator::tablet_map new_tablet_map{unimportant_init_tablet_count};
|
||||
|
||||
auto schedule_migrations = [&] () -> future<> {
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
co_return;
|
||||
}
|
||||
auto table = tables_with_mvs.front();
|
||||
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
|
||||
auto check_needs_colocation = [&] () -> future<bool> {
|
||||
const auto& new_replication_strategy_config = new_strategy->get_config_options();
|
||||
const auto& old_replication_strategy_config = ks.metadata()->strategy_options();
|
||||
bool rack_list_conversion = false;
|
||||
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (std::holds_alternative<locator::rack_list>(rf_value)) {
|
||||
auto it = old_replication_strategy_config.find(dc);
|
||||
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
|
||||
rack_list_conversion = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, tmptr, &_sys_ks, req_id) : false;
|
||||
};
|
||||
if (needs_colocation = co_await check_needs_colocation(); needs_colocation) {
|
||||
co_return;
|
||||
}
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
// Apply the transition only on base tables.
|
||||
@@ -1049,6 +1018,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
continue;
|
||||
}
|
||||
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), old_tablets.tablet_count(), ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
@@ -1075,8 +1046,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
});
|
||||
}
|
||||
};
|
||||
co_await schedule_migrations();
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
@@ -1092,22 +1061,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
|
||||
}
|
||||
|
||||
bool pause_request = needs_colocation && error.empty();
|
||||
topology_mutation_builder tbuilder(guard.write_timestamp());
|
||||
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.del_global_topology_request()
|
||||
.del_global_topology_request_id()
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
|
||||
if (pause_request) {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
|
||||
tbuilder.pause_rf_change_request(req_id);
|
||||
} else {
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
|
||||
.build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.done(error)
|
||||
.build()));
|
||||
}
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
|
||||
sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
|
||||
rtlogger.trace("do update {} reason {}", updates, reason);
|
||||
@@ -1371,14 +1334,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
.build());
|
||||
}
|
||||
|
||||
void generate_rf_change_resume_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, utils::UUID request_to_resume) {
|
||||
rtlogger.debug("Generating RF change resume for request id {}", request_to_resume);
|
||||
out.emplace_back(topology_mutation_builder(guard.write_timestamp())
|
||||
.queue_global_topology_request_id(request_to_resume)
|
||||
.resume_rf_change_request(_topo_sm._topology.paused_rf_change_requests, request_to_resume)
|
||||
.build());
|
||||
}
|
||||
|
||||
future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
|
||||
if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
|
||||
// schedule tablet migration only if there are no pending resize finalisations or if the node is draining.
|
||||
@@ -1386,10 +1341,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
generate_migration_update(out, guard, mig);
|
||||
}
|
||||
|
||||
if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
|
||||
generate_rf_change_resume_update(out, guard, request_to_resume);
|
||||
}
|
||||
}
|
||||
|
||||
auto sched_time = db_clock::now();
|
||||
@@ -1880,7 +1831,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool has_nodes_to_drain = false;
|
||||
if (!preempt) {
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), {}, get_dead_nodes());
|
||||
has_nodes_to_drain = plan.has_nodes_to_drain();
|
||||
if (!drain || plan.has_nodes_to_drain()) {
|
||||
co_await generate_migration_updates(updates, guard, plan);
|
||||
@@ -2003,7 +1954,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await utils::get_local_injector().inject("tablet_resize_finalization_post_barrier", utils::wait_for_message(std::chrono::minutes(2)));
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);
|
||||
@@ -2083,7 +2034,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
|
||||
// handler performed the truncate and cleared the session, but crashed before finalizing the request
|
||||
if (_topo_sm._topology.session) {
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id);
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id, true);
|
||||
const table_id& table_id = topology_requests_entry.truncate_table_id;
|
||||
lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(table_id);
|
||||
|
||||
@@ -2623,10 +2574,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await _voter_handler.on_node_removed(replaced_node_id, _as);
|
||||
}
|
||||
}
|
||||
utils::get_local_injector().inject("crash_coordinator_before_stream", [] {
|
||||
rtlogger.info("crash_coordinator_before_stream: aborting");
|
||||
abort();
|
||||
});
|
||||
utils::get_local_injector().inject("crash_coordinator_before_stream", [] { abort(); });
|
||||
raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
|
||||
auto state = node.rs->state;
|
||||
try {
|
||||
@@ -2675,7 +2623,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
|
||||
co_await sleep_abortable(std::chrono::milliseconds(10), _as);
|
||||
}
|
||||
const bool removenode_with_left_token_ring = _feature_service.removenode_with_left_token_ring;
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
bool barrier_failed = false;
|
||||
// In this state writes goes to old and new replicas but reads start to be done from new replicas
|
||||
@@ -2730,9 +2677,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
break;
|
||||
case node_state::removing: {
|
||||
co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (!removenode_with_left_token_ring) {
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
}
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::decommissioning: {
|
||||
@@ -2740,10 +2685,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
node_state next_state;
|
||||
utils::chunked_vector<canonical_mutation> muts;
|
||||
muts.reserve(2);
|
||||
if (removenode_with_left_token_ring || node.rs->state == node_state::decommissioning) {
|
||||
// Both decommission and removenode go through left_token_ring state
|
||||
// to ensure a global barrier is executed before the request is marked as done.
|
||||
// This ensures all nodes have observed the topology change.
|
||||
if (node.rs->state == node_state::decommissioning) {
|
||||
next_state = node.rs->state;
|
||||
builder.set_transition_state(topology::transition_state::left_token_ring);
|
||||
} else {
|
||||
@@ -2818,16 +2760,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
case topology::transition_state::left_token_ring: {
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
|
||||
// Need to be captured as the node variable might become invalid (e.g. moved out) at particular points.
|
||||
const auto node_rs_state = node.rs->state;
|
||||
|
||||
const bool is_removenode = node_rs_state == node_state::removing;
|
||||
|
||||
if (is_removenode && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
|
||||
auto finish_left_token_ring_transition = [&](node_to_work_on& node) -> future<> {
|
||||
// Remove the node from group0 here - in general, it won't be able to leave on its own
|
||||
// because we'll ban it as soon as we tell it to shut down.
|
||||
@@ -2847,16 +2779,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
muts.push_back(builder.build());
|
||||
co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
|
||||
co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
|
||||
auto str = std::invoke([&]() {
|
||||
switch (node_rs_state) {
|
||||
case node_state::decommissioning:
|
||||
return ::format("finished decommissioning node {}", node.id);
|
||||
case node_state::removing:
|
||||
return ::format("finished removing node {}", node.id);
|
||||
default:
|
||||
return ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
}
|
||||
});
|
||||
auto str = node.rs->state == node_state::decommissioning
|
||||
? ::format("finished decommissioning node {}", node.id)
|
||||
: ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
|
||||
};
|
||||
|
||||
@@ -2869,11 +2794,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (node.id == _raft.id()) {
|
||||
// Removed node must be dead, so it shouldn't enter here (it can't coordinate its own removal).
|
||||
if (is_removenode) {
|
||||
on_internal_error(rtlogger, "removenode operation cannot be coordinated by the removed node itself");
|
||||
}
|
||||
|
||||
// Someone else needs to coordinate the rest of the decommission process,
|
||||
// because the decommissioning node is going to shut down in the middle of this state.
|
||||
rtlogger.info("coordinator is decommissioning; giving up leadership");
|
||||
@@ -2887,13 +2807,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool barrier_failed = false;
|
||||
// Wait until other nodes observe the new token ring and stop sending writes to this node.
|
||||
auto excluded_nodes = get_excluded_nodes_for_topology_request(node);
|
||||
try {
|
||||
// Removed node is added to ignored nodes, so it should be automatically excluded.
|
||||
if (is_removenode && !excluded_nodes.contains(node.id)) {
|
||||
on_internal_error(rtlogger, "removenode operation must have the removed node in excluded_nodes");
|
||||
}
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), std::move(excluded_nodes)), node.id);
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
|
||||
} catch (term_changed_error&) {
|
||||
throw;
|
||||
} catch (group0_concurrent_modification&) {
|
||||
@@ -2910,17 +2825,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (barrier_failed) {
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node,
|
||||
// or some nodes might not have observed the new topology yet (one purpose of the barrier
|
||||
// is to make sure all nodes observed the new topology before completing the request).
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node.
|
||||
// Lets wait for the ring delay for those writes to complete and new topology to propagate
|
||||
// before continuing.
|
||||
co_await sleep_abortable(_ring_delay, _as);
|
||||
node = retake_node(co_await start_operation(), node.id);
|
||||
}
|
||||
|
||||
// Make decommissioning/removed node a non voter before reporting operation completion below.
|
||||
// Otherwise the node may see the completion and exit before it is removed from
|
||||
// Make decommissioning node a non voter before reporting operation completion below.
|
||||
// Otherwise the decommissioned node may see the completion and exit before it is removed from
|
||||
// the config at which point the removal from the config will hang if the cluster had only two
|
||||
// nodes before the decommission.
|
||||
co_await _voter_handler.on_node_removed(node.id, _as);
|
||||
@@ -2931,7 +2844,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");
|
||||
|
||||
// For decommission/rollback: Tell the node to shut down.
|
||||
// Tell the node to shut down.
|
||||
// This is done to improve user experience when there are no failures.
|
||||
// In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
|
||||
// so there's no guarantee that it would learn about entering that state even if it was still
|
||||
@@ -2940,19 +2853,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// There is the possibility that the node will never get the message
|
||||
// and decommission will hang on that node.
|
||||
// This is fine for the rest of the cluster - we will still remove, ban the node and continue.
|
||||
//
|
||||
// For removenode: The node is already dead, no need to send shutdown command.
|
||||
auto node_id = node.id;
|
||||
bool shutdown_failed = false;
|
||||
if (!is_removenode) {
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
}
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
}
|
||||
if (shutdown_failed) {
|
||||
node = retake_node(co_await start_operation(), node_id);
|
||||
@@ -3549,7 +3458,7 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
|
||||
}
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
if (plan.empty()) {
|
||||
rtlogger.debug("Tablet load balancer did not make any plan");
|
||||
co_return false;
|
||||
|
||||
@@ -256,20 +256,6 @@ topology_mutation_builder& topology_mutation_builder::drop_first_global_topology
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::pause_rf_change_request(const utils::UUID& id) {
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::update, std::vector<data_value>{id});
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::resume_rf_change_request(const std::unordered_set<utils::UUID>& values, const utils::UUID& id) {
|
||||
if (values.contains(id)) {
|
||||
auto new_values = values;
|
||||
new_values.erase(id);
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::overwrite, new_values | std::views::transform([] (const auto& id) { return data_value{id}; }));
|
||||
} else {
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::set_upgrade_state(topology::upgrade_state_type value) {
|
||||
return apply_atomic("upgrade_state", ::format("{}", value));
|
||||
}
|
||||
|
||||
@@ -129,8 +129,6 @@ public:
|
||||
topology_mutation_builder& del_global_topology_request_id();
|
||||
topology_mutation_builder& queue_global_topology_request_id(const utils::UUID& value);
|
||||
topology_mutation_builder& drop_first_global_topology_request_id(const std::vector<utils::UUID>&, const utils::UUID&);
|
||||
topology_mutation_builder& pause_rf_change_request(const utils::UUID&);
|
||||
topology_mutation_builder& resume_rf_change_request(const std::unordered_set<utils::UUID>&, const utils::UUID&);
|
||||
topology_node_mutation_builder& with_node(raft::server_id);
|
||||
canonical_mutation build() { return canonical_mutation{std::move(_m)}; }
|
||||
};
|
||||
|
||||
@@ -180,10 +180,6 @@ struct topology {
|
||||
// The KS options to be used when executing the scheduled ALTER KS statement
|
||||
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
|
||||
|
||||
// The ids of RF change requests that are paused because they require tablet co-location.
|
||||
// It may happen during altering from numerical RF to rack list.
|
||||
std::unordered_set<utils::UUID> paused_rf_change_requests;
|
||||
|
||||
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
|
||||
std::vector<cdc::generation_id_v2> unpublished_cdc_generations;
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ enum class component_type {
|
||||
TemporaryTOC,
|
||||
TemporaryStatistics,
|
||||
Scylla,
|
||||
TemporaryScylla,
|
||||
Rows,
|
||||
Partitions,
|
||||
TemporaryHashes,
|
||||
@@ -76,6 +77,8 @@ struct fmt::formatter<sstables::component_type> : fmt::formatter<string_view> {
|
||||
return formatter<string_view>::format("TemporaryStatistics", ctx);
|
||||
case Scylla:
|
||||
return formatter<string_view>::format("Scylla", ctx);
|
||||
case TemporaryScylla:
|
||||
return formatter<string_view>::format("TemporaryScylla", ctx);
|
||||
case Partitions:
|
||||
return formatter<string_view>::format("Partitions", ctx);
|
||||
case Rows:
|
||||
|
||||
@@ -632,6 +632,10 @@ private:
|
||||
std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
|
||||
|
||||
void close_data_writer();
|
||||
void close_index_writer();
|
||||
void close_rows_writer();
|
||||
void close_partitions_writer();
|
||||
|
||||
void ensure_tombstone_is_written() {
|
||||
if (!_tombstone_written) {
|
||||
consume(tombstone());
|
||||
@@ -944,17 +948,16 @@ void writer::init_file_writers() {
|
||||
_sst._schema->get_compressor_params(),
|
||||
std::move(compressor)), _sst.get_filename());
|
||||
}
|
||||
|
||||
if (_sst.has_component(component_type::Index)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Index).get();
|
||||
_index_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), _sst.index_filename());
|
||||
_index_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, _sst.index_filename());
|
||||
}
|
||||
if (_sst.has_component(component_type::Partitions) && _sst.has_component(component_type::Rows)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Rows).get();
|
||||
_rows_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Rows));
|
||||
_rows_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Rows));
|
||||
_bti_row_index_writer = trie::bti_row_index_writer(*_rows_writer);
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Partitions).get();
|
||||
_partitions_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Partitions));
|
||||
_partitions_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Partitions));
|
||||
_bti_partition_index_writer = trie::bti_partition_index_writer(*_partitions_writer);
|
||||
}
|
||||
if (_delayed_filter) {
|
||||
@@ -982,6 +985,41 @@ void writer::close_data_writer() {
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_index_writer() {
|
||||
if (_index_writer) {
|
||||
auto writer = close_writer(_index_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().index_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_partitions_writer() {
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
auto writer = close_writer(_partitions_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().partitions_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_rows_writer() {
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
auto writer = close_writer(_rows_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().rows_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
_c_stats.start_offset = _data_writer->offset();
|
||||
_prev_row_start = _data_writer->offset();
|
||||
@@ -1630,27 +1668,10 @@ void writer::consume_end_of_stream() {
|
||||
_collector.add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
|
||||
}
|
||||
|
||||
if (_index_writer) {
|
||||
close_writer(_index_writer);
|
||||
}
|
||||
close_index_writer();
|
||||
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
close_writer(_partitions_writer);
|
||||
}
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
close_writer(_rows_writer);
|
||||
}
|
||||
close_partitions_writer();
|
||||
close_rows_writer();
|
||||
|
||||
if (_hashes_writer) {
|
||||
close_writer(_hashes_writer);
|
||||
@@ -1696,9 +1717,7 @@ void writer::consume_end_of_stream() {
|
||||
.map = _collector.get_ext_timestamp_stats()
|
||||
});
|
||||
_sst.write_scylla_metadata(_shard, std::move(identifier), std::move(ld_stats), std::move(ts_stats));
|
||||
if (!_cfg.leave_unsealed) {
|
||||
_sst.seal_sstable(_cfg.backup).get();
|
||||
}
|
||||
_sst.seal_sstable(_cfg.backup).get();
|
||||
}
|
||||
|
||||
uint64_t writer::data_file_position_for_tests() const {
|
||||
|
||||
@@ -83,8 +83,6 @@ struct sstable_open_config {
|
||||
bool current_shard_as_sstable_owner = false;
|
||||
// Do not move the sharding metadata to the sharder, keeping it in the scylla metadata..
|
||||
bool keep_sharding_metadata = false;
|
||||
// Allows unsealed sstable to be loaded, since it must read components from temporary TOC instead.
|
||||
bool unsealed_sstable = false;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ sstable_version_constants::component_map_t sstable_version_constants::create_com
|
||||
{ component_type::Filter, "Filter.db" },
|
||||
{ component_type::Statistics, "Statistics.db" },
|
||||
{ component_type::Scylla, "Scylla.db" },
|
||||
{ component_type::TemporaryScylla, "Scylla.db.tmp" },
|
||||
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
|
||||
{ component_type::TemporaryStatistics, "Statistics.db.tmp" }
|
||||
};
|
||||
|
||||
@@ -836,14 +836,13 @@ future<std::vector<sstring>> sstable::read_and_parse_toc(file f) {
|
||||
|
||||
// This is small enough, and well-defined. Easier to just read it all
|
||||
// at once
|
||||
future<> sstable::read_toc(sstable_open_config cfg) noexcept {
|
||||
future<> sstable::read_toc() noexcept {
|
||||
if (_recognized_components.size()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
try {
|
||||
auto toc_type = cfg.unsealed_sstable ? component_type::TemporaryTOC : component_type::TOC;
|
||||
co_await do_read_simple(toc_type, [&] (version_types v, file f) -> future<> {
|
||||
co_await do_read_simple(component_type::TOC, [&] (version_types v, file f) -> future<> {
|
||||
auto comps = co_await read_and_parse_toc(f);
|
||||
for (auto& c: comps) {
|
||||
// accept trailing newlines
|
||||
@@ -901,8 +900,8 @@ future<std::unordered_map<component_type, file>> sstable::readable_file_for_all_
|
||||
co_return std::move(files);
|
||||
}
|
||||
|
||||
future<entry_descriptor> sstable::clone(generation_type new_generation, bool leave_unsealed) const {
|
||||
co_await _storage->snapshot(*this, _storage->prefix(), storage::absolute_path::yes, new_generation, storage::leave_unsealed(leave_unsealed));
|
||||
future<entry_descriptor> sstable::clone(generation_type new_generation) const {
|
||||
co_await _storage->snapshot(*this, _storage->prefix(), storage::absolute_path::yes, new_generation);
|
||||
co_return entry_descriptor(new_generation, _version, _format, component_type::TOC, _state);
|
||||
}
|
||||
|
||||
@@ -957,16 +956,22 @@ future<file_writer> sstable::make_component_file_writer(component_type c, file_o
|
||||
});
|
||||
}
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> sstable::make_digests_component_file_writer(component_type c, file_output_stream_options options, open_flags oflags) noexcept {
|
||||
return _storage->make_component_sink(*this, c, oflags, std::move(options)).then([this, comp = component_name(*this, c)] (data_sink sink) mutable {
|
||||
return std::make_unique<crc32_digest_file_writer>(std::move(sink), sstable_buffer_size, comp);
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::open_sstable(const sstring& origin) {
|
||||
_origin = origin;
|
||||
generate_toc();
|
||||
_storage->open(*this);
|
||||
}
|
||||
|
||||
void sstable::write_toc(file_writer w) {
|
||||
void sstable::write_toc(std::unique_ptr<crc32_digest_file_writer> w) {
|
||||
sstlog.debug("Writing TOC file {} ", toc_filename());
|
||||
|
||||
do_write_simple(std::move(w), [&] (version_types v, file_writer& w) {
|
||||
do_write_simple(*w, [&] (version_types v, file_writer& w) {
|
||||
for (auto&& key : _recognized_components) {
|
||||
// new line character is appended to the end of each component name.
|
||||
auto value = sstable_version_constants::get_component_map(v).at(key) + "\n";
|
||||
@@ -974,6 +979,8 @@ void sstable::write_toc(file_writer w) {
|
||||
write(v, w, b);
|
||||
}
|
||||
});
|
||||
|
||||
_components_digests.toc_digest = w->full_checksum();
|
||||
}
|
||||
|
||||
void sstable::write_crc(const checksum& c) {
|
||||
@@ -990,6 +997,7 @@ void sstable::write_digest(uint32_t full_checksum) {
|
||||
auto digest = to_sstring<bytes>(full_checksum);
|
||||
write(v, w, digest);
|
||||
}, buffer_size);
|
||||
_components_digests.data_digest = full_checksum;
|
||||
}
|
||||
|
||||
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
|
||||
@@ -1046,7 +1054,7 @@ future<> sstable::read_simple(T& component) {
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::do_write_simple(file_writer&& writer,
|
||||
void sstable::do_write_simple(file_writer& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component) {
|
||||
write_component(_version, writer);
|
||||
_metadata_size_on_disk += writer.offset();
|
||||
@@ -1061,7 +1069,7 @@ void sstable::do_write_simple(component_type type,
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(std::move(w), std::move(write_component));
|
||||
do_write_simple(w, std::move(write_component));
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
@@ -1071,10 +1079,30 @@ void sstable::write_simple(const T& component) {
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
uint32_t sstable::do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component, unsigned buffer_size) {
|
||||
auto file_path = filename(type);
|
||||
sstlog.debug("Writing {} file {}", sstable_version_constants::get_component_map(_version).at(type), file_path);
|
||||
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_digests_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(*w, std::move(write_component));
|
||||
return w->full_checksum();
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t sstable::write_simple_with_digest(const T& component) {
|
||||
return do_write_simple_with_digest(Type, [&component] (version_types v, file_writer& w) {
|
||||
write(v, w, component);
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f);
|
||||
template void sstable::write_simple<component_type::Filter>(const sstables::filter& f);
|
||||
|
||||
template void sstable::write_simple<component_type::Summary>(const sstables::summary_ka&);
|
||||
template uint32_t sstable::write_simple_with_digest<component_type::Summary>(const sstables::summary_ka&);
|
||||
|
||||
future<> sstable::read_compression() {
|
||||
// FIXME: If there is no compression, we should expect a CRC file to be present.
|
||||
@@ -1093,7 +1121,8 @@ void sstable::write_compression() {
|
||||
return;
|
||||
}
|
||||
|
||||
write_simple<component_type::CompressionInfo>(_components->compression);
|
||||
uint32_t digest = write_simple_with_digest<component_type::CompressionInfo>(_components->compression);
|
||||
_components_digests.compression_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::validate_partitioner() {
|
||||
@@ -1318,7 +1347,8 @@ future<> sstable::read_partitions_db_footer() {
|
||||
}
|
||||
|
||||
void sstable::write_statistics() {
|
||||
write_simple<component_type::Statistics>(_components->statistics);
|
||||
auto digest = write_simple_with_digest<component_type::Statistics>(_components->statistics);
|
||||
_components_digests.statistics_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::mark_as_being_repaired(const service::session_id& id) {
|
||||
@@ -1341,13 +1371,25 @@ int64_t sstable::update_repaired_at(int64_t repaired_at) {
|
||||
void sstable::rewrite_statistics() {
|
||||
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
|
||||
|
||||
auto lock = get_units(_mutate_sem, 1).get();
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = sstable_buffer_size;
|
||||
auto w = make_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
auto w = make_digests_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
open_flags::wo | open_flags::create | open_flags::truncate).get();
|
||||
write(_version, w, _components->statistics);
|
||||
w.close();
|
||||
write(_version, *w, _components->statistics);
|
||||
w->close();
|
||||
|
||||
// When rewriting statistics, we also need to update the scylla component
|
||||
// because it contains the digest of the statistics component.
|
||||
if (has_scylla_component()) {
|
||||
_components_digests.statistics_digest = w->full_checksum();
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests{_components_digests});
|
||||
sstlog.debug("Rewriting scylla component of sstable {}", get_filename());
|
||||
write_simple<component_type::TemporaryScylla>(*_components->scylla_metadata);
|
||||
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryScylla)), fmt::to_string(filename(component_type::Scylla))).get();
|
||||
}
|
||||
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryStatistics)), fmt::to_string(filename(component_type::Statistics))).get();
|
||||
}
|
||||
@@ -1541,7 +1583,8 @@ void sstable::write_filter() {
|
||||
|
||||
auto&& bs = f->bits();
|
||||
auto filter_ref = sstables::filter_ref(f->num_hashes(), bs.get_storage());
|
||||
write_simple<component_type::Filter>(filter_ref);
|
||||
uint32_t digest = write_simple_with_digest<component_type::Filter>(filter_ref);
|
||||
_components_digests.filter_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::maybe_rebuild_filter_from_index(uint64_t num_partitions) {
|
||||
@@ -1726,7 +1769,7 @@ void sstable::disable_component_memory_reload() {
|
||||
}
|
||||
|
||||
future<> sstable::load_metadata(sstable_open_config cfg) noexcept {
|
||||
co_await read_toc(cfg);
|
||||
co_await read_toc();
|
||||
// read scylla-meta after toc. Might need it to parse
|
||||
// rest (hint extensions)
|
||||
co_await read_scylla_metadata();
|
||||
@@ -2000,6 +2043,8 @@ sstable::read_scylla_metadata() noexcept {
|
||||
}
|
||||
return read_simple<component_type::Scylla>(*_components->scylla_metadata).then([this] {
|
||||
_features = _components->scylla_metadata->get_features();
|
||||
_components_digests = _components->scylla_metadata->get_components_digests();
|
||||
_components->digest = _components_digests.data_digest;
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2089,6 +2134,7 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
|
||||
sstable_schema.columns.elements.push_back(sstable_column_description{to_sstable_column_kind(col.kind), {col.name()}, {to_bytes(col.type->name())}});
|
||||
}
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::Schema>(std::move(sstable_schema));
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests(_components_digests));
|
||||
|
||||
write_simple<component_type::Scylla>(*_components->scylla_metadata);
|
||||
}
|
||||
@@ -2490,19 +2536,15 @@ std::vector<std::pair<component_type, sstring>> sstable::all_components() const
|
||||
}
|
||||
|
||||
future<> sstable::snapshot(const sstring& dir) const {
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
return _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
}
|
||||
|
||||
future<> sstable::change_state(sstable_state to, delayed_commit_changes* delay_commit) {
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, _generation, delay_commit);
|
||||
_state = to;
|
||||
}
|
||||
|
||||
future<> sstable::pick_up_from_upload(sstable_state to, generation_type new_generation) {
|
||||
// just in case, not really needed as the sstable is not yet in use while in the upload dir
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, new_generation, nullptr);
|
||||
_generation = std::move(new_generation);
|
||||
_state = to;
|
||||
@@ -3076,6 +3118,31 @@ void sstable::set_sstable_level(uint32_t new_level) {
|
||||
s.sstable_level = new_level;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> sstable::get_component_digest(component_type c) const {
|
||||
switch (c) {
|
||||
case component_type::Index:
|
||||
return _components_digests.index_digest;
|
||||
case component_type::Summary:
|
||||
return _components_digests.summary_digest;
|
||||
case component_type::TOC:
|
||||
return _components_digests.toc_digest;
|
||||
case component_type::CompressionInfo:
|
||||
return _components_digests.compression_digest;
|
||||
case component_type::Filter:
|
||||
return _components_digests.filter_digest;
|
||||
case component_type::Partitions:
|
||||
return _components_digests.partitions_digest;
|
||||
case component_type::Rows:
|
||||
return _components_digests.rows_digest;
|
||||
case component_type::Data:
|
||||
return _components_digests.data_digest;
|
||||
case component_type::Statistics:
|
||||
return _components_digests.statistics_digest;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
future<> sstable::mutate_sstable_level(uint32_t new_level) {
|
||||
if (!has_component(component_type::Statistics)) {
|
||||
return make_ready_future<>();
|
||||
@@ -3412,9 +3479,6 @@ utils::hashed_key sstable::make_hashed_key(const schema& s, const partition_key&
|
||||
|
||||
future<>
|
||||
sstable::unlink(storage::sync_dir sync) noexcept {
|
||||
// Serialize with other calls to unlink or potentially ongoing mutations.
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
|
||||
_unlinked = true;
|
||||
_on_delete(*this);
|
||||
|
||||
@@ -3961,13 +4025,11 @@ class sstable_stream_sink_impl : public sstable_stream_sink {
|
||||
shared_sstable _sst;
|
||||
component_type _type;
|
||||
bool _last_component;
|
||||
bool _leave_unsealed;
|
||||
public:
|
||||
sstable_stream_sink_impl(shared_sstable sst, component_type type, sstable_stream_sink_cfg cfg)
|
||||
sstable_stream_sink_impl(shared_sstable sst, component_type type, bool last_component)
|
||||
: _sst(std::move(sst))
|
||||
, _type(type)
|
||||
, _last_component(cfg.last_component)
|
||||
, _leave_unsealed(cfg.leave_unsealed)
|
||||
, _last_component(last_component)
|
||||
{}
|
||||
private:
|
||||
future<> load_metadata() const {
|
||||
@@ -4014,12 +4076,10 @@ public:
|
||||
|
||||
co_return co_await make_file_output_stream(std::move(f), stream_options);
|
||||
}
|
||||
future<shared_sstable> close() override {
|
||||
future<shared_sstable> close_and_seal() override {
|
||||
if (_last_component) {
|
||||
// If we are the last component in a sequence, we can seal the table.
|
||||
if (!_leave_unsealed) {
|
||||
co_await _sst->_storage->seal(*_sst);
|
||||
}
|
||||
co_await _sst->_storage->seal(*_sst);
|
||||
co_return std::move(_sst);
|
||||
}
|
||||
_sst = {};
|
||||
@@ -4036,7 +4096,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, sstable_stream_sink_cfg cfg) {
|
||||
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, bool last_component) {
|
||||
auto desc = parse_path(component_filename, schema->ks_name(), schema->cf_name());
|
||||
auto sst = sstm.make_sstable(schema, s_opts, desc.generation, state, desc.version, desc.format);
|
||||
|
||||
@@ -4047,7 +4107,7 @@ std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstab
|
||||
type = component_type::TemporaryTOC;
|
||||
}
|
||||
|
||||
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, cfg);
|
||||
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, last_component);
|
||||
}
|
||||
|
||||
generation_type
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "sstables/writer.hh"
|
||||
#include "version.hh"
|
||||
#include "shared_sstable.hh"
|
||||
#include "open_info.hh"
|
||||
@@ -109,7 +110,6 @@ struct sstable_writer_config {
|
||||
size_t promoted_index_auto_scale_threshold;
|
||||
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
|
||||
bool backup = false;
|
||||
bool leave_unsealed = false;
|
||||
mutation_fragment_stream_validation_level validation_level;
|
||||
std::optional<db::replay_position> replay_position;
|
||||
std::optional<int> sstable_level;
|
||||
@@ -418,8 +418,8 @@ public:
|
||||
return component_basename(_schema->ks_name(), _schema->cf_name(), _version, _generation, _format, f);
|
||||
}
|
||||
|
||||
component_name get_filename(component_type f = component_type::Data) const {
|
||||
return component_name(*this, f);
|
||||
component_name get_filename() const {
|
||||
return component_name(*this, component_type::Data);
|
||||
}
|
||||
|
||||
component_name toc_filename() const {
|
||||
@@ -629,9 +629,7 @@ private:
|
||||
size_t _total_memory_reclaimed{0};
|
||||
bool _unlinked{false};
|
||||
|
||||
// The mutate semaphore is used to serialize operations like rewrite_statistics
|
||||
// with linking or moving the sstable between directories.
|
||||
mutable named_semaphore _mutate_sem{1, named_semaphore_exception_factory{"sstable mutate"}};
|
||||
components_digests _components_digests;
|
||||
public:
|
||||
bool has_component(component_type f) const;
|
||||
sstables_manager& manager() { return _manager; }
|
||||
@@ -652,12 +650,18 @@ private:
|
||||
|
||||
template <component_type Type, typename T>
|
||||
void write_simple(const T& comp);
|
||||
void do_write_simple(file_writer&& writer,
|
||||
void do_write_simple(file_writer& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component);
|
||||
void do_write_simple(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t write_simple_with_digest(const T& comp);
|
||||
uint32_t do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
void write_crc(const checksum& c);
|
||||
void write_digest(uint32_t full_checksum);
|
||||
|
||||
@@ -668,6 +672,9 @@ private:
|
||||
future<file_writer> make_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> make_digests_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
void generate_toc();
|
||||
void open_sstable(const sstring& origin);
|
||||
|
||||
@@ -694,11 +701,12 @@ private:
|
||||
|
||||
future<> update_info_for_opened_data(sstable_open_config cfg = {});
|
||||
|
||||
future<> read_toc(sstable_open_config cfg = {}) noexcept;
|
||||
future<> read_toc() noexcept;
|
||||
future<> read_summary() noexcept;
|
||||
|
||||
void write_summary() {
|
||||
write_simple<component_type::Summary>(_components->summary);
|
||||
uint32_t digest = write_simple_with_digest<component_type::Summary>(_components->summary);
|
||||
_components_digests.summary_digest = digest;
|
||||
}
|
||||
|
||||
// To be called when we try to load an SSTable that lacks a Summary. Could
|
||||
@@ -828,7 +836,7 @@ private:
|
||||
|
||||
future<> open_or_create_data(open_flags oflags, file_open_options options = {}) noexcept;
|
||||
// runs in async context (called from storage::open)
|
||||
void write_toc(file_writer w);
|
||||
void write_toc(std::unique_ptr<crc32_digest_file_writer> w);
|
||||
static future<uint32_t> read_digest_from_file(file f);
|
||||
static future<lw_shared_ptr<checksum>> read_checksum_from_file(file f);
|
||||
public:
|
||||
@@ -1018,6 +1026,12 @@ public:
|
||||
return _components->digest;
|
||||
}
|
||||
|
||||
components_digests& get_components_digests() {
|
||||
return _components_digests;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> get_component_digest(component_type c) const;
|
||||
|
||||
// Gets ratio of droppable tombstone. A tombstone is considered droppable here
|
||||
// for cells and tombstones expired before the time point "GC before", which
|
||||
// is the point before which expiring data can be purged.
|
||||
@@ -1070,9 +1084,8 @@ public:
|
||||
future<std::unordered_map<component_type, file>> readable_file_for_all_components() const;
|
||||
|
||||
// Clones this sstable with a new generation, under the same location as the original one.
|
||||
// If leave_unsealed is true, the destination sstable is left unsealed.
|
||||
// Implementation is underlying storage specific.
|
||||
future<entry_descriptor> clone(generation_type new_generation, bool leave_unsealed = false) const;
|
||||
future<entry_descriptor> clone(generation_type new_generation) const;
|
||||
|
||||
struct lesser_reclaimed_memory {
|
||||
// comparator class to be used by the _reclaimed set in sstables manager
|
||||
@@ -1246,18 +1259,13 @@ public:
|
||||
// closes this component. If this is the last component in a set (see "last_component" in creating method below)
|
||||
// the table on disk will be sealed.
|
||||
// Returns sealed sstable if last, or nullptr otherwise.
|
||||
virtual future<shared_sstable> close() = 0;
|
||||
virtual future<shared_sstable> close_and_seal() = 0;
|
||||
virtual future<> abort() = 0;
|
||||
};
|
||||
|
||||
struct sstable_stream_sink_cfg {
|
||||
bool last_component = false;
|
||||
bool leave_unsealed = false;
|
||||
};
|
||||
|
||||
// Creates a sink object which can receive a component file sourced from above source object data.
|
||||
|
||||
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, sstable_stream_sink_cfg cfg);
|
||||
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, bool last_component);
|
||||
|
||||
} // namespace sstables
|
||||
|
||||
|
||||
@@ -50,14 +50,7 @@ class filesystem_storage final : public sstables::storage {
|
||||
std::optional<std::filesystem::path> _temp_dir; // Valid while the sstable is being created, until sealed
|
||||
|
||||
private:
|
||||
struct mark_for_removal_tag {};
|
||||
struct leave_unsealed_tag {};
|
||||
|
||||
enum class link_mode {
|
||||
default_mode,
|
||||
mark_for_removal,
|
||||
leave_unsealed,
|
||||
};
|
||||
using mark_for_removal = bool_class<class mark_for_removal_tag>;
|
||||
|
||||
template <typename Comp>
|
||||
requires std::is_same_v<Comp, component_type> || std::is_same_v<Comp, sstring>
|
||||
@@ -68,9 +61,7 @@ private:
|
||||
future<> check_create_links_replay(const sstable& sst, const sstring& dst_dir, generation_type dst_gen, const std::vector<std::pair<sstables::component_type, sstring>>& comps) const;
|
||||
future<> remove_temp_dir();
|
||||
virtual future<> create_links(const sstable& sst, const std::filesystem::path& dir) const override;
|
||||
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, link_mode mode) const;
|
||||
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal_tag) const;
|
||||
future<> create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen, leave_unsealed_tag) const;
|
||||
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal mark_for_removal) const;
|
||||
future<> create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> dst_gen) const;
|
||||
future<> touch_temp_dir(const sstable& sst);
|
||||
future<> move(const sstable& sst, sstring new_dir, generation_type generation, delayed_commit_changes* delay) override;
|
||||
@@ -92,7 +83,7 @@ public:
|
||||
{}
|
||||
|
||||
virtual future<> seal(const sstable& sst) override;
|
||||
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed) const override;
|
||||
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const override;
|
||||
virtual future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
|
||||
// runs in async context
|
||||
virtual void open(sstable& sst) override;
|
||||
@@ -213,13 +204,13 @@ void filesystem_storage::open(sstable& sst) {
|
||||
open_flags::create |
|
||||
open_flags::exclusive,
|
||||
options).get();
|
||||
auto w = file_writer(output_stream<char>(std::move(sink)), component_name(sst, component_type::TemporaryTOC));
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(sink), sst.sstable_buffer_size, component_name(sst, component_type::TemporaryTOC));
|
||||
|
||||
bool toc_exists = file_exists(fmt::to_string(sst.filename(component_type::TOC))).get();
|
||||
if (toc_exists) {
|
||||
// TOC will exist at this point if write_components() was called with
|
||||
// the generation of a sstable that exists.
|
||||
w.close();
|
||||
w->close();
|
||||
remove_file(fmt::to_string(sst.filename(component_type::TemporaryTOC))).get();
|
||||
throw std::runtime_error(format("SSTable write failed due to existence of TOC file for generation {} of {}.{}", sst._generation, sst._schema->ks_name(), sst._schema->cf_name()));
|
||||
}
|
||||
@@ -365,13 +356,8 @@ future<> filesystem_storage::check_create_links_replay(const sstable& sst, const
|
||||
/// \param sst - the sstable to work on
|
||||
/// \param dst_dir - the destination directory.
|
||||
/// \param generation - the generation of the destination sstable
|
||||
/// \param mode - what will be done after all components were linked
|
||||
/// mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
|
||||
/// leave_unsealed - leaves the destination sstable unsealed
|
||||
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, link_mode mode) const {
|
||||
// They're mutually exclusive, so we can assume only one is set.
|
||||
bool mark_for_removal = mode == link_mode::mark_for_removal;
|
||||
bool leave_unsealed = mode == link_mode::leave_unsealed;
|
||||
/// \param mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
|
||||
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, mark_for_removal mark_for_removal) const {
|
||||
sstlog.trace("create_links: {} -> {} generation={} mark_for_removal={}", sst.get_filename(), dst_dir, generation, mark_for_removal);
|
||||
auto comps = sst.all_components();
|
||||
co_await check_create_links_replay(sst, dst_dir, generation, comps);
|
||||
@@ -380,11 +366,7 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
|
||||
co_await sst.sstable_write_io_check(idempotent_link_file, fmt::to_string(sst.filename(component_type::TOC)), std::move(dst));
|
||||
auto dir = opened_directory(dst_dir);
|
||||
co_await dir.sync(sst._write_error_handler);
|
||||
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation, leave_unsealed] (auto p) {
|
||||
// Skips the linking of TOC file if the destination will be left unsealed.
|
||||
if (leave_unsealed && p.first == component_type::TOC) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation] (auto p) {
|
||||
auto src = filename(sst, _dir.native(), sst._generation, p.second);
|
||||
auto dst = filename(sst, dst_dir, generation, p.second);
|
||||
return sst.sstable_write_io_check(idempotent_link_file, std::move(src), std::move(dst));
|
||||
@@ -397,10 +379,9 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
|
||||
auto src_temp_toc = filename(sst, _dir.native(), sst._generation, component_type::TemporaryTOC);
|
||||
co_await sst.sstable_write_io_check(rename_file, std::move(dst_temp_toc), std::move(src_temp_toc));
|
||||
co_await _dir.sync(sst._write_error_handler);
|
||||
} else if (!leave_unsealed) {
|
||||
} else {
|
||||
// Now that the source sstable is linked to dir, remove
|
||||
// the TemporaryTOC file at the destination.
|
||||
// This is bypassed if destination will be left unsealed.
|
||||
co_await sst.sstable_write_io_check(remove_file, std::move(dst_temp_toc));
|
||||
}
|
||||
co_await dir.sync(sst._write_error_handler);
|
||||
@@ -408,23 +389,15 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
|
||||
sstlog.trace("create_links: {} -> {} generation={}: done", sst.get_filename(), dst_dir, generation);
|
||||
}
|
||||
|
||||
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal_tag) const {
|
||||
return create_links_common(sst, dst_dir, dst_gen, link_mode::mark_for_removal);
|
||||
}
|
||||
|
||||
future<> filesystem_storage::create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen, leave_unsealed_tag) const {
|
||||
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), link_mode::leave_unsealed);
|
||||
}
|
||||
|
||||
future<> filesystem_storage::create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen) const {
|
||||
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), link_mode::default_mode);
|
||||
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), mark_for_removal::no);
|
||||
}
|
||||
|
||||
future<> filesystem_storage::create_links(const sstable& sst, const std::filesystem::path& dir) const {
|
||||
return create_links_common(sst, dir.native(), sst._generation, link_mode::default_mode);
|
||||
return create_links_common(sst, dir.native(), sst._generation, mark_for_removal::no);
|
||||
}
|
||||
|
||||
future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed leave_unsealed) const {
|
||||
future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
|
||||
std::filesystem::path snapshot_dir;
|
||||
if (abs) {
|
||||
snapshot_dir = dir;
|
||||
@@ -432,11 +405,7 @@ future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_
|
||||
snapshot_dir = _dir.path() / dir;
|
||||
}
|
||||
co_await sst.sstable_touch_directory_io_check(snapshot_dir);
|
||||
if (leave_unsealed) {
|
||||
co_await create_links_common(sst, snapshot_dir, std::move(gen), leave_unsealed_tag{});
|
||||
} else {
|
||||
co_await create_links_common(sst, snapshot_dir, std::move(gen));
|
||||
}
|
||||
co_await create_links_common(sst, snapshot_dir, std::move(gen));
|
||||
}
|
||||
|
||||
future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generation_type new_generation, delayed_commit_changes* delay_commit) {
|
||||
@@ -444,7 +413,7 @@ future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generatio
|
||||
sstring old_dir = _dir.native();
|
||||
sstlog.debug("Moving {} old_generation={} to {} new_generation={} do_sync_dirs={}",
|
||||
sst.get_filename(), sst._generation, new_dir, new_generation, delay_commit == nullptr);
|
||||
co_await create_links_common(sst, new_dir, new_generation, mark_for_removal_tag{});
|
||||
co_await create_links_common(sst, new_dir, new_generation, mark_for_removal::yes);
|
||||
co_await change_dir(new_dir);
|
||||
generation_type old_generation = sst._generation;
|
||||
co_await coroutine::parallel_for_each(sst.all_components(), [&sst, old_generation, old_dir] (auto p) {
|
||||
@@ -629,7 +598,7 @@ public:
|
||||
{}
|
||||
|
||||
future<> seal(const sstable& sst) override;
|
||||
future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type>, storage::leave_unsealed) const override;
|
||||
future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type>) const override;
|
||||
future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
|
||||
// runs in async context
|
||||
void open(sstable& sst) override;
|
||||
@@ -701,15 +670,10 @@ void object_storage_base::open(sstable& sst) {
|
||||
sst.manager().sstables_registry().create_entry(owner(), status_creating, sst._state, std::move(desc)).get();
|
||||
|
||||
memory_data_sink_buffers bufs;
|
||||
sst.write_toc(
|
||||
file_writer(
|
||||
output_stream<char>(
|
||||
data_sink(
|
||||
std::make_unique<memory_data_sink>(bufs)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
auto out = data_sink(std::make_unique<memory_data_sink>(bufs));
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(out), sst.sstable_buffer_size, component_name(sst, component_type::TOC));
|
||||
|
||||
sst.write_toc(std::move(w));
|
||||
put_object(make_object_name(sst, component_type::TOC), std::move(bufs)).get();
|
||||
}
|
||||
|
||||
@@ -846,7 +810,7 @@ future<> object_storage_base::unlink_component(const sstable& sst, component_typ
|
||||
}
|
||||
}
|
||||
|
||||
future<> object_storage_base::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen, storage::leave_unsealed) const {
|
||||
future<> object_storage_base::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
|
||||
on_internal_error(sstlog, "Snapshotting S3 objects not implemented");
|
||||
co_return;
|
||||
}
|
||||
|
||||
@@ -97,10 +97,9 @@ public:
|
||||
|
||||
using absolute_path = bool_class<class absolute_path_tag>; // FIXME -- should go away eventually
|
||||
using sync_dir = bool_class<struct sync_dir_tag>; // meaningful only to filesystem storage
|
||||
using leave_unsealed = bool_class<struct leave_unsealed_tag>;
|
||||
|
||||
virtual future<> seal(const sstable& sst) = 0;
|
||||
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}, leave_unsealed lu = leave_unsealed::no) const = 0;
|
||||
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}) const = 0;
|
||||
virtual future<> change_state(const sstable& sst, sstable_state to, generation_type generation, delayed_commit_changes* delay) = 0;
|
||||
// runs in async context
|
||||
virtual void open(sstable& sst) = 0;
|
||||
|
||||
@@ -547,6 +547,7 @@ enum class scylla_metadata_type : uint32_t {
|
||||
ExtTimestampStats = 9,
|
||||
SSTableIdentifier = 10,
|
||||
Schema = 11,
|
||||
ComponentsDigests = 12,
|
||||
};
|
||||
|
||||
// UUID is used for uniqueness across nodes, such that an imported sstable
|
||||
@@ -573,6 +574,24 @@ struct sstable_identifier_type {
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(value); }
|
||||
};
|
||||
|
||||
// Component digests stored in scylla metadata to track integrity of individual components
|
||||
struct components_digests {
|
||||
std::optional<uint32_t> data_digest;
|
||||
std::optional<uint32_t> compression_digest;
|
||||
std::optional<uint32_t> filter_digest;
|
||||
std::optional<uint32_t> statistics_digest;
|
||||
std::optional<uint32_t> summary_digest;
|
||||
std::optional<uint32_t> index_digest;
|
||||
std::optional<uint32_t> toc_digest;
|
||||
std::optional<uint32_t> partitions_digest;
|
||||
std::optional<uint32_t> rows_digest;
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) {
|
||||
return f(data_digest,compression_digest, filter_digest, statistics_digest, summary_digest, index_digest, toc_digest, partitions_digest, rows_digest);
|
||||
}
|
||||
};
|
||||
|
||||
// Types of large data statistics.
|
||||
//
|
||||
// Note: For extensibility, never reuse an identifier,
|
||||
@@ -656,7 +675,8 @@ struct scylla_metadata {
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ScyllaVersion, scylla_version>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtTimestampStats, ext_timestamp_stats>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::SSTableIdentifier, sstable_identifier>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>
|
||||
> data;
|
||||
|
||||
sstable_enabled_features get_features() const {
|
||||
@@ -691,6 +711,13 @@ struct scylla_metadata {
|
||||
auto* sid = data.get<scylla_metadata_type::SSTableIdentifier, scylla_metadata::sstable_identifier>();
|
||||
return sid ? sid->value : sstable_id::create_null_id();
|
||||
}
|
||||
const components_digests get_components_digests() const {
|
||||
auto cd = data.get<scylla_metadata_type::ComponentsDigests, components_digests>();
|
||||
if (!cd) {
|
||||
return {};
|
||||
}
|
||||
return *cd;
|
||||
}
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(data); }
|
||||
|
||||
@@ -65,7 +65,7 @@ serialized_size(sstable_version_types v, const T& object) {
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink_impl : public data_sink_impl {
|
||||
data_sink _out;
|
||||
@@ -92,7 +92,9 @@ public:
|
||||
|
||||
per_chunk_checksum = ChecksumType::checksum(per_chunk_checksum, buf.begin() + offset, size);
|
||||
_full_checksum = checksum_combine_or_feed<ChecksumType>(_full_checksum, per_chunk_checksum, buf.begin() + offset, size);
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
if constexpr (calculate_chunk_checksums) {
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
}
|
||||
}
|
||||
}
|
||||
return _out.put(std::move(bufs));
|
||||
@@ -112,29 +114,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink : public data_sink {
|
||||
public:
|
||||
checksummed_file_data_sink(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum)
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType, calculate_chunk_checksums>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
};
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
inline
|
||||
output_stream<char> make_checksummed_file_output_stream(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) {
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType>(std::move(out), cinfo, full_file_checksum));
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType, calculate_chunk_checksums>(std::move(out), cinfo, full_file_checksum));
|
||||
}
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_writer : public file_writer {
|
||||
checksum _c;
|
||||
uint32_t _full_checksum;
|
||||
public:
|
||||
checksummed_file_writer(data_sink out, size_t buffer_size, component_name c)
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType, calculate_chunk_checksums>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
, _c(uint32_t(std::min(size_t(DEFAULT_CHUNK_SIZE), buffer_size)), {})
|
||||
, _full_checksum(ChecksumType::init_checksum()) {}
|
||||
|
||||
@@ -152,8 +154,10 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils>;
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils, true>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils, true>;
|
||||
|
||||
using crc32_digest_file_writer = checksummed_file_writer<crc32_utils, false>;
|
||||
|
||||
template <typename T, typename W>
|
||||
requires Writer<W>
|
||||
|
||||
@@ -63,45 +63,30 @@ mutation_reader_consumer make_streaming_consumer(sstring origin,
|
||||
}
|
||||
schema_ptr s = reader.schema();
|
||||
|
||||
// SSTable will be only sealed when added to the sstable set, so we make sure unsplit sstables aren't
|
||||
// left sealed on the table directory.
|
||||
auto cfg = cf->get_sstables_manager().configure_writer(origin);
|
||||
cfg.leave_unsealed = true;
|
||||
return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
|
||||
cfg, encoding_stats{}).then([sst] {
|
||||
return sst->open_data();
|
||||
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] -> future<std::vector<sstables::shared_sstable>> {
|
||||
auto on_add = [sst, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] (sstables::shared_sstable loading_sst) -> future<> {
|
||||
if (repaired_at && sstables::repair_origin == origin) {
|
||||
loading_sst->being_repaired = frozen_guard;
|
||||
if (sstable_list_to_mark_as_repaired) {
|
||||
sstable_list_to_mark_as_repaired->insert(loading_sst);
|
||||
}
|
||||
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard] -> future<> {
|
||||
if (repaired_at && sstables::repair_origin == origin) {
|
||||
sst->being_repaired = frozen_guard;
|
||||
if (sstable_list_to_mark_as_repaired) {
|
||||
sstable_list_to_mark_as_repaired->insert(sst);
|
||||
}
|
||||
if (loading_sst == sst) {
|
||||
co_await loading_sst->seal_sstable(cfg.backup);
|
||||
}
|
||||
co_return;
|
||||
};
|
||||
}
|
||||
if (offstrategy && sstables::repair_origin == origin) {
|
||||
sstables::sstlog.debug("Enabled automatic off-strategy trigger for table {}.{}",
|
||||
cf->schema()->ks_name(), cf->schema()->cf_name());
|
||||
cf->enable_off_strategy_trigger();
|
||||
}
|
||||
co_return co_await cf->add_new_sstable_and_update_cache(sst, on_add, offstrategy);
|
||||
}).then([cf, s, sst, use_view_update_path, &vb, &vbw] (std::vector<sstables::shared_sstable> new_sstables) mutable -> future<> {
|
||||
auto& vb_ = vb;
|
||||
auto new_sstables_ = std::move(new_sstables);
|
||||
auto table = cf;
|
||||
|
||||
co_await cf->add_sstable_and_update_cache(sst, offstrategy);
|
||||
}).then([cf, s, sst, use_view_update_path, &vb, &vbw]() mutable -> future<> {
|
||||
if (use_view_update_path == db::view::sstable_destination_decision::staging_managed_by_vbc) {
|
||||
co_return co_await vbw.local().register_staging_sstable_tasks(new_sstables_, cf->schema()->id());
|
||||
return vbw.local().register_staging_sstable_tasks({sst}, cf->schema()->id());
|
||||
} else if (use_view_update_path == db::view::sstable_destination_decision::staging_directly_to_generator) {
|
||||
co_await coroutine::parallel_for_each(new_sstables_, [&vb_, &table] (sstables::shared_sstable sst) -> future<> {
|
||||
return vb_.local().register_staging_sstable(sst, table);
|
||||
});
|
||||
return vb.local().register_staging_sstable(sst, std::move(cf));
|
||||
}
|
||||
co_return;
|
||||
return make_ready_future<>();
|
||||
});
|
||||
};
|
||||
if (!offstrategy) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user