Compare commits
2 Commits
copilot/fi
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c401e260a | ||
|
|
1824b04e2a |
@@ -1,14 +0,0 @@
|
||||
name: Call Jira release creation for new milestone
|
||||
|
||||
on:
|
||||
milestone:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
sync-milestone-to-jira:
|
||||
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
|
||||
with:
|
||||
# Comma-separated list of Jira project keys
|
||||
jira_project_keys: "SCYLLADB,CUSTOMER"
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
@@ -169,7 +169,7 @@ future<> controller::request_stop_server() {
|
||||
});
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
|
||||
future<utils::chunked_vector<client_data>> controller::get_client_data() {
|
||||
return _server.local().get_client_data();
|
||||
}
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ public:
|
||||
// This virtual function is called (on each shard separately) when the
|
||||
// virtual table "system.clients" is read. It is expected to generate a
|
||||
// list of clients connected to this server (on this shard).
|
||||
virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
|
||||
virtual future<utils::chunked_vector<client_data>> get_client_data() override;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -708,12 +708,8 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
|
||||
// As long as the system_clients_entry object is alive, this request will
|
||||
// be visible in the "system.clients" virtual table. When requested, this
|
||||
// entry will be formatted by server::ongoing_request::make_client_data().
|
||||
auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
|
||||
auto system_clients_entry = _ongoing_requests.emplace(
|
||||
req->get_client_address(), std::move(user_agent_header),
|
||||
req->get_client_address(), req->get_header("User-Agent"),
|
||||
username, current_scheduling_group(),
|
||||
req->get_protocol_name() == "https");
|
||||
|
||||
@@ -989,10 +985,10 @@ client_data server::ongoing_request::make_client_data() const {
|
||||
return cd;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
|
||||
utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
|
||||
future<utils::chunked_vector<client_data>> server::get_client_data() {
|
||||
utils::chunked_vector<client_data> ret;
|
||||
co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
|
||||
ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
|
||||
ret.emplace_back(r.make_client_data());
|
||||
});
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
@@ -55,7 +55,6 @@ class server : public peering_sharded_service<server> {
|
||||
// though it isn't really relevant for Alternator which defines its own
|
||||
// timeouts separately. We can create this object only once.
|
||||
updateable_timeout_config _timeout_config;
|
||||
client_options_cache_type _connection_options_keys_and_values;
|
||||
|
||||
alternator_callbacks_map _callbacks;
|
||||
|
||||
@@ -89,7 +88,7 @@ class server : public peering_sharded_service<server> {
|
||||
// is called when reading the "system.clients" virtual table.
|
||||
struct ongoing_request {
|
||||
socket_address _client_address;
|
||||
client_options_cache_entry_type _user_agent;
|
||||
sstring _user_agent;
|
||||
sstring _username;
|
||||
scheduling_group _scheduling_group;
|
||||
bool _is_https;
|
||||
@@ -108,7 +107,7 @@ public:
|
||||
// table "system.clients" is read. It is expected to generate a list of
|
||||
// clients connected to this server (on this shard). This function is
|
||||
// called by alternator::controller::get_client_data().
|
||||
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
|
||||
future<utils::chunked_vector<client_data>> get_client_data();
|
||||
private:
|
||||
void set_routes(seastar::httpd::routes& r);
|
||||
// If verification succeeds, returns the authenticated user's username
|
||||
|
||||
@@ -31,7 +31,6 @@ set(swagger_files
|
||||
api-doc/column_family.json
|
||||
api-doc/commitlog.json
|
||||
api-doc/compaction_manager.json
|
||||
api-doc/client_routes.json
|
||||
api-doc/config.json
|
||||
api-doc/cql_server_test.json
|
||||
api-doc/endpoint_snitch_info.json
|
||||
@@ -69,7 +68,6 @@ target_sources(api
|
||||
PRIVATE
|
||||
api.cc
|
||||
cache_service.cc
|
||||
client_routes.cc
|
||||
collectd.cc
|
||||
column_family.cc
|
||||
commitlog.cc
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
, "client_routes_entry": {
|
||||
"id": "client_routes_entry",
|
||||
"summary": "An entry storing client routes",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"},
|
||||
"address": {"type": "string"},
|
||||
"port": {"type": "integer"},
|
||||
"tls_port": {"type": "integer"},
|
||||
"alternator_port": {"type": "integer"},
|
||||
"alternator_https_port": {"type": "integer"}
|
||||
},
|
||||
"required": ["connection_id", "host_id", "address"]
|
||||
}
|
||||
, "client_routes_key": {
|
||||
"id": "client_routes_key",
|
||||
"summary": "A key of client_routes_entry",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
, "/v2/client-routes":{
|
||||
"get": {
|
||||
"description":"List all client route entries",
|
||||
"operationId":"get_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[],
|
||||
"responses":{
|
||||
"200":{
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{"$ref":"#/definitions/ErrorModel"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"post": {
|
||||
"description":"Upsert one or more client route entries",
|
||||
"operationId":"set_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{ "description": "OK" },
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{ "$ref":"#/definitions/ErrorModel" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"delete": {
|
||||
"description":"Delete one or more client route entries",
|
||||
"operationId":"delete_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_key" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{
|
||||
"description": "OK"
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{
|
||||
"$ref":"#/definitions/ErrorModel"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
13
api/api.cc
13
api/api.cc
@@ -37,7 +37,6 @@
|
||||
#include "raft.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "service_levels.hh"
|
||||
#include "client_routes.hh"
|
||||
|
||||
logging::logger apilog("api");
|
||||
|
||||
@@ -68,11 +67,9 @@ future<> set_server_init(http_context& ctx) {
|
||||
rb02->set_api_doc(r);
|
||||
rb02->register_api_file(r, "swagger20_header");
|
||||
rb02->register_api_file(r, "metrics");
|
||||
rb02->register_api_file(r, "client_routes");
|
||||
rb->register_function(r, "system",
|
||||
"The system related API");
|
||||
rb02->add_definitions_file(r, "metrics");
|
||||
rb02->add_definitions_file(r, "client_routes");
|
||||
set_system(ctx, r);
|
||||
rb->register_function(r, "error_injection",
|
||||
"The error injection API");
|
||||
@@ -132,16 +129,6 @@ future<> unset_server_storage_service(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr) {
|
||||
return ctx.http_server.set_routes([&ctx, &cr] (routes& r) {
|
||||
set_client_routes(ctx, r, cr);
|
||||
});
|
||||
}
|
||||
|
||||
future<> unset_server_client_routes(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_client_routes(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_load_meter(http_context& ctx, service::load_meter& lm) {
|
||||
return ctx.http_server.set_routes([&ctx, &lm] (routes& r) { set_load_meter(ctx, r, lm); });
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ class storage_proxy;
|
||||
class storage_service;
|
||||
class raft_group0_client;
|
||||
class raft_group_registry;
|
||||
class client_routes_service;
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -100,8 +99,6 @@ future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snit
|
||||
future<> unset_server_snitch(http_context& ctx);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
|
||||
future<> unset_server_storage_service(http_context& ctx);
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
|
||||
future<> unset_server_client_routes(http_context& ctx);
|
||||
future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
|
||||
future<> unset_server_sstables_loader(http_context& ctx);
|
||||
future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g);
|
||||
|
||||
@@ -1,176 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <seastar/http/short_streams.hh>
|
||||
|
||||
#include "client_routes.hh"
|
||||
#include "api/api.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "utils/rjson.hh"
|
||||
|
||||
|
||||
#include "api/api-doc/client_routes.json.hh"
|
||||
|
||||
using namespace seastar::httpd;
|
||||
using namespace std::chrono_literals;
|
||||
using namespace json;
|
||||
|
||||
extern logging::logger apilog;
|
||||
|
||||
namespace api {
|
||||
|
||||
static void validate_client_routes_endpoint(sharded<service::client_routes_service>& cr, sstring endpoint_name) {
|
||||
if (!cr.local().get_feature_service().client_routes) {
|
||||
apilog.warn("{}: called before the cluster feature was enabled", endpoint_name);
|
||||
throw std::runtime_error(fmt::format("{} requires all nodes to support the CLIENT_ROUTES cluster feature", endpoint_name));
|
||||
}
|
||||
}
|
||||
|
||||
static sstring parse_string(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
throw bad_param_exception(fmt::format("Missing '{}'", name));
|
||||
}
|
||||
if (!it->value.IsString()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be a string", name));
|
||||
}
|
||||
return {it->value.GetString(), it->value.GetStringLength()};
|
||||
}
|
||||
|
||||
static std::optional<uint32_t> parse_port(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (!it->value.IsInt()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be an integer", name));
|
||||
}
|
||||
auto port = it->value.GetInt();
|
||||
if (port < 1 || port > 65535) {
|
||||
throw bad_param_exception(fmt::format("'{}' value={} is outside the allowed port range", name, port));
|
||||
}
|
||||
return port;
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_entry> parse_set_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_entry> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
if (!element.IsObject()) { throw bad_param_exception("Each element must be object"); }
|
||||
|
||||
const auto port = parse_port("port", element);
|
||||
const auto tls_port = parse_port("tls_port", element);
|
||||
const auto alternator_port = parse_port("alternator_port", element);
|
||||
const auto alternator_https_port = parse_port("alternator_https_port", element);
|
||||
|
||||
if (!port.has_value() && !tls_port.has_value() && !alternator_port.has_value() && !alternator_https_port.has_value()) {
|
||||
throw bad_param_exception("At least one port field ('port', 'tls_port', 'alternator_port', 'alternator_https_port') must be specified");
|
||||
}
|
||||
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)},
|
||||
parse_string("address", element),
|
||||
port,
|
||||
tls_port,
|
||||
alternator_port,
|
||||
alternator_https_port
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "rest_set_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
co_await cr.local().set_client_routes(parse_set_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_key> parse_delete_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_key> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)}
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "delete_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
co_await cr.local().delete_client_routes(parse_delete_client_array(root));
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_get_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "get_client_routes");
|
||||
|
||||
co_return co_await cr.invoke_on(0, [] (service::client_routes_service& cr) -> future<json::json_return_type> {
|
||||
co_return json::json_return_type(stream_range_as_array(co_await cr.get_client_routes(), [](const service::client_routes_service::client_route_entry & entry) {
|
||||
seastar::httpd::client_routes_json::client_routes_entry obj;
|
||||
obj.connection_id = entry.connection_id;
|
||||
obj.host_id = fmt::to_string(entry.host_id);
|
||||
obj.address = entry.address;
|
||||
if (entry.port.has_value()) { obj.port = entry.port.value(); }
|
||||
if (entry.tls_port.has_value()) { obj.tls_port = entry.tls_port.value(); }
|
||||
if (entry.alternator_port.has_value()) { obj.alternator_port = entry.alternator_port.value(); }
|
||||
if (entry.alternator_https_port.has_value()) { obj.alternator_https_port = entry.alternator_https_port.value(); }
|
||||
return obj;
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
void set_client_routes(http_context& ctx, routes& r, sharded<service::client_routes_service>& cr) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_set_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::delete_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_delete_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::get_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_get_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
}
|
||||
|
||||
void unset_client_routes(http_context& ctx, routes& r) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::delete_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::get_client_routes.unset(r);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/json/json_elements.hh>
|
||||
#include "api/api_init.hh"
|
||||
|
||||
namespace api {
|
||||
|
||||
void set_client_routes(http_context& ctx, httpd::routes& r, sharded<service::client_routes_service>& cr);
|
||||
void unset_client_routes(http_context& ctx, httpd::routes& r);
|
||||
|
||||
}
|
||||
@@ -10,9 +10,7 @@
|
||||
#include <seastar/net/inet_address.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include "seastarx.hh"
|
||||
#include "utils/loading_shared_values.hh"
|
||||
|
||||
#include <list>
|
||||
#include <optional>
|
||||
|
||||
enum class client_type {
|
||||
@@ -29,20 +27,6 @@ enum class client_connection_stage {
|
||||
ready,
|
||||
};
|
||||
|
||||
// We implement a keys cache using a map-like utils::loading_shared_values container by storing empty values.
|
||||
struct options_cache_value_type {};
|
||||
using client_options_cache_type = utils::loading_shared_values<sstring, options_cache_value_type>;
|
||||
using client_options_cache_entry_type = client_options_cache_type::entry_ptr;
|
||||
using client_options_cache_key_type = client_options_cache_type::key_type;
|
||||
|
||||
// This struct represents a single OPTION key-value pair from the client's connection options.
|
||||
// Both key and value are represented by corresponding "references" to their cached values.
|
||||
// Each "reference" is effectively a lw_shared_ptr value.
|
||||
struct client_option_key_value_cached_entry {
|
||||
client_options_cache_entry_type key;
|
||||
client_options_cache_entry_type value;
|
||||
};
|
||||
|
||||
sstring to_string(client_connection_stage ct);
|
||||
|
||||
// Representation of a row in `system.clients'. std::optionals are for nullable cells.
|
||||
@@ -53,8 +37,8 @@ struct client_data {
|
||||
client_connection_stage connection_stage = client_connection_stage::established;
|
||||
int32_t shard_id; /// ID of server-side shard which is processing the connection.
|
||||
|
||||
std::optional<client_options_cache_entry_type> driver_name;
|
||||
std::optional<client_options_cache_entry_type> driver_version;
|
||||
std::optional<sstring> driver_name;
|
||||
std::optional<sstring> driver_version;
|
||||
std::optional<sstring> hostname;
|
||||
std::optional<int32_t> protocol_version;
|
||||
std::optional<sstring> ssl_cipher_suite;
|
||||
@@ -62,7 +46,6 @@ struct client_data {
|
||||
std::optional<sstring> ssl_protocol;
|
||||
std::optional<sstring> username;
|
||||
std::optional<sstring> scheduling_group_name;
|
||||
std::list<client_option_key_value_cached_entry> client_options;
|
||||
|
||||
sstring stage_str() const { return to_string(connection_stage); }
|
||||
sstring client_type_str() const { return to_string(ct); }
|
||||
|
||||
@@ -125,6 +125,10 @@ if(target_arch)
|
||||
add_compile_options("-march=${target_arch}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_options("SHELL:-Xclang -fexperimental-assignment-tracking=disabled")
|
||||
endif()
|
||||
|
||||
function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
|
||||
math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
|
||||
set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")
|
||||
|
||||
14
configure.py
14
configure.py
@@ -1158,7 +1158,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'locator/topology.cc',
|
||||
'locator/util.cc',
|
||||
'service/client_state.cc',
|
||||
'service/client_routes.cc',
|
||||
'service/storage_service.cc',
|
||||
'service/session.cc',
|
||||
'service/task_manager_module.cc',
|
||||
@@ -1319,8 +1318,6 @@ api = ['api/api.cc',
|
||||
'api/storage_proxy.cc',
|
||||
Json2Code('api/api-doc/cache_service.json'),
|
||||
'api/cache_service.cc',
|
||||
Json2Code('api/api-doc/client_routes.json'),
|
||||
'api/client_routes.cc',
|
||||
Json2Code('api/api-doc/collectd.json'),
|
||||
'api/collectd.cc',
|
||||
Json2Code('api/api-doc/endpoint_snitch_info.json'),
|
||||
@@ -2195,6 +2192,8 @@ def kmiplib():
|
||||
for id in os_ids:
|
||||
if id in { 'centos', 'fedora', 'rhel' }:
|
||||
return 'rhel84'
|
||||
elif id in { 'ubuntu', 'debian' }:
|
||||
return 'ubuntu' # Temporarily use a placeholder for Ubuntu/Debian
|
||||
print('Could not resolve libkmip.a for platform {}'.format(os_ids))
|
||||
sys.exit(1)
|
||||
|
||||
@@ -2251,6 +2250,15 @@ def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
|
||||
if debuginfo and mode_config['can_have_debug_info']:
|
||||
cxxflags += ['-g', '-gz']
|
||||
|
||||
if 'clang' in cxx:
|
||||
# Since AssignmentTracking was enabled by default in clang
|
||||
# (llvm/llvm-project@de6da6ad55d3ca945195d1cb109cb8efdf40a52a)
|
||||
# coroutine frame debugging info (`coro_frame_ty`) is broken.
|
||||
#
|
||||
# It seems that we aren't losing much by disabling AssigmentTracking,
|
||||
# so for now we choose to disable it to get `coro_frame_ty` back.
|
||||
cxxflags.append('-Xclang -fexperimental-assignment-tracking=disabled')
|
||||
|
||||
return cxxflags
|
||||
|
||||
|
||||
|
||||
@@ -64,10 +64,6 @@ bool query_processor::topology_global_queue_empty() {
|
||||
return remote().first.get().ss.topology_global_queue_empty();
|
||||
}
|
||||
|
||||
future<bool> query_processor::ongoing_rf_change(const service::group0_guard& guard, sstring ks) {
|
||||
return remote().first.get().ss.ongoing_rf_change(guard, std::move(ks));
|
||||
}
|
||||
|
||||
static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
@@ -474,7 +474,6 @@ public:
|
||||
void reset_cache();
|
||||
|
||||
bool topology_global_queue_empty();
|
||||
future<bool> ongoing_rf_change(const service::group0_guard& guard, sstring ks);
|
||||
|
||||
query_options make_internal_options(
|
||||
const statements::prepared_statement::checked_weak_ptr& p,
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "prepared_statement.hh"
|
||||
#include "seastar/coroutine/exception.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/topology_mutation.hh"
|
||||
@@ -139,7 +138,6 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
|
||||
future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
|
||||
cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
|
||||
using namespace cql_transport;
|
||||
bool unknown_keyspace = false;
|
||||
try {
|
||||
event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
|
||||
auto ks = qp.db().find_keyspace(_name);
|
||||
@@ -160,12 +158,8 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
// when in reality nothing or only schema is being changed
|
||||
if (changes_tablets(qp)) {
|
||||
if (!qp.proxy().features().topology_global_request_queue && !qp.topology_global_queue_empty()) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
}
|
||||
if (qp.proxy().features().rack_list_rf && co_await qp.ongoing_rf_change(mc.guard(),_name)) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception(format("Another RF change for this keyspace {} ongoing, please retry.", _name)));
|
||||
return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
}
|
||||
qp.db().real_database().validate_keyspace_update(*ks_md_update);
|
||||
|
||||
@@ -248,15 +242,10 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
target_type,
|
||||
keyspace());
|
||||
mc.add_mutations(std::move(muts), "CQL alter keyspace");
|
||||
co_return std::make_tuple(std::move(ret), warnings);
|
||||
return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), warnings));
|
||||
} catch (data_dictionary::no_such_keyspace& e) {
|
||||
unknown_keyspace = true;
|
||||
return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
}
|
||||
if (unknown_keyspace) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
}
|
||||
std::unreachable();
|
||||
}
|
||||
|
||||
std::unique_ptr<cql3::statements::prepared_statement>
|
||||
|
||||
@@ -61,7 +61,7 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
|
||||
// Handle ALTER:
|
||||
// ([]|0) -> numeric is allowed, there are no existing replicas
|
||||
// numeric -> numeric' is not supported unless numeric == numeric'. User should convert RF to rack list of equal count first.
|
||||
// numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
|
||||
// rack_list -> len(rack_list) is allowed (no-op)
|
||||
// rack_list -> numeric is not allowed
|
||||
if (old_options.contains(dc)) {
|
||||
@@ -75,8 +75,6 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
"Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
|
||||
dc, old_rf_val, data.count()));
|
||||
}
|
||||
} else if (old_rf.count() == data.count()) {
|
||||
return rf;
|
||||
} else if (old_rf.count() > 0) {
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
|
||||
@@ -155,8 +153,6 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
}
|
||||
|
||||
// Validate options.
|
||||
bool numeric_to_rack_list_transition = false;
|
||||
bool rf_change = false;
|
||||
for (auto&& [dc, opt] : options) {
|
||||
locator::replication_factor_data rf(opt);
|
||||
|
||||
@@ -166,7 +162,6 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
old_rf = locator::replication_factor_data(i->second);
|
||||
}
|
||||
|
||||
rf_change = rf_change || (old_rf && old_rf->count() != rf.count()) || (!old_rf && rf.count() != 0);
|
||||
if (!rf.is_rack_based()) {
|
||||
if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
|
||||
if (old_rf->count() != rf.count()) {
|
||||
@@ -192,11 +187,12 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Rack list for '{}' contains duplicate entries", dc));
|
||||
}
|
||||
numeric_to_rack_list_transition = numeric_to_rack_list_transition || (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0);
|
||||
}
|
||||
|
||||
if (numeric_to_rack_list_transition && rf_change) {
|
||||
throw exceptions::configuration_exception("Cannot change replication factor from numeric to rack list and rf value at the same time");
|
||||
if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
|
||||
// FIXME: Allow this if replicas already conform to the given rack list.
|
||||
// FIXME: Implement automatic colocation to allow transition to rack list.
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor from numeric to rack list for '{}'", dc));
|
||||
}
|
||||
}
|
||||
|
||||
if (!rf && options.empty() && old_options.empty()) {
|
||||
@@ -416,7 +412,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
|
||||
std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
@@ -432,7 +428,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
|
||||
}
|
||||
auto sc = get_replication_strategy_class();
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
if (sc) {
|
||||
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
|
||||
} else {
|
||||
|
||||
@@ -248,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
|
||||
// which is larger than the segment ID of the RP of the last written hint.
|
||||
cfg.base_segment_id = _last_written_rp.base_id();
|
||||
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
|
||||
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
|
||||
// When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
|
||||
if (_sender.have_segments()) {
|
||||
|
||||
@@ -135,5 +135,5 @@ const std::string db::object_storage_endpoint_param::gs_type = "gs";
|
||||
|
||||
auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
|
||||
-> decltype(ctx.out()) {
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{}", e.to_json_string());
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
|
||||
}
|
||||
|
||||
@@ -110,7 +110,6 @@ namespace {
|
||||
system_keyspace::v3::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.enable_schema_commitlog();
|
||||
@@ -138,7 +137,6 @@ namespace {
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.is_group0_table = true;
|
||||
@@ -311,7 +309,6 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("tablet_balancing_enabled", boolean_type, column_kind::static_column)
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -1418,23 +1415,6 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::client_routes() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, CLIENT_ROUTES);
|
||||
return schema_builder(NAME, CLIENT_ROUTES, std::make_optional(id))
|
||||
.with_column("connection_id", utf8_type, column_kind::partition_key)
|
||||
.with_column("host_id", uuid_type, column_kind::clustering_key)
|
||||
.with_column("address", utf8_type)
|
||||
.with_column("port", int32_type)
|
||||
.with_column("tls_port", int32_type)
|
||||
.with_column("alternator_port", int32_type)
|
||||
.with_column("alternator_https_port", int32_type)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
future<system_keyspace::local_info> system_keyspace::load_local_info() {
|
||||
auto msg = co_await execute_cql(format("SELECT host_id, cluster_name, data_center, rack FROM system.{} WHERE key=?", LOCAL), sstring(LOCAL));
|
||||
|
||||
@@ -2362,7 +2342,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
|
||||
v3::cdc_local(),
|
||||
raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
|
||||
topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
|
||||
dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
|
||||
dicts(), view_building_tasks(), cdc_streams_state(), cdc_streams_history()
|
||||
});
|
||||
|
||||
if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
|
||||
@@ -3157,10 +3137,7 @@ static bool must_have_tokens(service::node_state nst) {
|
||||
// A decommissioning node doesn't have tokens at the end, they are
|
||||
// removed during transition to the left_token_ring state.
|
||||
case service::node_state::decommissioning: return false;
|
||||
// A removing node might or might not have tokens depending on whether
|
||||
// REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
|
||||
// cases, we allow removing nodes to not have tokens.
|
||||
case service::node_state::removing: return false;
|
||||
case service::node_state::removing: return true;
|
||||
case service::node_state::rebuilding: return true;
|
||||
case service::node_state::normal: return true;
|
||||
case service::node_state::left: return false;
|
||||
@@ -3400,12 +3377,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("paused_rf_change_requests")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "paused_rf_change_requests")) {
|
||||
ret.paused_rf_change_requests.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
@@ -3617,43 +3588,35 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
|
||||
return entry;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id) {
|
||||
auto r = co_await get_topology_request_entry_opt(id);
|
||||
if (!r) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
}
|
||||
co_return std::move(*r);
|
||||
}
|
||||
|
||||
future<std::optional<system_keyspace::topology_requests_entry>> system_keyspace::get_topology_request_entry_opt(utils::UUID id) {
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id, bool require_entry) {
|
||||
auto rs = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE id = {}", TOPOLOGY_REQUESTS, id));
|
||||
|
||||
if (!rs || rs->empty()) {
|
||||
co_return std::nullopt;
|
||||
if (require_entry) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
} else {
|
||||
co_return topology_requests_entry{
|
||||
.id = utils::null_uuid()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const auto& row = rs->one();
|
||||
co_return topology_request_row_to_entry(id, row);
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit) {
|
||||
sstring request_types_str = "";
|
||||
bool first = true;
|
||||
for (const auto& rt : request_types) {
|
||||
if (!std::exchange(first, false)) {
|
||||
request_types_str += ", ";
|
||||
}
|
||||
request_types_str += std::visit([] (auto&& arg) { return fmt::format("'{}'", arg); }, rt);
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
// Running requests.
|
||||
auto rs_running = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, request_types_str));
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS,
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
|
||||
|
||||
// Requests which finished after end_time_limit.
|
||||
auto rs_done = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(), request_types_str));
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(),
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
|
||||
topology_requests_entries m;
|
||||
for (const auto& row: *rs_done) {
|
||||
@@ -3671,16 +3634,6 @@ future<system_keyspace::topology_requests_entries> system_keyspace::get_topology
|
||||
co_return m;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
return get_topology_request_entries({
|
||||
service::topology_request::join,
|
||||
service::topology_request::replace,
|
||||
service::topology_request::rebuild,
|
||||
service::topology_request::leave,
|
||||
service::topology_request::remove
|
||||
}, end_time_limit);
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::get_insert_dict_mutation(
|
||||
std::string_view name,
|
||||
bytes data,
|
||||
|
||||
@@ -199,7 +199,6 @@ public:
|
||||
static constexpr auto VIEW_BUILD_STATUS_V2 = "view_build_status_v2";
|
||||
static constexpr auto DICTS = "dicts";
|
||||
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
|
||||
static constexpr auto CLIENT_ROUTES = "client_routes";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
@@ -277,7 +276,6 @@ public:
|
||||
static schema_ptr view_build_status_v2();
|
||||
static schema_ptr dicts();
|
||||
static schema_ptr view_building_tasks();
|
||||
static schema_ptr client_routes();
|
||||
|
||||
// auth
|
||||
static schema_ptr roles();
|
||||
@@ -669,9 +667,7 @@ public:
|
||||
|
||||
future<service::topology_request_state> get_topology_request_state(utils::UUID id, bool require_entry);
|
||||
topology_requests_entry topology_request_row_to_entry(utils::UUID id, const cql3::untyped_result_set_row& row);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id);
|
||||
future<std::optional<topology_requests_entry>> get_topology_request_entry_opt(utils::UUID id);
|
||||
future<system_keyspace::topology_requests_entries> get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id, bool require_entry);
|
||||
future<topology_requests_entries> get_node_ops_request_entries(db_clock::time_point end_time_limit);
|
||||
|
||||
public:
|
||||
|
||||
@@ -198,7 +198,6 @@ future<> view_building_worker::register_staging_sstable_tasks(std::vector<sstabl
|
||||
|
||||
future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
try {
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
co_await create_staging_sstable_tasks();
|
||||
@@ -215,14 +214,6 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
vbw_logger.warn("Got group0_concurrent_modification while creating staging sstable tasks");
|
||||
} catch (raft::request_aborted&) {
|
||||
vbw_logger.warn("Got raft::request_aborted while creating staging sstable tasks");
|
||||
} catch (...) {
|
||||
vbw_logger.error("Exception while creating staging sstable tasks: {}", std::current_exception());
|
||||
sleep = true;
|
||||
}
|
||||
|
||||
if (sleep) {
|
||||
vbw_logger.debug("Sleeping after exception.");
|
||||
co_await seastar::sleep_abortable(1s, _as).handle_exception([] (auto x) { return make_ready_future<>(); });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -426,12 +417,9 @@ future<> view_building_worker::check_for_aborted_tasks() {
|
||||
|
||||
auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
|
||||
auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
|
||||
auto it = vbw._state._batch->tasks.begin();
|
||||
while (it != vbw._state._batch->tasks.end()) {
|
||||
auto id = it->first;
|
||||
auto task_opt = building_state.get_task(it->second.base_id, my_replica, id);
|
||||
|
||||
++it; // Advance the iterator before potentially removing the entry from the map.
|
||||
auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
|
||||
for (auto& [id, t]: tasks_map) {
|
||||
auto task_opt = building_state.get_task(t.base_id, my_replica, id);
|
||||
if (!task_opt || task_opt->get().aborted) {
|
||||
co_await vbw._state._batch->abort_task(id);
|
||||
}
|
||||
@@ -461,7 +449,7 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
|
||||
}) | std::ranges::to<std::unordered_set>();;
|
||||
}
|
||||
|
||||
// If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
|
||||
// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
|
||||
// clear the state, save and flush new base table
|
||||
future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
|
||||
if (processing_base_table != building_state.currently_processed_base_table) {
|
||||
@@ -583,6 +571,8 @@ future<> view_building_worker::batch::do_work() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_vbw.local()._vb_state_machine.event.broadcast();
|
||||
}
|
||||
|
||||
future<> view_building_worker::do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as) {
|
||||
@@ -784,15 +774,13 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
tasks.insert({id, *task_opt});
|
||||
}
|
||||
#ifdef SEASTAR_DEBUG
|
||||
{
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
auto& some_task = tasks.begin()->second;
|
||||
for (auto& [_, t]: tasks) {
|
||||
SCYLLA_ASSERT(t.base_id == some_task.base_id);
|
||||
SCYLLA_ASSERT(t.last_token == some_task.last_token);
|
||||
SCYLLA_ASSERT(t.replica == some_task.replica);
|
||||
SCYLLA_ASSERT(t.type == some_task.type);
|
||||
SCYLLA_ASSERT(t.replica.shard == this_shard_id());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -823,6 +811,25 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
|
||||
co_return collect_completed_tasks();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -749,7 +749,6 @@ class clients_table : public streaming_virtual_table {
|
||||
.with_column("ssl_protocol", utf8_type)
|
||||
.with_column("username", utf8_type)
|
||||
.with_column("scheduling_group", utf8_type)
|
||||
.with_column("client_options", map_type_impl::get_instance(utf8_type, utf8_type, false))
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}
|
||||
@@ -767,7 +766,7 @@ class clients_table : public streaming_virtual_table {
|
||||
|
||||
future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
|
||||
// Collect
|
||||
using client_data_vec = utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>;
|
||||
using client_data_vec = utils::chunked_vector<client_data>;
|
||||
using shard_client_data = std::vector<client_data_vec>;
|
||||
std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
|
||||
cd_vec.resize(smp::count);
|
||||
@@ -807,13 +806,13 @@ class clients_table : public streaming_virtual_table {
|
||||
for (unsigned i = 0; i < smp::count; i++) {
|
||||
for (auto&& ps_cdc : *cd_vec[i]) {
|
||||
for (auto&& cd : ps_cdc) {
|
||||
if (cd_map.contains(cd->ip)) {
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
if (cd_map.contains(cd.ip)) {
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
} else {
|
||||
dht::decorated_key key = make_partition_key(cd->ip);
|
||||
dht::decorated_key key = make_partition_key(cd.ip);
|
||||
if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
|
||||
ips.insert(decorated_ip{std::move(key), cd->ip});
|
||||
cd_map[cd->ip].emplace_back(std::move(cd));
|
||||
ips.insert(decorated_ip{std::move(key), cd.ip});
|
||||
cd_map[cd.ip].emplace_back(std::move(cd));
|
||||
}
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
@@ -826,58 +825,39 @@ class clients_table : public streaming_virtual_table {
|
||||
co_await result.emit_partition_start(dip.key);
|
||||
auto& clients = cd_map[dip.ip];
|
||||
|
||||
std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
|
||||
return a->port < b->port || a->client_type_str() < b->client_type_str();
|
||||
std::ranges::sort(clients, [] (const client_data& a, const client_data& b) {
|
||||
return a.port < b.port || a.client_type_str() < b.client_type_str();
|
||||
});
|
||||
|
||||
for (const auto& cd : clients) {
|
||||
clustering_row cr(make_clustering_key(cd->port, cd->client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd->shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd->stage_str());
|
||||
if (cd->driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", cd->driver_name->key());
|
||||
clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
|
||||
set_cell(cr.cells(), "shard_id", cd.shard_id);
|
||||
set_cell(cr.cells(), "connection_stage", cd.stage_str());
|
||||
if (cd.driver_name) {
|
||||
set_cell(cr.cells(), "driver_name", *cd.driver_name);
|
||||
}
|
||||
if (cd->driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", cd->driver_version->key());
|
||||
if (cd.driver_version) {
|
||||
set_cell(cr.cells(), "driver_version", *cd.driver_version);
|
||||
}
|
||||
if (cd->hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd->hostname);
|
||||
if (cd.hostname) {
|
||||
set_cell(cr.cells(), "hostname", *cd.hostname);
|
||||
}
|
||||
if (cd->protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd->protocol_version);
|
||||
if (cd.protocol_version) {
|
||||
set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
|
||||
}
|
||||
if (cd->ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd->ssl_cipher_suite);
|
||||
if (cd.ssl_cipher_suite) {
|
||||
set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
|
||||
}
|
||||
if (cd->ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd->ssl_enabled);
|
||||
if (cd.ssl_enabled) {
|
||||
set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
|
||||
}
|
||||
if (cd->ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd->ssl_protocol);
|
||||
if (cd.ssl_protocol) {
|
||||
set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
|
||||
}
|
||||
set_cell(cr.cells(), "username", cd->username ? *cd->username : sstring("anonymous"));
|
||||
if (cd->scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd->scheduling_group_name);
|
||||
set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
|
||||
if (cd.scheduling_group_name) {
|
||||
set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name);
|
||||
}
|
||||
|
||||
auto map_type = map_type_impl::get_instance(
|
||||
utf8_type,
|
||||
utf8_type,
|
||||
false
|
||||
);
|
||||
|
||||
auto prepare_client_options = [] (const auto& client_options) {
|
||||
map_type_impl::native_type tmp;
|
||||
for (auto& co: client_options) {
|
||||
auto map_element = std::make_pair(data_value(co.key.key()), data_value(co.value.key()));
|
||||
tmp.push_back(std::move(map_element));
|
||||
}
|
||||
return tmp;
|
||||
};
|
||||
|
||||
set_cell(cr.cells(), "client_options",
|
||||
make_map_value(map_type, prepare_client_options(cd->client_options)));
|
||||
|
||||
co_await result.emit_row(std::move(cr));
|
||||
}
|
||||
co_await result.emit_partition_end();
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# Alternator: DynamoDB API in ScyllaDB
|
||||
# Alternator: DynamoDB API in Scylla
|
||||
|
||||
## Introduction
|
||||
Alternator is a ScyllaDB feature adding compatibility with Amazon DynamoDB(TM).
|
||||
Alternator is a Scylla feature adding compatibility with Amazon DynamoDB(TM).
|
||||
DynamoDB's API uses JSON-encoded requests and responses which are sent over
|
||||
an HTTP or HTTPS transport. It is described in detail in Amazon's [DynamoDB
|
||||
API Reference](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/).
|
||||
|
||||
Our goal is that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. Alternator's
|
||||
be run, unmodified, against Scylla with Alternator enabled. Alternator's
|
||||
compatibility with DynamoDB is fairly complete, but users should be aware
|
||||
of some differences and some unimplemented features. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document,
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document,
|
||||
which is updated as the work on Alternator progresses and compatibility
|
||||
continues to improve.
|
||||
|
||||
@@ -19,8 +19,8 @@ Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These are described in [Alternator-specific APIs](new-apis.md).
|
||||
|
||||
## Running Alternator
|
||||
By default, ScyllaDB does not listen for DynamoDB API requests. To enable
|
||||
this API in ScyllaDB you must set at least two configuration options,
|
||||
By default, Scylla does not listen for DynamoDB API requests. To enable
|
||||
this API in Scylla you must set at least two configuration options,
|
||||
**alternator_port** and **alternator_write_isolation**. For example in the
|
||||
YAML configuration file:
|
||||
```yaml
|
||||
@@ -30,7 +30,7 @@ alternator_write_isolation: only_rmw_uses_lwt # or always, forbid or unsafe
|
||||
or, equivalently, via command-line arguments: `--alternator-port=8000
|
||||
--alternator-write-isolation=only_rmw_uses_lwt.
|
||||
|
||||
the **alternator_port** option determines on which port ScyllaDB listens for
|
||||
the **alternator_port** option determines on which port Scylla listens for
|
||||
DynamoDB API requests. By default, it listens on this port on all network
|
||||
interfaces. To listen only on a specific interface, configure also the
|
||||
**alternator_address** option.
|
||||
@@ -41,12 +41,12 @@ Alternator has four different choices
|
||||
for the implementation of writes, each with different advantages. You should
|
||||
carefully consider which of the options makes more sense for your intended
|
||||
use case and configure alternator_write_isolation accordingly. There is
|
||||
currently no default for this option: Trying to run ScyllaDB with an Alternator
|
||||
currently no default for this option: Trying to run Scylla with an Alternator
|
||||
port selected but without configuring write isolation will result in an error message,
|
||||
asking you to set it.
|
||||
|
||||
In addition to (or instead of) serving HTTP requests on alternator_port,
|
||||
ScyllaDB can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
specified by **alternator_https_port**. As usual for HTTPS servers, the
|
||||
operator must specify certificate and key files. By default these should
|
||||
be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
|
||||
@@ -54,7 +54,7 @@ these default locations can overridden by specifying
|
||||
`--alternator-encryption-options keyfile="..."` and
|
||||
`--alternator-encryption-options certificate="..."`.
|
||||
|
||||
By default, ScyllaDB saves a snapshot of deleted tables. But Alternator does
|
||||
By default, Scylla saves a snapshot of deleted tables. But Alternator does
|
||||
not offer an API to restore these snapshots, so these snapshots are not useful
|
||||
and waste disk space - deleting a table does not recover any disk space.
|
||||
It is therefore recommended to disable this automatic-snapshotting feature
|
||||
@@ -73,11 +73,11 @@ itself. Instructions, code and examples for doing this can be found in the
|
||||
|
||||
This section provides only a very brief introduction to Alternator's
|
||||
design. A much more detailed document about the features of the DynamoDB
|
||||
API and how they are, or could be, implemented in ScyllaDB can be found in:
|
||||
API and how they are, or could be, implemented in Scylla can be found in:
|
||||
<https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs>
|
||||
|
||||
Almost all of Alternator's source code (except some initialization code)
|
||||
can be found in the alternator/ subdirectory of ScyllaDB's source code.
|
||||
can be found in the alternator/ subdirectory of Scylla's source code.
|
||||
Extensive functional tests can be found in the test/alternator
|
||||
subdirectory. These tests are written in Python, and can be run against
|
||||
both Alternator and Amazon's DynamoDB; This allows verifying that
|
||||
@@ -85,15 +85,15 @@ Alternator's behavior matches the one observed on DynamoDB.
|
||||
See test/alternator/README.md for more information about the tests and
|
||||
how to run them.
|
||||
|
||||
With Alternator enabled on port 8000 (for example), every ScyllaDB node
|
||||
With Alternator enabled on port 8000 (for example), every Scylla node
|
||||
listens for DynamoDB API requests on this port. These requests, in
|
||||
JSON format over HTTP, are parsed and result in calls to internal Scylla
|
||||
C++ functions - there is no CQL generation or parsing involved.
|
||||
In ScyllaDB terminology, the node receiving the request acts as the
|
||||
In Scylla terminology, the node receiving the request acts as the
|
||||
*coordinator*, and often passes the request on to one or more other nodes -
|
||||
*replicas* which hold copies of the requested data.
|
||||
|
||||
Alternator tables are stored as ScyllaDB tables, each in a separate keyspace.
|
||||
Alternator tables are stored as Scylla tables, each in a separate keyspace.
|
||||
Each keyspace is initialized when the corresponding Alternator table is
|
||||
created (with a CreateTable request). The replication factor (RF) for this
|
||||
keyspace is chosen at that point, depending on the size of the cluster:
|
||||
@@ -101,19 +101,19 @@ RF=3 is used on clusters with three or more nodes, and RF=1 is used for
|
||||
smaller clusters. Such smaller clusters are, of course, only recommended
|
||||
for tests because of the risk of data loss.
|
||||
|
||||
Each table in Alternator is stored as a ScyllaDB table in a separate
|
||||
Each table in Alternator is stored as a Scylla table in a separate
|
||||
keyspace. The DynamoDB key columns (hash and sort key) have known types,
|
||||
and become partition and clustering key columns of the ScyllaDB table.
|
||||
and become partition and clustering key columns of the Scylla table.
|
||||
All other attributes may be different for each row, so are stored in one
|
||||
map column in ScyllaDB, and not as separate columns.
|
||||
map column in Scylla, and not as separate columns.
|
||||
|
||||
DynamoDB supports two consistency levels for reads, "eventual consistency"
|
||||
and "strong consistency". These two modes are implemented using ScyllaDB's CL
|
||||
and "strong consistency". These two modes are implemented using Scylla's CL
|
||||
(consistency level) feature: All writes are done using the `LOCAL_QUORUM`
|
||||
consistency level, then strongly-consistent reads are done with
|
||||
`LOCAL_QUORUM`, while eventually-consistent reads are with just `LOCAL_ONE`.
|
||||
|
||||
In ScyllaDB (and its inspiration, Cassandra), high write performance is
|
||||
In Scylla (and its inspiration, Cassandra), high write performance is
|
||||
achieved by ensuring that writes do not require reads from disk.
|
||||
The DynamoDB API, however, provides many types of requests that need a read
|
||||
before the write (a.k.a. RMW requests - read-modify-write). For example,
|
||||
@@ -121,7 +121,7 @@ a request may copy an existing attribute, increment an attribute,
|
||||
be conditional on some expression involving existing values of attribute,
|
||||
or request that the previous values of attributes be returned. These
|
||||
read-modify-write transactions should be _isolated_ from each other, so
|
||||
by default Alternator implements every write operation using ScyllaDB's
|
||||
by default Alternator implements every write operation using Scylla's
|
||||
LWT (lightweight transactions). This default can be overridden on a per-table
|
||||
basis, by tagging the table as explained above in the "write isolation
|
||||
policies" section.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# ScyllaDB Alternator for DynamoDB users
|
||||
|
||||
ScyllaDB supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
Scylla supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
Our goal is to support any application written for Amazon DynamoDB.
|
||||
Nevertheless, there are a few differences between DynamoDB and Scylla, and
|
||||
and a few DynamoDB features that have not yet been implemented in Scylla.
|
||||
@@ -8,16 +8,16 @@ The purpose of this document is to inform users of these differences.
|
||||
|
||||
## Provisioning
|
||||
|
||||
The most obvious difference between DynamoDB and ScyllaDB is that while
|
||||
DynamoDB is a shared cloud service, ScyllaDB is a dedicated service running
|
||||
The most obvious difference between DynamoDB and Scylla is that while
|
||||
DynamoDB is a shared cloud service, Scylla is a dedicated service running
|
||||
on your private cluster. Whereas DynamoDB allows you to "provision" the
|
||||
number of requests per second you'll need - or at an extra cost not even
|
||||
provision that - ScyllaDB requires you to provision your cluster. You need
|
||||
provision that - Scylla requires you to provision your cluster. You need
|
||||
to reason about the number and size of your nodes - not the throughput.
|
||||
|
||||
Moreover, DynamoDB's per-table provisioning (`BillingMode=PROVISIONED`) is
|
||||
not yet supported by Scylla. The BillingMode and ProvisionedThroughput options
|
||||
on a table need to be valid but are ignored, and ScyllaDB behaves like DynamoDB's
|
||||
on a table need to be valid but are ignored, and Scylla behaves like DynamoDB's
|
||||
`BillingMode=PAY_PER_REQUEST`: All requests are accepted without a per-table
|
||||
throughput cap.
|
||||
|
||||
@@ -33,7 +33,7 @@ Instructions for doing this can be found in:
|
||||
|
||||
## Write isolation policies
|
||||
|
||||
ScyllaDB was designed to optimize the performance of pure write operations -
|
||||
Scylla was designed to optimize the performance of pure write operations -
|
||||
writes which do not need to read the previous value of the item.
|
||||
In CQL, writes which do need the previous value of the item must explicitly
|
||||
use the slower LWT ("LightWeight Transaction") feature to be correctly
|
||||
@@ -79,11 +79,11 @@ a _higher_ timestamp - and this will be the "last write" that wins.
|
||||
To avoid or mitigate this write reordering issue, users may consider
|
||||
one or more of the following:
|
||||
|
||||
1. Use NTP to keep the clocks on the different ScyllaDB nodes synchronized.
|
||||
1. Use NTP to keep the clocks on the different Scylla nodes synchronized.
|
||||
If the delay between the two writes is longer than NTP's accuracy,
|
||||
they will not be reordered.
|
||||
2. If an application wants to ensure that two specific writes are not
|
||||
reordered, it should send both requests to the same ScyllaDB node.
|
||||
reordered, it should send both requests to the same Scylla node.
|
||||
Care should be taken when using a load balancer - which might redirect
|
||||
two requests to two different nodes.
|
||||
3. Consider using the `always_use_lwt` write isolation policy.
|
||||
@@ -210,7 +210,7 @@ CREATE SERVICE_LEVEL IF NOT EXISTS oltp WITH SHARES = 1000;
|
||||
ATTACH SERVICE_LEVEL olap TO alice;
|
||||
ATTACH SERVICE_LEVEL oltp TO bob;
|
||||
```
|
||||
Note that `alternator_enforce_authorization` has to be enabled in ScyllaDB configuration.
|
||||
Note that `alternator_enforce_authorization` has to be enabled in Scylla configuration.
|
||||
|
||||
See [Authorization](##Authorization) section to learn more about roles and authorization.
|
||||
See [Workload Prioritization](../features/workload-prioritization)
|
||||
@@ -218,11 +218,11 @@ to read about Workload Prioritization in detail.
|
||||
|
||||
## Metrics
|
||||
|
||||
ScyllaDB has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of ScyllaDB's usage and performance.
|
||||
ScyllaDB's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
Scylla has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of Scylla's usage and performance.
|
||||
Scylla's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
<https://docs.scylladb.com/operating-scylla/monitoring/>.
|
||||
This monitoring stack is different from DynamoDB's offering - but ScyllaDB's
|
||||
This monitoring stack is different from DynamoDB's offering - but Scylla's
|
||||
is significantly more powerful and gives the user better insights on
|
||||
the internals of the database and its performance.
|
||||
|
||||
@@ -248,7 +248,7 @@ data in different partition order. Applications mustn't rely on that
|
||||
undocumented order.
|
||||
|
||||
Note that inside each partition, the individual items will be sorted the same
|
||||
in DynamoDB and ScyllaDB - determined by the _sort key_ defined for that table.
|
||||
in DynamoDB and Scylla - determined by the _sort key_ defined for that table.
|
||||
|
||||
---
|
||||
|
||||
@@ -274,7 +274,7 @@ is different, or can be configured in Alternator:
|
||||
## Experimental API features
|
||||
|
||||
Some DynamoDB API features are supported by Alternator, but considered
|
||||
**experimental** in this release. An experimental feature in ScyllaDB is a
|
||||
**experimental** in this release. An experimental feature in Scylla is a
|
||||
feature whose functionality is complete, or mostly complete, but it is not
|
||||
as thoroughly tested or optimized as regular features. Also, an experimental
|
||||
feature's implementation is still subject to change and upgrades may not be
|
||||
@@ -351,8 +351,8 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
|
||||
* The on-demand backup APIs are not supported: CreateBackup, DescribeBackup,
|
||||
DeleteBackup, ListBackups, RestoreTableFromBackup.
|
||||
For now, users can use ScyllaDB's existing backup solutions such as snapshots
|
||||
or ScyllaDB Manager.
|
||||
For now, users can use Scylla's existing backup solutions such as snapshots
|
||||
or Scylla Manager.
|
||||
<https://github.com/scylladb/scylla/issues/5063>
|
||||
|
||||
* Continuous backup (the ability to restore any point in time) is also not
|
||||
@@ -370,7 +370,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
<https://github.com/scylladb/scylla/issues/5068>
|
||||
|
||||
* DAX (DynamoDB Accelerator), an in-memory cache for DynamoDB, is not
|
||||
available in for Alternator. Anyway, it should not be necessary - ScyllaDB's
|
||||
available in for Alternator. Anyway, it should not be necessary - Scylla's
|
||||
internal cache is already rather advanced and there is no need to place
|
||||
another cache in front of the it. We wrote more about this here:
|
||||
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
|
||||
@@ -384,7 +384,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
|
||||
and the operations ExecuteStatement, BatchExecuteStatement and
|
||||
ExecuteTransaction are not yet supported.
|
||||
A user that is interested in an SQL-like syntax can consider using ScyllaDB's
|
||||
A user that is interested in an SQL-like syntax can consider using Scylla's
|
||||
CQL protocol instead.
|
||||
This feature was added to DynamoDB in November 2020.
|
||||
<https://github.com/scylladb/scylla/issues/8787>
|
||||
@@ -393,7 +393,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
which is different from AWS's. In particular, the operations
|
||||
DescribeContributorInsights, ListContributorInsights and
|
||||
UpdateContributorInsights that configure Amazon's "CloudWatch Contributor
|
||||
Insights" are not yet supported. ScyllaDB has different ways to retrieve the
|
||||
Insights" are not yet supported. Scylla has different ways to retrieve the
|
||||
same information, such as which items were accessed most often.
|
||||
<https://github.com/scylladb/scylla/issues/8788>
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ This section will guide you through the steps for setting up the cluster:
|
||||
<https://hub.docker.com/r/scylladb/scylla/>, but add to every `docker run`
|
||||
command a `-p 8000:8000` before the image name and
|
||||
`--alternator-port=8000 --alternator-write-isolation=always` at the end.
|
||||
The "alternator-port" option specifies on which port ScyllaDB will listen for
|
||||
The "alternator-port" option specifies on which port Scylla will listen for
|
||||
the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses
|
||||
whether or not Alternator will use LWT for every write.
|
||||
For example,
|
||||
@@ -24,10 +24,10 @@ This section will guide you through the steps for setting up the cluster:
|
||||
By default, ScyllaDB run in this way will not have authentication or
|
||||
authorization enabled, and any DynamoDB API request will be honored without
|
||||
requiring them to be signed appropriately. See the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
document on how to configure authentication and authorization.
|
||||
|
||||
## Testing ScyllaDB's DynamoDB API support:
|
||||
## Testing Scylla's DynamoDB API support:
|
||||
### Running AWS Tic Tac Toe demo app to test the cluster:
|
||||
1. Follow the instructions on the [AWS github page](https://github.com/awsdocs/amazon-dynamodb-developer-guide/blob/master/doc_source/TicTacToe.Phase1.md)
|
||||
2. Enjoy your tic-tac-toe game :-)
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
Alternator's primary goal is to be compatible with Amazon DynamoDB(TM)
|
||||
and its APIs, so that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. The extent of
|
||||
be run, unmodified, against Scylla with Alternator enabled. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document.
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document.
|
||||
|
||||
But Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These Alternator-specific APIs are documented here.
|
||||
@@ -15,7 +15,7 @@ _conditional_ update or an update based on the old value of an attribute.
|
||||
The read and the write should be treated as a single transaction - protected
|
||||
(_isolated_) from other parallel writes to the same item.
|
||||
|
||||
Alternator could do this isolation by using ScyllaDB's LWT (lightweight
|
||||
Alternator could do this isolation by using Scylla's LWT (lightweight
|
||||
transactions) for every write operation, but this significantly slows
|
||||
down writes, and not necessary for workloads which don't use read-modify-write
|
||||
(RMW) updates.
|
||||
@@ -41,7 +41,7 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
which need a read before the write. An attempt to use such statements
|
||||
(e.g., UpdateItem with a ConditionExpression) will result in an error.
|
||||
In this mode, the remaining write requests which are allowed - pure writes
|
||||
without a read - are performed using standard ScyllaDB writes, not LWT,
|
||||
without a read - are performed using standard Scylla writes, not LWT,
|
||||
so they are significantly faster than they would have been in the
|
||||
`always_use_lwt`, but their isolation is still correct.
|
||||
|
||||
@@ -65,19 +65,19 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
read-modify-write updates. This mode is not recommended for any use case,
|
||||
and will likely be removed in the future.
|
||||
|
||||
## Accessing system tables from ScyllaDB
|
||||
ScyllaDB exposes lots of useful information via its internal system tables,
|
||||
## Accessing system tables from Scylla
|
||||
Scylla exposes lots of useful information via its internal system tables,
|
||||
which can be found in system keyspaces: 'system', 'system\_auth', etc.
|
||||
In order to access to these tables via alternator interface,
|
||||
Scan and Query requests can use a special table name:
|
||||
`.scylla.alternator.KEYSPACE_NAME.TABLE_NAME`
|
||||
which will return results fetched from corresponding ScyllaDB table.
|
||||
which will return results fetched from corresponding Scylla table.
|
||||
|
||||
This interface can be used only to fetch data from system tables.
|
||||
Attempts to read regular tables via the virtual interface will result
|
||||
in an error.
|
||||
|
||||
Example: in order to query the contents of ScyllaDB's `system.large_rows`,
|
||||
Example: in order to query the contents of Scylla's `system.large_rows`,
|
||||
pass `TableName='.scylla.alternator.system.large_rows'` to a Query/Scan
|
||||
request.
|
||||
|
||||
@@ -113,14 +113,14 @@ connection (either active or idle), not necessarily an active request as
|
||||
in Alternator.
|
||||
|
||||
## Service discovery
|
||||
As explained in [ScyllaDB Alternator for DynamoDB users](compatibility.md),
|
||||
As explained in [Scylla Alternator for DynamoDB users](compatibility.md),
|
||||
Alternator requires a load-balancer or a client-side load-balancing library
|
||||
to distribute requests between all ScyllaDB nodes. This load-balancer needs
|
||||
to be able to _discover_ the ScyllaDB nodes. Alternator provides two special
|
||||
to distribute requests between all Scylla nodes. This load-balancer needs
|
||||
to be able to _discover_ the Scylla nodes. Alternator provides two special
|
||||
requests, `/` and `/localnodes`, to help with this service discovery, which
|
||||
we will now explain.
|
||||
|
||||
Some setups know exactly which ScyllaDB nodes were brought up, so all that
|
||||
Some setups know exactly which Scylla nodes were brought up, so all that
|
||||
remains is to periodically verify that each node is still functional. The
|
||||
easiest way to do this is to make an HTTP (or HTTPS) GET request to the node,
|
||||
with URL `/`. This is a trivial GET request and does **not** need to be
|
||||
@@ -133,10 +133,10 @@ $ curl http://localhost:8000/
|
||||
healthy: localhost:8000
|
||||
```
|
||||
|
||||
In other setups, the load balancer might not know which ScyllaDB nodes exist.
|
||||
For example, it may be possible to add or remove ScyllaDB nodes without a
|
||||
In other setups, the load balancer might not know which Scylla nodes exist.
|
||||
For example, it may be possible to add or remove Scylla nodes without a
|
||||
client-side load balancer knowing. For these setups we have the `/localnodes`
|
||||
request that can be used to discover which ScyllaDB nodes exist: A load balancer
|
||||
request that can be used to discover which Scylla nodes exist: A load balancer
|
||||
that already knows at least one live node can discover the rest by sending
|
||||
a `/localnodes` request to the known node. It's again an unauthenticated
|
||||
HTTP (or HTTPS) GET request:
|
||||
@@ -160,7 +160,7 @@ list the nodes in a specific _data center_ or _rack_. These options are
|
||||
useful for certain use cases:
|
||||
|
||||
* A `dc` option (e.g., `/localnodes?dc=dc1`) can be passed to list the
|
||||
nodes in a specific ScyllaDB data center, not the data center of the node
|
||||
nodes in a specific Scylla data center, not the data center of the node
|
||||
being contacted. This is useful when a client knowns of _some_ Scylla
|
||||
node belonging to an unknown DC, but wants to list the nodes in _its_
|
||||
DC, which it knows by name.
|
||||
@@ -191,7 +191,7 @@ tells them to.
|
||||
|
||||
If you want to influence whether a specific Alternator table is created with tablets or vnodes,
|
||||
you can do this by specifying the `system:initial_tablets` tag
|
||||
(in earlier versions of ScyllaDB the tag was `experimental:initial_tablets`)
|
||||
(in earlier versions of Scylla the tag was `experimental:initial_tablets`)
|
||||
in the CreateTable operation. The value of this tag can be:
|
||||
|
||||
* Any valid integer as the value of this tag enables tablets.
|
||||
|
||||
@@ -365,7 +365,7 @@ Modifying a keyspace with tablets enabled is possible and doesn't require any sp
|
||||
|
||||
- The replication factor (RF) can be increased or decreased by at most 1 at a time. To reach the desired RF value, modify the RF repeatedly.
|
||||
- The ``ALTER`` statement rejects the ``replication_factor`` tag. List the DCs explicitly when altering a keyspace. See :ref:`NetworkTopologyStrategy <replication-strategy>`.
|
||||
- An RF change cannot be requested while another RF change is pending for the same keyspace. Attempting to execute an ``ALTER`` statement in this scenario will fail with an explicit error. Wait for the ongoing RF change to complete before issuing another ``ALTER`` statement.
|
||||
- If there's any other ongoing global topology operation, executing the ``ALTER`` statement will fail (with an explicit and specific error) and needs to be repeated.
|
||||
- The ``ALTER`` statement may take longer than the regular query timeout, and even if it times out, it will continue to execute in the background.
|
||||
- The replication strategy cannot be modified, as keyspaces with tablets only support ``NetworkTopologyStrategy``.
|
||||
- The ``ALTER`` statement will fail if it would make the keyspace :term:`RF-rack-invalid <RF-rack-valid keyspace>`.
|
||||
@@ -1043,8 +1043,6 @@ The following modes are available:
|
||||
* - ``immediate``
|
||||
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
|
||||
|
||||
.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
|
||||
|
||||
.. _cql-per-table-tablet-options:
|
||||
|
||||
Per-table tablet options
|
||||
|
||||
@@ -102,7 +102,6 @@ Additional Information
|
||||
|
||||
To learn more about TTL, and see a hands-on example, check out `this lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_ on ScyllaDB University.
|
||||
|
||||
* `Video: Managing data expiration with Time-To-Live <https://www.youtube.com/watch?v=SXkbu7mFHeA>`_
|
||||
* :doc:`Apache Cassandra Query Language (CQL) Reference </cql/index>`
|
||||
* :doc:`KB Article:How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds/>`
|
||||
* :doc:`KB Article:Time to Live (TTL) and Compaction </kb/ttl-facts/>`
|
||||
|
||||
@@ -74,8 +74,6 @@ The keys and values are:
|
||||
as an indicator to which shard client wants to connect. The desired shard number
|
||||
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
|
||||
Its value is a decimal representation of type `uint16_t`, by default `19142`.
|
||||
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
|
||||
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
|
||||
|
||||
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
|
||||
`biased-token-round-robin`. To apply the algorithm,
|
||||
@@ -238,26 +236,3 @@ the same mechanism for other protocol versions, such as CQLv4.
|
||||
|
||||
The feature is identified by the `SCYLLA_USE_METADATA_ID` key, which is meant to be sent
|
||||
in the SUPPORTED message.
|
||||
|
||||
## Sending the CLIENT_ROUTES_CHANGE event
|
||||
|
||||
This extension allows a driver to update its connections when the
|
||||
`system.client_routes` table is modified.
|
||||
|
||||
In some network topologies a specific mapping of addresses and ports is required (e.g.
|
||||
to support Private Link). This mapping can change dynamically even when no nodes are
|
||||
added or removed. The driver must adapt to those changes; otherwise connectivity can be
|
||||
lost.
|
||||
|
||||
The extension is implemented as a new `EVENT` type: `CLIENT_ROUTES_CHANGE`. The event
|
||||
body consists of:
|
||||
- [string] change
|
||||
- [string list] connection_ids
|
||||
- [string list] host_ids
|
||||
|
||||
There is only one change value: `UPDATE_NODES`, which means at least one client route
|
||||
was inserted, updated, or deleted.
|
||||
|
||||
Events already have a subscription mechanism similar to protocol extensions (that is,
|
||||
the driver only receives the events it explicitly subscribed to), so no additional
|
||||
`cql_protocol_extension` key is introduced for this feature.
|
||||
|
||||
@@ -86,7 +86,6 @@ stateDiagram-v2
|
||||
de_left_token_ring --> [*]
|
||||
}
|
||||
state removing {
|
||||
re_left_token_ring : left_token_ring
|
||||
re_tablet_draining : tablet_draining
|
||||
re_tablet_migration : tablet_migration
|
||||
re_write_both_read_old : write_both_read_old
|
||||
@@ -99,8 +98,7 @@ stateDiagram-v2
|
||||
re_tablet_draining --> re_write_both_read_old
|
||||
re_write_both_read_old --> re_write_both_read_new: streaming completed
|
||||
re_write_both_read_old --> re_rollback_to_normal: rollback
|
||||
re_write_both_read_new --> re_left_token_ring
|
||||
re_left_token_ring --> [*]
|
||||
re_write_both_read_new --> [*]
|
||||
}
|
||||
rebuilding --> normal: streaming completed
|
||||
decommissioning --> left: operation succeeded
|
||||
@@ -124,10 +122,9 @@ Note that these are not all states, as there are other states specific to tablet
|
||||
Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
|
||||
to modified token ring), reads are using old replicas.
|
||||
- `write_both_read_new` - as above, but reads are using new replicas.
|
||||
- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
|
||||
then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
|
||||
We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
|
||||
it from group 0. We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
|
||||
moving the node we tried to decommission/remove back to the normal state.
|
||||
- `lock` - the topology stays in this state until externally changed (to null state), preventing topology
|
||||
@@ -144,9 +141,7 @@ reads that started before this point exist in the system. Finally we remove the
|
||||
transitioning state.
|
||||
|
||||
Decommission, removenode and replace work similarly, except they don't go through
|
||||
`commit_cdc_generation`. Both decommission and removenode go through the
|
||||
`left_token_ring` state to run a global barrier ensuring all nodes are aware
|
||||
of the topology change before the operation completes.
|
||||
`commit_cdc_generation`.
|
||||
|
||||
The state machine may also go only through the `commit_cdc_generation` state
|
||||
after getting a request from the user to create a new CDC generation if the
|
||||
|
||||
@@ -41,12 +41,12 @@ Unless the task was aborted, the worker will eventually reply that the task was
|
||||
it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
|
||||
This batching of removing finished tasks is done in order to reduce number of generated group0 operations.
|
||||
|
||||
On the other hand, view building tasks can can also be aborted due to 2 main reasons:
|
||||
On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
|
||||
- a keyspace/view was dropped
|
||||
- tablet operations (see [tablet operations section](#tablet-operations))
|
||||
In the first case we simply delete relevant view building tasks as they are no longer needed.
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task information
|
||||
to create new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
|
||||
to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
|
||||
Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.
|
||||
|
||||
(*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,
|
||||
|
||||
@@ -17,7 +17,6 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
:title: ScyllaDB Features
|
||||
@@ -44,5 +43,3 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
=================================
|
||||
Vector Search in ScyllaDB
|
||||
=================================
|
||||
|
||||
.. note::
|
||||
|
||||
This feature is currently available only in `ScyllaDB Cloud <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
What Is Vector Search
|
||||
-------------------------
|
||||
|
||||
Vector Search enables similarity-based queries over high-dimensional data,
|
||||
such as text, images, audio, or user behavior. Instead of searching for exact
|
||||
matches, it allows applications to find items that are semantically similar to
|
||||
a given input.
|
||||
|
||||
To do this, Vector Search works on vector embeddings, which are numerical
|
||||
representations of data that capture semantic meaning. This enables queries
|
||||
such as:
|
||||
|
||||
* “Find documents similar to this paragraph”
|
||||
* “Find products similar to what the user just viewed”
|
||||
* “Find previous tickets related to this support request”
|
||||
|
||||
Rather than relying on exact values or keywords, Vector Search returns results
|
||||
based on distance or similarity between vectors. This capability is
|
||||
increasingly used in modern workloads such as AI-powered search, recommendation
|
||||
systems, and retrieval-augmented generation (RAG).
|
||||
|
||||
Why Vector Search Matters
|
||||
------------------------------------
|
||||
|
||||
Many applications already rely on ScyllaDB for high throughput, low and
|
||||
predictable latency, and large-scale data storage.
|
||||
|
||||
Vector Search complements these strengths by enabling new classes of workloads,
|
||||
including:
|
||||
|
||||
* Semantic search over text or documents
|
||||
* Recommendations based on user or item similarity
|
||||
* AI and ML applications, including RAG pipelines
|
||||
* Anomaly and pattern detection
|
||||
|
||||
With Vector Search, ScyllaDB can serve as the similarity search backend for
|
||||
AI-driven applications.
|
||||
|
||||
Availability
|
||||
--------------
|
||||
|
||||
Vector Search is currently available only in ScyllaDB Cloud, the fully managed
|
||||
ScyllaDB service.
|
||||
|
||||
|
||||
👉 For details on using Vector Search, refer to the
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/index.html>`_.
|
||||
@@ -20,10 +20,7 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
|
||||
Amazon Web Services (AWS)
|
||||
-----------------------------
|
||||
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`,
|
||||
:ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`,
|
||||
:ref:`i7ie <system-requirements-i7ie-instances>`, :ref:`i8g<system-requirements-i8g-instances>`,
|
||||
and :ref:`i8ge <system-requirements-i8ge-instances>`.
|
||||
The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -198,118 +195,6 @@ All i7i instances have the following specs:
|
||||
|
||||
See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.
|
||||
|
||||
|
||||
.. _system-requirements-i8g-instances:
|
||||
|
||||
i8g instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8g instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8g.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 468 GB
|
||||
* - i8g.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 937 GB
|
||||
* - i8g.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 1 x 1,875 GB
|
||||
* - i8g.4xlarge
|
||||
- 16
|
||||
- 128
|
||||
- 1 x 3,750 GB
|
||||
* - i8g.8xlarge
|
||||
- 32
|
||||
- 256
|
||||
- 2 x 3,750 GB
|
||||
* - i8g.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 3 x 3,750 GB
|
||||
* - i8g.16xlarge
|
||||
- 64
|
||||
- 512
|
||||
- 4 x 3,750 GB
|
||||
|
||||
All i8g instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 100 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 45 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
.. _system-requirements-i8ge-instances:
|
||||
|
||||
i8ge instances
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
The following i8ge instances are supported.
|
||||
|
||||
.. list-table::
|
||||
:widths: 30 20 20 30
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- vCPU
|
||||
- Mem (GiB)
|
||||
- Storage (GB)
|
||||
* - i8ge.large
|
||||
- 2
|
||||
- 16
|
||||
- 1 x 1,250 GB
|
||||
* - i8ge.xlarge
|
||||
- 4
|
||||
- 32
|
||||
- 1 x 2,500 GB
|
||||
* - i8ge.2xlarge
|
||||
- 8
|
||||
- 64
|
||||
- 2 x 2,500 GB
|
||||
* - i8ge.3xlarge
|
||||
- 12
|
||||
- 96
|
||||
- 1 x 7,500 GB
|
||||
* - i8ge.6xlarge
|
||||
- 24
|
||||
- 192
|
||||
- 2 x 7,500 GB
|
||||
* - i8ge.12xlarge
|
||||
- 48
|
||||
- 384
|
||||
- 4 x 7,500 GB
|
||||
* - i8ge.18xlarge
|
||||
- 72
|
||||
- 576
|
||||
- 6 x 7,500 GB
|
||||
|
||||
All i8ge instances have the following specs:
|
||||
|
||||
* Powered by AWS Graviton4 processors
|
||||
* 3rd generation AWS Nitro SSD storage
|
||||
* DDR5-5600 memory for improved throughput
|
||||
* Up to 300 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
|
||||
Amazon Elastic Block Store (EBS)
|
||||
* Instance sizes offer up to 120 TB of total local NVMe instance storage
|
||||
|
||||
See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
|
||||
|
||||
|
||||
Im4gn and Is4gen instances
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
ScyllaDB supports Arm-based Im4gn and Is4gen instances. See `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details.
|
||||
|
||||
@@ -25,7 +25,8 @@ Getting Started
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
|
||||
* `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
|
||||
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
|
||||
* :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
|
||||
* :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
ScyllaDB Housekeeping and how to disable it
|
||||
============================================
|
||||
|
||||
It is always recommended to run the latest stable version of ScyllaDB.
|
||||
It is always recommended to run the latest version of ScyllaDB.
|
||||
The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
|
||||
|
||||
When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
|
||||
Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
|
||||
|
||||
@@ -9,8 +9,6 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
|
||||
To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
|
||||
of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on **any** of the nodes in the cluster.
|
||||
|
||||
.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
|
||||
|
||||
To check if a keyspace enables tablets, use:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
95
docs/poetry.lock
generated
95
docs/poetry.lock
generated
@@ -2,35 +2,36 @@
|
||||
|
||||
[[package]]
|
||||
name = "alabaster"
|
||||
version = "1.0.0"
|
||||
version = "0.7.16"
|
||||
description = "A light, configurable Sphinx theme"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b"},
|
||||
{file = "alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e"},
|
||||
{file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"},
|
||||
{file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.12.0"
|
||||
version = "4.11.0"
|
||||
description = "High-level concurrency and networking framework on top of asyncio or Trio"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb"},
|
||||
{file = "anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0"},
|
||||
{file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"},
|
||||
{file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
|
||||
idna = ">=2.8"
|
||||
sniffio = ">=1.1"
|
||||
typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
|
||||
|
||||
[package.extras]
|
||||
trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""]
|
||||
trio = ["trio (>=0.31.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
@@ -49,14 +50,14 @@ dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)"
|
||||
|
||||
[[package]]
|
||||
name = "beartype"
|
||||
version = "0.22.8"
|
||||
version = "0.22.6"
|
||||
description = "Unbearably fast near-real-time pure-Python runtime-static type-checker."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beartype-0.22.8-py3-none-any.whl", hash = "sha256:b832882d04e41a4097bab9f63e6992bc6de58c414ee84cba9b45b67314f5ab2e"},
|
||||
{file = "beartype-0.22.8.tar.gz", hash = "sha256:b19b21c9359722ee3f7cc433f063b3e13997b27ae8226551ea5062e621f61165"},
|
||||
{file = "beartype-0.22.6-py3-none-any.whl", hash = "sha256:0584bc46a2ea2a871509679278cda992eadde676c01356ab0ac77421f3c9a093"},
|
||||
{file = "beartype-0.22.6.tar.gz", hash = "sha256:97fbda69c20b48c5780ac2ca60ce3c1bb9af29b3a1a0216898ffabdd523e48f4"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -69,18 +70,18 @@ test-tox-coverage = ["coverage (>=5.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.3"
|
||||
version = "4.14.2"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
|
||||
{file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
|
||||
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
|
||||
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">=1.6.1"
|
||||
soupsieve = ">1.2"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
@@ -801,6 +802,18 @@ files = [
|
||||
{file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sniffio"
|
||||
version = "1.3.1"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
|
||||
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "snowballstemmer"
|
||||
version = "3.0.1"
|
||||
@@ -827,18 +840,18 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "sphinx"
|
||||
version = "8.1.3"
|
||||
version = "7.4.7"
|
||||
description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
|
||||
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
|
||||
{file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"},
|
||||
{file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
alabaster = ">=0.7.14"
|
||||
alabaster = ">=0.7.14,<0.8.0"
|
||||
babel = ">=2.13"
|
||||
colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""}
|
||||
docutils = ">=0.20,<0.22"
|
||||
@@ -848,17 +861,17 @@ packaging = ">=23.0"
|
||||
Pygments = ">=2.17"
|
||||
requests = ">=2.30.0"
|
||||
snowballstemmer = ">=2.2"
|
||||
sphinxcontrib-applehelp = ">=1.0.7"
|
||||
sphinxcontrib-devhelp = ">=1.0.6"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.6"
|
||||
sphinxcontrib-jsmath = ">=1.0.1"
|
||||
sphinxcontrib-qthelp = ">=1.0.6"
|
||||
sphinxcontrib-applehelp = "*"
|
||||
sphinxcontrib-devhelp = "*"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.0"
|
||||
sphinxcontrib-jsmath = "*"
|
||||
sphinxcontrib-qthelp = "*"
|
||||
sphinxcontrib-serializinghtml = ">=1.1.9"
|
||||
tomli = {version = ">=2", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinxcontrib-websupport"]
|
||||
lint = ["flake8 (>=6.0)", "mypy (==1.11.1)", "pyright (==1.1.384)", "pytest (>=6.0)", "ruff (==0.6.9)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-Pillow (==10.2.0.20240822)", "types-Pygments (==2.18.0.20240506)", "types-colorama (==0.4.15.20240311)", "types-defusedxml (==0.7.0.20240218)", "types-docutils (==0.21.0.20241005)", "types-requests (==2.32.0.20240914)", "types-urllib3 (==1.26.25.14)"]
|
||||
lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"]
|
||||
test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"]
|
||||
|
||||
[[package]]
|
||||
@@ -988,14 +1001,13 @@ test = ["tox"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-scylladb-markdown"
|
||||
version = "0.1.4"
|
||||
version = "0.1.3"
|
||||
description = "Sphinx extension for ScyllaDB documentation with enhanced Markdown support through MystParser and recommonmark."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_scylladb_markdown-0.1.4-py3-none-any.whl", hash = "sha256:598753e01cf159d4698eb1a707958828446e21749038d3d42c5b9c7e86eda6e4"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.4.tar.gz", hash = "sha256:9db3ae0dcf7c3519262da65e48c7f9e4db0ad1ce9c5f874864ea218f4cbc4c68"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.3-py3-none-any.whl", hash = "sha256:f20160b4aadf4c8cf95637f0a544121954b792914ab6ec05b67cae75e20a5566"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1047,25 +1059,24 @@ dev = ["build", "flake8", "pre-commit", "pytest", "sphinx", "sphinx-last-updated
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-substitution-extensions"
|
||||
version = "2025.11.17"
|
||||
version = "2025.1.2"
|
||||
description = "Extensions for Sphinx which allow for substitutions."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_substitution_extensions-2025.11.17-py2.py3-none-any.whl", hash = "sha256:ac18455bdc8324b337b0fe7498c1c0d0b1cb65c74d131459be4dea9edb6abbef"},
|
||||
{file = "sphinx_substitution_extensions-2025.11.17.tar.gz", hash = "sha256:aae17f8db9efc3d454a304373ae3df763f8739e05e0b98d5381db46f6d250b27"},
|
||||
{file = "sphinx_substitution_extensions-2025.1.2-py2.py3-none-any.whl", hash = "sha256:ff14f40e4393bd7434a196badb8d47983355d9755af884b902e3023fb456b958"},
|
||||
{file = "sphinx_substitution_extensions-2025.1.2.tar.gz", hash = "sha256:53b8d394d5098a09aef36bc687fa310aeb28466319d2c750e996e46400fb2474"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beartype = ">=0.18.5"
|
||||
docutils = ">=0.19"
|
||||
myst-parser = ">=4.0.0"
|
||||
sphinx = ">=8.1.0"
|
||||
sphinx = ">=7.3.5"
|
||||
|
||||
[package.extras]
|
||||
dev = ["actionlint-py (==1.7.8.24)", "check-manifest (==0.51)", "deptry (==0.24.0)", "doc8 (==2.0.0)", "doccmd (==2025.11.8.1)", "docformatter (==1.7.7)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2025.4.3)", "mypy[faster-cache] (==1.18.2)", "pre-commit (==4.4.0)", "pylint[spelling] (==4.0.3)", "pyproject-fmt (==2.11.1)", "pyright (==1.1.407)", "pyroma (==5.0)", "pytest (==9.0.1)", "pytest-cov (==7.0.0)", "ruff (==0.14.5)", "shellcheck-py (==0.11.0.1)", "shfmt-py (==3.12.0.2)", "sphinx-lint (==1.0.1)", "sphinx-toolbox (==4.0.0)", "types-docutils (==0.22.2.20251006)", "vulture (==2.14)", "yamlfix (==1.19.0)"]
|
||||
release = ["check-wheel-contents (==0.6.3)"]
|
||||
dev = ["actionlint-py (==1.7.5.21)", "check-manifest (==0.50)", "deptry (==0.21.2)", "doc8 (==1.1.2)", "doccmd (==2024.12.26)", "docformatter (==1.7.5)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2024.12.25)", "mypy[faster-cache] (==1.14.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pyenchant (==3.3.0rc1)", "pylint (==3.3.3)", "pyproject-fmt (==2.5.0)", "pyright (==1.1.391)", "pyroma (==4.2)", "pytest (==8.3.4)", "pytest-cov (==6.0.0)", "ruff (==0.8.4)", "shellcheck-py (==0.10.0.1)", "shfmt-py (==3.7.0.1)", "sphinx-toolbox (==3.8.1)", "sphinx[test] (==8.1.3)", "types-docutils (==0.21.0.20241128)", "vulture (==2.14)", "yamlfix (==1.17.0)"]
|
||||
release = ["check-wheel-contents (==0.6.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-tabs"
|
||||
@@ -1352,21 +1363,21 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.6.2"
|
||||
version = "2.5.0"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"},
|
||||
{file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"},
|
||||
{file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
|
||||
{file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""]
|
||||
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
|
||||
h2 = ["h2 (>=4,<5)"]
|
||||
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
|
||||
zstd = ["zstandard (>=0.18.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "uvicorn"
|
||||
@@ -1592,4 +1603,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "9a17caa38b3c88f3fe3d1a60fdb73a96aa12ff1e30ecb00e2f9249e7ba9f859c"
|
||||
content-hash = "0ae673106f45d3465cbdabbf511e165ca44feadd34d7753f2e68093afaa95c79"
|
||||
|
||||
@@ -12,10 +12,10 @@ redirects_cli ="^0.1.3"
|
||||
sphinx-scylladb-theme = "^1.8.10"
|
||||
sphinx-sitemap = "^2.6.0"
|
||||
sphinx-autobuild = "^2024.4.19"
|
||||
Sphinx = "^8.0.0"
|
||||
Sphinx = "^7.3.7"
|
||||
sphinx-multiversion-scylla = "^0.3.4"
|
||||
sphinxcontrib-datatemplates = "^0.9.2"
|
||||
sphinx-scylladb-markdown = "^0.1.4"
|
||||
sphinx-scylladb-markdown = "^0.1.2"
|
||||
sphinx_collapse ="^0.1.3"
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -202,7 +202,3 @@ Glossary
|
||||
The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
|
||||
The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.
|
||||
|
||||
Colocated Table
|
||||
An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
|
||||
Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.
|
||||
|
||||
|
||||
@@ -816,6 +816,7 @@ public:
|
||||
future<data_sink> wrap_sink(const sstables::sstable& sst, sstables::component_type type, data_sink sink) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
co_return sink;
|
||||
@@ -844,6 +845,7 @@ public:
|
||||
sstables::component_type type,
|
||||
data_source src) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
|
||||
@@ -176,8 +176,6 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
15
main.cc
15
main.cc
@@ -23,7 +23,6 @@
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/signal.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/qos/raft_service_level_distributed_data_accessor.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "tasks/task_manager.hh"
|
||||
@@ -1796,13 +1795,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth_cache.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing client routes service");
|
||||
static sharded<service::client_routes_service> client_routes;
|
||||
client_routes.start(std::ref(stop_signal.as_sharded_abort_source()), std::ref(feature_service), std::ref(group0_client), std::ref(qp), std::ref(lifecycle_notifier)).get();
|
||||
auto stop_client_routes = defer_verbose_shutdown("client_routes", [&] {
|
||||
client_routes.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing storage service");
|
||||
debug::the_storage_service = &ss;
|
||||
ss.start(std::ref(stop_signal.as_sharded_abort_source()),
|
||||
@@ -1811,7 +1803,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
std::ref(messaging), std::ref(repair),
|
||||
std::ref(stream_manager), std::ref(lifecycle_notifier), std::ref(bm), std::ref(snitch),
|
||||
std::ref(tablet_allocator), std::ref(cdc_generation_service), std::ref(view_builder), std::ref(view_building_worker), std::ref(qp), std::ref(sl_controller),
|
||||
std::ref(auth_cache), std::ref(client_routes),
|
||||
std::ref(auth_cache),
|
||||
std::ref(tsm), std::ref(vbsm), std::ref(task_manager), std::ref(gossip_address_map),
|
||||
compression_dict_updated_callback,
|
||||
only_on_shard0(&*disk_space_monitor_shard0)
|
||||
@@ -2199,11 +2191,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
}).get();
|
||||
|
||||
api::set_server_client_routes(ctx, client_routes).get();
|
||||
auto stop_cr_api = defer_verbose_shutdown("client routes API", [&ctx] {
|
||||
api::unset_server_client_routes(ctx).get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "join cluster");
|
||||
// Allow abort during join_cluster since bootstrap or replace
|
||||
// can take a long time.
|
||||
|
||||
@@ -56,16 +56,33 @@ static tasks::task_manager::task_state get_state(const db::system_keyspace::topo
|
||||
}
|
||||
}
|
||||
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, std::chrono::seconds ttl) {
|
||||
return sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
static std::set<tasks::task_id> get_pending_ids(service::topology& topology) {
|
||||
std::set<tasks::task_id> ids;
|
||||
for (auto& request : topology.requests) {
|
||||
ids.emplace(topology.find(request.first)->second.request_id);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry_opt = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry_opt) {
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, service::topology& topology, std::chrono::seconds ttl) {
|
||||
// Started requests.
|
||||
auto entries = co_await sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
|
||||
// Pending requests.
|
||||
for (auto& id : get_pending_ids(topology)) {
|
||||
entries.try_emplace(id.uuid(), db::system_keyspace::topology_requests_entry{});
|
||||
}
|
||||
|
||||
co_return entries;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(id.uuid(), false);
|
||||
auto started = entry.id;
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
if (!started && !get_pending_ids(topology).contains(id)) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto& entry = *entry_opt;
|
||||
co_return tasks::task_status{
|
||||
.task_id = id,
|
||||
.type = request_type_to_task_type(entry.request_type),
|
||||
@@ -84,7 +101,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = "",
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = started ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -106,22 +123,26 @@ future<std::optional<tasks::virtual_task_hint>> node_ops_virtual_task::contains(
|
||||
}
|
||||
}
|
||||
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
co_return entry && std::holds_alternative<service::topology_request>(entry->request_type) ? empty_hint : std::nullopt;
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(task_id.uuid(), false);
|
||||
co_return bool(entry.id) && std::holds_alternative<service::topology_request>(entry.request_type) ? empty_hint : std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> node_ops_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::no);
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
return get_status_helper(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status(id, hint);
|
||||
auto entry = co_await get_status_helper(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
co_return co_await get_status_helper(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
@@ -130,7 +151,8 @@ future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hin
|
||||
|
||||
future<std::vector<tasks::task_stats>> node_ops_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, get_task_manager().get_user_task_ttl())
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, topology, get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto id = e.first;
|
||||
auto& entry = e.second;
|
||||
|
||||
@@ -39,6 +39,8 @@ public:
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
private:
|
||||
future<std::optional<tasks::task_status>> get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const;
|
||||
};
|
||||
|
||||
class streaming_task_impl : public tasks::task_manager::task::impl {
|
||||
|
||||
@@ -176,7 +176,7 @@ void fsm::become_leader() {
|
||||
|
||||
_last_election_time = _clock.now();
|
||||
_ping_leader = false;
|
||||
// a new leader needs to commit at least one entry to make sure that
|
||||
// a new leader needs to commit at lease one entry to make sure that
|
||||
// all existing entries in its log are committed as well. Also it should
|
||||
// send append entries RPC as soon as possible to establish its leadership
|
||||
// (3.4). Do both of those by committing a dummy entry.
|
||||
|
||||
@@ -2793,7 +2793,6 @@ future<> database::flush_all_tables() {
|
||||
});
|
||||
_all_tables_flushed_at = db_clock::now();
|
||||
co_await _commitlog->wait_for_pending_deletes();
|
||||
dblog.info("Forcing new commitlog segment and flushing all tables complete");
|
||||
}
|
||||
|
||||
future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {
|
||||
|
||||
@@ -3385,15 +3385,16 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
continue;
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
lister::scan_dir(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [datadir, &all_snapshots] (fs::path snapshots_dir, directory_entry de) {
|
||||
auto snapshot_name = de.name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
auto details = get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).get();
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
}
|
||||
return get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).then([&all_snapshots, snapshot_name] (auto details) {
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).get();
|
||||
}
|
||||
return all_snapshots;
|
||||
});
|
||||
@@ -3401,61 +3402,38 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
|
||||
future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_dir, fs::path datadir) {
|
||||
table::snapshot_details details{};
|
||||
std::optional<fs::path> staging_dir = snapshot_dir / sstables::staging_dir;
|
||||
if (!co_await file_exists(staging_dir->native())) {
|
||||
staging_dir.reset();
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
// FIXME: optimize stat calls by keeping the base directory open and use statat instead, here and below.
|
||||
// See https://github.com/scylladb/seastar/pull/3163
|
||||
auto sd = co_await io_check(file_stat, (snapshot_dir / name).native(), follow_symlink::no);
|
||||
co_await lister::scan_dir(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>(), [datadir, &details] (fs::path snapshot_dir, directory_entry de) -> future<> {
|
||||
auto sd = co_await io_check(file_stat, (snapshot_dir / de.name).native(), follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
if (de.name != "manifest.json" && de.name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of linkes is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
size = 0;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (fs::path path) -> future<bool> {
|
||||
try {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, path.native(), follow_symlink::no);
|
||||
auto psd = co_await io_check(file_stat, (datadir / de.name).native(), follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(datadir / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
(datadir / de.name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / de.name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(*staging_dir / name)) &&
|
||||
!co_await exists_in_dir(datadir / name)) {
|
||||
details.live += size;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
co_return details;
|
||||
}
|
||||
|
||||
@@ -593,7 +593,7 @@ private:
|
||||
v3_columns _v3_columns;
|
||||
mutable schema_registry_entry* _registry_entry = nullptr;
|
||||
std::unique_ptr<::view_info> _view_info;
|
||||
mutable schema_ptr _cdc_schema;
|
||||
schema_ptr _cdc_schema;
|
||||
|
||||
const std::array<column_count_type, 3> _offsets;
|
||||
|
||||
@@ -957,7 +957,6 @@ public:
|
||||
friend bool operator==(const schema&, const schema&);
|
||||
const column_mapping& get_column_mapping() const;
|
||||
friend class schema_registry_entry;
|
||||
friend class schema_registry;
|
||||
// May be called from different shard
|
||||
schema_registry_entry* registry_entry() const noexcept;
|
||||
// Returns true iff this schema version was synced with on current node.
|
||||
|
||||
@@ -78,8 +78,10 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
|
||||
}
|
||||
|
||||
schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
auto learned_cdc_schema = s->cdc_schema() ? learn(s->cdc_schema()) : nullptr;
|
||||
s->_cdc_schema = learned_cdc_schema;
|
||||
auto learned_cdc_schema = s->cdc_schema() ? local_schema_registry().learn(s->cdc_schema()) : nullptr;
|
||||
if (learned_cdc_schema != s->cdc_schema()) {
|
||||
s = s->make_with_cdc(learned_cdc_schema);
|
||||
}
|
||||
if (s->registry_entry()) {
|
||||
return s;
|
||||
}
|
||||
@@ -90,9 +92,7 @@ schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
e.load(s);
|
||||
attach_table(e);
|
||||
}
|
||||
auto loaded_s = e.get_schema();
|
||||
loaded_s->_cdc_schema = learned_cdc_schema;
|
||||
return loaded_s;
|
||||
return e.get_schema();
|
||||
}
|
||||
slogger.debug("Learning about version {} of {}.{}", s->version(), s->ks_name(), s->cf_name());
|
||||
auto e_ptr = make_lw_shared<schema_registry_entry>(s->version(), *this);
|
||||
|
||||
@@ -3,7 +3,6 @@ target_sources(service
|
||||
PRIVATE
|
||||
broadcast_tables/experimental/lang.cc
|
||||
client_state.cc
|
||||
client_routes.cc
|
||||
mapreduce_service.cc
|
||||
migration_manager.cc
|
||||
misc_services.cc
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "service/client_routes.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
|
||||
static logging::logger crlogger("client_routes");
|
||||
|
||||
service::query_state& client_routes_query_state() {
|
||||
using namespace std::chrono_literals;
|
||||
const auto t = 10s;
|
||||
static timeout_config tc{ t, t, t, t, t, t, t };
|
||||
static thread_local service::client_state cs(service::client_state::internal_tag{}, tc);
|
||||
static thread_local service::query_state qs(cs, empty_service_permit());
|
||||
return qs;
|
||||
};
|
||||
|
||||
future<mutation> service::client_routes_service::make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key) {
|
||||
static const sstring stmt = format("DELETE FROM {}.{} WHERE connection_id = ? and host_id = ?", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {key.connection_id, key.host_id});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<mutation> service::client_routes_service::make_update_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_entry& route) {
|
||||
static const sstring stmt = format("INSERT INTO {}.{} (connection_id, host_id, address, port, tls_port, alternator_port, alternator_https_port) VALUES (?, ?, ?, ?, ?, ?, ?)", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {
|
||||
route.connection_id,
|
||||
route.host_id,
|
||||
route.address,
|
||||
route.port,
|
||||
route.tls_port,
|
||||
route.alternator_port,
|
||||
route.alternator_https_port
|
||||
});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<std::vector<service::client_routes_service::client_route_entry>> service::client_routes_service::get_client_routes() const {
|
||||
std::vector<service::client_routes_service::client_route_entry> result;
|
||||
static const sstring query = format("SELECT * from {}.{}", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
auto rs = co_await _qp.execute_internal(query, cql3::query_processor::cache_internal::yes);
|
||||
result.reserve(rs->size());
|
||||
for (const auto& row : *rs) {
|
||||
result.emplace_back(
|
||||
row.get_as<sstring>("connection_id"),
|
||||
row.get_as<utils::UUID>("host_id"),
|
||||
row.get_as<sstring>("address"),
|
||||
row.get_opt<int32_t>("port"),
|
||||
row.get_opt<int32_t>("tls_port"),
|
||||
row.get_opt<int32_t>("alternator_port"),
|
||||
row.get_opt<int32_t>("alternator_https_port")
|
||||
);
|
||||
}
|
||||
co_return result;
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::notify_client_routes_change(const client_route_keys& client_route_keys) {
|
||||
co_await container().invoke_on_all([&client_route_keys] (service::client_routes_service& client_routes) {
|
||||
return client_routes._lifecycle_notifier.notify_client_routes_change(client_route_keys);
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (const auto& entry : route_entries) {
|
||||
auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "insert client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (const auto& route_key : route_keys) {
|
||||
auto mut = co_await make_remove_client_route_mutation(guard.write_timestamp(), route_key);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "delete client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries) {
|
||||
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_entries = std::move(route_entries)] {
|
||||
return cr.set_client_routes_inner(route_entries);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys) {
|
||||
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) mutable -> future<> {
|
||||
return cr.with_retry([&cr, route_keys = std::move(route_keys)] {
|
||||
return cr.delete_client_routes_inner(route_keys);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
seastar::future<> service::client_routes_service::with_retry(Func func) const {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
try {
|
||||
co_await func();
|
||||
} catch (const ::service::group0_concurrent_modification&) {
|
||||
crlogger.warn("Failed to set client routes due to guard conflict, retries={}", retries);
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
|
||||
#include "gms/feature_service.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
class endpoint_lifecycle_notifier;
|
||||
|
||||
class client_routes_service : public seastar::peering_sharded_service<client_routes_service> {
|
||||
public:
|
||||
client_routes_service(
|
||||
abort_source& abort_source,
|
||||
gms::feature_service& feature_service,
|
||||
service::raft_group0_client& group0_client,
|
||||
cql3::query_processor& qp,
|
||||
endpoint_lifecycle_notifier& elc_notif
|
||||
)
|
||||
: _abort_source(abort_source)
|
||||
, _feature_service(feature_service)
|
||||
, _group0_client(group0_client)
|
||||
, _qp(qp)
|
||||
, _lifecycle_notifier(elc_notif) { }
|
||||
|
||||
struct client_route_key {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
|
||||
bool operator<(const client_route_key& other) const {
|
||||
if (connection_id != other.connection_id) {
|
||||
return connection_id < other.connection_id;
|
||||
}
|
||||
return host_id < other.host_id;
|
||||
}
|
||||
};
|
||||
using client_route_keys = std::set<client_route_key>;
|
||||
|
||||
struct client_route_entry {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
sstring address;
|
||||
// At least one of the ports should be specified
|
||||
std::optional<int32_t> port;
|
||||
std::optional<int32_t> tls_port;
|
||||
std::optional<int32_t> alternator_port;
|
||||
std::optional<int32_t> alternator_https_port;
|
||||
};
|
||||
|
||||
gms::feature_service& get_feature_service() noexcept {
|
||||
return _feature_service;
|
||||
}
|
||||
|
||||
// mutations
|
||||
future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
|
||||
future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
|
||||
future<std::vector<client_route_entry>> get_client_routes() const;
|
||||
seastar::future<> set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries);
|
||||
seastar::future<> delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys);
|
||||
|
||||
|
||||
// notifications
|
||||
seastar::future<> notify_client_routes_change(const client_route_keys& client_route_keys);
|
||||
private:
|
||||
seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
template <typename Func>
|
||||
seastar::future<> with_retry(Func func) const;
|
||||
|
||||
abort_source& _abort_source;
|
||||
gms::feature_service& _feature_service;
|
||||
service::raft_group0_client& _group0_client;
|
||||
cql3::query_processor& _qp;
|
||||
endpoint_lifecycle_notifier& _lifecycle_notifier;
|
||||
};
|
||||
|
||||
}
|
||||
@@ -344,17 +344,3 @@ void service::client_state::update_per_service_level_params(qos::service_level_o
|
||||
|
||||
_workload_type = slo.workload;
|
||||
}
|
||||
|
||||
future<> service::client_state::set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options) {
|
||||
for (const auto& [key, value] : client_options) {
|
||||
auto cached_key = co_await keys_and_values_cache.get_or_load(key, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
auto cached_value = co_await keys_and_values_cache.get_or_load(value, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
_client_options.emplace_back(std::move(cached_key), std::move(cached_value));
|
||||
}
|
||||
}
|
||||
@@ -18,7 +18,6 @@
|
||||
#include "auth/authenticated_user.hh"
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/permission.hh"
|
||||
#include "client_data.hh"
|
||||
|
||||
#include "transport/cql_protocol_extension.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
@@ -103,8 +102,7 @@ private:
|
||||
private volatile String keyspace;
|
||||
#endif
|
||||
std::optional<auth::authenticated_user> _user;
|
||||
std::optional<client_options_cache_entry_type> _driver_name, _driver_version;
|
||||
std::list<client_option_key_value_cached_entry> _client_options;
|
||||
std::optional<sstring> _driver_name, _driver_version;
|
||||
|
||||
auth_state _auth_state = auth_state::UNINITIALIZED;
|
||||
bool _control_connection = false;
|
||||
@@ -153,33 +151,18 @@ public:
|
||||
return _control_connection = true;
|
||||
}
|
||||
|
||||
std::optional<client_options_cache_entry_type> get_driver_name() const {
|
||||
std::optional<sstring> get_driver_name() const {
|
||||
return _driver_name;
|
||||
}
|
||||
future<> set_driver_name(client_options_cache_type& keys_and_values_cache, const sstring& driver_name) {
|
||||
_driver_name = co_await keys_and_values_cache.get_or_load(driver_name, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
void set_driver_name(sstring driver_name) {
|
||||
_driver_name = std::move(driver_name);
|
||||
}
|
||||
|
||||
const auto& get_client_options() const {
|
||||
return _client_options;
|
||||
}
|
||||
|
||||
future<> set_client_options(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const std::unordered_map<sstring, sstring>& client_options);
|
||||
|
||||
std::optional<client_options_cache_entry_type> get_driver_version() const {
|
||||
std::optional<sstring> get_driver_version() const {
|
||||
return _driver_version;
|
||||
}
|
||||
future<> set_driver_version(
|
||||
client_options_cache_type& keys_and_values_cache,
|
||||
const sstring& driver_version)
|
||||
{
|
||||
_driver_version = co_await keys_and_values_cache.get_or_load(driver_version, [] (const client_options_cache_key_type&) {
|
||||
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
|
||||
});
|
||||
void set_driver_version(sstring driver_version) {
|
||||
_driver_version = std::move(driver_version);
|
||||
}
|
||||
|
||||
client_state(external_tag,
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "utils/atomic_vector.hh"
|
||||
#include "service/client_routes.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -66,7 +65,6 @@ public:
|
||||
* @param endpoint the endpoint marked DOWN.
|
||||
*/
|
||||
virtual void on_down(const gms::inet_address& endpoint, locator::host_id host_id) {}
|
||||
virtual void on_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {}
|
||||
};
|
||||
|
||||
class endpoint_lifecycle_notifier {
|
||||
@@ -81,8 +79,6 @@ public:
|
||||
future<> notify_released(locator::host_id host_id);
|
||||
future<> notify_up(gms::inet_address endpoint, locator::host_id host_id);
|
||||
future<> notify_joined(gms::inet_address endpoint, locator::host_id host_id);
|
||||
|
||||
future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -163,11 +163,7 @@ public:
|
||||
void before_drop_column_family(const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_drop_keyspace(const sstring& keyspace_name, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
|
||||
// Called when creating a tablet map for a new table.
|
||||
// When in the context of a notification callback, call `before_allocate_tablet_map_in_notification`,
|
||||
// and otherwise call 'before_allocate_tablet_map'.
|
||||
void before_allocate_tablet_map(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_allocate_tablet_map_in_notification(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -648,13 +648,6 @@ void migration_notifier::before_allocate_tablet_map(const locator::tablet_map& m
|
||||
});
|
||||
}
|
||||
|
||||
void migration_notifier::before_allocate_tablet_map_in_notification(const locator::tablet_map& map,
|
||||
const schema& s, utils::chunked_vector<mutation>& mutations, api::timestamp_type ts) {
|
||||
_listeners.thread_for_each_nested([&map, &s, &mutations, ts] (migration_listener* listener) {
|
||||
listener->on_before_allocate_tablet_map(map, s, mutations, ts);
|
||||
});
|
||||
}
|
||||
|
||||
utils::chunked_vector<mutation> prepare_keyspace_update_announcement(replica::database& db, lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type ts) {
|
||||
db.validate_keyspace_update(*ksm);
|
||||
mlogger.info("Update Keyspace: {}", ksm);
|
||||
|
||||
@@ -640,16 +640,6 @@ future<scheduling_group> service_level_controller::auth_integration::get_user_sc
|
||||
}
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::auth_integration::get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
if (usr && usr->name) {
|
||||
auto sl_opt = find_cached_effective_service_level(*usr->name);
|
||||
auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name;
|
||||
return _sl_controller.get_scheduling_group(sl_name);
|
||||
} else {
|
||||
return _sl_controller.get_default_scheduling_group();
|
||||
}
|
||||
}
|
||||
|
||||
future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
// Special case:
|
||||
// -------------
|
||||
@@ -666,11 +656,6 @@ future<scheduling_group> service_level_controller::get_user_scheduling_group(con
|
||||
return _auth_integration->get_user_scheduling_group(usr);
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
SCYLLA_ASSERT(_auth_integration != nullptr);
|
||||
return _auth_integration->get_user_cached_scheduling_group(usr);
|
||||
}
|
||||
|
||||
std::optional<sstring> service_level_controller::get_active_service_level() {
|
||||
unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group());
|
||||
if (_sl_lookup[sched_idx].first) {
|
||||
@@ -789,10 +774,6 @@ future<service_levels_info> service_level_controller::get_distributed_service_le
|
||||
return _sl_data_accessor ? _sl_data_accessor->get_service_level(service_level_name) : make_ready_future<service_levels_info>();
|
||||
}
|
||||
|
||||
bool service_level_controller::can_use_effective_service_level_cache() const{
|
||||
return _sl_data_accessor && _sl_data_accessor->can_use_effective_service_level_cache();
|
||||
}
|
||||
|
||||
future<bool> service_level_controller::validate_before_service_level_add() {
|
||||
assert(this_shard_id() == global_controller);
|
||||
if (_global_controller_db->deleted_scheduling_groups.size() > 0) {
|
||||
|
||||
@@ -154,10 +154,7 @@ public:
|
||||
/// Synchronous version of `find_effective_service_level` that only checks the cache.
|
||||
std::optional<service_level_options> find_cached_effective_service_level(const sstring& role_name);
|
||||
|
||||
/// Execute a function within the service level context of a user, get_user_scheduling_group - async version
|
||||
/// get_user_cached_scheduling_group - sync version (used for v2 servers).
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
scheduling_group get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
|
||||
template <typename Func, typename Ret = std::invoke_result_t<Func>>
|
||||
requires std::invocable<Func>
|
||||
@@ -342,12 +339,6 @@ public:
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* Get the scheduling group of a specific user for the service level cache
|
||||
* @param user - the user for determining the service level
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
scheduling_group get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* @return the name of the currently active service level if such exists or an empty
|
||||
* optional if no active service level.
|
||||
@@ -409,13 +400,6 @@ public:
|
||||
future<service_levels_info> get_distributed_service_levels(qos::query_context ctx);
|
||||
future<service_levels_info> get_distributed_service_level(sstring service_level_name);
|
||||
|
||||
/*
|
||||
* Returns whether effective service level cache can be populated and used.
|
||||
* This is equivalent to checking whether auth + raft have been migrated to raft.
|
||||
*/
|
||||
bool can_use_effective_service_level_cache() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the service level options **in effect** for a user having the given
|
||||
* collection of roles.
|
||||
|
||||
@@ -124,40 +124,8 @@ bool should_flush_system_topology_after_applying(const mutation& mut, const data
|
||||
return false;
|
||||
}
|
||||
|
||||
static void collect_client_routes_update(const mutation& mut, client_routes_service::client_route_keys& client_routes_update) {
|
||||
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
if (mut.column_family_id() != s_client_routes->id()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto pk_components = mut.decorated_key()._key.explode(*s_client_routes);
|
||||
if (pk_components.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto conn_uuid = value_cast<sstring>(utf8_type->deserialize_value(pk_components[0]));
|
||||
for (const rows_entry& re : mut.partition().clustered_rows()) {
|
||||
const auto ck_components = re.key().explode(*s_client_routes);
|
||||
if (ck_components.empty()) {
|
||||
continue;
|
||||
}
|
||||
auto host_uuid = value_cast<utils::UUID>(uuid_type->deserialize_value(ck_components[0]));
|
||||
client_routes_update.emplace(conn_uuid, host_uuid);
|
||||
}
|
||||
}
|
||||
|
||||
static future<> notify_client_route_change_if_needed(storage_service& storage_service, const client_routes_service::client_route_keys& client_routes_update) {
|
||||
if (client_routes_update.size() > 0) {
|
||||
slogger.trace("write_mutations_to_database: notify_client_routes_change routes_update.size()={}", client_routes_update.size());
|
||||
co_await storage_service.notify_client_routes_change(client_routes_update);
|
||||
}
|
||||
}
|
||||
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
|
||||
mutations.reserve(cms.size());
|
||||
bool need_system_topology_flush = false;
|
||||
try {
|
||||
@@ -165,12 +133,7 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
|
||||
auto& tbl = proxy.local_db().find_column_family(cm.column_family_id());
|
||||
auto& s = tbl.schema();
|
||||
auto mut = co_await to_mutation_gently(cm, s);
|
||||
|
||||
need_system_topology_flush = need_system_topology_flush || should_flush_system_topology_after_applying(mut, proxy.data_dictionary());
|
||||
if (proxy.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
|
||||
mutations.emplace_back(co_await freeze_gently(mut), s);
|
||||
}
|
||||
} catch (replica::no_such_column_family& e) {
|
||||
@@ -184,8 +147,6 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
|
||||
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
co_await proxy.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
co_await notify_client_route_change_if_needed(storage_service, client_routes_update);
|
||||
}
|
||||
|
||||
group0_state_machine::modules_to_reload group0_state_machine::get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations) {
|
||||
@@ -290,7 +251,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
[&] (topology_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
@@ -302,7 +263,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
},
|
||||
[&] (write_mutations& muts) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(muts.mutations);
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
}
|
||||
), cmd.change);
|
||||
@@ -432,7 +393,6 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
|
||||
future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) {
|
||||
try {
|
||||
co_await utils::get_local_injector().inject("block_group0_transfer_snapshot", utils::wait_for_message(300s));
|
||||
// Note that this may bring newer state than the group0 state machine raft's
|
||||
// log, so some raft entries may be double applied, but since the state
|
||||
// machine is idempotent it is not a problem.
|
||||
@@ -491,23 +451,11 @@ future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::
|
||||
co_await _sp.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
if (raft_snp) {
|
||||
if (_sp.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
for (auto& canonical_mut : raft_snp->mutations) {
|
||||
if (canonical_mut.column_family_id() == s_client_routes->id()) {
|
||||
auto mut = co_await to_mutation_gently(canonical_mut, s_client_routes);
|
||||
slogger.trace("transfer snapshot: raft snapshot includes client_routes mutation");
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await mutate_locally(std::move(raft_snp->mutations), _sp);
|
||||
}
|
||||
|
||||
co_await _ss.auth_cache().load_all();
|
||||
co_await notify_client_route_change_if_needed(_ss, client_routes_update);
|
||||
|
||||
co_await _sp.mutate_locally({std::move(history_mut)}, nullptr);
|
||||
} catch (const abort_requested_exception&) {
|
||||
|
||||
@@ -130,6 +130,6 @@ public:
|
||||
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
|
||||
|
||||
// Used to write data to topology and other tables except schema tables.
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -254,10 +254,6 @@ public:
|
||||
group0_batch(const group0_batch&) = delete;
|
||||
group0_batch(group0_batch&&) = default;
|
||||
|
||||
const group0_guard& guard() const {
|
||||
return _guard.value();
|
||||
}
|
||||
|
||||
// Gets timestamp which should be used when building mutations.
|
||||
api::timestamp_type write_timestamp() const;
|
||||
utils::UUID new_group0_state_id() const;
|
||||
|
||||
@@ -1114,7 +1114,7 @@ private:
|
||||
// only for a truncate which is still waiting.
|
||||
if (_topology_state_machine._topology.global_request) {
|
||||
utils::UUID ongoing_global_request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id);
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id, true);
|
||||
auto global_request = std::get<service::global_topology_request>(topology_requests_entry.request_type);
|
||||
if (global_request == global_topology_request::truncate_table) {
|
||||
std::optional<topology::transition_state>& tstate = _topology_state_machine._topology.tstate;
|
||||
|
||||
@@ -205,7 +205,6 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -225,13 +224,11 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
, _snitch(snitch)
|
||||
, _sl_controller(sl_controller)
|
||||
, _auth_cache(auth_cache)
|
||||
, _client_routes(client_routes)
|
||||
, _group0(nullptr)
|
||||
, _async_gate("storage_service")
|
||||
, _node_ops_abort_thread(node_ops_abort_thread())
|
||||
, _node_ops_module(make_shared<node_ops::task_manager_module>(tm, *this))
|
||||
, _tablets_module(make_shared<service::task_manager_module>(tm, *this))
|
||||
, _global_topology_requests_module(make_shared<service::topo::task_manager_module>(tm))
|
||||
, _address_map(address_map)
|
||||
, _shared_token_metadata(stm)
|
||||
, _erm_factory(erm_factory)
|
||||
@@ -255,11 +252,9 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
{
|
||||
tm.register_module(_node_ops_module->get_name(), _node_ops_module);
|
||||
tm.register_module(_tablets_module->get_name(), _tablets_module);
|
||||
tm.register_module(_global_topology_requests_module->get_name(), _global_topology_requests_module);
|
||||
if (this_shard_id() == 0) {
|
||||
_node_ops_module->make_virtual_task<node_ops::node_ops_virtual_task>(*this);
|
||||
_tablets_module->make_virtual_task<service::tablet_virtual_task>(*this);
|
||||
_global_topology_requests_module->make_virtual_task<service::topo::global_topology_request_virtual_task>(*this);
|
||||
}
|
||||
register_metrics();
|
||||
|
||||
@@ -588,16 +583,12 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
}
|
||||
break;
|
||||
case node_state::decommissioning:
|
||||
[[fallthrough]];
|
||||
case node_state::removing:
|
||||
// A decommissioning or removing node loses its tokens when topology moves to left_token_ring.
|
||||
// A decommissioning node loses its tokens when topology moves to left_token_ring.
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::left_token_ring) {
|
||||
if (rs.state == node_state::removing && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::removing:
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::rollback_to_normal) {
|
||||
// no need for double writes anymore since op failed
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
@@ -1384,34 +1375,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstring ks) const {
|
||||
auto ongoing_ks_rf_change = [&] (utils::UUID request_id) -> future<bool> {
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
co_return std::holds_alternative<global_topology_request>(req_entry.request_type) &&
|
||||
std::get<global_topology_request>(req_entry.request_type) == global_topology_request::keyspace_rf_change &&
|
||||
req_entry.new_keyspace_rf_change_ks_name.has_value() && req_entry.new_keyspace_rf_change_ks_name.value() == ks;
|
||||
};
|
||||
if (_topology_state_machine._topology.global_request_id.has_value()) {
|
||||
auto req_id = _topology_state_machine._topology.global_request_id.value();
|
||||
if (co_await ongoing_ks_rf_change(req_id)) {
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.paused_rf_change_requests) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.global_requests_queue) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
future<> storage_service::raft_initialize_discovery_leader(const join_node_request_params& params) {
|
||||
if (params.replaced_id.has_value()) {
|
||||
throw std::runtime_error(::format("Cannot perform a replace operation because this is the first node in the cluster"));
|
||||
@@ -1457,7 +1420,7 @@ future<> storage_service::raft_initialize_discovery_leader(const join_node_reque
|
||||
_migration_manager.local().get_group0_client().get_history_gc_duration(), "bootstrap: adding myself as the first node to the topology");
|
||||
auto mutation_creator_addr = _sys_ks.local().local_db().get_token_metadata().get_topology().my_address();
|
||||
|
||||
co_await write_mutations_to_database(*this, _qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await write_mutations_to_database(_qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await _qp.proxy().mutate_locally({history_append}, nullptr);
|
||||
}
|
||||
|
||||
@@ -3480,7 +3443,6 @@ future<> storage_service::stop() {
|
||||
_listeners.clear();
|
||||
co_await _tablets_module->stop();
|
||||
co_await _node_ops_module->stop();
|
||||
co_await _global_topology_requests_module->stop();
|
||||
co_await _async_gate.close();
|
||||
co_await std::move(_node_ops_abort_thread);
|
||||
_tablet_split_monitor_event.signal();
|
||||
@@ -5063,50 +5025,6 @@ future<> storage_service::wait_for_topology_not_busy() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.abort_paused_rf_change(request_id);
|
||||
});
|
||||
}
|
||||
|
||||
if (!_feature_service.rack_list_rf) {
|
||||
throw std::runtime_error("The RACK_LIST_RF feature is not enabled on the cluster yet");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
|
||||
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
|
||||
if (!found) {
|
||||
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
|
||||
topology_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("aborting rf change request {}", request_id));
|
||||
|
||||
try {
|
||||
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _group0_as, raft_timeout{});
|
||||
} catch (group0_concurrent_modification&) {
|
||||
slogger.info("aborting request {}: concurrent modification, retrying.", request_id);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
semaphore& storage_service::get_do_sample_sstables_concurrency_limiter() {
|
||||
return _do_sample_sstables_concurrency_limiter;
|
||||
}
|
||||
@@ -5310,7 +5228,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {
|
||||
request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
} else if (!_topology_state_machine._topology.global_requests_queue.empty()) {
|
||||
request_id = _topology_state_machine._topology.global_requests_queue[0];
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id, true);
|
||||
curr_req = std::get<global_topology_request>(req_entry.request_type);
|
||||
} else {
|
||||
request_id = utils::UUID{};
|
||||
@@ -7784,9 +7702,6 @@ void storage_service::init_messaging_service() {
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_state()->id());
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_history()->id());
|
||||
}
|
||||
if (ss._feature_service.client_routes) {
|
||||
additional_tables.push_back(db::system_keyspace::client_routes()->id());
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& table : boost::join(params.tables, additional_tables)) {
|
||||
@@ -8126,18 +8041,6 @@ future<> endpoint_lifecycle_notifier::notify_joined(gms::inet_address endpoint,
|
||||
});
|
||||
}
|
||||
|
||||
future<> endpoint_lifecycle_notifier::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await seastar::async([this, &client_route_keys] {
|
||||
_subscribers.thread_for_each([&client_route_keys] (endpoint_lifecycle_subscriber* subscriber) {
|
||||
try {
|
||||
subscriber->on_client_routes_change(client_route_keys);
|
||||
} catch (...) {
|
||||
slogger.warn("Client routes notification failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_service::notify_joined(inet_address endpoint, locator::host_id hid) {
|
||||
co_await utils::get_local_injector().inject(
|
||||
"storage_service_notify_joined_sleep", std::chrono::milliseconds{500});
|
||||
@@ -8162,10 +8065,6 @@ future<> storage_service::notify_cql_change(inet_address endpoint, locator::host
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await _client_routes.local().notify_client_routes_change(client_route_keys);
|
||||
}
|
||||
|
||||
bool storage_service::is_normal_state_handled_on_boot(locator::host_id node) {
|
||||
return _normal_state_handled_on_boot.contains(node);
|
||||
}
|
||||
|
||||
@@ -17,10 +17,8 @@
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_guard.hh"
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/snitch_base.hh"
|
||||
@@ -50,7 +48,6 @@
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "service/tablet_operation.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/user_provided_param.hh"
|
||||
#include "utils/sequenced_set.hh"
|
||||
#include "service/topology_coordinator.hh"
|
||||
@@ -205,7 +202,6 @@ private:
|
||||
sharded<locator::snitch_ptr>& _snitch;
|
||||
sharded<qos::service_level_controller>& _sl_controller;
|
||||
auth::cache& _auth_cache;
|
||||
sharded<client_routes_service>& _client_routes;
|
||||
|
||||
// Engaged on shard 0 before `join_cluster`.
|
||||
service::raft_group0* _group0;
|
||||
@@ -229,7 +225,6 @@ private:
|
||||
future<> _node_ops_abort_thread;
|
||||
shared_ptr<node_ops::task_manager_module> _node_ops_module;
|
||||
shared_ptr<service::task_manager_module> _tablets_module;
|
||||
shared_ptr<service::topo::task_manager_module> _global_topology_requests_module;
|
||||
gms::gossip_address_map& _address_map;
|
||||
void node_ops_insert(node_ops_id, gms::inet_address coordinator, std::list<inet_address> ignore_nodes,
|
||||
std::function<future<>()> abort_func);
|
||||
@@ -274,7 +269,6 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& _client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -937,7 +931,6 @@ public:
|
||||
bool topology_global_queue_empty() const {
|
||||
return !_topology_state_machine._topology.global_request.has_value();
|
||||
}
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
private:
|
||||
@@ -1075,8 +1068,6 @@ public:
|
||||
future<sstring> wait_for_topology_request_completion(utils::UUID id, bool require_entry = true);
|
||||
future<> wait_for_topology_not_busy();
|
||||
|
||||
future<> abort_paused_rf_change(utils::UUID request_id);
|
||||
|
||||
private:
|
||||
semaphore _do_sample_sstables_concurrency_limiter{1};
|
||||
// To avoid overly-large RPC messages, `do_sample_sstables` is broken up into several rounds.
|
||||
@@ -1147,14 +1138,11 @@ public:
|
||||
future<std::vector<std::byte>> train_dict(utils::chunked_vector<temporary_buffer<char>> sample);
|
||||
future<> publish_new_sstable_dict(table_id, std::span<const std::byte>, service::raft_group0_client&);
|
||||
void set_train_dict_callback(decltype(_train_dict));
|
||||
seastar::future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
|
||||
|
||||
friend class join_node_rpc_handshaker;
|
||||
friend class node_ops::node_ops_virtual_task;
|
||||
friend class tasks::task_manager;
|
||||
friend class tablet_virtual_task;
|
||||
friend class topo::global_topology_request_virtual_task;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -6,16 +6,12 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "cql3/statements/ks_prop_defs.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "locator/topology.hh"
|
||||
#include "replica/tablets.hh"
|
||||
#include "locator/tablet_replication_strategy.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -26,7 +22,6 @@
|
||||
#include "replica/database.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <utility>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -242,147 +237,6 @@ struct migration_candidate {
|
||||
migration_badness badness;
|
||||
};
|
||||
|
||||
struct colocation_source {
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_replica replica;
|
||||
};
|
||||
|
||||
using colocation_source_set = utils::chunked_vector<colocation_source>;
|
||||
using colocation_sources_by_destination_rack = std::unordered_map<endpoint_dc_rack, colocation_source_set>;
|
||||
|
||||
struct rack_list_colocation_state {
|
||||
colocation_sources_by_destination_rack dst_dc_rack_to_tablets;
|
||||
std::unordered_map<endpoint_dc_rack, std::unordered_set<utils::UUID>> dst_to_requests;
|
||||
utils::UUID request_to_resume;
|
||||
|
||||
void maybe_set_request_to_resume(const utils::UUID& id) {
|
||||
if (!request_to_resume) {
|
||||
request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
future<rack_list_colocation_state> find_required_rack_list_colocations(
|
||||
replica::database& db,
|
||||
token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
const std::unordered_set<utils::UUID>& paused_rf_change_requests,
|
||||
const std::unordered_set<locator::global_tablet_id>& already_planned_migrations) {
|
||||
rack_list_colocation_state state;
|
||||
|
||||
auto get_node = [&] (locator::host_id host) -> const locator::node& {
|
||||
auto* node = tmptr->get_topology().find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
return *node;
|
||||
};
|
||||
for (const auto& request_id : paused_rf_change_requests) {
|
||||
auto req_entry = co_await sys_ks->get_topology_request_entry(request_id);
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
|
||||
if (!db.has_keyspace(ks_name)) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
auto& ks = db.find_keyspace(ks_name);
|
||||
std::unordered_map<sstring, sstring> saved_ks_props = *req_entry.new_keyspace_rf_change_data;
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, db.features(), db.get_config());
|
||||
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
bool no_changes_needed = true;
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = tmptr->tablets().get_tablet_map(table_or_mv->id());
|
||||
const auto& new_replication_strategy_config = ks_md->strategy_options();
|
||||
for (auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (!std::holds_alternative<rack_list>(rf_value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto racks = std::get<rack_list>(rf_value) | std::ranges::to<std::unordered_set<sstring>>();
|
||||
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
|
||||
auto gid = locator::global_tablet_id{table_or_mv->id(), tid};
|
||||
|
||||
// Current replicas in this DC. There might be multiple replicas in the same rack.
|
||||
auto dc_replicas = ti.replicas | std::views::filter([&] (const tablet_replica& r) {
|
||||
return get_node(r.host).dc_rack().dc == dc;
|
||||
}) | std::ranges::to<std::vector<tablet_replica>>();
|
||||
|
||||
if (dc_replicas.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Find replicas that are not in the desired racks (src_replicas)
|
||||
// and racks that do not have replicas yet (dst_racks).
|
||||
auto dst_racks = racks;
|
||||
std::vector<tablet_replica> src_replicas;
|
||||
for (const auto& r : dc_replicas) {
|
||||
auto rack = get_node(r.host).dc_rack().rack;
|
||||
if (dst_racks.find(rack) != dst_racks.end()) {
|
||||
// There is already a replica in this rack.
|
||||
dst_racks.erase(rack);
|
||||
} else {
|
||||
// There is a replica in this rack, but it needs to be moved.
|
||||
src_replicas.push_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
auto zipped = std::views::zip(src_replicas, dst_racks);
|
||||
if (!std::ranges::empty(zipped)) {
|
||||
no_changes_needed = false;
|
||||
}
|
||||
|
||||
// Skip tablet that is in transitions.
|
||||
auto* tti = tmap.get_tablet_transition_info(tid);
|
||||
if (tti) {
|
||||
lblogger.debug("Skipped colocation for tablet={} which is already in transition={}", gid, tti->transition);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Skip tablet that is about to be in transition.
|
||||
if (already_planned_migrations.contains(gid)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
for (auto src_dst : zipped) {
|
||||
auto src = std::get<0>(src_dst);
|
||||
auto dst = std::get<1>(src_dst);
|
||||
auto endpoint = locator::endpoint_dc_rack{dc, dst};
|
||||
|
||||
state.dst_dc_rack_to_tablets[endpoint].emplace_back(colocation_source{{table_or_mv->id(), tid}, src});
|
||||
state.dst_to_requests[endpoint].insert(request_id);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
}
|
||||
if (no_changes_needed) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
}
|
||||
}
|
||||
co_return state;
|
||||
}
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id) {
|
||||
auto res = co_await find_required_rack_list_colocations(db, tmptr, sys_ks, {request_id}, {});
|
||||
co_return res.request_to_resume != request_id;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
@@ -804,8 +658,6 @@ class load_balancer {
|
||||
|
||||
replica::database& _db;
|
||||
token_metadata_ptr _tm;
|
||||
service::topology* _topology;
|
||||
db::system_keyspace* _sys_ks;
|
||||
std::optional<locator::load_sketch> _load_sketch;
|
||||
// Holds the set of tablets already scheduled for transition during plan-making.
|
||||
std::unordered_set<global_tablet_id> _scheduled_tablets;
|
||||
@@ -890,10 +742,7 @@ private:
|
||||
return streaming_infos;
|
||||
}
|
||||
public:
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm, locator::load_stats_ptr table_load_stats,
|
||||
load_balancer_stats_manager& stats,
|
||||
uint64_t target_tablet_size,
|
||||
unsigned tablets_per_shard_goal,
|
||||
@@ -902,26 +751,19 @@ public:
|
||||
, _tablets_per_shard_goal(tablets_per_shard_goal)
|
||||
, _db(db)
|
||||
, _tm(std::move(tm))
|
||||
, _topology(topology)
|
||||
, _sys_ks(sys_ks)
|
||||
, _table_load_stats(std::move(table_load_stats))
|
||||
, _stats(stats)
|
||||
, _skiplist(std::move(skiplist))
|
||||
{ }
|
||||
|
||||
bool ongoing_rack_list_colocation() const {
|
||||
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan() {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || rack_list_colocation) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces()) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
@@ -937,10 +779,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (rack_list_colocation) {
|
||||
plan.merge(co_await make_rack_list_colocation_plan(plan));
|
||||
}
|
||||
|
||||
// Merge table-wide resize decisions, may emit new decisions, revoke or finalize ongoing ones.
|
||||
// Note : Resize plans should be generated before repair plans to avoid scheduling repairs when there is pending resize finalization
|
||||
plan.merge_resize_plan(co_await make_resize_plan(plan));
|
||||
@@ -951,8 +789,8 @@ public:
|
||||
}
|
||||
|
||||
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count());
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
@@ -977,58 +815,6 @@ public:
|
||||
co_return false;
|
||||
}
|
||||
|
||||
void ensure_node(node_load_map& nodes, host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_scheduled_load(node_load_map& nodes) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_planned_load(node_load_map& nodes, const migration_plan& mplan) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
}
|
||||
|
||||
future<tablet_repair_plan> make_repair_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_repair_plan");
|
||||
|
||||
@@ -1044,19 +830,53 @@ public:
|
||||
// Populate the load of the migration that is already in the plan
|
||||
node_load_map nodes;
|
||||
// TODO: share code with make_plan()
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
};
|
||||
// TODO: share code with make_plan()
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
bool is_drained = node.get_state() == locator::node::state::being_decommissioned
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled
|
||||
co_await consider_scheduled_load(nodes);
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Consider load that is about to be scheduled
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
|
||||
struct repair_plan {
|
||||
locator::global_tablet_id gid;
|
||||
@@ -1139,109 +959,6 @@ public:
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<migration_plan> make_rack_list_colocation_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_rack_list_colocation_plan");
|
||||
|
||||
migration_plan plan;
|
||||
tablet_rack_list_colocation_plan rack_list_plan;
|
||||
if (!ongoing_rack_list_colocation()) {
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
|
||||
auto migration_tablet_ids = co_await mplan.get_migration_tablet_ids();
|
||||
auto colocation_state = co_await find_required_rack_list_colocations(_db, _tm, _sys_ks,
|
||||
_topology->paused_rf_change_requests, std::move(migration_tablet_ids));
|
||||
|
||||
node_load_map nodes;
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
if (node.get_state() == locator::node::state::normal && !node.is_excluded()) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled.
|
||||
co_await consider_scheduled_load(nodes);
|
||||
|
||||
// Consider load that is about to be scheduled.
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
|
||||
std::unordered_set<global_tablet_id> colocation_tablet_ids;
|
||||
for (auto& [dc_rack, colocation_sources] : colocation_state.dst_dc_rack_to_tablets) {
|
||||
auto nodes_by_load_dst = nodes | std::views::filter([&] (const auto& host_load) {
|
||||
auto& [host, load] = host_load;
|
||||
auto& node = *load.node;
|
||||
return node.dc_rack() == dc_rack;
|
||||
}) | std::views::keys | std::ranges::to<std::vector<host_id>>();
|
||||
|
||||
if (nodes_by_load_dst.empty()) {
|
||||
lblogger.warn("No target nodes available for RF change colocation plan in dc {}, rack {}", dc_rack.dc, dc_rack.rack);
|
||||
if (auto it = colocation_state.dst_to_requests.find(dc_rack); it != colocation_state.dst_to_requests.end()) {
|
||||
rack_list_plan.maybe_add_request_to_resume(*it->second.begin());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nodes_cmp = nodes_by_load_cmp(nodes);
|
||||
auto nodes_dst_cmp = [&] (const host_id& a, const host_id& b) {
|
||||
return nodes_cmp(b, a);
|
||||
};
|
||||
|
||||
// Ascending load heap of candidate target nodes.
|
||||
std::make_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
|
||||
const tablet_metadata& tmeta = _tm->tablets();
|
||||
for (colocation_source& source : colocation_sources) {
|
||||
if (colocation_tablet_ids.contains(source.gid)) {
|
||||
lblogger.debug("Skipped colocation of replica {} of tablet={}, another replica of which is about to be colocated", source.replica, source.gid);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pick the least loaded node as target.
|
||||
std::pop_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
auto target = nodes_by_load_dst.back();
|
||||
auto& target_info = nodes[target];
|
||||
auto push_back_target_node = seastar::defer([&] {
|
||||
std::push_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
});
|
||||
|
||||
lblogger.debug("target node: {}, avg_load={}", target, target_info.avg_load);
|
||||
|
||||
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
|
||||
|
||||
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
|
||||
target_info.shards[dst.shard].tablet_count,
|
||||
target_info.shard_load(dst.shard, _target_tablet_size));
|
||||
|
||||
tablet_transition_kind kind = tablet_transition_kind::migration;
|
||||
migration_tablet_set source_tablets {
|
||||
.tablet_s = source.gid, // Ignore the merge co-location.
|
||||
};
|
||||
auto src = source.replica;
|
||||
auto mig = get_migration_info(source_tablets, kind, src, dst);
|
||||
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
|
||||
auto mig_streaming_info = get_migration_streaming_infos(topo, tmap, mig);
|
||||
pick(*_load_sketch, dst.host, dst.shard, source_tablets);
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {}", mig);
|
||||
mark_as_scheduled(mig);
|
||||
for (auto& m : mig) {
|
||||
plan.add(std::move(m));
|
||||
colocation_tablet_ids.insert(m.tablet);
|
||||
}
|
||||
}
|
||||
update_node_load_on_migration(nodes, src, dst, source_tablets);
|
||||
}
|
||||
}
|
||||
if (colocation_state.request_to_resume) {
|
||||
rack_list_plan.maybe_add_request_to_resume(colocation_state.request_to_resume);
|
||||
}
|
||||
plan.set_rack_list_colocation_plan(std::move(rack_list_plan));
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
// Returns true if a table has replicas of all its sibling tablets co-located.
|
||||
// This is used for determining whether merge can be finalized, since co-location
|
||||
// is a strict requirement for sibling tablets to be merged.
|
||||
@@ -3250,6 +2967,30 @@ public:
|
||||
node_load_map nodes;
|
||||
std::unordered_set<host_id> nodes_to_drain;
|
||||
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
};
|
||||
|
||||
_tm->for_each_token_owner([&] (const locator::node& node) {
|
||||
if (!node_filter(node)) {
|
||||
return;
|
||||
@@ -3258,7 +2999,7 @@ public:
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
if (is_drained) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
|
||||
nodes_to_drain.emplace(node.host_id());
|
||||
nodes[node.host_id()].drained = true;
|
||||
@@ -3266,7 +3007,7 @@ public:
|
||||
// Excluded nodes should not be chosen as targets for migration.
|
||||
lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
|
||||
} else {
|
||||
ensure_node(nodes, node.host_id());
|
||||
ensure_node(node.host_id());
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -3299,7 +3040,7 @@ public:
|
||||
r, global_tablet_id{table, tid}));
|
||||
}
|
||||
if (node->left() && node_filter(*node)) {
|
||||
ensure_node(nodes, r.host);
|
||||
ensure_node(r.host);
|
||||
nodes_to_drain.insert(r.host);
|
||||
nodes[r.host].drained = true;
|
||||
}
|
||||
@@ -3501,7 +3242,7 @@ public:
|
||||
plan.merge(co_await make_intranode_plan(nodes, nodes_to_drain));
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
|
||||
@@ -3523,11 +3264,9 @@ class tablet_allocator_impl : public tablet_allocator::impl
|
||||
locator::load_stats_ptr _load_stats;
|
||||
private:
|
||||
load_balancer make_load_balancer(token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
std::unordered_set<host_id> skiplist) {
|
||||
load_balancer lb(_db, tm, topology, sys_ks, std::move(table_load_stats), _load_balancer_stats,
|
||||
load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats,
|
||||
_db.get_config().target_tablet_size_in_bytes(),
|
||||
_db.get_config().tablets_per_shard_goal(),
|
||||
std::move(skiplist));
|
||||
@@ -3554,8 +3293,8 @@ public:
|
||||
_stopped = true;
|
||||
}
|
||||
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
|
||||
co_return co_await lb.make_plan();
|
||||
}
|
||||
@@ -3575,7 +3314,7 @@ public:
|
||||
// Allocates new tablets for a table which is not co-located with another table.
|
||||
tablet_map allocate_tablets_for_new_base_table(const tablet_aware_replication_strategy* tablet_rs, const schema& s) {
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto lb = make_load_balancer(tm, nullptr, nullptr, nullptr, {});
|
||||
auto lb = make_load_balancer(tm, nullptr, {});
|
||||
auto plan = lb.make_sizing_plan(s.shared_from_this(), tablet_rs).get();
|
||||
auto& table_plan = plan.tables[s.id()];
|
||||
if (table_plan.target_tablet_count_aligned != table_plan.target_tablet_count) {
|
||||
@@ -3589,7 +3328,6 @@ public:
|
||||
|
||||
// Allocate tablets for multiple new tables, which may be co-located with each other, or co-located with an existing base table.
|
||||
void allocate_tablets_for_new_tables(const keyspace_metadata& ksm, const std::vector<schema_ptr>& cfms, utils::chunked_vector<mutation>& muts, api::timestamp_type ts) {
|
||||
utils::get_local_injector().inject("pause_in_allocate_tablets_for_new_table", utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto rs = abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, tm->get_topology());
|
||||
@@ -3631,7 +3369,7 @@ public:
|
||||
if (s.id() != base_id) {
|
||||
lblogger.debug("Creating tablets for {}.{} id={} with base={}", s.ks_name(), s.cf_name(), s.id(), base_id);
|
||||
muts.emplace_back(colocated_tablet_map_to_mutation(s.id(), s.ks_name(), s.cf_name(), base_id, ts));
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3647,7 +3385,7 @@ public:
|
||||
muts.emplace_back(std::move(m));
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
|
||||
create_colocated_tablet_maps(base_map);
|
||||
}
|
||||
@@ -3796,8 +3534,8 @@ future<> tablet_allocator::stop() {
|
||||
return impl().stop();
|
||||
}
|
||||
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), topology, sys_ks, std::move(load_stats), std::move(skiplist));
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), std::move(load_stats), std::move(skiplist));
|
||||
}
|
||||
|
||||
void tablet_allocator::set_load_stats(locator::load_stats_ptr load_stats) {
|
||||
|
||||
@@ -14,14 +14,8 @@
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include <seastar/core/metrics.hh>
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
|
||||
class topology;
|
||||
|
||||
struct load_balancer_dc_stats {
|
||||
uint64_t calls = 0;
|
||||
uint64_t migrations_produced = 0;
|
||||
@@ -139,26 +133,6 @@ struct tablet_repair_plan {
|
||||
}
|
||||
};
|
||||
|
||||
struct tablet_rack_list_colocation_plan {
|
||||
utils::UUID _request_to_resume;
|
||||
|
||||
const utils::UUID& request_to_resume() const noexcept {
|
||||
return _request_to_resume;
|
||||
}
|
||||
|
||||
size_t size() const { return _request_to_resume ? 1 : 0; };
|
||||
|
||||
void merge(tablet_rack_list_colocation_plan&& other) {
|
||||
_request_to_resume = _request_to_resume ? _request_to_resume : other._request_to_resume;
|
||||
}
|
||||
|
||||
void maybe_add_request_to_resume(const utils::UUID& id) {
|
||||
if (!_request_to_resume) {
|
||||
_request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class migration_plan {
|
||||
public:
|
||||
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
|
||||
@@ -166,19 +140,17 @@ private:
|
||||
migrations_vector _migrations;
|
||||
table_resize_plan _resize_plan;
|
||||
tablet_repair_plan _repair_plan;
|
||||
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
|
||||
bool _has_nodes_to_drain = false;
|
||||
public:
|
||||
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size(); }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size();}
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
size_t tablet_repair_count() const { return _repair_plan.size(); }
|
||||
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
|
||||
|
||||
void add(tablet_migration_info info) {
|
||||
_migrations.emplace_back(std::move(info));
|
||||
@@ -195,7 +167,6 @@ public:
|
||||
_has_nodes_to_drain |= other._has_nodes_to_drain;
|
||||
_resize_plan.merge(std::move(other._resize_plan));
|
||||
_repair_plan.merge(std::move(other._repair_plan));
|
||||
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
|
||||
}
|
||||
|
||||
void set_has_nodes_to_drain(bool b) {
|
||||
@@ -214,12 +185,6 @@ public:
|
||||
_repair_plan = std::move(repair);
|
||||
}
|
||||
|
||||
const tablet_rack_list_colocation_plan& rack_list_colocation_plan() const { return _rack_list_colocation_plan; }
|
||||
|
||||
void set_rack_list_colocation_plan(tablet_rack_list_colocation_plan rack_list_colocation_plan) {
|
||||
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
|
||||
}
|
||||
|
||||
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
|
||||
};
|
||||
|
||||
@@ -265,7 +230,7 @@ public:
|
||||
///
|
||||
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
|
||||
///
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, service::topology*, db::system_keyspace*, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
|
||||
void set_load_stats(locator::load_stats_ptr);
|
||||
|
||||
@@ -281,12 +246,6 @@ public:
|
||||
void on_leadership_lost();
|
||||
};
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id);
|
||||
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "tasks/task_handler.hh"
|
||||
#include "tasks/virtual_task_hint.hh"
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -289,116 +288,4 @@ std::set<locator::host_id> task_manager_module::get_nodes() const {
|
||||
return get_task_manager().get_nodes(_ss);
|
||||
}
|
||||
|
||||
namespace topo {
|
||||
|
||||
static tasks::task_manager::task_state get_state(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
if (!entry.id) {
|
||||
return tasks::task_manager::task_state::created;
|
||||
} else if (!entry.done) {
|
||||
return tasks::task_manager::task_state::running;
|
||||
} else if (entry.error == "") {
|
||||
return tasks::task_manager::task_state::done;
|
||||
} else {
|
||||
return tasks::task_manager::task_state::failed;
|
||||
}
|
||||
}
|
||||
|
||||
tasks::task_manager::task_group global_topology_request_virtual_task::get_group() const noexcept {
|
||||
return tasks::task_manager::task_group::global_topology_change_group;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::virtual_task_hint>> global_topology_request_virtual_task::contains(tasks::task_id task_id) const {
|
||||
if (!task_id.uuid().is_timestamp()) {
|
||||
// Task id of node ops operation is always a timestamp.
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
auto hint = std::make_optional<tasks::virtual_task_hint>({});
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
if (entry.has_value() && std::holds_alternative<service::global_topology_request>(entry->request_type) &&
|
||||
std::get<service::global_topology_request>(entry->request_type) == global_topology_request::keyspace_rf_change) {
|
||||
co_return hint;
|
||||
}
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> global_topology_request_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::yes);
|
||||
}
|
||||
|
||||
static tasks::task_stats get_task_stats(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
return tasks::task_stats{
|
||||
.task_id = tasks::task_id{entry.id},
|
||||
.type = fmt::to_string(entry.request_type),
|
||||
.kind = tasks::task_kind::cluster,
|
||||
.scope = "keyspace",
|
||||
.state = get_state(entry),
|
||||
.sequence_number = 0,
|
||||
.keyspace = entry.new_keyspace_rf_change_ks_name.value_or(""),
|
||||
.table = "",
|
||||
.entity = "",
|
||||
.shard = 0,
|
||||
.start_time = entry.start_time,
|
||||
.end_time = entry.end_time,
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry.has_value()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto task_stats = get_task_stats(*entry);
|
||||
co_return tasks::task_status{
|
||||
.task_id = task_stats.task_id,
|
||||
.type = task_stats.type,
|
||||
.kind = task_stats.kind,
|
||||
.scope = task_stats.scope,
|
||||
.state = task_stats.state,
|
||||
.is_abortable = co_await is_abortable(std::move(hint)),
|
||||
.start_time = task_stats.start_time,
|
||||
.end_time = task_stats.end_time,
|
||||
.error = entry->error,
|
||||
.parent_id = tasks::task_id::create_null_id(),
|
||||
.sequence_number = task_stats.sequence_number,
|
||||
.shard = task_stats.shard,
|
||||
.keyspace = task_stats.keyspace,
|
||||
.table = task_stats.table,
|
||||
.entity = task_stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = utils::chunked_vector<tasks::task_identity>{},
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> global_topology_request_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
return _ss.abort_paused_rf_change(id.uuid());
|
||||
}
|
||||
|
||||
future<std::vector<tasks::task_stats>> global_topology_request_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await sys_ks.get_topology_request_entries({global_topology_request::keyspace_rf_change}, db_clock::now() - get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto& entry = e.second;
|
||||
return get_task_stats(entry);
|
||||
}));
|
||||
}
|
||||
|
||||
task_manager_module::task_manager_module(tasks::task_manager& tm) noexcept
|
||||
: tasks::task_manager::module(tm, "global_topology_requests")
|
||||
{}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -54,33 +54,4 @@ public:
|
||||
|
||||
std::set<locator::host_id> get_nodes() const override;
|
||||
};
|
||||
|
||||
namespace topo {
|
||||
|
||||
class global_topology_request_virtual_task : public tasks::task_manager::virtual_task::impl {
|
||||
private:
|
||||
service::storage_service& _ss;
|
||||
public:
|
||||
global_topology_request_virtual_task(tasks::task_manager::module_ptr module,
|
||||
service::storage_service& ss)
|
||||
: tasks::task_manager::virtual_task::impl(std::move(module))
|
||||
, _ss(ss)
|
||||
{}
|
||||
virtual tasks::task_manager::task_group get_group() const noexcept override;
|
||||
virtual future<std::optional<tasks::virtual_task_hint>> contains(tasks::task_id task_id) const override;
|
||||
virtual future<tasks::is_abortable> is_abortable(tasks::virtual_task_hint hint) const override;
|
||||
|
||||
virtual future<std::optional<tasks::task_status>> get_status(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
};
|
||||
|
||||
class task_manager_module : public tasks::task_manager::module {
|
||||
public:
|
||||
task_manager_module(tasks::task_manager& tm) noexcept;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <fmt/ranges.h>
|
||||
|
||||
@@ -55,7 +54,6 @@
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "topology_mutation.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -955,7 +953,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
} else {
|
||||
assert(_feature_service.topology_global_request_queue);
|
||||
req_id = _topo_sm._topology.global_requests_queue[0];
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id);
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id, true);
|
||||
req = std::get<global_topology_request>(req_entry.request_type);
|
||||
}
|
||||
switch (req) {
|
||||
@@ -999,7 +997,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
sstring error;
|
||||
bool needs_colocation = false;
|
||||
if (_db.has_keyspace(ks_name)) {
|
||||
try {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
@@ -1007,40 +1004,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, _db.features(), _db.get_config());
|
||||
_db.validate_keyspace_update(*ks_md);
|
||||
size_t unimportant_init_tablet_count = 2; // must be a power of 2
|
||||
locator::tablet_map new_tablet_map{unimportant_init_tablet_count};
|
||||
|
||||
auto schedule_migrations = [&] () -> future<> {
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
co_return;
|
||||
}
|
||||
auto table = tables_with_mvs.front();
|
||||
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
|
||||
auto check_needs_colocation = [&] () -> future<bool> {
|
||||
const auto& new_replication_strategy_config = new_strategy->get_config_options();
|
||||
const auto& old_replication_strategy_config = ks.metadata()->strategy_options();
|
||||
bool rack_list_conversion = false;
|
||||
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (std::holds_alternative<locator::rack_list>(rf_value)) {
|
||||
auto it = old_replication_strategy_config.find(dc);
|
||||
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
|
||||
rack_list_conversion = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, tmptr, &_sys_ks, req_id) : false;
|
||||
};
|
||||
if (needs_colocation = co_await check_needs_colocation(); needs_colocation) {
|
||||
co_return;
|
||||
}
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
// Apply the transition only on base tables.
|
||||
@@ -1049,6 +1018,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
continue;
|
||||
}
|
||||
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), old_tablets.tablet_count(), ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
@@ -1075,8 +1046,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
});
|
||||
}
|
||||
};
|
||||
co_await schedule_migrations();
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
@@ -1092,22 +1061,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
|
||||
}
|
||||
|
||||
bool pause_request = needs_colocation && error.empty();
|
||||
topology_mutation_builder tbuilder(guard.write_timestamp());
|
||||
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.del_global_topology_request()
|
||||
.del_global_topology_request_id()
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
|
||||
if (pause_request) {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
|
||||
tbuilder.pause_rf_change_request(req_id);
|
||||
} else {
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
|
||||
.build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.done(error)
|
||||
.build()));
|
||||
}
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
|
||||
sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
|
||||
rtlogger.trace("do update {} reason {}", updates, reason);
|
||||
@@ -1371,14 +1334,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
.build());
|
||||
}
|
||||
|
||||
void generate_rf_change_resume_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, utils::UUID request_to_resume) {
|
||||
rtlogger.debug("Generating RF change resume for request id {}", request_to_resume);
|
||||
out.emplace_back(topology_mutation_builder(guard.write_timestamp())
|
||||
.queue_global_topology_request_id(request_to_resume)
|
||||
.resume_rf_change_request(_topo_sm._topology.paused_rf_change_requests, request_to_resume)
|
||||
.build());
|
||||
}
|
||||
|
||||
future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
|
||||
if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
|
||||
// schedule tablet migration only if there are no pending resize finalisations or if the node is draining.
|
||||
@@ -1386,10 +1341,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
generate_migration_update(out, guard, mig);
|
||||
}
|
||||
|
||||
if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
|
||||
generate_rf_change_resume_update(out, guard, request_to_resume);
|
||||
}
|
||||
}
|
||||
|
||||
auto sched_time = db_clock::now();
|
||||
@@ -1880,7 +1831,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool has_nodes_to_drain = false;
|
||||
if (!preempt) {
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), {}, get_dead_nodes());
|
||||
has_nodes_to_drain = plan.has_nodes_to_drain();
|
||||
if (!drain || plan.has_nodes_to_drain()) {
|
||||
co_await generate_migration_updates(updates, guard, plan);
|
||||
@@ -2003,7 +1954,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await utils::get_local_injector().inject("tablet_resize_finalization_post_barrier", utils::wait_for_message(std::chrono::minutes(2)));
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);
|
||||
@@ -2083,7 +2034,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
|
||||
// handler performed the truncate and cleared the session, but crashed before finalizing the request
|
||||
if (_topo_sm._topology.session) {
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id);
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id, true);
|
||||
const table_id& table_id = topology_requests_entry.truncate_table_id;
|
||||
lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(table_id);
|
||||
|
||||
@@ -2672,7 +2623,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
|
||||
co_await sleep_abortable(std::chrono::milliseconds(10), _as);
|
||||
}
|
||||
const bool removenode_with_left_token_ring = _feature_service.removenode_with_left_token_ring;
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
bool barrier_failed = false;
|
||||
// In this state writes goes to old and new replicas but reads start to be done from new replicas
|
||||
@@ -2727,9 +2677,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
break;
|
||||
case node_state::removing: {
|
||||
co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (!removenode_with_left_token_ring) {
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
}
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::decommissioning: {
|
||||
@@ -2737,10 +2685,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
node_state next_state;
|
||||
utils::chunked_vector<canonical_mutation> muts;
|
||||
muts.reserve(2);
|
||||
if (removenode_with_left_token_ring || node.rs->state == node_state::decommissioning) {
|
||||
// Both decommission and removenode go through left_token_ring state
|
||||
// to ensure a global barrier is executed before the request is marked as done.
|
||||
// This ensures all nodes have observed the topology change.
|
||||
if (node.rs->state == node_state::decommissioning) {
|
||||
next_state = node.rs->state;
|
||||
builder.set_transition_state(topology::transition_state::left_token_ring);
|
||||
} else {
|
||||
@@ -2815,16 +2760,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
case topology::transition_state::left_token_ring: {
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
|
||||
// Need to be captured as the node variable might become invalid (e.g. moved out) at particular points.
|
||||
const auto node_rs_state = node.rs->state;
|
||||
|
||||
const bool is_removenode = node_rs_state == node_state::removing;
|
||||
|
||||
if (is_removenode && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
|
||||
auto finish_left_token_ring_transition = [&](node_to_work_on& node) -> future<> {
|
||||
// Remove the node from group0 here - in general, it won't be able to leave on its own
|
||||
// because we'll ban it as soon as we tell it to shut down.
|
||||
@@ -2844,16 +2779,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
muts.push_back(builder.build());
|
||||
co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
|
||||
co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
|
||||
auto str = std::invoke([&]() {
|
||||
switch (node_rs_state) {
|
||||
case node_state::decommissioning:
|
||||
return ::format("finished decommissioning node {}", node.id);
|
||||
case node_state::removing:
|
||||
return ::format("finished removing node {}", node.id);
|
||||
default:
|
||||
return ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
}
|
||||
});
|
||||
auto str = node.rs->state == node_state::decommissioning
|
||||
? ::format("finished decommissioning node {}", node.id)
|
||||
: ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
|
||||
};
|
||||
|
||||
@@ -2866,11 +2794,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (node.id == _raft.id()) {
|
||||
// Removed node must be dead, so it shouldn't enter here (it can't coordinate its own removal).
|
||||
if (is_removenode) {
|
||||
on_internal_error(rtlogger, "removenode operation cannot be coordinated by the removed node itself");
|
||||
}
|
||||
|
||||
// Someone else needs to coordinate the rest of the decommission process,
|
||||
// because the decommissioning node is going to shut down in the middle of this state.
|
||||
rtlogger.info("coordinator is decommissioning; giving up leadership");
|
||||
@@ -2884,13 +2807,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool barrier_failed = false;
|
||||
// Wait until other nodes observe the new token ring and stop sending writes to this node.
|
||||
auto excluded_nodes = get_excluded_nodes_for_topology_request(node);
|
||||
try {
|
||||
// Removed node is added to ignored nodes, so it should be automatically excluded.
|
||||
if (is_removenode && !excluded_nodes.contains(node.id)) {
|
||||
on_internal_error(rtlogger, "removenode operation must have the removed node in excluded_nodes");
|
||||
}
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), std::move(excluded_nodes)), node.id);
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
|
||||
} catch (term_changed_error&) {
|
||||
throw;
|
||||
} catch (group0_concurrent_modification&) {
|
||||
@@ -2907,17 +2825,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (barrier_failed) {
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node,
|
||||
// or some nodes might not have observed the new topology yet (one purpose of the barrier
|
||||
// is to make sure all nodes observed the new topology before completing the request).
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node.
|
||||
// Lets wait for the ring delay for those writes to complete and new topology to propagate
|
||||
// before continuing.
|
||||
co_await sleep_abortable(_ring_delay, _as);
|
||||
node = retake_node(co_await start_operation(), node.id);
|
||||
}
|
||||
|
||||
// Make decommissioning/removed node a non voter before reporting operation completion below.
|
||||
// Otherwise the node may see the completion and exit before it is removed from
|
||||
// Make decommissioning node a non voter before reporting operation completion below.
|
||||
// Otherwise the decommissioned node may see the completion and exit before it is removed from
|
||||
// the config at which point the removal from the config will hang if the cluster had only two
|
||||
// nodes before the decommission.
|
||||
co_await _voter_handler.on_node_removed(node.id, _as);
|
||||
@@ -2928,7 +2844,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");
|
||||
|
||||
// For decommission/rollback: Tell the node to shut down.
|
||||
// Tell the node to shut down.
|
||||
// This is done to improve user experience when there are no failures.
|
||||
// In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
|
||||
// so there's no guarantee that it would learn about entering that state even if it was still
|
||||
@@ -2937,19 +2853,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// There is the possibility that the node will never get the message
|
||||
// and decommission will hang on that node.
|
||||
// This is fine for the rest of the cluster - we will still remove, ban the node and continue.
|
||||
//
|
||||
// For removenode: The node is already dead, no need to send shutdown command.
|
||||
auto node_id = node.id;
|
||||
bool shutdown_failed = false;
|
||||
if (!is_removenode) {
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
}
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
}
|
||||
if (shutdown_failed) {
|
||||
node = retake_node(co_await start_operation(), node_id);
|
||||
@@ -3546,7 +3458,7 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
|
||||
}
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
if (plan.empty()) {
|
||||
rtlogger.debug("Tablet load balancer did not make any plan");
|
||||
co_return false;
|
||||
|
||||
@@ -256,20 +256,6 @@ topology_mutation_builder& topology_mutation_builder::drop_first_global_topology
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::pause_rf_change_request(const utils::UUID& id) {
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::update, std::vector<data_value>{id});
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::resume_rf_change_request(const std::unordered_set<utils::UUID>& values, const utils::UUID& id) {
|
||||
if (values.contains(id)) {
|
||||
auto new_values = values;
|
||||
new_values.erase(id);
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::overwrite, new_values | std::views::transform([] (const auto& id) { return data_value{id}; }));
|
||||
} else {
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::set_upgrade_state(topology::upgrade_state_type value) {
|
||||
return apply_atomic("upgrade_state", ::format("{}", value));
|
||||
}
|
||||
|
||||
@@ -129,8 +129,6 @@ public:
|
||||
topology_mutation_builder& del_global_topology_request_id();
|
||||
topology_mutation_builder& queue_global_topology_request_id(const utils::UUID& value);
|
||||
topology_mutation_builder& drop_first_global_topology_request_id(const std::vector<utils::UUID>&, const utils::UUID&);
|
||||
topology_mutation_builder& pause_rf_change_request(const utils::UUID&);
|
||||
topology_mutation_builder& resume_rf_change_request(const std::unordered_set<utils::UUID>&, const utils::UUID&);
|
||||
topology_node_mutation_builder& with_node(raft::server_id);
|
||||
canonical_mutation build() { return canonical_mutation{std::move(_m)}; }
|
||||
};
|
||||
|
||||
@@ -180,10 +180,6 @@ struct topology {
|
||||
// The KS options to be used when executing the scheduled ALTER KS statement
|
||||
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
|
||||
|
||||
// The ids of RF change requests that are paused because they require tablet co-location.
|
||||
// It may happen during altering from numerical RF to rack list.
|
||||
std::unordered_set<utils::UUID> paused_rf_change_requests;
|
||||
|
||||
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
|
||||
std::vector<cdc::generation_id_v2> unpublished_cdc_generations;
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ enum class component_type {
|
||||
TemporaryTOC,
|
||||
TemporaryStatistics,
|
||||
Scylla,
|
||||
TemporaryScylla,
|
||||
Rows,
|
||||
Partitions,
|
||||
TemporaryHashes,
|
||||
@@ -76,6 +77,8 @@ struct fmt::formatter<sstables::component_type> : fmt::formatter<string_view> {
|
||||
return formatter<string_view>::format("TemporaryStatistics", ctx);
|
||||
case Scylla:
|
||||
return formatter<string_view>::format("Scylla", ctx);
|
||||
case TemporaryScylla:
|
||||
return formatter<string_view>::format("TemporaryScylla", ctx);
|
||||
case Partitions:
|
||||
return formatter<string_view>::format("Partitions", ctx);
|
||||
case Rows:
|
||||
|
||||
@@ -632,6 +632,10 @@ private:
|
||||
std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
|
||||
|
||||
void close_data_writer();
|
||||
void close_index_writer();
|
||||
void close_rows_writer();
|
||||
void close_partitions_writer();
|
||||
|
||||
void ensure_tombstone_is_written() {
|
||||
if (!_tombstone_written) {
|
||||
consume(tombstone());
|
||||
@@ -944,17 +948,16 @@ void writer::init_file_writers() {
|
||||
_sst._schema->get_compressor_params(),
|
||||
std::move(compressor)), _sst.get_filename());
|
||||
}
|
||||
|
||||
if (_sst.has_component(component_type::Index)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Index).get();
|
||||
_index_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), _sst.index_filename());
|
||||
_index_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, _sst.index_filename());
|
||||
}
|
||||
if (_sst.has_component(component_type::Partitions) && _sst.has_component(component_type::Rows)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Rows).get();
|
||||
_rows_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Rows));
|
||||
_rows_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Rows));
|
||||
_bti_row_index_writer = trie::bti_row_index_writer(*_rows_writer);
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Partitions).get();
|
||||
_partitions_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Partitions));
|
||||
_partitions_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Partitions));
|
||||
_bti_partition_index_writer = trie::bti_partition_index_writer(*_partitions_writer);
|
||||
}
|
||||
if (_delayed_filter) {
|
||||
@@ -982,6 +985,41 @@ void writer::close_data_writer() {
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_index_writer() {
|
||||
if (_index_writer) {
|
||||
auto writer = close_writer(_index_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().index_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_partitions_writer() {
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
auto writer = close_writer(_partitions_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().partitions_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_rows_writer() {
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
auto writer = close_writer(_rows_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().rows_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
_c_stats.start_offset = _data_writer->offset();
|
||||
_prev_row_start = _data_writer->offset();
|
||||
@@ -1630,27 +1668,10 @@ void writer::consume_end_of_stream() {
|
||||
_collector.add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
|
||||
}
|
||||
|
||||
if (_index_writer) {
|
||||
close_writer(_index_writer);
|
||||
}
|
||||
close_index_writer();
|
||||
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
close_writer(_partitions_writer);
|
||||
}
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
close_writer(_rows_writer);
|
||||
}
|
||||
close_partitions_writer();
|
||||
close_rows_writer();
|
||||
|
||||
if (_hashes_writer) {
|
||||
close_writer(_hashes_writer);
|
||||
|
||||
@@ -44,6 +44,7 @@ sstable_version_constants::component_map_t sstable_version_constants::create_com
|
||||
{ component_type::Filter, "Filter.db" },
|
||||
{ component_type::Statistics, "Statistics.db" },
|
||||
{ component_type::Scylla, "Scylla.db" },
|
||||
{ component_type::TemporaryScylla, "Scylla.db.tmp" },
|
||||
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
|
||||
{ component_type::TemporaryStatistics, "Statistics.db.tmp" }
|
||||
};
|
||||
|
||||
@@ -956,16 +956,22 @@ future<file_writer> sstable::make_component_file_writer(component_type c, file_o
|
||||
});
|
||||
}
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> sstable::make_digests_component_file_writer(component_type c, file_output_stream_options options, open_flags oflags) noexcept {
|
||||
return _storage->make_component_sink(*this, c, oflags, std::move(options)).then([this, comp = component_name(*this, c)] (data_sink sink) mutable {
|
||||
return std::make_unique<crc32_digest_file_writer>(std::move(sink), sstable_buffer_size, comp);
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::open_sstable(const sstring& origin) {
|
||||
_origin = origin;
|
||||
generate_toc();
|
||||
_storage->open(*this);
|
||||
}
|
||||
|
||||
void sstable::write_toc(file_writer w) {
|
||||
void sstable::write_toc(std::unique_ptr<crc32_digest_file_writer> w) {
|
||||
sstlog.debug("Writing TOC file {} ", toc_filename());
|
||||
|
||||
do_write_simple(std::move(w), [&] (version_types v, file_writer& w) {
|
||||
do_write_simple(*w, [&] (version_types v, file_writer& w) {
|
||||
for (auto&& key : _recognized_components) {
|
||||
// new line character is appended to the end of each component name.
|
||||
auto value = sstable_version_constants::get_component_map(v).at(key) + "\n";
|
||||
@@ -973,6 +979,8 @@ void sstable::write_toc(file_writer w) {
|
||||
write(v, w, b);
|
||||
}
|
||||
});
|
||||
|
||||
_components_digests.toc_digest = w->full_checksum();
|
||||
}
|
||||
|
||||
void sstable::write_crc(const checksum& c) {
|
||||
@@ -989,6 +997,7 @@ void sstable::write_digest(uint32_t full_checksum) {
|
||||
auto digest = to_sstring<bytes>(full_checksum);
|
||||
write(v, w, digest);
|
||||
}, buffer_size);
|
||||
_components_digests.data_digest = full_checksum;
|
||||
}
|
||||
|
||||
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
|
||||
@@ -1045,7 +1054,7 @@ future<> sstable::read_simple(T& component) {
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::do_write_simple(file_writer&& writer,
|
||||
void sstable::do_write_simple(file_writer& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component) {
|
||||
write_component(_version, writer);
|
||||
_metadata_size_on_disk += writer.offset();
|
||||
@@ -1060,7 +1069,7 @@ void sstable::do_write_simple(component_type type,
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(std::move(w), std::move(write_component));
|
||||
do_write_simple(w, std::move(write_component));
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
@@ -1070,10 +1079,30 @@ void sstable::write_simple(const T& component) {
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
uint32_t sstable::do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component, unsigned buffer_size) {
|
||||
auto file_path = filename(type);
|
||||
sstlog.debug("Writing {} file {}", sstable_version_constants::get_component_map(_version).at(type), file_path);
|
||||
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_digests_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(*w, std::move(write_component));
|
||||
return w->full_checksum();
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t sstable::write_simple_with_digest(const T& component) {
|
||||
return do_write_simple_with_digest(Type, [&component] (version_types v, file_writer& w) {
|
||||
write(v, w, component);
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f);
|
||||
template void sstable::write_simple<component_type::Filter>(const sstables::filter& f);
|
||||
|
||||
template void sstable::write_simple<component_type::Summary>(const sstables::summary_ka&);
|
||||
template uint32_t sstable::write_simple_with_digest<component_type::Summary>(const sstables::summary_ka&);
|
||||
|
||||
future<> sstable::read_compression() {
|
||||
// FIXME: If there is no compression, we should expect a CRC file to be present.
|
||||
@@ -1092,7 +1121,8 @@ void sstable::write_compression() {
|
||||
return;
|
||||
}
|
||||
|
||||
write_simple<component_type::CompressionInfo>(_components->compression);
|
||||
uint32_t digest = write_simple_with_digest<component_type::CompressionInfo>(_components->compression);
|
||||
_components_digests.compression_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::validate_partitioner() {
|
||||
@@ -1317,7 +1347,8 @@ future<> sstable::read_partitions_db_footer() {
|
||||
}
|
||||
|
||||
void sstable::write_statistics() {
|
||||
write_simple<component_type::Statistics>(_components->statistics);
|
||||
auto digest = write_simple_with_digest<component_type::Statistics>(_components->statistics);
|
||||
_components_digests.statistics_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::mark_as_being_repaired(const service::session_id& id) {
|
||||
@@ -1340,13 +1371,25 @@ int64_t sstable::update_repaired_at(int64_t repaired_at) {
|
||||
void sstable::rewrite_statistics() {
|
||||
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
|
||||
|
||||
auto lock = get_units(_mutate_sem, 1).get();
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = sstable_buffer_size;
|
||||
auto w = make_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
auto w = make_digests_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
open_flags::wo | open_flags::create | open_flags::truncate).get();
|
||||
write(_version, w, _components->statistics);
|
||||
w.close();
|
||||
write(_version, *w, _components->statistics);
|
||||
w->close();
|
||||
|
||||
// When rewriting statistics, we also need to update the scylla component
|
||||
// because it contains the digest of the statistics component.
|
||||
if (has_scylla_component()) {
|
||||
_components_digests.statistics_digest = w->full_checksum();
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests{_components_digests});
|
||||
sstlog.debug("Rewriting scylla component of sstable {}", get_filename());
|
||||
write_simple<component_type::TemporaryScylla>(*_components->scylla_metadata);
|
||||
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryScylla)), fmt::to_string(filename(component_type::Scylla))).get();
|
||||
}
|
||||
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryStatistics)), fmt::to_string(filename(component_type::Statistics))).get();
|
||||
}
|
||||
@@ -1540,7 +1583,8 @@ void sstable::write_filter() {
|
||||
|
||||
auto&& bs = f->bits();
|
||||
auto filter_ref = sstables::filter_ref(f->num_hashes(), bs.get_storage());
|
||||
write_simple<component_type::Filter>(filter_ref);
|
||||
uint32_t digest = write_simple_with_digest<component_type::Filter>(filter_ref);
|
||||
_components_digests.filter_digest = digest;
|
||||
}
|
||||
|
||||
void sstable::maybe_rebuild_filter_from_index(uint64_t num_partitions) {
|
||||
@@ -1999,6 +2043,8 @@ sstable::read_scylla_metadata() noexcept {
|
||||
}
|
||||
return read_simple<component_type::Scylla>(*_components->scylla_metadata).then([this] {
|
||||
_features = _components->scylla_metadata->get_features();
|
||||
_components_digests = _components->scylla_metadata->get_components_digests();
|
||||
_components->digest = _components_digests.data_digest;
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2088,6 +2134,7 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
|
||||
sstable_schema.columns.elements.push_back(sstable_column_description{to_sstable_column_kind(col.kind), {col.name()}, {to_bytes(col.type->name())}});
|
||||
}
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::Schema>(std::move(sstable_schema));
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests(_components_digests));
|
||||
|
||||
write_simple<component_type::Scylla>(*_components->scylla_metadata);
|
||||
}
|
||||
@@ -2489,19 +2536,15 @@ std::vector<std::pair<component_type, sstring>> sstable::all_components() const
|
||||
}
|
||||
|
||||
future<> sstable::snapshot(const sstring& dir) const {
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
return _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
}
|
||||
|
||||
future<> sstable::change_state(sstable_state to, delayed_commit_changes* delay_commit) {
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, _generation, delay_commit);
|
||||
_state = to;
|
||||
}
|
||||
|
||||
future<> sstable::pick_up_from_upload(sstable_state to, generation_type new_generation) {
|
||||
// just in case, not really needed as the sstable is not yet in use while in the upload dir
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, new_generation, nullptr);
|
||||
_generation = std::move(new_generation);
|
||||
_state = to;
|
||||
@@ -3075,6 +3118,31 @@ void sstable::set_sstable_level(uint32_t new_level) {
|
||||
s.sstable_level = new_level;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> sstable::get_component_digest(component_type c) const {
|
||||
switch (c) {
|
||||
case component_type::Index:
|
||||
return _components_digests.index_digest;
|
||||
case component_type::Summary:
|
||||
return _components_digests.summary_digest;
|
||||
case component_type::TOC:
|
||||
return _components_digests.toc_digest;
|
||||
case component_type::CompressionInfo:
|
||||
return _components_digests.compression_digest;
|
||||
case component_type::Filter:
|
||||
return _components_digests.filter_digest;
|
||||
case component_type::Partitions:
|
||||
return _components_digests.partitions_digest;
|
||||
case component_type::Rows:
|
||||
return _components_digests.rows_digest;
|
||||
case component_type::Data:
|
||||
return _components_digests.data_digest;
|
||||
case component_type::Statistics:
|
||||
return _components_digests.statistics_digest;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
future<> sstable::mutate_sstable_level(uint32_t new_level) {
|
||||
if (!has_component(component_type::Statistics)) {
|
||||
return make_ready_future<>();
|
||||
@@ -3411,9 +3479,6 @@ utils::hashed_key sstable::make_hashed_key(const schema& s, const partition_key&
|
||||
|
||||
future<>
|
||||
sstable::unlink(storage::sync_dir sync) noexcept {
|
||||
// Serialize with other calls to unlink or potentially ongoing mutations.
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
|
||||
_unlinked = true;
|
||||
_on_delete(*this);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "sstables/writer.hh"
|
||||
#include "version.hh"
|
||||
#include "shared_sstable.hh"
|
||||
#include "open_info.hh"
|
||||
@@ -628,9 +629,7 @@ private:
|
||||
size_t _total_memory_reclaimed{0};
|
||||
bool _unlinked{false};
|
||||
|
||||
// The mutate semaphore is used to serialize operations like rewrite_statistics
|
||||
// with linking or moving the sstable between directories.
|
||||
mutable named_semaphore _mutate_sem{1, named_semaphore_exception_factory{"sstable mutate"}};
|
||||
components_digests _components_digests;
|
||||
public:
|
||||
bool has_component(component_type f) const;
|
||||
sstables_manager& manager() { return _manager; }
|
||||
@@ -651,12 +650,18 @@ private:
|
||||
|
||||
template <component_type Type, typename T>
|
||||
void write_simple(const T& comp);
|
||||
void do_write_simple(file_writer&& writer,
|
||||
void do_write_simple(file_writer& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component);
|
||||
void do_write_simple(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t write_simple_with_digest(const T& comp);
|
||||
uint32_t do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
void write_crc(const checksum& c);
|
||||
void write_digest(uint32_t full_checksum);
|
||||
|
||||
@@ -667,6 +672,9 @@ private:
|
||||
future<file_writer> make_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> make_digests_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
void generate_toc();
|
||||
void open_sstable(const sstring& origin);
|
||||
|
||||
@@ -697,7 +705,8 @@ private:
|
||||
future<> read_summary() noexcept;
|
||||
|
||||
void write_summary() {
|
||||
write_simple<component_type::Summary>(_components->summary);
|
||||
uint32_t digest = write_simple_with_digest<component_type::Summary>(_components->summary);
|
||||
_components_digests.summary_digest = digest;
|
||||
}
|
||||
|
||||
// To be called when we try to load an SSTable that lacks a Summary. Could
|
||||
@@ -827,7 +836,7 @@ private:
|
||||
|
||||
future<> open_or_create_data(open_flags oflags, file_open_options options = {}) noexcept;
|
||||
// runs in async context (called from storage::open)
|
||||
void write_toc(file_writer w);
|
||||
void write_toc(std::unique_ptr<crc32_digest_file_writer> w);
|
||||
static future<uint32_t> read_digest_from_file(file f);
|
||||
static future<lw_shared_ptr<checksum>> read_checksum_from_file(file f);
|
||||
public:
|
||||
@@ -1017,6 +1026,12 @@ public:
|
||||
return _components->digest;
|
||||
}
|
||||
|
||||
components_digests& get_components_digests() {
|
||||
return _components_digests;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> get_component_digest(component_type c) const;
|
||||
|
||||
// Gets ratio of droppable tombstone. A tombstone is considered droppable here
|
||||
// for cells and tombstones expired before the time point "GC before", which
|
||||
// is the point before which expiring data can be purged.
|
||||
|
||||
@@ -204,13 +204,13 @@ void filesystem_storage::open(sstable& sst) {
|
||||
open_flags::create |
|
||||
open_flags::exclusive,
|
||||
options).get();
|
||||
auto w = file_writer(output_stream<char>(std::move(sink)), component_name(sst, component_type::TemporaryTOC));
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(sink), sst.sstable_buffer_size, component_name(sst, component_type::TemporaryTOC));
|
||||
|
||||
bool toc_exists = file_exists(fmt::to_string(sst.filename(component_type::TOC))).get();
|
||||
if (toc_exists) {
|
||||
// TOC will exist at this point if write_components() was called with
|
||||
// the generation of a sstable that exists.
|
||||
w.close();
|
||||
w->close();
|
||||
remove_file(fmt::to_string(sst.filename(component_type::TemporaryTOC))).get();
|
||||
throw std::runtime_error(format("SSTable write failed due to existence of TOC file for generation {} of {}.{}", sst._generation, sst._schema->ks_name(), sst._schema->cf_name()));
|
||||
}
|
||||
@@ -670,15 +670,10 @@ void object_storage_base::open(sstable& sst) {
|
||||
sst.manager().sstables_registry().create_entry(owner(), status_creating, sst._state, std::move(desc)).get();
|
||||
|
||||
memory_data_sink_buffers bufs;
|
||||
sst.write_toc(
|
||||
file_writer(
|
||||
output_stream<char>(
|
||||
data_sink(
|
||||
std::make_unique<memory_data_sink>(bufs)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
auto out = data_sink(std::make_unique<memory_data_sink>(bufs));
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(out), sst.sstable_buffer_size, component_name(sst, component_type::TOC));
|
||||
|
||||
sst.write_toc(std::move(w));
|
||||
put_object(make_object_name(sst, component_type::TOC), std::move(bufs)).get();
|
||||
}
|
||||
|
||||
|
||||
@@ -547,6 +547,7 @@ enum class scylla_metadata_type : uint32_t {
|
||||
ExtTimestampStats = 9,
|
||||
SSTableIdentifier = 10,
|
||||
Schema = 11,
|
||||
ComponentsDigests = 12,
|
||||
};
|
||||
|
||||
// UUID is used for uniqueness across nodes, such that an imported sstable
|
||||
@@ -573,6 +574,24 @@ struct sstable_identifier_type {
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(value); }
|
||||
};
|
||||
|
||||
// Component digests stored in scylla metadata to track integrity of individual components
|
||||
struct components_digests {
|
||||
std::optional<uint32_t> data_digest;
|
||||
std::optional<uint32_t> compression_digest;
|
||||
std::optional<uint32_t> filter_digest;
|
||||
std::optional<uint32_t> statistics_digest;
|
||||
std::optional<uint32_t> summary_digest;
|
||||
std::optional<uint32_t> index_digest;
|
||||
std::optional<uint32_t> toc_digest;
|
||||
std::optional<uint32_t> partitions_digest;
|
||||
std::optional<uint32_t> rows_digest;
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) {
|
||||
return f(data_digest,compression_digest, filter_digest, statistics_digest, summary_digest, index_digest, toc_digest, partitions_digest, rows_digest);
|
||||
}
|
||||
};
|
||||
|
||||
// Types of large data statistics.
|
||||
//
|
||||
// Note: For extensibility, never reuse an identifier,
|
||||
@@ -656,7 +675,8 @@ struct scylla_metadata {
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ScyllaVersion, scylla_version>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtTimestampStats, ext_timestamp_stats>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::SSTableIdentifier, sstable_identifier>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>
|
||||
> data;
|
||||
|
||||
sstable_enabled_features get_features() const {
|
||||
@@ -691,6 +711,13 @@ struct scylla_metadata {
|
||||
auto* sid = data.get<scylla_metadata_type::SSTableIdentifier, scylla_metadata::sstable_identifier>();
|
||||
return sid ? sid->value : sstable_id::create_null_id();
|
||||
}
|
||||
const components_digests get_components_digests() const {
|
||||
auto cd = data.get<scylla_metadata_type::ComponentsDigests, components_digests>();
|
||||
if (!cd) {
|
||||
return {};
|
||||
}
|
||||
return *cd;
|
||||
}
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(data); }
|
||||
|
||||
@@ -65,7 +65,7 @@ serialized_size(sstable_version_types v, const T& object) {
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink_impl : public data_sink_impl {
|
||||
data_sink _out;
|
||||
@@ -92,7 +92,9 @@ public:
|
||||
|
||||
per_chunk_checksum = ChecksumType::checksum(per_chunk_checksum, buf.begin() + offset, size);
|
||||
_full_checksum = checksum_combine_or_feed<ChecksumType>(_full_checksum, per_chunk_checksum, buf.begin() + offset, size);
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
if constexpr (calculate_chunk_checksums) {
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
}
|
||||
}
|
||||
}
|
||||
return _out.put(std::move(bufs));
|
||||
@@ -112,29 +114,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink : public data_sink {
|
||||
public:
|
||||
checksummed_file_data_sink(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum)
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType, calculate_chunk_checksums>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
};
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
inline
|
||||
output_stream<char> make_checksummed_file_output_stream(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) {
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType>(std::move(out), cinfo, full_file_checksum));
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType, calculate_chunk_checksums>(std::move(out), cinfo, full_file_checksum));
|
||||
}
|
||||
|
||||
template <typename ChecksumType>
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_writer : public file_writer {
|
||||
checksum _c;
|
||||
uint32_t _full_checksum;
|
||||
public:
|
||||
checksummed_file_writer(data_sink out, size_t buffer_size, component_name c)
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType, calculate_chunk_checksums>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
, _c(uint32_t(std::min(size_t(DEFAULT_CHUNK_SIZE), buffer_size)), {})
|
||||
, _full_checksum(ChecksumType::init_checksum()) {}
|
||||
|
||||
@@ -152,8 +154,10 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils>;
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils, true>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils, true>;
|
||||
|
||||
using crc32_digest_file_writer = checksummed_file_writer<crc32_utils, false>;
|
||||
|
||||
template <typename T, typename W>
|
||||
requires Writer<W>
|
||||
|
||||
@@ -112,7 +112,6 @@ public:
|
||||
// Each virtual task needs to have its group.
|
||||
topology_change_group,
|
||||
tablets_group,
|
||||
global_topology_change_group,
|
||||
};
|
||||
|
||||
class task : public enable_lw_shared_from_this<task> {
|
||||
|
||||
5
test.py
5
test.py
@@ -228,7 +228,7 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
scylla_additional_options = parser.add_argument_group('Additional options for Scylla tests')
|
||||
scylla_additional_options.add_argument('--x-log2-compaction-groups', action="store", default="0", type=int,
|
||||
help="Controls number of compaction groups to be used by Scylla tests. Value of 3 implies 8 groups.")
|
||||
scylla_additional_options.add_argument('--extra-scylla-cmdline-options', action="store", default="", type=str,
|
||||
scylla_additional_options.add_argument('--extra-scylla-cmdline-options', action="store", default=[], type=str,
|
||||
help="Passing extra scylla cmdline options for all tests. Options should be space separated:"
|
||||
"'--logger-log-level raft=trace --default-log-level error'")
|
||||
|
||||
@@ -279,6 +279,9 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
args.tmpdir = os.path.abspath(args.tmpdir)
|
||||
prepare_dirs(tempdir_base=pathlib.Path(args.tmpdir), modes=args.modes, gather_metrics=args.gather_metrics, save_log_on_success=args.save_log_on_success)
|
||||
|
||||
if args.extra_scylla_cmdline_options:
|
||||
args.extra_scylla_cmdline_options = args.extra_scylla_cmdline_options.split()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
@@ -152,7 +152,7 @@ def test_batch_write_nonduplicate_multiple_tables(test_table_s, test_table_s_2):
|
||||
p = random_string()
|
||||
# The batch_writer() function used in previous tests can't write to more
|
||||
# than one table. So we use the lower level interface boto3 gives us.
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
reply = test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p, 'a': 'hi'}}}],
|
||||
test_table_s_2.name: [{'PutRequest': {'Item': {'p': p, 'b': 'hello'}}}]
|
||||
})
|
||||
@@ -222,7 +222,7 @@ def test_batch_write_multiple_tables(test_table_s, test_table):
|
||||
# We use the low-level batch_write_item API for lack of a more convenient
|
||||
# API (the batch_writer() API can only write to one table). At least it
|
||||
# spares us the need to encode the key's types...
|
||||
test_table.meta.client.batch_write_item(RequestItems = {
|
||||
reply = test_table.meta.client.batch_write_item(RequestItems = {
|
||||
test_table.name: [{'PutRequest': {'Item': {'p': p1, 'c': c1, 'a': 'hi'}}}],
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p2, 'b': 'hello'}}}]
|
||||
})
|
||||
@@ -537,8 +537,9 @@ def test_batch_get_item_full_failure(scylla_only, dynamodb, rest_api, test_table
|
||||
for i in range(count):
|
||||
batch.put_item(Item={
|
||||
'p': p, 'c': i, 'content': content})
|
||||
responses = []
|
||||
to_read = { test_table_sn.name: {'Keys': [{'p': p, 'c': c} for c in range(count)], 'ConsistentRead': True } }
|
||||
# The error injection is permanent, so it will fire for each batch read.
|
||||
with scylla_inject_error(rest_api, "alternator_batch_get_item", one_shot=False):
|
||||
with pytest.raises(ClientError, match="InternalServerError"):
|
||||
test_table_sn.meta.client.batch_get_item(RequestItems = to_read)
|
||||
reply = test_table_sn.meta.client.batch_get_item(RequestItems = to_read)
|
||||
|
||||
@@ -376,7 +376,7 @@ def test_rbac_updateitem_read(dynamodb, cql, test_table_s):
|
||||
assert ret['Attributes'] == {'p': p, 'v': v1}
|
||||
# Just MODIFY permission, not SELECT permission, also allows
|
||||
# us to do a read-modify-write expression:
|
||||
authorized(lambda: tab.update_item(Key={'p': p},
|
||||
ret = authorized(lambda: tab.update_item(Key={'p': p},
|
||||
UpdateExpression='SET v = v + :val',
|
||||
ExpressionAttributeValues={':val': 1}))
|
||||
assert {'p': p, 'v': v2 + 1} == test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
@@ -903,6 +903,7 @@ def test_rbac_tagresource(dynamodb, cql):
|
||||
arn = table.meta.client.describe_table(TableName=table.name)['Table']['TableArn']
|
||||
with new_role(cql) as (role, key):
|
||||
with new_dynamodb(dynamodb, role, key) as d:
|
||||
tab = d.Table(table.name)
|
||||
# Without ALTER permission, TagResource and UntagResource
|
||||
# are refused
|
||||
tags = [{'Key': 'hello', 'Value': 'dog'},
|
||||
|
||||
@@ -80,18 +80,18 @@ def test_table_sn_with_data(test_table_sn):
|
||||
def test_filter_expression_partition_key_1(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.*Condition'):
|
||||
full_query(table, FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
got_items = full_query(table, FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
|
||||
def test_filter_expression_partition_key_2(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.* p'):
|
||||
full_query(table, KeyConditionExpression='p=:p', FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
|
||||
# FilterExpression is also not allowed on the sort key.
|
||||
def test_filter_expression_sort_key(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.* key '):
|
||||
full_query(table, KeyConditionExpression='p=:p', FilterExpression='c=:c',
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='c=:c',
|
||||
ExpressionAttributeValues={':p': p, ':c': 3})
|
||||
|
||||
# Test the "=" operator on different types of attributes (numeric, string,
|
||||
@@ -387,6 +387,7 @@ def test_filter_expression_map_contains(test_table_sn_with_data):
|
||||
assert(got_items == expected_items)
|
||||
# One value from a map:
|
||||
i = next(iter(items[2]['m']))
|
||||
v = items[2]['m'][i]
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='contains(m, :i)',
|
||||
ExpressionAttributeValues={':p': p, ':i': i})
|
||||
#The following could have made sense, but it's what DynamoDB does:
|
||||
|
||||
@@ -125,6 +125,7 @@ def test_basic_string_more_update(test_table):
|
||||
val1 = random_string()
|
||||
val2 = random_string()
|
||||
val3 = random_string()
|
||||
val4 = random_string()
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
|
||||
|
||||
@@ -304,7 +304,7 @@ def test_wcu_batch_write_item(test_table_s, metrics):
|
||||
with check_increases_operation(metrics, ['PutItem'], 'scylla_alternator_wcu_total', 3):
|
||||
p1 = random_string()
|
||||
p2 = random_string()
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
response = test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p1, 'a': 'hi'}}}, {'PutRequest': {'Item': {'p': p2, 'a': 'a' * KB}}}]
|
||||
})
|
||||
|
||||
|
||||
@@ -369,6 +369,7 @@ def test_query_exclusivestartkey(test_table_sn):
|
||||
# The ExclusiveStartKey option must indicate both partition key and
|
||||
# sort key. Note that the Python driver further converts this map
|
||||
# into the correct format for the request (including the key types).
|
||||
exclusivestartkey = { 'p': p, 'c': start }
|
||||
got_items = test_table_sn.query(
|
||||
KeyConditions={'p': { 'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}},
|
||||
ExclusiveStartKey= { 'p': p, 'c': start },
|
||||
|
||||
@@ -35,12 +35,14 @@ def test_invalid_consumed_capacity_type(test_table_sb):
|
||||
c = random_bytes()
|
||||
test_table_sb.put_item(Item={'p': p, 'c': c, 'att': val})
|
||||
with pytest.raises(ClientError):
|
||||
test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='DUMMY')
|
||||
response = test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='DUMMY')
|
||||
|
||||
# A missing Item, count as zero length item which require 1 or 0.5 RCU depends on the consistency
|
||||
def test_missing_get_item(test_table):
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
val = random_string()
|
||||
val2 = random_string()
|
||||
response = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='TOTAL')
|
||||
assert 'ConsumedCapacity' in response
|
||||
consumed_capacity = response['ConsumedCapacity']
|
||||
@@ -223,6 +225,7 @@ def test_simple_delete_item(test_table_sb):
|
||||
# we will get 1 WCU
|
||||
def test_delete_missing_item(test_table_sb):
|
||||
p = random_string()
|
||||
val = random_string()
|
||||
c = random_bytes()
|
||||
response = test_table_sb.delete_item(Key={'p': p, 'c': c}, ReturnConsumedCapacity='TOTAL')
|
||||
assert 'ConsumedCapacity' in response
|
||||
|
||||
@@ -99,7 +99,7 @@ def test_put_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Failed conditional on non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
test_table_s.put_item(Item={'p': p, 's': 'cat'},
|
||||
ret=test_table_s.put_item(Item={'p': p, 's': 'cat'},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
ExpressionAttributeValues={':v1' : 'dog'})
|
||||
@@ -175,7 +175,7 @@ def test_delete_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Delete of non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
test_table_s.delete_item(Key={'p': p},
|
||||
ret=test_table_s.delete_item(Key={'p': p},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
ExpressionAttributeValues={':v1' : 'dog'})
|
||||
@@ -566,7 +566,7 @@ def test_update_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Modification of non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
ret=test_table_s.update_item(Key={'p': p},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
UpdateExpression='SET s = :v2',
|
||||
|
||||
@@ -220,6 +220,7 @@ def test_scan_with_key_equality_filtering(dynamodb, filled_test_table):
|
||||
# without returning items at all.
|
||||
def test_scan_select(filled_test_table):
|
||||
test_table, items = filled_test_table
|
||||
got_items = full_scan(test_table)
|
||||
# By default, a scan returns all the items, with all their attributes:
|
||||
# query returns all attributes:
|
||||
got_items = full_scan(test_table)
|
||||
|
||||
@@ -135,7 +135,7 @@ def test_list_streams_create(dynamodb, dynamodbstreams):
|
||||
def test_list_streams_alter(dynamodb, dynamodbstreams):
|
||||
for type in stream_types:
|
||||
with create_stream_test_table(dynamodb, StreamViewType=None) as table:
|
||||
table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': type});
|
||||
res = table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': type});
|
||||
wait_for_active_stream(dynamodbstreams, table)
|
||||
|
||||
def test_list_streams_paged(dynamodb, dynamodbstreams):
|
||||
@@ -273,7 +273,7 @@ def test_describe_stream_create_time(dynamodb, dynamodbstreams):
|
||||
|
||||
def test_describe_nonexistent_stream(dynamodb, dynamodbstreams):
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException' if is_local_java(dynamodbstreams) else 'ValidationException'):
|
||||
dynamodbstreams.describe_stream(StreamArn='sdfadfsdfnlfkajakfgjalksfgklasjklasdjfklasdfasdfgasf')
|
||||
streams = dynamodbstreams.describe_stream(StreamArn='sdfadfsdfnlfkajakfgjalksfgklasjklasdjfklasdfasdfgasf')
|
||||
|
||||
def test_describe_stream_with_nonexistent_last_shard(dynamodb, dynamodbstreams):
|
||||
with create_stream_test_table(dynamodb, StreamViewType='KEYS_ONLY') as table:
|
||||
@@ -313,7 +313,7 @@ def test_get_shard_iterator(dynamodb, dynamodbstreams):
|
||||
for type in ['AT_SEQUENCE_NUMBER', 'AFTER_SEQUENCE_NUMBER']:
|
||||
# must have seq in these modes
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
dynamodbstreams.get_shard_iterator(
|
||||
iter = dynamodbstreams.get_shard_iterator(
|
||||
StreamArn=arn, ShardId=shard_id, ShardIteratorType=type
|
||||
)
|
||||
|
||||
@@ -326,7 +326,7 @@ def test_get_shard_iterator(dynamodb, dynamodbstreams):
|
||||
|
||||
# bad arn
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
dynamodbstreams.get_shard_iterator(
|
||||
iter = dynamodbstreams.get_shard_iterator(
|
||||
StreamArn='sdfadsfsdfsdgdfsgsfdabadfbabdadsfsdfsdfsdfsdfsdfsdfdfdssdffbdfdf', ShardId=shard_id, ShardIteratorType=type, SequenceNumber=seq
|
||||
)
|
||||
# bad shard id
|
||||
@@ -735,6 +735,7 @@ def compare_events(expected_events, output, mode, expected_region):
|
||||
assert not 'NewImage' in record
|
||||
if expected_old_image == None:
|
||||
assert not 'OldImage' in record
|
||||
pass
|
||||
else:
|
||||
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
|
||||
assert expected_old_image == old_image
|
||||
@@ -1641,6 +1642,7 @@ def test_table_stream_with_result(dynamodb, dynamodbstreams):
|
||||
# doing an UpdateTable to a table - because before this wait finishes we are
|
||||
# not allowed to update the same table again or delete it.
|
||||
def wait_for_status_active(table):
|
||||
start_time = time.time()
|
||||
for i in range(60):
|
||||
desc = table.meta.client.describe_table(TableName=table.name)
|
||||
if desc['Table']['TableStatus'] == 'ACTIVE':
|
||||
@@ -1917,15 +1919,15 @@ def test_get_records_too_high_limit(test_table_ss_keys_only, dynamodbstreams):
|
||||
shard_id = shard['ShardId']
|
||||
iter = dynamodbstreams.get_shard_iterator(StreamArn=arn, ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator']
|
||||
# Limit=1000 should be allowed:
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=1000)
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=1000)
|
||||
# Limit=1001 should NOT be allowed
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=1001)
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=1001)
|
||||
# Limit must be >= 0:
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=0)
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=0)
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=-1)
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=-1)
|
||||
|
||||
# padded_name() creates a unique name of given length by taking the
|
||||
# output of unique_table_name() and padding it with extra 'x' characters:
|
||||
|
||||
@@ -56,6 +56,7 @@ def test_page_break_over_range_tombstone_asan(scylla_only, dynamodb, rest_api, c
|
||||
while True:
|
||||
response = client.scan(TableName=qualified_name, Limit=10, **args)
|
||||
pos = response.get('LastEvaluatedKey', None)
|
||||
cnt = 0
|
||||
for i in response['Items']:
|
||||
if i['cf_id'] == 'eee7eb26-a372-4eb4-aeaa-72f224cf0000':
|
||||
items_found.append(i['schema_version'])
|
||||
@@ -100,9 +101,10 @@ def test_fetch_from_system_tables(scylla_only, dynamodb, rest_api):
|
||||
def test_block_access_to_non_system_tables_with_virtual_interface(scylla_only, test_table_s, dynamodb):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException.*{}'.format(internal_prefix)):
|
||||
client.scan(TableName="{}alternator_{}.{}".format(internal_prefix, test_table_s.name, test_table_s.name))
|
||||
tables_response = client.scan(TableName="{}alternator_{}.{}".format(internal_prefix, test_table_s.name, test_table_s.name))
|
||||
|
||||
def test_block_creating_tables_with_reserved_prefix(scylla_only, dynamodb):
|
||||
client = dynamodb.meta.client
|
||||
for wrong_name_postfix in ['', 'a', 'xxx', 'system_auth.roles', 'table_name']:
|
||||
with pytest.raises(ClientError, match=internal_prefix):
|
||||
dynamodb.create_table(TableName=internal_prefix+wrong_name_postfix,
|
||||
@@ -198,6 +200,7 @@ def test_write_to_config(scylla_only, dynamodb):
|
||||
# Same test as above, just using the scylla_config_temporary() utility
|
||||
# function (also validating its correctness)
|
||||
def test_scylla_config_temporary(scylla_only, dynamodb):
|
||||
tbl = '.scylla.alternator.system.config'
|
||||
parameter = 'query_tombstone_page_limit'
|
||||
old_val = scylla_config_read(dynamodb, parameter)
|
||||
new_val = old_val + "1"
|
||||
|
||||
@@ -1021,7 +1021,7 @@ def test_transact_get_items_projection_expression(test_table_s):
|
||||
def test_transact_get_items_unused_expressionattributenames(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*unused.*#qq'):
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
ret = test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
'TableName': test_table_s.name,
|
||||
'Key': {'p': p},
|
||||
@@ -1034,7 +1034,7 @@ def test_transact_get_items_unused_expressionattributenames(test_table_s):
|
||||
def test_transact_get_items_missing_expressionattributenames(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*#zz'):
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
ret = test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
'TableName': test_table_s.name,
|
||||
'Key': {'p': p},
|
||||
@@ -1071,6 +1071,7 @@ def test_transact_get_items_100(test_table_s):
|
||||
# A transaction with 100 read actions is the limit, and 101 are not allowed:
|
||||
@pytest.mark.xfail(reason="#5064 - transactions not yet supported")
|
||||
def test_transact_get_items_101(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*[tT]ransactItems.*100'):
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
|
||||
@@ -638,10 +638,12 @@ def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration):
|
||||
assert response['TimeToLiveSpecification'] == ttl_spec
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
l = random_string()
|
||||
# expiration one minute in the past, so item should expire ASAP.
|
||||
expiration = int(time.time()) - 60
|
||||
table.put_item(Item={'p': p, 'c': c, 'l': expiration})
|
||||
start_time = time.time()
|
||||
gsi_was_alive = False
|
||||
while time.time() < start_time + max_duration:
|
||||
if 'Item' not in table.get_item(Key={'p': p, 'c': c}):
|
||||
# test is done - and successful:
|
||||
@@ -785,7 +787,7 @@ def test_ttl_expiration_long(dynamodb, waits_for_expiration):
|
||||
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'N' },
|
||||
{ 'AttributeName': 'c', 'AttributeType': 'N' }]) as table:
|
||||
ttl_spec = {'AttributeName': 'expiration', 'Enabled': True}
|
||||
table.meta.client.update_time_to_live(TableName=table.name,
|
||||
response = table.meta.client.update_time_to_live(TableName=table.name,
|
||||
TimeToLiveSpecification=ttl_spec)
|
||||
with table.batch_writer() as batch:
|
||||
for p in range(N):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user