Compare commits
82 Commits
copilot/fi
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb0d8a38f1 | ||
|
|
12787302bf | ||
|
|
f65db4e8eb | ||
|
|
df2ac0f257 | ||
|
|
093e97a539 | ||
|
|
fa6e5d0754 | ||
|
|
08518b2c12 | ||
|
|
2a75b1374e | ||
|
|
2cb9bb8f3a | ||
|
|
f1d63d014c | ||
|
|
33f7bc28da | ||
|
|
f831ca5ab5 | ||
|
|
1fe0509a9b | ||
|
|
e7d76fd8f3 | ||
|
|
700853740d | ||
|
|
3c5dd5e5ae | ||
|
|
5971b2ad97 | ||
|
|
f89315d02f | ||
|
|
d5c205194b | ||
|
|
6ad10b141a | ||
|
|
8cf8e6c87d | ||
|
|
3a06c32749 | ||
|
|
74ab5addd3 | ||
|
|
55f4a2b754 | ||
|
|
1642c686c2 | ||
|
|
9431826c52 | ||
|
|
ba6fabfc88 | ||
|
|
a6618f225c | ||
|
|
0bfd07a268 | ||
|
|
c077283352 | ||
|
|
7061384a27 | ||
|
|
7bc59e93b2 | ||
|
|
a61c221902 | ||
|
|
386ec0af4e | ||
|
|
c4496dd63c | ||
|
|
84df5cfaf8 | ||
|
|
f06db096bd | ||
|
|
31f90c089c | ||
|
|
c58739de6a | ||
|
|
9e18cfbe17 | ||
|
|
7900aa5319 | ||
|
|
9d20f0a3d2 | ||
|
|
0476e8d272 | ||
|
|
e48789cf6c | ||
|
|
9039dfa4a5 | ||
|
|
1884e655d6 | ||
|
|
640c491388 | ||
|
|
cd83d1d4dc | ||
|
|
bbe0b01b14 | ||
|
|
2e7ba1f8ce | ||
|
|
b3a0e4c2dc | ||
|
|
08e5f35527 | ||
|
|
d66a36058b | ||
|
|
6681c0f33f | ||
|
|
13e9ee3f6f | ||
|
|
71e6ef90f4 | ||
|
|
902803babd | ||
|
|
4ed17c9e88 | ||
|
|
73db5c94de | ||
|
|
85f05fbe1b | ||
|
|
83f46fa7f5 | ||
|
|
f1fc5cc808 | ||
|
|
61bbea51ad | ||
|
|
c2b1b10ca0 | ||
|
|
ec87b92ba1 | ||
|
|
9c9371511f | ||
|
|
2e80997630 | ||
|
|
1143acaf5b | ||
|
|
e153cc434f | ||
|
|
64d9c370ee | ||
|
|
a3959fe3db | ||
|
|
f287484f4d | ||
|
|
70a0418102 | ||
|
|
6fcc1ecf94 | ||
|
|
8dde70d04c | ||
|
|
2e7070d3b7 | ||
|
|
a9442e6d56 | ||
|
|
d21faab9dc | ||
|
|
30f6a40ae6 | ||
|
|
5579489c4c | ||
|
|
17c9d640fe | ||
|
|
f98af582a7 |
14
.github/workflows/call_sync_milestone_to_jira.yml
vendored
Normal file
14
.github/workflows/call_sync_milestone_to_jira.yml
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
name: Call Jira release creation for new milestone
|
||||
|
||||
on:
|
||||
milestone:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
sync-milestone-to-jira:
|
||||
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
|
||||
with:
|
||||
# Comma-separated list of Jira project keys
|
||||
jira_project_keys: "SCYLLADB,CUSTOMER"
|
||||
secrets:
|
||||
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
@@ -31,6 +31,7 @@ set(swagger_files
|
||||
api-doc/column_family.json
|
||||
api-doc/commitlog.json
|
||||
api-doc/compaction_manager.json
|
||||
api-doc/client_routes.json
|
||||
api-doc/config.json
|
||||
api-doc/cql_server_test.json
|
||||
api-doc/endpoint_snitch_info.json
|
||||
@@ -68,6 +69,7 @@ target_sources(api
|
||||
PRIVATE
|
||||
api.cc
|
||||
cache_service.cc
|
||||
client_routes.cc
|
||||
collectd.cc
|
||||
column_family.cc
|
||||
commitlog.cc
|
||||
|
||||
23
api/api-doc/client_routes.def.json
Normal file
23
api/api-doc/client_routes.def.json
Normal file
@@ -0,0 +1,23 @@
|
||||
, "client_routes_entry": {
|
||||
"id": "client_routes_entry",
|
||||
"summary": "An entry storing client routes",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"},
|
||||
"address": {"type": "string"},
|
||||
"port": {"type": "integer"},
|
||||
"tls_port": {"type": "integer"},
|
||||
"alternator_port": {"type": "integer"},
|
||||
"alternator_https_port": {"type": "integer"}
|
||||
},
|
||||
"required": ["connection_id", "host_id", "address"]
|
||||
}
|
||||
, "client_routes_key": {
|
||||
"id": "client_routes_key",
|
||||
"summary": "A key of client_routes_entry",
|
||||
"properties": {
|
||||
"connection_id": {"type": "string"},
|
||||
"host_id": {"type": "string", "format": "uuid"}
|
||||
}
|
||||
}
|
||||
|
||||
74
api/api-doc/client_routes.json
Normal file
74
api/api-doc/client_routes.json
Normal file
@@ -0,0 +1,74 @@
|
||||
, "/v2/client-routes":{
|
||||
"get": {
|
||||
"description":"List all client route entries",
|
||||
"operationId":"get_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[],
|
||||
"responses":{
|
||||
"200":{
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{"$ref":"#/definitions/ErrorModel"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"post": {
|
||||
"description":"Upsert one or more client route entries",
|
||||
"operationId":"set_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_entry" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{ "description": "OK" },
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{ "$ref":"#/definitions/ErrorModel" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"delete": {
|
||||
"description":"Delete one or more client route entries",
|
||||
"operationId":"delete_client_routes",
|
||||
"tags":["client_routes"],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"body",
|
||||
"in":"body",
|
||||
"required":true,
|
||||
"schema":{
|
||||
"type":"array",
|
||||
"items":{ "$ref":"#/definitions/client_routes_key" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses":{
|
||||
"200":{
|
||||
"description": "OK"
|
||||
},
|
||||
"default":{
|
||||
"description":"unexpected error",
|
||||
"schema":{
|
||||
"$ref":"#/definitions/ErrorModel"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
13
api/api.cc
13
api/api.cc
@@ -37,6 +37,7 @@
|
||||
#include "raft.hh"
|
||||
#include "gms/gossip_address_map.hh"
|
||||
#include "service_levels.hh"
|
||||
#include "client_routes.hh"
|
||||
|
||||
logging::logger apilog("api");
|
||||
|
||||
@@ -67,9 +68,11 @@ future<> set_server_init(http_context& ctx) {
|
||||
rb02->set_api_doc(r);
|
||||
rb02->register_api_file(r, "swagger20_header");
|
||||
rb02->register_api_file(r, "metrics");
|
||||
rb02->register_api_file(r, "client_routes");
|
||||
rb->register_function(r, "system",
|
||||
"The system related API");
|
||||
rb02->add_definitions_file(r, "metrics");
|
||||
rb02->add_definitions_file(r, "client_routes");
|
||||
set_system(ctx, r);
|
||||
rb->register_function(r, "error_injection",
|
||||
"The error injection API");
|
||||
@@ -129,6 +132,16 @@ future<> unset_server_storage_service(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr) {
|
||||
return ctx.http_server.set_routes([&ctx, &cr] (routes& r) {
|
||||
set_client_routes(ctx, r, cr);
|
||||
});
|
||||
}
|
||||
|
||||
future<> unset_server_client_routes(http_context& ctx) {
|
||||
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_client_routes(ctx, r); });
|
||||
}
|
||||
|
||||
future<> set_load_meter(http_context& ctx, service::load_meter& lm) {
|
||||
return ctx.http_server.set_routes([&ctx, &lm] (routes& r) { set_load_meter(ctx, r, lm); });
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ class storage_proxy;
|
||||
class storage_service;
|
||||
class raft_group0_client;
|
||||
class raft_group_registry;
|
||||
class client_routes_service;
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -99,6 +100,8 @@ future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snit
|
||||
future<> unset_server_snitch(http_context& ctx);
|
||||
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
|
||||
future<> unset_server_storage_service(http_context& ctx);
|
||||
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
|
||||
future<> unset_server_client_routes(http_context& ctx);
|
||||
future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
|
||||
future<> unset_server_sstables_loader(http_context& ctx);
|
||||
future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g);
|
||||
|
||||
178
api/client_routes.cc
Normal file
178
api/client_routes.cc
Normal file
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <seastar/http/short_streams.hh>
|
||||
|
||||
#include "client_routes.hh"
|
||||
#include "api/api.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "utils/rjson.hh"
|
||||
|
||||
|
||||
#include "api/api-doc/client_routes.json.hh"
|
||||
|
||||
using namespace seastar::httpd;
|
||||
using namespace std::chrono_literals;
|
||||
using namespace json;
|
||||
|
||||
extern logging::logger apilog;
|
||||
|
||||
namespace api {
|
||||
|
||||
static void validate_client_routes_endpoint(sharded<service::client_routes_service>& cr, sstring endpoint_name) {
|
||||
if (!cr.local().get_feature_service().client_routes) {
|
||||
apilog.warn("{}: called before the cluster feature was enabled", endpoint_name);
|
||||
throw std::runtime_error(fmt::format("{} requires all nodes to support the CLIENT_ROUTES cluster feature", endpoint_name));
|
||||
}
|
||||
}
|
||||
|
||||
static sstring parse_string(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
throw bad_param_exception(fmt::format("Missing '{}'", name));
|
||||
}
|
||||
if (!it->value.IsString()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be a string", name));
|
||||
}
|
||||
return {it->value.GetString(), it->value.GetStringLength()};
|
||||
}
|
||||
|
||||
static std::optional<uint32_t> parse_port(const char* name, rapidjson::Value const& v) {
|
||||
const auto it = v.FindMember(name);
|
||||
if (it == v.MemberEnd()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (!it->value.IsInt()) {
|
||||
throw bad_param_exception(fmt::format("'{}' must be an integer", name));
|
||||
}
|
||||
auto port = it->value.GetInt();
|
||||
if (port < 1 || port > 65535) {
|
||||
throw bad_param_exception(fmt::format("'{}' value={} is outside the allowed port range", name, port));
|
||||
}
|
||||
return port;
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_entry> parse_set_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_entry> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
if (!element.IsObject()) { throw bad_param_exception("Each element must be object"); }
|
||||
|
||||
const auto port = parse_port("port", element);
|
||||
const auto tls_port = parse_port("tls_port", element);
|
||||
const auto alternator_port = parse_port("alternator_port", element);
|
||||
const auto alternator_https_port = parse_port("alternator_https_port", element);
|
||||
|
||||
if (!port.has_value() && !tls_port.has_value() && !alternator_port.has_value() && !alternator_https_port.has_value()) {
|
||||
throw bad_param_exception("At least one port field ('port', 'tls_port', 'alternator_port', 'alternator_https_port') must be specified");
|
||||
}
|
||||
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)},
|
||||
parse_string("address", element),
|
||||
port,
|
||||
tls_port,
|
||||
alternator_port,
|
||||
alternator_https_port
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "rest_set_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
const auto route_entries = parse_set_client_array(root);
|
||||
|
||||
co_await cr.local().set_client_routes(route_entries);
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static std::vector<service::client_routes_service::client_route_key> parse_delete_client_array(const rapidjson::Document& root) {
|
||||
if (!root.IsArray()) {
|
||||
throw bad_param_exception("Body must be a JSON array");
|
||||
}
|
||||
|
||||
std::vector<service::client_routes_service::client_route_key> v;
|
||||
v.reserve(root.GetArray().Size());
|
||||
for (const auto& element : root.GetArray()) {
|
||||
v.emplace_back(
|
||||
parse_string("connection_id", element),
|
||||
utils::UUID{parse_string("host_id", element)}
|
||||
);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "delete_client_routes");
|
||||
|
||||
rapidjson::Document root;
|
||||
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
|
||||
root.Parse(content.c_str());
|
||||
|
||||
const auto route_keys = parse_delete_client_array(root);
|
||||
co_await cr.local().delete_client_routes(route_keys);
|
||||
co_return seastar::json::json_void();
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_get_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
|
||||
validate_client_routes_endpoint(cr, "get_client_routes");
|
||||
|
||||
co_return co_await cr.invoke_on(0, [] (service::client_routes_service& cr) -> future<json::json_return_type> {
|
||||
co_return json::json_return_type(stream_range_as_array(co_await cr.get_client_routes(), [](const service::client_routes_service::client_route_entry & entry) {
|
||||
seastar::httpd::client_routes_json::client_routes_entry obj;
|
||||
obj.connection_id = entry.connection_id;
|
||||
obj.host_id = fmt::to_string(entry.host_id);
|
||||
obj.address = entry.address;
|
||||
if (entry.port.has_value()) { obj.port = entry.port.value(); }
|
||||
if (entry.tls_port.has_value()) { obj.tls_port = entry.tls_port.value(); }
|
||||
if (entry.alternator_port.has_value()) { obj.alternator_port = entry.alternator_port.value(); }
|
||||
if (entry.alternator_https_port.has_value()) { obj.alternator_https_port = entry.alternator_https_port.value(); }
|
||||
return obj;
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
void set_client_routes(http_context& ctx, routes& r, sharded<service::client_routes_service>& cr) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_set_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::delete_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_delete_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
seastar::httpd::client_routes_json::get_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
|
||||
return rest_get_client_routes(ctx, cr, std::move(req));
|
||||
});
|
||||
}
|
||||
|
||||
void unset_client_routes(http_context& ctx, routes& r) {
|
||||
seastar::httpd::client_routes_json::set_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::delete_client_routes.unset(r);
|
||||
seastar::httpd::client_routes_json::get_client_routes.unset(r);
|
||||
}
|
||||
|
||||
}
|
||||
20
api/client_routes.hh
Normal file
20
api/client_routes.hh
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/json/json_elements.hh>
|
||||
#include "api/api_init.hh"
|
||||
|
||||
namespace api {
|
||||
|
||||
void set_client_routes(http_context& ctx, httpd::routes& r, sharded<service::client_routes_service>& cr);
|
||||
void unset_client_routes(http_context& ctx, httpd::routes& r);
|
||||
|
||||
}
|
||||
@@ -1158,6 +1158,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'locator/topology.cc',
|
||||
'locator/util.cc',
|
||||
'service/client_state.cc',
|
||||
'service/client_routes.cc',
|
||||
'service/storage_service.cc',
|
||||
'service/session.cc',
|
||||
'service/task_manager_module.cc',
|
||||
@@ -1318,6 +1319,8 @@ api = ['api/api.cc',
|
||||
'api/storage_proxy.cc',
|
||||
Json2Code('api/api-doc/cache_service.json'),
|
||||
'api/cache_service.cc',
|
||||
Json2Code('api/api-doc/client_routes.json'),
|
||||
'api/client_routes.cc',
|
||||
Json2Code('api/api-doc/collectd.json'),
|
||||
'api/collectd.cc',
|
||||
Json2Code('api/api-doc/endpoint_snitch_info.json'),
|
||||
|
||||
@@ -64,6 +64,10 @@ bool query_processor::topology_global_queue_empty() {
|
||||
return remote().first.get().ss.topology_global_queue_empty();
|
||||
}
|
||||
|
||||
future<bool> query_processor::ongoing_rf_change(const service::group0_guard& guard, sstring ks) {
|
||||
return remote().first.get().ss.ongoing_rf_change(guard, std::move(ks));
|
||||
}
|
||||
|
||||
static service::query_state query_state_for_internal_call() {
|
||||
return {service::client_state::for_internal_calls(), empty_service_permit()};
|
||||
}
|
||||
|
||||
@@ -474,6 +474,7 @@ public:
|
||||
void reset_cache();
|
||||
|
||||
bool topology_global_queue_empty();
|
||||
future<bool> ongoing_rf_change(const service::group0_guard& guard, sstring ks);
|
||||
|
||||
query_options make_internal_options(
|
||||
const statements::prepared_statement::checked_weak_ptr& p,
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "mutation/canonical_mutation.hh"
|
||||
#include "prepared_statement.hh"
|
||||
#include "seastar/coroutine/exception.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "service/topology_mutation.hh"
|
||||
@@ -138,6 +139,7 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
|
||||
future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
|
||||
cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
|
||||
using namespace cql_transport;
|
||||
bool unknown_keyspace = false;
|
||||
try {
|
||||
event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
|
||||
auto ks = qp.db().find_keyspace(_name);
|
||||
@@ -158,8 +160,12 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
// when in reality nothing or only schema is being changed
|
||||
if (changes_tablets(qp)) {
|
||||
if (!qp.proxy().features().topology_global_request_queue && !qp.topology_global_queue_empty()) {
|
||||
return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
|
||||
}
|
||||
if (qp.proxy().features().rack_list_rf && co_await qp.ongoing_rf_change(mc.guard(),_name)) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception(format("Another RF change for this keyspace {} ongoing, please retry.", _name)));
|
||||
}
|
||||
qp.db().real_database().validate_keyspace_update(*ks_md_update);
|
||||
|
||||
@@ -242,10 +248,15 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
|
||||
target_type,
|
||||
keyspace());
|
||||
mc.add_mutations(std::move(muts), "CQL alter keyspace");
|
||||
return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), warnings));
|
||||
co_return std::make_tuple(std::move(ret), warnings);
|
||||
} catch (data_dictionary::no_such_keyspace& e) {
|
||||
return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
unknown_keyspace = true;
|
||||
}
|
||||
if (unknown_keyspace) {
|
||||
co_await coroutine::return_exception(
|
||||
exceptions::invalid_request_exception("Unknown keyspace " + _name));
|
||||
}
|
||||
std::unreachable();
|
||||
}
|
||||
|
||||
std::unique_ptr<cql3::statements::prepared_statement>
|
||||
|
||||
@@ -61,7 +61,7 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
|
||||
// Handle ALTER:
|
||||
// ([]|0) -> numeric is allowed, there are no existing replicas
|
||||
// numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
|
||||
// numeric -> numeric' is not supported unless numeric == numeric'. User should convert RF to rack list of equal count first.
|
||||
// rack_list -> len(rack_list) is allowed (no-op)
|
||||
// rack_list -> numeric is not allowed
|
||||
if (old_options.contains(dc)) {
|
||||
@@ -75,6 +75,8 @@ expand_to_racks(const locator::token_metadata& tm,
|
||||
"Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
|
||||
dc, old_rf_val, data.count()));
|
||||
}
|
||||
} else if (old_rf.count() == data.count()) {
|
||||
return rf;
|
||||
} else if (old_rf.count() > 0) {
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
|
||||
@@ -153,6 +155,8 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
}
|
||||
|
||||
// Validate options.
|
||||
bool numeric_to_rack_list_transition = false;
|
||||
bool rf_change = false;
|
||||
for (auto&& [dc, opt] : options) {
|
||||
locator::replication_factor_data rf(opt);
|
||||
|
||||
@@ -162,6 +166,7 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
old_rf = locator::replication_factor_data(i->second);
|
||||
}
|
||||
|
||||
rf_change = rf_change || (old_rf && old_rf->count() != rf.count()) || (!old_rf && rf.count() != 0);
|
||||
if (!rf.is_rack_based()) {
|
||||
if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
|
||||
if (old_rf->count() != rf.count()) {
|
||||
@@ -187,12 +192,11 @@ static locator::replication_strategy_config_options prepare_options(
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Rack list for '{}' contains duplicate entries", dc));
|
||||
}
|
||||
if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
|
||||
// FIXME: Allow this if replicas already conform to the given rack list.
|
||||
// FIXME: Implement automatic colocation to allow transition to rack list.
|
||||
throw exceptions::configuration_exception(fmt::format(
|
||||
"Cannot change replication factor from numeric to rack list for '{}'", dc));
|
||||
}
|
||||
numeric_to_rack_list_transition = numeric_to_rack_list_transition || (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0);
|
||||
}
|
||||
|
||||
if (numeric_to_rack_list_transition && rf_change) {
|
||||
throw exceptions::configuration_exception("Cannot change replication factor from numeric to rack list and rf value at the same time");
|
||||
}
|
||||
|
||||
if (!rf && options.empty() && old_options.empty()) {
|
||||
@@ -412,7 +416,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
|
||||
? std::optional<unsigned>(0) : std::nullopt;
|
||||
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
|
||||
bool uses_tablets = initial_tablets.has_value();
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
|
||||
return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
|
||||
std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
|
||||
@@ -428,7 +432,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
|
||||
throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
|
||||
}
|
||||
auto sc = get_replication_strategy_class();
|
||||
bool rack_list_enabled = feat.rack_list_rf;
|
||||
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
|
||||
if (sc) {
|
||||
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
|
||||
} else {
|
||||
|
||||
@@ -248,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
|
||||
// which is larger than the segment ID of the RP of the last written hint.
|
||||
cfg.base_segment_id = _last_written_rp.base_id();
|
||||
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
|
||||
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
|
||||
// When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
|
||||
if (_sender.have_segments()) {
|
||||
|
||||
@@ -135,5 +135,5 @@ const std::string db::object_storage_endpoint_param::gs_type = "gs";
|
||||
|
||||
auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
|
||||
-> decltype(ctx.out()) {
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
|
||||
return fmt::format_to(ctx.out(), "object_storage_endpoint_param{}", e.to_json_string());
|
||||
}
|
||||
|
||||
@@ -110,6 +110,7 @@ namespace {
|
||||
system_keyspace::v3::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.enable_schema_commitlog();
|
||||
@@ -137,6 +138,7 @@ namespace {
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
|
||||
props.is_group0_table = true;
|
||||
@@ -309,6 +311,7 @@ schema_ptr system_keyspace::topology() {
|
||||
.with_column("tablet_balancing_enabled", boolean_type, column_kind::static_column)
|
||||
.with_column("upgrade_state", utf8_type, column_kind::static_column)
|
||||
.with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
|
||||
.set_comment("Current state of topology change machine")
|
||||
.with_hash_version()
|
||||
.build();
|
||||
@@ -1415,6 +1418,23 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
schema_ptr system_keyspace::client_routes() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, CLIENT_ROUTES);
|
||||
return schema_builder(NAME, CLIENT_ROUTES, std::make_optional(id))
|
||||
.with_column("connection_id", utf8_type, column_kind::partition_key)
|
||||
.with_column("host_id", uuid_type, column_kind::clustering_key)
|
||||
.with_column("address", utf8_type)
|
||||
.with_column("port", int32_type)
|
||||
.with_column("tls_port", int32_type)
|
||||
.with_column("alternator_port", int32_type)
|
||||
.with_column("alternator_https_port", int32_type)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
future<system_keyspace::local_info> system_keyspace::load_local_info() {
|
||||
auto msg = co_await execute_cql(format("SELECT host_id, cluster_name, data_center, rack FROM system.{} WHERE key=?", LOCAL), sstring(LOCAL));
|
||||
|
||||
@@ -2342,7 +2362,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
|
||||
v3::cdc_local(),
|
||||
raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
|
||||
topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
|
||||
dicts(), view_building_tasks(), cdc_streams_state(), cdc_streams_history()
|
||||
dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
|
||||
});
|
||||
|
||||
if (cfg.check_experimental(db::experimental_features_t::feature::BROADCAST_TABLES)) {
|
||||
@@ -3137,7 +3157,10 @@ static bool must_have_tokens(service::node_state nst) {
|
||||
// A decommissioning node doesn't have tokens at the end, they are
|
||||
// removed during transition to the left_token_ring state.
|
||||
case service::node_state::decommissioning: return false;
|
||||
case service::node_state::removing: return true;
|
||||
// A removing node might or might not have tokens depending on whether
|
||||
// REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
|
||||
// cases, we allow removing nodes to not have tokens.
|
||||
case service::node_state::removing: return false;
|
||||
case service::node_state::rebuilding: return true;
|
||||
case service::node_state::normal: return true;
|
||||
case service::node_state::left: return false;
|
||||
@@ -3377,6 +3400,12 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("paused_rf_change_requests")) {
|
||||
for (auto&& v : deserialize_set_column(*topology(), some_row, "paused_rf_change_requests")) {
|
||||
ret.paused_rf_change_requests.insert(value_cast<utils::UUID>(v));
|
||||
}
|
||||
}
|
||||
|
||||
if (some_row.has("enabled_features")) {
|
||||
ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
|
||||
}
|
||||
@@ -3588,35 +3617,43 @@ system_keyspace::topology_requests_entry system_keyspace::topology_request_row_t
|
||||
return entry;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id, bool require_entry) {
|
||||
future<system_keyspace::topology_requests_entry> system_keyspace::get_topology_request_entry(utils::UUID id) {
|
||||
auto r = co_await get_topology_request_entry_opt(id);
|
||||
if (!r) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
}
|
||||
co_return std::move(*r);
|
||||
}
|
||||
|
||||
future<std::optional<system_keyspace::topology_requests_entry>> system_keyspace::get_topology_request_entry_opt(utils::UUID id) {
|
||||
auto rs = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE id = {}", TOPOLOGY_REQUESTS, id));
|
||||
|
||||
if (!rs || rs->empty()) {
|
||||
if (require_entry) {
|
||||
on_internal_error(slogger, format("no entry for request id {}", id));
|
||||
} else {
|
||||
co_return topology_requests_entry{
|
||||
.id = utils::null_uuid()
|
||||
};
|
||||
}
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
const auto& row = rs->one();
|
||||
co_return topology_request_row_to_entry(id, row);
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit) {
|
||||
sstring request_types_str = "";
|
||||
bool first = true;
|
||||
for (const auto& rt : request_types) {
|
||||
if (!std::exchange(first, false)) {
|
||||
request_types_str += ", ";
|
||||
}
|
||||
request_types_str += std::visit([] (auto&& arg) { return fmt::format("'{}'", arg); }, rt);
|
||||
}
|
||||
|
||||
// Running requests.
|
||||
auto rs_running = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS,
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
|
||||
format("SELECT * FROM system.{} WHERE done = false AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, request_types_str));
|
||||
|
||||
// Requests which finished after end_time_limit.
|
||||
auto rs_done = co_await execute_cql(
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ('{}', '{}', '{}', '{}', '{}') ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(),
|
||||
service::topology_request::join, service::topology_request::replace, service::topology_request::rebuild, service::topology_request::leave, service::topology_request::remove));
|
||||
format("SELECT * FROM system.{} WHERE end_time > {} AND request_type IN ({}) ALLOW FILTERING", TOPOLOGY_REQUESTS, end_time_limit.time_since_epoch().count(), request_types_str));
|
||||
|
||||
topology_requests_entries m;
|
||||
for (const auto& row: *rs_done) {
|
||||
@@ -3634,6 +3671,16 @@ future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops
|
||||
co_return m;
|
||||
}
|
||||
|
||||
future<system_keyspace::topology_requests_entries> system_keyspace::get_node_ops_request_entries(db_clock::time_point end_time_limit) {
|
||||
return get_topology_request_entries({
|
||||
service::topology_request::join,
|
||||
service::topology_request::replace,
|
||||
service::topology_request::rebuild,
|
||||
service::topology_request::leave,
|
||||
service::topology_request::remove
|
||||
}, end_time_limit);
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::get_insert_dict_mutation(
|
||||
std::string_view name,
|
||||
bytes data,
|
||||
|
||||
@@ -199,6 +199,7 @@ public:
|
||||
static constexpr auto VIEW_BUILD_STATUS_V2 = "view_build_status_v2";
|
||||
static constexpr auto DICTS = "dicts";
|
||||
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
|
||||
static constexpr auto CLIENT_ROUTES = "client_routes";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
@@ -276,6 +277,7 @@ public:
|
||||
static schema_ptr view_build_status_v2();
|
||||
static schema_ptr dicts();
|
||||
static schema_ptr view_building_tasks();
|
||||
static schema_ptr client_routes();
|
||||
|
||||
// auth
|
||||
static schema_ptr roles();
|
||||
@@ -667,7 +669,9 @@ public:
|
||||
|
||||
future<service::topology_request_state> get_topology_request_state(utils::UUID id, bool require_entry);
|
||||
topology_requests_entry topology_request_row_to_entry(utils::UUID id, const cql3::untyped_result_set_row& row);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id, bool require_entry);
|
||||
future<topology_requests_entry> get_topology_request_entry(utils::UUID id);
|
||||
future<std::optional<topology_requests_entry>> get_topology_request_entry_opt(utils::UUID id);
|
||||
future<system_keyspace::topology_requests_entries> get_topology_request_entries(std::vector<std::variant<service::topology_request, service::global_topology_request>> request_types, db_clock::time_point end_time_limit);
|
||||
future<topology_requests_entries> get_node_ops_request_entries(db_clock::time_point end_time_limit);
|
||||
|
||||
public:
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# Alternator: DynamoDB API in Scylla
|
||||
# Alternator: DynamoDB API in ScyllaDB
|
||||
|
||||
## Introduction
|
||||
Alternator is a Scylla feature adding compatibility with Amazon DynamoDB(TM).
|
||||
Alternator is a ScyllaDB feature adding compatibility with Amazon DynamoDB(TM).
|
||||
DynamoDB's API uses JSON-encoded requests and responses which are sent over
|
||||
an HTTP or HTTPS transport. It is described in detail in Amazon's [DynamoDB
|
||||
API Reference](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/).
|
||||
|
||||
Our goal is that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against Scylla with Alternator enabled. Alternator's
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. Alternator's
|
||||
compatibility with DynamoDB is fairly complete, but users should be aware
|
||||
of some differences and some unimplemented features. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document,
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document,
|
||||
which is updated as the work on Alternator progresses and compatibility
|
||||
continues to improve.
|
||||
|
||||
@@ -19,8 +19,8 @@ Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These are described in [Alternator-specific APIs](new-apis.md).
|
||||
|
||||
## Running Alternator
|
||||
By default, Scylla does not listen for DynamoDB API requests. To enable
|
||||
this API in Scylla you must set at least two configuration options,
|
||||
By default, ScyllaDB does not listen for DynamoDB API requests. To enable
|
||||
this API in ScyllaDB you must set at least two configuration options,
|
||||
**alternator_port** and **alternator_write_isolation**. For example in the
|
||||
YAML configuration file:
|
||||
```yaml
|
||||
@@ -30,7 +30,7 @@ alternator_write_isolation: only_rmw_uses_lwt # or always, forbid or unsafe
|
||||
or, equivalently, via command-line arguments: `--alternator-port=8000
|
||||
--alternator-write-isolation=only_rmw_uses_lwt.
|
||||
|
||||
the **alternator_port** option determines on which port Scylla listens for
|
||||
the **alternator_port** option determines on which port ScyllaDB listens for
|
||||
DynamoDB API requests. By default, it listens on this port on all network
|
||||
interfaces. To listen only on a specific interface, configure also the
|
||||
**alternator_address** option.
|
||||
@@ -41,12 +41,12 @@ Alternator has four different choices
|
||||
for the implementation of writes, each with different advantages. You should
|
||||
carefully consider which of the options makes more sense for your intended
|
||||
use case and configure alternator_write_isolation accordingly. There is
|
||||
currently no default for this option: Trying to run Scylla with an Alternator
|
||||
currently no default for this option: Trying to run ScyllaDB with an Alternator
|
||||
port selected but without configuring write isolation will result in an error message,
|
||||
asking you to set it.
|
||||
|
||||
In addition to (or instead of) serving HTTP requests on alternator_port,
|
||||
Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
ScyllaDB can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
specified by **alternator_https_port**. As usual for HTTPS servers, the
|
||||
operator must specify certificate and key files. By default these should
|
||||
be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
|
||||
@@ -54,7 +54,7 @@ these default locations can overridden by specifying
|
||||
`--alternator-encryption-options keyfile="..."` and
|
||||
`--alternator-encryption-options certificate="..."`.
|
||||
|
||||
By default, Scylla saves a snapshot of deleted tables. But Alternator does
|
||||
By default, ScyllaDB saves a snapshot of deleted tables. But Alternator does
|
||||
not offer an API to restore these snapshots, so these snapshots are not useful
|
||||
and waste disk space - deleting a table does not recover any disk space.
|
||||
It is therefore recommended to disable this automatic-snapshotting feature
|
||||
@@ -73,11 +73,11 @@ itself. Instructions, code and examples for doing this can be found in the
|
||||
|
||||
This section provides only a very brief introduction to Alternator's
|
||||
design. A much more detailed document about the features of the DynamoDB
|
||||
API and how they are, or could be, implemented in Scylla can be found in:
|
||||
API and how they are, or could be, implemented in ScyllaDB can be found in:
|
||||
<https://docs.google.com/document/d/1i4yjF5OSAazAY_-T8CBce9-2ykW4twx_E_Nt2zDoOVs>
|
||||
|
||||
Almost all of Alternator's source code (except some initialization code)
|
||||
can be found in the alternator/ subdirectory of Scylla's source code.
|
||||
can be found in the alternator/ subdirectory of ScyllaDB's source code.
|
||||
Extensive functional tests can be found in the test/alternator
|
||||
subdirectory. These tests are written in Python, and can be run against
|
||||
both Alternator and Amazon's DynamoDB; This allows verifying that
|
||||
@@ -85,15 +85,15 @@ Alternator's behavior matches the one observed on DynamoDB.
|
||||
See test/alternator/README.md for more information about the tests and
|
||||
how to run them.
|
||||
|
||||
With Alternator enabled on port 8000 (for example), every Scylla node
|
||||
With Alternator enabled on port 8000 (for example), every ScyllaDB node
|
||||
listens for DynamoDB API requests on this port. These requests, in
|
||||
JSON format over HTTP, are parsed and result in calls to internal Scylla
|
||||
C++ functions - there is no CQL generation or parsing involved.
|
||||
In Scylla terminology, the node receiving the request acts as the
|
||||
In ScyllaDB terminology, the node receiving the request acts as the
|
||||
*coordinator*, and often passes the request on to one or more other nodes -
|
||||
*replicas* which hold copies of the requested data.
|
||||
|
||||
Alternator tables are stored as Scylla tables, each in a separate keyspace.
|
||||
Alternator tables are stored as ScyllaDB tables, each in a separate keyspace.
|
||||
Each keyspace is initialized when the corresponding Alternator table is
|
||||
created (with a CreateTable request). The replication factor (RF) for this
|
||||
keyspace is chosen at that point, depending on the size of the cluster:
|
||||
@@ -101,19 +101,19 @@ RF=3 is used on clusters with three or more nodes, and RF=1 is used for
|
||||
smaller clusters. Such smaller clusters are, of course, only recommended
|
||||
for tests because of the risk of data loss.
|
||||
|
||||
Each table in Alternator is stored as a Scylla table in a separate
|
||||
Each table in Alternator is stored as a ScyllaDB table in a separate
|
||||
keyspace. The DynamoDB key columns (hash and sort key) have known types,
|
||||
and become partition and clustering key columns of the Scylla table.
|
||||
and become partition and clustering key columns of the ScyllaDB table.
|
||||
All other attributes may be different for each row, so are stored in one
|
||||
map column in Scylla, and not as separate columns.
|
||||
map column in ScyllaDB, and not as separate columns.
|
||||
|
||||
DynamoDB supports two consistency levels for reads, "eventual consistency"
|
||||
and "strong consistency". These two modes are implemented using Scylla's CL
|
||||
and "strong consistency". These two modes are implemented using ScyllaDB's CL
|
||||
(consistency level) feature: All writes are done using the `LOCAL_QUORUM`
|
||||
consistency level, then strongly-consistent reads are done with
|
||||
`LOCAL_QUORUM`, while eventually-consistent reads are with just `LOCAL_ONE`.
|
||||
|
||||
In Scylla (and its inspiration, Cassandra), high write performance is
|
||||
In ScyllaDB (and its inspiration, Cassandra), high write performance is
|
||||
achieved by ensuring that writes do not require reads from disk.
|
||||
The DynamoDB API, however, provides many types of requests that need a read
|
||||
before the write (a.k.a. RMW requests - read-modify-write). For example,
|
||||
@@ -121,7 +121,7 @@ a request may copy an existing attribute, increment an attribute,
|
||||
be conditional on some expression involving existing values of attribute,
|
||||
or request that the previous values of attributes be returned. These
|
||||
read-modify-write transactions should be _isolated_ from each other, so
|
||||
by default Alternator implements every write operation using Scylla's
|
||||
by default Alternator implements every write operation using ScyllaDB's
|
||||
LWT (lightweight transactions). This default can be overridden on a per-table
|
||||
basis, by tagging the table as explained above in the "write isolation
|
||||
policies" section.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# ScyllaDB Alternator for DynamoDB users
|
||||
|
||||
Scylla supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
ScyllaDB supports the DynamoDB API (this feature is codenamed "Alternator").
|
||||
Our goal is to support any application written for Amazon DynamoDB.
|
||||
Nevertheless, there are a few differences between DynamoDB and Scylla, and
|
||||
and a few DynamoDB features that have not yet been implemented in Scylla.
|
||||
@@ -8,16 +8,16 @@ The purpose of this document is to inform users of these differences.
|
||||
|
||||
## Provisioning
|
||||
|
||||
The most obvious difference between DynamoDB and Scylla is that while
|
||||
DynamoDB is a shared cloud service, Scylla is a dedicated service running
|
||||
The most obvious difference between DynamoDB and ScyllaDB is that while
|
||||
DynamoDB is a shared cloud service, ScyllaDB is a dedicated service running
|
||||
on your private cluster. Whereas DynamoDB allows you to "provision" the
|
||||
number of requests per second you'll need - or at an extra cost not even
|
||||
provision that - Scylla requires you to provision your cluster. You need
|
||||
provision that - ScyllaDB requires you to provision your cluster. You need
|
||||
to reason about the number and size of your nodes - not the throughput.
|
||||
|
||||
Moreover, DynamoDB's per-table provisioning (`BillingMode=PROVISIONED`) is
|
||||
not yet supported by Scylla. The BillingMode and ProvisionedThroughput options
|
||||
on a table need to be valid but are ignored, and Scylla behaves like DynamoDB's
|
||||
on a table need to be valid but are ignored, and ScyllaDB behaves like DynamoDB's
|
||||
`BillingMode=PAY_PER_REQUEST`: All requests are accepted without a per-table
|
||||
throughput cap.
|
||||
|
||||
@@ -33,7 +33,7 @@ Instructions for doing this can be found in:
|
||||
|
||||
## Write isolation policies
|
||||
|
||||
Scylla was designed to optimize the performance of pure write operations -
|
||||
ScyllaDB was designed to optimize the performance of pure write operations -
|
||||
writes which do not need to read the previous value of the item.
|
||||
In CQL, writes which do need the previous value of the item must explicitly
|
||||
use the slower LWT ("LightWeight Transaction") feature to be correctly
|
||||
@@ -79,11 +79,11 @@ a _higher_ timestamp - and this will be the "last write" that wins.
|
||||
To avoid or mitigate this write reordering issue, users may consider
|
||||
one or more of the following:
|
||||
|
||||
1. Use NTP to keep the clocks on the different Scylla nodes synchronized.
|
||||
1. Use NTP to keep the clocks on the different ScyllaDB nodes synchronized.
|
||||
If the delay between the two writes is longer than NTP's accuracy,
|
||||
they will not be reordered.
|
||||
2. If an application wants to ensure that two specific writes are not
|
||||
reordered, it should send both requests to the same Scylla node.
|
||||
reordered, it should send both requests to the same ScyllaDB node.
|
||||
Care should be taken when using a load balancer - which might redirect
|
||||
two requests to two different nodes.
|
||||
3. Consider using the `always_use_lwt` write isolation policy.
|
||||
@@ -210,7 +210,7 @@ CREATE SERVICE_LEVEL IF NOT EXISTS oltp WITH SHARES = 1000;
|
||||
ATTACH SERVICE_LEVEL olap TO alice;
|
||||
ATTACH SERVICE_LEVEL oltp TO bob;
|
||||
```
|
||||
Note that `alternator_enforce_authorization` has to be enabled in Scylla configuration.
|
||||
Note that `alternator_enforce_authorization` has to be enabled in ScyllaDB configuration.
|
||||
|
||||
See [Authorization](##Authorization) section to learn more about roles and authorization.
|
||||
See [Workload Prioritization](../features/workload-prioritization)
|
||||
@@ -218,11 +218,11 @@ to read about Workload Prioritization in detail.
|
||||
|
||||
## Metrics
|
||||
|
||||
Scylla has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of Scylla's usage and performance.
|
||||
Scylla's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
ScyllaDB has an advanced and extensive monitoring framework for inspecting
|
||||
and graphing hundreds of different metrics of ScyllaDB's usage and performance.
|
||||
ScyllaDB's monitoring stack, based on Grafana and Prometheus, is described in
|
||||
<https://docs.scylladb.com/operating-scylla/monitoring/>.
|
||||
This monitoring stack is different from DynamoDB's offering - but Scylla's
|
||||
This monitoring stack is different from DynamoDB's offering - but ScyllaDB's
|
||||
is significantly more powerful and gives the user better insights on
|
||||
the internals of the database and its performance.
|
||||
|
||||
@@ -248,7 +248,7 @@ data in different partition order. Applications mustn't rely on that
|
||||
undocumented order.
|
||||
|
||||
Note that inside each partition, the individual items will be sorted the same
|
||||
in DynamoDB and Scylla - determined by the _sort key_ defined for that table.
|
||||
in DynamoDB and ScyllaDB - determined by the _sort key_ defined for that table.
|
||||
|
||||
---
|
||||
|
||||
@@ -274,7 +274,7 @@ is different, or can be configured in Alternator:
|
||||
## Experimental API features
|
||||
|
||||
Some DynamoDB API features are supported by Alternator, but considered
|
||||
**experimental** in this release. An experimental feature in Scylla is a
|
||||
**experimental** in this release. An experimental feature in ScyllaDB is a
|
||||
feature whose functionality is complete, or mostly complete, but it is not
|
||||
as thoroughly tested or optimized as regular features. Also, an experimental
|
||||
feature's implementation is still subject to change and upgrades may not be
|
||||
@@ -351,8 +351,8 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
|
||||
* The on-demand backup APIs are not supported: CreateBackup, DescribeBackup,
|
||||
DeleteBackup, ListBackups, RestoreTableFromBackup.
|
||||
For now, users can use Scylla's existing backup solutions such as snapshots
|
||||
or Scylla Manager.
|
||||
For now, users can use ScyllaDB's existing backup solutions such as snapshots
|
||||
or ScyllaDB Manager.
|
||||
<https://github.com/scylladb/scylla/issues/5063>
|
||||
|
||||
* Continuous backup (the ability to restore any point in time) is also not
|
||||
@@ -370,7 +370,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
<https://github.com/scylladb/scylla/issues/5068>
|
||||
|
||||
* DAX (DynamoDB Accelerator), an in-memory cache for DynamoDB, is not
|
||||
available in for Alternator. Anyway, it should not be necessary - Scylla's
|
||||
available in for Alternator. Anyway, it should not be necessary - ScyllaDB's
|
||||
internal cache is already rather advanced and there is no need to place
|
||||
another cache in front of the it. We wrote more about this here:
|
||||
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
|
||||
@@ -384,7 +384,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
|
||||
and the operations ExecuteStatement, BatchExecuteStatement and
|
||||
ExecuteTransaction are not yet supported.
|
||||
A user that is interested in an SQL-like syntax can consider using Scylla's
|
||||
A user that is interested in an SQL-like syntax can consider using ScyllaDB's
|
||||
CQL protocol instead.
|
||||
This feature was added to DynamoDB in November 2020.
|
||||
<https://github.com/scylladb/scylla/issues/8787>
|
||||
@@ -393,7 +393,7 @@ they should be easy to detect. Here is a list of these unimplemented features:
|
||||
which is different from AWS's. In particular, the operations
|
||||
DescribeContributorInsights, ListContributorInsights and
|
||||
UpdateContributorInsights that configure Amazon's "CloudWatch Contributor
|
||||
Insights" are not yet supported. Scylla has different ways to retrieve the
|
||||
Insights" are not yet supported. ScyllaDB has different ways to retrieve the
|
||||
same information, such as which items were accessed most often.
|
||||
<https://github.com/scylladb/scylla/issues/8788>
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ This section will guide you through the steps for setting up the cluster:
|
||||
<https://hub.docker.com/r/scylladb/scylla/>, but add to every `docker run`
|
||||
command a `-p 8000:8000` before the image name and
|
||||
`--alternator-port=8000 --alternator-write-isolation=always` at the end.
|
||||
The "alternator-port" option specifies on which port Scylla will listen for
|
||||
The "alternator-port" option specifies on which port ScyllaDB will listen for
|
||||
the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses
|
||||
whether or not Alternator will use LWT for every write.
|
||||
For example,
|
||||
@@ -24,10 +24,10 @@ This section will guide you through the steps for setting up the cluster:
|
||||
By default, ScyllaDB run in this way will not have authentication or
|
||||
authorization enabled, and any DynamoDB API request will be honored without
|
||||
requiring them to be signed appropriately. See the
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md#authentication-and-authorization)
|
||||
document on how to configure authentication and authorization.
|
||||
|
||||
## Testing Scylla's DynamoDB API support:
|
||||
## Testing ScyllaDB's DynamoDB API support:
|
||||
### Running AWS Tic Tac Toe demo app to test the cluster:
|
||||
1. Follow the instructions on the [AWS github page](https://github.com/awsdocs/amazon-dynamodb-developer-guide/blob/master/doc_source/TicTacToe.Phase1.md)
|
||||
2. Enjoy your tic-tac-toe game :-)
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
Alternator's primary goal is to be compatible with Amazon DynamoDB(TM)
|
||||
and its APIs, so that any application written to use Amazon DynamoDB could
|
||||
be run, unmodified, against Scylla with Alternator enabled. The extent of
|
||||
be run, unmodified, against ScyllaDB with Alternator enabled. The extent of
|
||||
Alternator's compatibility with DynamoDB is described in the
|
||||
[Scylla Alternator for DynamoDB users](compatibility.md) document.
|
||||
[ScyllaDB Alternator for DynamoDB users](compatibility.md) document.
|
||||
|
||||
But Alternator also adds several features and APIs that are not available in
|
||||
DynamoDB. These Alternator-specific APIs are documented here.
|
||||
@@ -15,7 +15,7 @@ _conditional_ update or an update based on the old value of an attribute.
|
||||
The read and the write should be treated as a single transaction - protected
|
||||
(_isolated_) from other parallel writes to the same item.
|
||||
|
||||
Alternator could do this isolation by using Scylla's LWT (lightweight
|
||||
Alternator could do this isolation by using ScyllaDB's LWT (lightweight
|
||||
transactions) for every write operation, but this significantly slows
|
||||
down writes, and not necessary for workloads which don't use read-modify-write
|
||||
(RMW) updates.
|
||||
@@ -41,7 +41,7 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
which need a read before the write. An attempt to use such statements
|
||||
(e.g., UpdateItem with a ConditionExpression) will result in an error.
|
||||
In this mode, the remaining write requests which are allowed - pure writes
|
||||
without a read - are performed using standard Scylla writes, not LWT,
|
||||
without a read - are performed using standard ScyllaDB writes, not LWT,
|
||||
so they are significantly faster than they would have been in the
|
||||
`always_use_lwt`, but their isolation is still correct.
|
||||
|
||||
@@ -65,19 +65,19 @@ isolation policy for a specific table can be overridden by tagging the table
|
||||
read-modify-write updates. This mode is not recommended for any use case,
|
||||
and will likely be removed in the future.
|
||||
|
||||
## Accessing system tables from Scylla
|
||||
Scylla exposes lots of useful information via its internal system tables,
|
||||
## Accessing system tables from ScyllaDB
|
||||
ScyllaDB exposes lots of useful information via its internal system tables,
|
||||
which can be found in system keyspaces: 'system', 'system\_auth', etc.
|
||||
In order to access to these tables via alternator interface,
|
||||
Scan and Query requests can use a special table name:
|
||||
`.scylla.alternator.KEYSPACE_NAME.TABLE_NAME`
|
||||
which will return results fetched from corresponding Scylla table.
|
||||
which will return results fetched from corresponding ScyllaDB table.
|
||||
|
||||
This interface can be used only to fetch data from system tables.
|
||||
Attempts to read regular tables via the virtual interface will result
|
||||
in an error.
|
||||
|
||||
Example: in order to query the contents of Scylla's `system.large_rows`,
|
||||
Example: in order to query the contents of ScyllaDB's `system.large_rows`,
|
||||
pass `TableName='.scylla.alternator.system.large_rows'` to a Query/Scan
|
||||
request.
|
||||
|
||||
@@ -113,14 +113,14 @@ connection (either active or idle), not necessarily an active request as
|
||||
in Alternator.
|
||||
|
||||
## Service discovery
|
||||
As explained in [Scylla Alternator for DynamoDB users](compatibility.md),
|
||||
As explained in [ScyllaDB Alternator for DynamoDB users](compatibility.md),
|
||||
Alternator requires a load-balancer or a client-side load-balancing library
|
||||
to distribute requests between all Scylla nodes. This load-balancer needs
|
||||
to be able to _discover_ the Scylla nodes. Alternator provides two special
|
||||
to distribute requests between all ScyllaDB nodes. This load-balancer needs
|
||||
to be able to _discover_ the ScyllaDB nodes. Alternator provides two special
|
||||
requests, `/` and `/localnodes`, to help with this service discovery, which
|
||||
we will now explain.
|
||||
|
||||
Some setups know exactly which Scylla nodes were brought up, so all that
|
||||
Some setups know exactly which ScyllaDB nodes were brought up, so all that
|
||||
remains is to periodically verify that each node is still functional. The
|
||||
easiest way to do this is to make an HTTP (or HTTPS) GET request to the node,
|
||||
with URL `/`. This is a trivial GET request and does **not** need to be
|
||||
@@ -133,10 +133,10 @@ $ curl http://localhost:8000/
|
||||
healthy: localhost:8000
|
||||
```
|
||||
|
||||
In other setups, the load balancer might not know which Scylla nodes exist.
|
||||
For example, it may be possible to add or remove Scylla nodes without a
|
||||
In other setups, the load balancer might not know which ScyllaDB nodes exist.
|
||||
For example, it may be possible to add or remove ScyllaDB nodes without a
|
||||
client-side load balancer knowing. For these setups we have the `/localnodes`
|
||||
request that can be used to discover which Scylla nodes exist: A load balancer
|
||||
request that can be used to discover which ScyllaDB nodes exist: A load balancer
|
||||
that already knows at least one live node can discover the rest by sending
|
||||
a `/localnodes` request to the known node. It's again an unauthenticated
|
||||
HTTP (or HTTPS) GET request:
|
||||
@@ -160,7 +160,7 @@ list the nodes in a specific _data center_ or _rack_. These options are
|
||||
useful for certain use cases:
|
||||
|
||||
* A `dc` option (e.g., `/localnodes?dc=dc1`) can be passed to list the
|
||||
nodes in a specific Scylla data center, not the data center of the node
|
||||
nodes in a specific ScyllaDB data center, not the data center of the node
|
||||
being contacted. This is useful when a client knowns of _some_ Scylla
|
||||
node belonging to an unknown DC, but wants to list the nodes in _its_
|
||||
DC, which it knows by name.
|
||||
@@ -191,7 +191,7 @@ tells them to.
|
||||
|
||||
If you want to influence whether a specific Alternator table is created with tablets or vnodes,
|
||||
you can do this by specifying the `system:initial_tablets` tag
|
||||
(in earlier versions of Scylla the tag was `experimental:initial_tablets`)
|
||||
(in earlier versions of ScyllaDB the tag was `experimental:initial_tablets`)
|
||||
in the CreateTable operation. The value of this tag can be:
|
||||
|
||||
* Any valid integer as the value of this tag enables tablets.
|
||||
|
||||
@@ -1043,6 +1043,8 @@ The following modes are available:
|
||||
* - ``immediate``
|
||||
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
|
||||
|
||||
.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
|
||||
|
||||
.. _cql-per-table-tablet-options:
|
||||
|
||||
Per-table tablet options
|
||||
|
||||
@@ -102,6 +102,7 @@ Additional Information
|
||||
|
||||
To learn more about TTL, and see a hands-on example, check out `this lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_ on ScyllaDB University.
|
||||
|
||||
* `Video: Managing data expiration with Time-To-Live <https://www.youtube.com/watch?v=SXkbu7mFHeA>`_
|
||||
* :doc:`Apache Cassandra Query Language (CQL) Reference </cql/index>`
|
||||
* :doc:`KB Article:How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds/>`
|
||||
* :doc:`KB Article:Time to Live (TTL) and Compaction </kb/ttl-facts/>`
|
||||
|
||||
@@ -236,3 +236,26 @@ the same mechanism for other protocol versions, such as CQLv4.
|
||||
|
||||
The feature is identified by the `SCYLLA_USE_METADATA_ID` key, which is meant to be sent
|
||||
in the SUPPORTED message.
|
||||
|
||||
## Sending the CLIENT_ROUTES_CHANGE event
|
||||
|
||||
This extension allows a driver to update its connections when the
|
||||
`system.client_routes` table is modified.
|
||||
|
||||
In some network topologies a specific mapping of addresses and ports is required (e.g.
|
||||
to support Private Link). This mapping can change dynamically even when no nodes are
|
||||
added or removed. The driver must adapt to those changes; otherwise connectivity can be
|
||||
lost.
|
||||
|
||||
The extension is implemented as a new `EVENT` type: `CLIENT_ROUTES_CHANGE`. The event
|
||||
body consists of:
|
||||
- [string] change
|
||||
- [string list] connection_ids
|
||||
- [string list] host_ids
|
||||
|
||||
There is only one change value: `UPDATE_NODES`, which means at least one client route
|
||||
was inserted, updated, or deleted.
|
||||
|
||||
Events already have a subscription mechanism similar to protocol extensions (that is,
|
||||
the driver only receives the events it explicitly subscribed to), so no additional
|
||||
`cql_protocol_extension` key is introduced for this feature.
|
||||
|
||||
@@ -86,6 +86,7 @@ stateDiagram-v2
|
||||
de_left_token_ring --> [*]
|
||||
}
|
||||
state removing {
|
||||
re_left_token_ring : left_token_ring
|
||||
re_tablet_draining : tablet_draining
|
||||
re_tablet_migration : tablet_migration
|
||||
re_write_both_read_old : write_both_read_old
|
||||
@@ -98,7 +99,8 @@ stateDiagram-v2
|
||||
re_tablet_draining --> re_write_both_read_old
|
||||
re_write_both_read_old --> re_write_both_read_new: streaming completed
|
||||
re_write_both_read_old --> re_rollback_to_normal: rollback
|
||||
re_write_both_read_new --> [*]
|
||||
re_write_both_read_new --> re_left_token_ring
|
||||
re_left_token_ring --> [*]
|
||||
}
|
||||
rebuilding --> normal: streaming completed
|
||||
decommissioning --> left: operation succeeded
|
||||
@@ -122,9 +124,10 @@ Note that these are not all states, as there are other states specific to tablet
|
||||
Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
|
||||
to modified token ring), reads are using old replicas.
|
||||
- `write_both_read_new` - as above, but reads are using new replicas.
|
||||
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
|
||||
it from group 0. We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
|
||||
nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
|
||||
then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
|
||||
We also use this state to rollback a failed bootstrap or decommission.
|
||||
- `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
|
||||
moving the node we tried to decommission/remove back to the normal state.
|
||||
- `lock` - the topology stays in this state until externally changed (to null state), preventing topology
|
||||
@@ -141,7 +144,9 @@ reads that started before this point exist in the system. Finally we remove the
|
||||
transitioning state.
|
||||
|
||||
Decommission, removenode and replace work similarly, except they don't go through
|
||||
`commit_cdc_generation`.
|
||||
`commit_cdc_generation`. Both decommission and removenode go through the
|
||||
`left_token_ring` state to run a global barrier ensuring all nodes are aware
|
||||
of the topology change before the operation completes.
|
||||
|
||||
The state machine may also go only through the `commit_cdc_generation` state
|
||||
after getting a request from the user to create a new CDC generation if the
|
||||
|
||||
@@ -25,8 +25,7 @@ Getting Started
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
|
||||
|
||||
* :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
|
||||
* :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
|
||||
* :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
ScyllaDB Housekeeping and how to disable it
|
||||
============================================
|
||||
|
||||
It is always recommended to run the latest version of ScyllaDB.
|
||||
The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
|
||||
It is always recommended to run the latest stable version of ScyllaDB.
|
||||
|
||||
When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
|
||||
Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
|
||||
|
||||
@@ -9,6 +9,8 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
|
||||
To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
|
||||
of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on **any** of the nodes in the cluster.
|
||||
|
||||
.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
|
||||
|
||||
To check if a keyspace enables tablets, use:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
95
docs/poetry.lock
generated
95
docs/poetry.lock
generated
@@ -2,36 +2,35 @@
|
||||
|
||||
[[package]]
|
||||
name = "alabaster"
|
||||
version = "0.7.16"
|
||||
version = "1.0.0"
|
||||
description = "A light, configurable Sphinx theme"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"},
|
||||
{file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"},
|
||||
{file = "alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b"},
|
||||
{file = "alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.11.0"
|
||||
version = "4.12.0"
|
||||
description = "High-level concurrency and networking framework on top of asyncio or Trio"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"},
|
||||
{file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"},
|
||||
{file = "anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb"},
|
||||
{file = "anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
|
||||
idna = ">=2.8"
|
||||
sniffio = ">=1.1"
|
||||
typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
|
||||
|
||||
[package.extras]
|
||||
trio = ["trio (>=0.31.0)"]
|
||||
trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
@@ -50,14 +49,14 @@ dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)"
|
||||
|
||||
[[package]]
|
||||
name = "beartype"
|
||||
version = "0.22.6"
|
||||
version = "0.22.8"
|
||||
description = "Unbearably fast near-real-time pure-Python runtime-static type-checker."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beartype-0.22.6-py3-none-any.whl", hash = "sha256:0584bc46a2ea2a871509679278cda992eadde676c01356ab0ac77421f3c9a093"},
|
||||
{file = "beartype-0.22.6.tar.gz", hash = "sha256:97fbda69c20b48c5780ac2ca60ce3c1bb9af29b3a1a0216898ffabdd523e48f4"},
|
||||
{file = "beartype-0.22.8-py3-none-any.whl", hash = "sha256:b832882d04e41a4097bab9f63e6992bc6de58c414ee84cba9b45b67314f5ab2e"},
|
||||
{file = "beartype-0.22.8.tar.gz", hash = "sha256:b19b21c9359722ee3f7cc433f063b3e13997b27ae8226551ea5062e621f61165"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -70,18 +69,18 @@ test-tox-coverage = ["coverage (>=5.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.2"
|
||||
version = "4.14.3"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
|
||||
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
|
||||
{file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
|
||||
{file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">1.2"
|
||||
soupsieve = ">=1.6.1"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
@@ -802,18 +801,6 @@ files = [
|
||||
{file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sniffio"
|
||||
version = "1.3.1"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
|
||||
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "snowballstemmer"
|
||||
version = "3.0.1"
|
||||
@@ -840,18 +827,18 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "sphinx"
|
||||
version = "7.4.7"
|
||||
version = "8.1.3"
|
||||
description = "Python documentation generator"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"},
|
||||
{file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"},
|
||||
{file = "sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2"},
|
||||
{file = "sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
alabaster = ">=0.7.14,<0.8.0"
|
||||
alabaster = ">=0.7.14"
|
||||
babel = ">=2.13"
|
||||
colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""}
|
||||
docutils = ">=0.20,<0.22"
|
||||
@@ -861,17 +848,17 @@ packaging = ">=23.0"
|
||||
Pygments = ">=2.17"
|
||||
requests = ">=2.30.0"
|
||||
snowballstemmer = ">=2.2"
|
||||
sphinxcontrib-applehelp = "*"
|
||||
sphinxcontrib-devhelp = "*"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.0"
|
||||
sphinxcontrib-jsmath = "*"
|
||||
sphinxcontrib-qthelp = "*"
|
||||
sphinxcontrib-applehelp = ">=1.0.7"
|
||||
sphinxcontrib-devhelp = ">=1.0.6"
|
||||
sphinxcontrib-htmlhelp = ">=2.0.6"
|
||||
sphinxcontrib-jsmath = ">=1.0.1"
|
||||
sphinxcontrib-qthelp = ">=1.0.6"
|
||||
sphinxcontrib-serializinghtml = ">=1.1.9"
|
||||
tomli = {version = ">=2", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinxcontrib-websupport"]
|
||||
lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"]
|
||||
lint = ["flake8 (>=6.0)", "mypy (==1.11.1)", "pyright (==1.1.384)", "pytest (>=6.0)", "ruff (==0.6.9)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-Pillow (==10.2.0.20240822)", "types-Pygments (==2.18.0.20240506)", "types-colorama (==0.4.15.20240311)", "types-defusedxml (==0.7.0.20240218)", "types-docutils (==0.21.0.20241005)", "types-requests (==2.32.0.20240914)", "types-urllib3 (==1.26.25.14)"]
|
||||
test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"]
|
||||
|
||||
[[package]]
|
||||
@@ -1001,13 +988,14 @@ test = ["tox"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-scylladb-markdown"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
description = "Sphinx extension for ScyllaDB documentation with enhanced Markdown support through MystParser and recommonmark."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_scylladb_markdown-0.1.3-py3-none-any.whl", hash = "sha256:f20160b4aadf4c8cf95637f0a544121954b792914ab6ec05b67cae75e20a5566"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.4-py3-none-any.whl", hash = "sha256:598753e01cf159d4698eb1a707958828446e21749038d3d42c5b9c7e86eda6e4"},
|
||||
{file = "sphinx_scylladb_markdown-0.1.4.tar.gz", hash = "sha256:9db3ae0dcf7c3519262da65e48c7f9e4db0ad1ce9c5f874864ea218f4cbc4c68"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1059,24 +1047,25 @@ dev = ["build", "flake8", "pre-commit", "pytest", "sphinx", "sphinx-last-updated
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-substitution-extensions"
|
||||
version = "2025.1.2"
|
||||
version = "2025.11.17"
|
||||
description = "Extensions for Sphinx which allow for substitutions."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sphinx_substitution_extensions-2025.1.2-py2.py3-none-any.whl", hash = "sha256:ff14f40e4393bd7434a196badb8d47983355d9755af884b902e3023fb456b958"},
|
||||
{file = "sphinx_substitution_extensions-2025.1.2.tar.gz", hash = "sha256:53b8d394d5098a09aef36bc687fa310aeb28466319d2c750e996e46400fb2474"},
|
||||
{file = "sphinx_substitution_extensions-2025.11.17-py2.py3-none-any.whl", hash = "sha256:ac18455bdc8324b337b0fe7498c1c0d0b1cb65c74d131459be4dea9edb6abbef"},
|
||||
{file = "sphinx_substitution_extensions-2025.11.17.tar.gz", hash = "sha256:aae17f8db9efc3d454a304373ae3df763f8739e05e0b98d5381db46f6d250b27"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beartype = ">=0.18.5"
|
||||
docutils = ">=0.19"
|
||||
sphinx = ">=7.3.5"
|
||||
myst-parser = ">=4.0.0"
|
||||
sphinx = ">=8.1.0"
|
||||
|
||||
[package.extras]
|
||||
dev = ["actionlint-py (==1.7.5.21)", "check-manifest (==0.50)", "deptry (==0.21.2)", "doc8 (==1.1.2)", "doccmd (==2024.12.26)", "docformatter (==1.7.5)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2024.12.25)", "mypy[faster-cache] (==1.14.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pyenchant (==3.3.0rc1)", "pylint (==3.3.3)", "pyproject-fmt (==2.5.0)", "pyright (==1.1.391)", "pyroma (==4.2)", "pytest (==8.3.4)", "pytest-cov (==6.0.0)", "ruff (==0.8.4)", "shellcheck-py (==0.10.0.1)", "shfmt-py (==3.7.0.1)", "sphinx-toolbox (==3.8.1)", "sphinx[test] (==8.1.3)", "types-docutils (==0.21.0.20241128)", "vulture (==2.14)", "yamlfix (==1.17.0)"]
|
||||
release = ["check-wheel-contents (==0.6.1)"]
|
||||
dev = ["actionlint-py (==1.7.8.24)", "check-manifest (==0.51)", "deptry (==0.24.0)", "doc8 (==2.0.0)", "doccmd (==2025.11.8.1)", "docformatter (==1.7.7)", "interrogate (==1.7.0)", "mypy-strict-kwargs (==2025.4.3)", "mypy[faster-cache] (==1.18.2)", "pre-commit (==4.4.0)", "pylint[spelling] (==4.0.3)", "pyproject-fmt (==2.11.1)", "pyright (==1.1.407)", "pyroma (==5.0)", "pytest (==9.0.1)", "pytest-cov (==7.0.0)", "ruff (==0.14.5)", "shellcheck-py (==0.11.0.1)", "shfmt-py (==3.12.0.2)", "sphinx-lint (==1.0.1)", "sphinx-toolbox (==4.0.0)", "types-docutils (==0.22.2.20251006)", "vulture (==2.14)", "yamlfix (==1.19.0)"]
|
||||
release = ["check-wheel-contents (==0.6.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-tabs"
|
||||
@@ -1363,21 +1352,21 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.5.0"
|
||||
version = "2.6.2"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
|
||||
{file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
|
||||
{file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"},
|
||||
{file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
|
||||
brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""]
|
||||
h2 = ["h2 (>=4,<5)"]
|
||||
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
zstd = ["zstandard (>=0.18.0)"]
|
||||
zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
|
||||
|
||||
[[package]]
|
||||
name = "uvicorn"
|
||||
@@ -1603,4 +1592,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "0ae673106f45d3465cbdabbf511e165ca44feadd34d7753f2e68093afaa95c79"
|
||||
content-hash = "9a17caa38b3c88f3fe3d1a60fdb73a96aa12ff1e30ecb00e2f9249e7ba9f859c"
|
||||
|
||||
@@ -12,10 +12,10 @@ redirects_cli ="^0.1.3"
|
||||
sphinx-scylladb-theme = "^1.8.10"
|
||||
sphinx-sitemap = "^2.6.0"
|
||||
sphinx-autobuild = "^2024.4.19"
|
||||
Sphinx = "^7.3.7"
|
||||
Sphinx = "^8.0.0"
|
||||
sphinx-multiversion-scylla = "^0.3.4"
|
||||
sphinxcontrib-datatemplates = "^0.9.2"
|
||||
sphinx-scylladb-markdown = "^0.1.2"
|
||||
sphinx-scylladb-markdown = "^0.1.4"
|
||||
sphinx_collapse ="^0.1.3"
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -202,3 +202,7 @@ Glossary
|
||||
The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
|
||||
The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.
|
||||
|
||||
Colocated Table
|
||||
An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
|
||||
Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.
|
||||
|
||||
|
||||
@@ -816,7 +816,6 @@ public:
|
||||
future<data_sink> wrap_sink(const sstables::sstable& sst, sstables::component_type type, data_sink sink) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
co_return sink;
|
||||
@@ -845,7 +844,6 @@ public:
|
||||
sstables::component_type type,
|
||||
data_source src) override {
|
||||
switch (type) {
|
||||
case sstables::component_type::TemporaryScylla:
|
||||
case sstables::component_type::Scylla:
|
||||
case sstables::component_type::TemporaryTOC:
|
||||
case sstables::component_type::TOC:
|
||||
|
||||
@@ -176,6 +176,8 @@ public:
|
||||
gms::feature rack_list_rf { *this, "RACK_LIST_RF"sv };
|
||||
gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
|
||||
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
|
||||
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
15
main.cc
15
main.cc
@@ -23,6 +23,7 @@
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/signal.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/qos/raft_service_level_distributed_data_accessor.hh"
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "tasks/task_manager.hh"
|
||||
@@ -1795,6 +1796,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth_cache.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing client routes service");
|
||||
static sharded<service::client_routes_service> client_routes;
|
||||
client_routes.start(std::ref(stop_signal.as_sharded_abort_source()), std::ref(feature_service), std::ref(group0_client), std::ref(qp), std::ref(lifecycle_notifier)).get();
|
||||
auto stop_client_routes = defer_verbose_shutdown("client_routes", [&] {
|
||||
client_routes.stop().get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "initializing storage service");
|
||||
debug::the_storage_service = &ss;
|
||||
ss.start(std::ref(stop_signal.as_sharded_abort_source()),
|
||||
@@ -1803,7 +1811,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
std::ref(messaging), std::ref(repair),
|
||||
std::ref(stream_manager), std::ref(lifecycle_notifier), std::ref(bm), std::ref(snitch),
|
||||
std::ref(tablet_allocator), std::ref(cdc_generation_service), std::ref(view_builder), std::ref(view_building_worker), std::ref(qp), std::ref(sl_controller),
|
||||
std::ref(auth_cache),
|
||||
std::ref(auth_cache), std::ref(client_routes),
|
||||
std::ref(tsm), std::ref(vbsm), std::ref(task_manager), std::ref(gossip_address_map),
|
||||
compression_dict_updated_callback,
|
||||
only_on_shard0(&*disk_space_monitor_shard0)
|
||||
@@ -2191,6 +2199,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
}).get();
|
||||
|
||||
api::set_server_client_routes(ctx, client_routes).get();
|
||||
auto stop_cr_api = defer_verbose_shutdown("client routes API", [&ctx] {
|
||||
api::unset_server_client_routes(ctx).get();
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "join cluster");
|
||||
// Allow abort during join_cluster since bootstrap or replace
|
||||
// can take a long time.
|
||||
|
||||
@@ -56,33 +56,16 @@ static tasks::task_manager::task_state get_state(const db::system_keyspace::topo
|
||||
}
|
||||
}
|
||||
|
||||
static std::set<tasks::task_id> get_pending_ids(service::topology& topology) {
|
||||
std::set<tasks::task_id> ids;
|
||||
for (auto& request : topology.requests) {
|
||||
ids.emplace(topology.find(request.first)->second.request_id);
|
||||
}
|
||||
return ids;
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, std::chrono::seconds ttl) {
|
||||
return sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
}
|
||||
|
||||
static future<db::system_keyspace::topology_requests_entries> get_entries(db::system_keyspace& sys_ks, service::topology& topology, std::chrono::seconds ttl) {
|
||||
// Started requests.
|
||||
auto entries = co_await sys_ks.get_node_ops_request_entries(db_clock::now() - ttl);
|
||||
|
||||
// Pending requests.
|
||||
for (auto& id : get_pending_ids(topology)) {
|
||||
entries.try_emplace(id.uuid(), db::system_keyspace::topology_requests_entry{});
|
||||
}
|
||||
|
||||
co_return entries;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(id.uuid(), false);
|
||||
auto started = entry.id;
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
if (!started && !get_pending_ids(topology).contains(id)) {
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry_opt = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry_opt) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto& entry = *entry_opt;
|
||||
co_return tasks::task_status{
|
||||
.task_id = id,
|
||||
.type = request_type_to_task_type(entry.request_type),
|
||||
@@ -101,7 +84,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status_help
|
||||
.entity = "",
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = started ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{}
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
};
|
||||
}
|
||||
|
||||
@@ -123,26 +106,22 @@ future<std::optional<tasks::virtual_task_hint>> node_ops_virtual_task::contains(
|
||||
}
|
||||
}
|
||||
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry(task_id.uuid(), false);
|
||||
co_return bool(entry.id) && std::holds_alternative<service::topology_request>(entry.request_type) ? empty_hint : std::nullopt;
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
co_return entry && std::holds_alternative<service::topology_request>(entry->request_type) ? empty_hint : std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> node_ops_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::no);
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
return get_status_helper(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> node_ops_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status_helper(id, hint);
|
||||
auto entry = co_await get_status(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status_helper(id, std::move(hint));
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
@@ -151,8 +130,7 @@ future<> node_ops_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hin
|
||||
|
||||
future<std::vector<tasks::task_stats>> node_ops_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
service::topology& topology = _ss._topology_state_machine._topology;
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, topology, get_task_manager().get_user_task_ttl())
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await get_entries(sys_ks, get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto id = e.first;
|
||||
auto& entry = e.second;
|
||||
|
||||
@@ -39,8 +39,6 @@ public:
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
private:
|
||||
future<std::optional<tasks::task_status>> get_status_helper(tasks::task_id id, tasks::virtual_task_hint hint) const;
|
||||
};
|
||||
|
||||
class streaming_task_impl : public tasks::task_manager::task::impl {
|
||||
|
||||
@@ -2793,6 +2793,7 @@ future<> database::flush_all_tables() {
|
||||
});
|
||||
_all_tables_flushed_at = db_clock::now();
|
||||
co_await _commitlog->wait_for_pending_deletes();
|
||||
dblog.info("Forcing new commitlog segment and flushing all tables complete");
|
||||
}
|
||||
|
||||
future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {
|
||||
|
||||
@@ -593,7 +593,7 @@ private:
|
||||
v3_columns _v3_columns;
|
||||
mutable schema_registry_entry* _registry_entry = nullptr;
|
||||
std::unique_ptr<::view_info> _view_info;
|
||||
schema_ptr _cdc_schema;
|
||||
mutable schema_ptr _cdc_schema;
|
||||
|
||||
const std::array<column_count_type, 3> _offsets;
|
||||
|
||||
@@ -957,6 +957,7 @@ public:
|
||||
friend bool operator==(const schema&, const schema&);
|
||||
const column_mapping& get_column_mapping() const;
|
||||
friend class schema_registry_entry;
|
||||
friend class schema_registry;
|
||||
// May be called from different shard
|
||||
schema_registry_entry* registry_entry() const noexcept;
|
||||
// Returns true iff this schema version was synced with on current node.
|
||||
|
||||
@@ -78,10 +78,8 @@ void schema_registry::attach_table(schema_registry_entry& e) noexcept {
|
||||
}
|
||||
|
||||
schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
auto learned_cdc_schema = s->cdc_schema() ? local_schema_registry().learn(s->cdc_schema()) : nullptr;
|
||||
if (learned_cdc_schema != s->cdc_schema()) {
|
||||
s = s->make_with_cdc(learned_cdc_schema);
|
||||
}
|
||||
auto learned_cdc_schema = s->cdc_schema() ? learn(s->cdc_schema()) : nullptr;
|
||||
s->_cdc_schema = learned_cdc_schema;
|
||||
if (s->registry_entry()) {
|
||||
return s;
|
||||
}
|
||||
@@ -92,7 +90,9 @@ schema_ptr schema_registry::learn(schema_ptr s) {
|
||||
e.load(s);
|
||||
attach_table(e);
|
||||
}
|
||||
return e.get_schema();
|
||||
auto loaded_s = e.get_schema();
|
||||
loaded_s->_cdc_schema = learned_cdc_schema;
|
||||
return loaded_s;
|
||||
}
|
||||
slogger.debug("Learning about version {} of {}.{}", s->version(), s->ks_name(), s->cf_name());
|
||||
auto e_ptr = make_lw_shared<schema_registry_entry>(s->version(), *this);
|
||||
|
||||
@@ -3,6 +3,7 @@ target_sources(service
|
||||
PRIVATE
|
||||
broadcast_tables/experimental/lang.cc
|
||||
client_state.cc
|
||||
client_routes.cc
|
||||
mapreduce_service.cc
|
||||
migration_manager.cc
|
||||
misc_services.cc
|
||||
|
||||
137
service/client_routes.cc
Normal file
137
service/client_routes.cc
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "service/client_routes.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
|
||||
static logging::logger crlogger("client_routes");
|
||||
|
||||
service::query_state& client_routes_query_state() {
|
||||
using namespace std::chrono_literals;
|
||||
const auto t = 10s;
|
||||
static timeout_config tc{ t, t, t, t, t, t, t };
|
||||
static thread_local service::client_state cs(service::client_state::internal_tag{}, tc);
|
||||
static thread_local service::query_state qs(cs, empty_service_permit());
|
||||
return qs;
|
||||
};
|
||||
|
||||
future<mutation> service::client_routes_service::make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key) {
|
||||
static const sstring stmt = format("DELETE FROM {}.{} WHERE connection_id = ? and host_id = ?", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {key.connection_id, key.host_id});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<mutation> service::client_routes_service::make_update_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_entry& route) {
|
||||
static const sstring stmt = format("INSERT INTO {}.{} (connection_id, host_id, address, port, tls_port, alternator_port, alternator_https_port) VALUES (?, ?, ?, ?, ?, ?, ?)", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
|
||||
auto muts = co_await _qp.get_mutations_internal(stmt, client_routes_query_state(), ts, {
|
||||
route.connection_id,
|
||||
route.host_id,
|
||||
route.address,
|
||||
route.port,
|
||||
route.tls_port,
|
||||
route.alternator_port,
|
||||
route.alternator_https_port
|
||||
});
|
||||
if (muts.size() != 1) {
|
||||
on_internal_error(crlogger, fmt::format("expected 1 mutation got {}", muts.size()));
|
||||
}
|
||||
co_return std::move(muts[0]);
|
||||
}
|
||||
|
||||
future<std::vector<service::client_routes_service::client_route_entry>> service::client_routes_service::get_client_routes() const {
|
||||
std::vector<service::client_routes_service::client_route_entry> result;
|
||||
static const sstring query = format("SELECT * from {}.{}", db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES);
|
||||
auto rs = co_await _qp.execute_internal(query, cql3::query_processor::cache_internal::yes);
|
||||
result.reserve(rs->size());
|
||||
for (const auto& row : *rs) {
|
||||
result.emplace_back(
|
||||
row.get_as<sstring>("connection_id"),
|
||||
row.get_as<utils::UUID>("host_id"),
|
||||
row.get_as<sstring>("address"),
|
||||
row.get_opt<int32_t>("port"),
|
||||
row.get_opt<int32_t>("tls_port"),
|
||||
row.get_opt<int32_t>("alternator_port"),
|
||||
row.get_opt<int32_t>("alternator_https_port")
|
||||
);
|
||||
}
|
||||
co_return result;
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::notify_client_routes_change(const client_route_keys& client_route_keys) {
|
||||
co_await container().invoke_on_all([&client_route_keys] (service::client_routes_service& client_routes) {
|
||||
return client_routes._lifecycle_notifier.notify_client_routes_change(client_route_keys);
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (auto& entry : route_entries) {
|
||||
auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "insert client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
|
||||
auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
|
||||
for (const auto& route_key : route_keys) {
|
||||
auto mut = co_await make_remove_client_route_mutation(guard.write_timestamp(), route_key);
|
||||
cmuts.emplace_back(std::move(mut));
|
||||
}
|
||||
|
||||
auto cmd = _group0_client.prepare_command(service::write_mutations{std::move(cmuts)}, guard, "delete client routes");
|
||||
co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
|
||||
return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) -> future<> {
|
||||
return cr.with_retry([&cr, route_entries = std::move(route_entries)] () mutable {
|
||||
return cr.set_client_routes_inner(route_entries);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
seastar::future<> service::client_routes_service::delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
|
||||
return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) -> future<> {
|
||||
return cr.with_retry([&cr, route_keys = std::move(route_keys)] () mutable {
|
||||
return cr.delete_client_routes_inner(route_keys);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
seastar::future<> service::client_routes_service::with_retry(Func&& func) const {
|
||||
int retries = 10;
|
||||
while (true) {
|
||||
try {
|
||||
co_await func();
|
||||
} catch (const ::service::group0_concurrent_modification&) {
|
||||
crlogger.warn("Failed to set client routes due to guard conflict, retries={}", retries);
|
||||
if (retries--) {
|
||||
continue;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
88
service/client_routes.hh
Normal file
88
service/client_routes.hh
Normal file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (C) 2025-present ScyllaDB
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
|
||||
#include "gms/feature_service.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "service/raft/raft_group0_client.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
class endpoint_lifecycle_notifier;
|
||||
|
||||
class client_routes_service : public seastar::peering_sharded_service<client_routes_service> {
|
||||
public:
|
||||
client_routes_service(
|
||||
abort_source& abort_source,
|
||||
gms::feature_service& feature_service,
|
||||
service::raft_group0_client& group0_client,
|
||||
cql3::query_processor& qp,
|
||||
endpoint_lifecycle_notifier& elc_notif
|
||||
)
|
||||
: _abort_source(abort_source)
|
||||
, _feature_service(feature_service)
|
||||
, _group0_client(group0_client)
|
||||
, _qp(qp)
|
||||
, _lifecycle_notifier(elc_notif) { }
|
||||
|
||||
struct client_route_key {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
|
||||
bool operator<(const client_route_key& other) const {
|
||||
if (connection_id != other.connection_id) {
|
||||
return connection_id < other.connection_id;
|
||||
}
|
||||
return host_id < other.host_id;
|
||||
}
|
||||
};
|
||||
using client_route_keys = std::set<client_route_key>;
|
||||
|
||||
struct client_route_entry {
|
||||
sstring connection_id;
|
||||
utils::UUID host_id;
|
||||
sstring address;
|
||||
// At least one of the ports should be specified
|
||||
std::optional<int32_t> port;
|
||||
std::optional<int32_t> tls_port;
|
||||
std::optional<int32_t> alternator_port;
|
||||
std::optional<int32_t> alternator_https_port;
|
||||
};
|
||||
|
||||
gms::feature_service& get_feature_service() noexcept {
|
||||
return _feature_service;
|
||||
}
|
||||
|
||||
// mutations
|
||||
future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
|
||||
future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
|
||||
future<std::vector<client_route_entry>> get_client_routes() const;
|
||||
seastar::future<> set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
|
||||
|
||||
// notifications
|
||||
seastar::future<> notify_client_routes_change(const client_route_keys& client_route_keys);
|
||||
private:
|
||||
seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
|
||||
seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
|
||||
template <typename Func>
|
||||
seastar::future<> with_retry(Func&& func) const;
|
||||
|
||||
abort_source& _abort_source;
|
||||
gms::feature_service& _feature_service;
|
||||
service::raft_group0_client& _group0_client;
|
||||
cql3::query_processor& _qp;
|
||||
endpoint_lifecycle_notifier& _lifecycle_notifier;
|
||||
};
|
||||
|
||||
}
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "gms/inet_address.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "utils/atomic_vector.hh"
|
||||
#include "service/client_routes.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -65,6 +66,7 @@ public:
|
||||
* @param endpoint the endpoint marked DOWN.
|
||||
*/
|
||||
virtual void on_down(const gms::inet_address& endpoint, locator::host_id host_id) {}
|
||||
virtual void on_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {}
|
||||
};
|
||||
|
||||
class endpoint_lifecycle_notifier {
|
||||
@@ -79,6 +81,8 @@ public:
|
||||
future<> notify_released(locator::host_id host_id);
|
||||
future<> notify_up(gms::inet_address endpoint, locator::host_id host_id);
|
||||
future<> notify_joined(gms::inet_address endpoint, locator::host_id host_id);
|
||||
|
||||
future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -163,7 +163,11 @@ public:
|
||||
void before_drop_column_family(const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_drop_keyspace(const sstring& keyspace_name, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
|
||||
// Called when creating a tablet map for a new table.
|
||||
// When in the context of a notification callback, call `before_allocate_tablet_map_in_notification`,
|
||||
// and otherwise call 'before_allocate_tablet_map'.
|
||||
void before_allocate_tablet_map(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
void before_allocate_tablet_map_in_notification(const locator::tablet_map&, const schema&, utils::chunked_vector<mutation>&, api::timestamp_type);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -648,6 +648,13 @@ void migration_notifier::before_allocate_tablet_map(const locator::tablet_map& m
|
||||
});
|
||||
}
|
||||
|
||||
void migration_notifier::before_allocate_tablet_map_in_notification(const locator::tablet_map& map,
|
||||
const schema& s, utils::chunked_vector<mutation>& mutations, api::timestamp_type ts) {
|
||||
_listeners.thread_for_each_nested([&map, &s, &mutations, ts] (migration_listener* listener) {
|
||||
listener->on_before_allocate_tablet_map(map, s, mutations, ts);
|
||||
});
|
||||
}
|
||||
|
||||
utils::chunked_vector<mutation> prepare_keyspace_update_announcement(replica::database& db, lw_shared_ptr<keyspace_metadata> ksm, api::timestamp_type ts) {
|
||||
db.validate_keyspace_update(*ksm);
|
||||
mlogger.info("Update Keyspace: {}", ksm);
|
||||
|
||||
@@ -640,6 +640,16 @@ future<scheduling_group> service_level_controller::auth_integration::get_user_sc
|
||||
}
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::auth_integration::get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
if (usr && usr->name) {
|
||||
auto sl_opt = find_cached_effective_service_level(*usr->name);
|
||||
auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name;
|
||||
return _sl_controller.get_scheduling_group(sl_name);
|
||||
} else {
|
||||
return _sl_controller.get_default_scheduling_group();
|
||||
}
|
||||
}
|
||||
|
||||
future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
// Special case:
|
||||
// -------------
|
||||
@@ -656,6 +666,11 @@ future<scheduling_group> service_level_controller::get_user_scheduling_group(con
|
||||
return _auth_integration->get_user_scheduling_group(usr);
|
||||
}
|
||||
|
||||
scheduling_group service_level_controller::get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
|
||||
SCYLLA_ASSERT(_auth_integration != nullptr);
|
||||
return _auth_integration->get_user_cached_scheduling_group(usr);
|
||||
}
|
||||
|
||||
std::optional<sstring> service_level_controller::get_active_service_level() {
|
||||
unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group());
|
||||
if (_sl_lookup[sched_idx].first) {
|
||||
@@ -774,6 +789,10 @@ future<service_levels_info> service_level_controller::get_distributed_service_le
|
||||
return _sl_data_accessor ? _sl_data_accessor->get_service_level(service_level_name) : make_ready_future<service_levels_info>();
|
||||
}
|
||||
|
||||
bool service_level_controller::can_use_effective_service_level_cache() const{
|
||||
return _sl_data_accessor && _sl_data_accessor->can_use_effective_service_level_cache();
|
||||
}
|
||||
|
||||
future<bool> service_level_controller::validate_before_service_level_add() {
|
||||
assert(this_shard_id() == global_controller);
|
||||
if (_global_controller_db->deleted_scheduling_groups.size() > 0) {
|
||||
|
||||
@@ -154,7 +154,10 @@ public:
|
||||
/// Synchronous version of `find_effective_service_level` that only checks the cache.
|
||||
std::optional<service_level_options> find_cached_effective_service_level(const sstring& role_name);
|
||||
|
||||
/// Execute a function within the service level context of a user, get_user_scheduling_group - async version
|
||||
/// get_user_cached_scheduling_group - sync version (used for v2 servers).
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
scheduling_group get_user_cached_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
|
||||
template <typename Func, typename Ret = std::invoke_result_t<Func>>
|
||||
requires std::invocable<Func>
|
||||
@@ -339,6 +342,12 @@ public:
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* Get the scheduling group of a specific user for the service level cache
|
||||
* @param user - the user for determining the service level
|
||||
* @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default")
|
||||
*/
|
||||
scheduling_group get_cached_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
|
||||
/**
|
||||
* @return the name of the currently active service level if such exists or an empty
|
||||
* optional if no active service level.
|
||||
@@ -400,6 +409,13 @@ public:
|
||||
future<service_levels_info> get_distributed_service_levels(qos::query_context ctx);
|
||||
future<service_levels_info> get_distributed_service_level(sstring service_level_name);
|
||||
|
||||
/*
|
||||
* Returns whether effective service level cache can be populated and used.
|
||||
* This is equivalent to checking whether auth + raft have been migrated to raft.
|
||||
*/
|
||||
bool can_use_effective_service_level_cache() const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the service level options **in effect** for a user having the given
|
||||
* collection of roles.
|
||||
|
||||
@@ -124,8 +124,40 @@ bool should_flush_system_topology_after_applying(const mutation& mut, const data
|
||||
return false;
|
||||
}
|
||||
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
static void collect_client_routes_update(const mutation& mut, client_routes_service::client_route_keys& client_routes_update) {
|
||||
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
if (mut.column_family_id() != s_client_routes->id()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto pk_components = mut.decorated_key()._key.explode(*s_client_routes);
|
||||
if (pk_components.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto conn_uuid = value_cast<sstring>(utf8_type->deserialize_value(pk_components[0]));
|
||||
for (const rows_entry& re : mut.partition().clustered_rows()) {
|
||||
const auto ck_components = re.key().explode(*s_client_routes);
|
||||
if (ck_components.empty()) {
|
||||
continue;
|
||||
}
|
||||
auto host_uuid = value_cast<utils::UUID>(uuid_type->deserialize_value(ck_components[0]));
|
||||
client_routes_update.emplace(conn_uuid, host_uuid);
|
||||
}
|
||||
}
|
||||
|
||||
static future<> notify_client_route_change_if_needed(storage_service& storage_service, const client_routes_service::client_route_keys& client_routes_update) {
|
||||
if (client_routes_update.size() > 0) {
|
||||
slogger.trace("write_mutations_to_database: notify_client_routes_change routes_update.size()={}", client_routes_update.size());
|
||||
co_await storage_service.notify_client_routes_change(client_routes_update);
|
||||
}
|
||||
}
|
||||
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
|
||||
utils::chunked_vector<frozen_mutation_and_schema> mutations;
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
|
||||
mutations.reserve(cms.size());
|
||||
bool need_system_topology_flush = false;
|
||||
try {
|
||||
@@ -133,7 +165,12 @@ future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address fro
|
||||
auto& tbl = proxy.local_db().find_column_family(cm.column_family_id());
|
||||
auto& s = tbl.schema();
|
||||
auto mut = co_await to_mutation_gently(cm, s);
|
||||
|
||||
need_system_topology_flush = need_system_topology_flush || should_flush_system_topology_after_applying(mut, proxy.data_dictionary());
|
||||
if (proxy.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
|
||||
mutations.emplace_back(co_await freeze_gently(mut), s);
|
||||
}
|
||||
} catch (replica::no_such_column_family& e) {
|
||||
@@ -147,6 +184,8 @@ future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address fro
|
||||
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
co_await proxy.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
co_await notify_client_route_change_if_needed(storage_service, client_routes_update);
|
||||
}
|
||||
|
||||
group0_state_machine::modules_to_reload group0_state_machine::get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations) {
|
||||
@@ -251,7 +290,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
[&] (topology_change& chng) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(chng.mutations);
|
||||
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
|
||||
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
},
|
||||
@@ -263,7 +302,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
|
||||
},
|
||||
[&] (write_mutations& muts) -> future<> {
|
||||
auto modules_to_reload = get_modules_to_reload(muts.mutations);
|
||||
co_await write_mutations_to_database(_sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
|
||||
co_await reload_modules(std::move(modules_to_reload));
|
||||
}
|
||||
), cmd.change);
|
||||
@@ -393,6 +432,7 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
|
||||
future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) {
|
||||
try {
|
||||
co_await utils::get_local_injector().inject("block_group0_transfer_snapshot", utils::wait_for_message(300s));
|
||||
// Note that this may bring newer state than the group0 state machine raft's
|
||||
// log, so some raft entries may be double applied, but since the state
|
||||
// machine is idempotent it is not a problem.
|
||||
@@ -451,11 +491,23 @@ future<> group0_state_machine::transfer_snapshot(raft::server_id from_id, raft::
|
||||
co_await _sp.get_db().local().flush(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
|
||||
}
|
||||
|
||||
client_routes_service::client_route_keys client_routes_update;
|
||||
if (raft_snp) {
|
||||
if (_sp.data_dictionary().has_schema(db::system_keyspace::NAME, db::system_keyspace::CLIENT_ROUTES)) {
|
||||
auto s_client_routes = db::system_keyspace::client_routes();
|
||||
for (auto& canonical_mut : raft_snp->mutations) {
|
||||
if (canonical_mut.column_family_id() == s_client_routes->id()) {
|
||||
auto mut = co_await to_mutation_gently(canonical_mut, s_client_routes);
|
||||
slogger.trace("transfer snapshot: raft snapshot includes client_routes mutation");
|
||||
collect_client_routes_update(mut, client_routes_update);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await mutate_locally(std::move(raft_snp->mutations), _sp);
|
||||
}
|
||||
|
||||
co_await _ss.auth_cache().load_all();
|
||||
co_await notify_client_route_change_if_needed(_ss, client_routes_update);
|
||||
|
||||
co_await _sp.mutate_locally({std::move(history_mut)}, nullptr);
|
||||
} catch (const abort_requested_exception&) {
|
||||
|
||||
@@ -130,6 +130,6 @@ public:
|
||||
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
|
||||
|
||||
// Used to write data to topology and other tables except schema tables.
|
||||
future<> write_mutations_to_database(storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms);
|
||||
|
||||
} // end of namespace service
|
||||
|
||||
@@ -254,6 +254,10 @@ public:
|
||||
group0_batch(const group0_batch&) = delete;
|
||||
group0_batch(group0_batch&&) = default;
|
||||
|
||||
const group0_guard& guard() const {
|
||||
return _guard.value();
|
||||
}
|
||||
|
||||
// Gets timestamp which should be used when building mutations.
|
||||
api::timestamp_type write_timestamp() const;
|
||||
utils::UUID new_group0_state_id() const;
|
||||
|
||||
@@ -1114,7 +1114,7 @@ private:
|
||||
// only for a truncate which is still waiting.
|
||||
if (_topology_state_machine._topology.global_request) {
|
||||
utils::UUID ongoing_global_request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id, true);
|
||||
const auto topology_requests_entry = co_await _sys_ks.local().get_topology_request_entry(ongoing_global_request_id);
|
||||
auto global_request = std::get<service::global_topology_request>(topology_requests_entry.request_type);
|
||||
if (global_request == global_topology_request::truncate_table) {
|
||||
std::optional<topology::transition_state>& tstate = _topology_state_machine._topology.tstate;
|
||||
|
||||
@@ -205,6 +205,7 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -224,11 +225,13 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
, _snitch(snitch)
|
||||
, _sl_controller(sl_controller)
|
||||
, _auth_cache(auth_cache)
|
||||
, _client_routes(client_routes)
|
||||
, _group0(nullptr)
|
||||
, _async_gate("storage_service")
|
||||
, _node_ops_abort_thread(node_ops_abort_thread())
|
||||
, _node_ops_module(make_shared<node_ops::task_manager_module>(tm, *this))
|
||||
, _tablets_module(make_shared<service::task_manager_module>(tm, *this))
|
||||
, _global_topology_requests_module(make_shared<service::topo::task_manager_module>(tm))
|
||||
, _address_map(address_map)
|
||||
, _shared_token_metadata(stm)
|
||||
, _erm_factory(erm_factory)
|
||||
@@ -252,9 +255,11 @@ storage_service::storage_service(abort_source& abort_source,
|
||||
{
|
||||
tm.register_module(_node_ops_module->get_name(), _node_ops_module);
|
||||
tm.register_module(_tablets_module->get_name(), _tablets_module);
|
||||
tm.register_module(_global_topology_requests_module->get_name(), _global_topology_requests_module);
|
||||
if (this_shard_id() == 0) {
|
||||
_node_ops_module->make_virtual_task<node_ops::node_ops_virtual_task>(*this);
|
||||
_tablets_module->make_virtual_task<service::tablet_virtual_task>(*this);
|
||||
_global_topology_requests_module->make_virtual_task<service::topo::global_topology_request_virtual_task>(*this);
|
||||
}
|
||||
register_metrics();
|
||||
|
||||
@@ -583,12 +588,16 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
}
|
||||
break;
|
||||
case node_state::decommissioning:
|
||||
// A decommissioning node loses its tokens when topology moves to left_token_ring.
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::left_token_ring) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::removing:
|
||||
// A decommissioning or removing node loses its tokens when topology moves to left_token_ring.
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::left_token_ring) {
|
||||
if (rs.state == node_state::removing && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::rollback_to_normal) {
|
||||
// no need for double writes anymore since op failed
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
@@ -1375,6 +1384,34 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<bool> storage_service::ongoing_rf_change(const group0_guard& guard, sstring ks) const {
|
||||
auto ongoing_ks_rf_change = [&] (utils::UUID request_id) -> future<bool> {
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
co_return std::holds_alternative<global_topology_request>(req_entry.request_type) &&
|
||||
std::get<global_topology_request>(req_entry.request_type) == global_topology_request::keyspace_rf_change &&
|
||||
req_entry.new_keyspace_rf_change_ks_name.has_value() && req_entry.new_keyspace_rf_change_ks_name.value() == ks;
|
||||
};
|
||||
if (_topology_state_machine._topology.global_request_id.has_value()) {
|
||||
auto req_id = _topology_state_machine._topology.global_request_id.value();
|
||||
if (co_await ongoing_ks_rf_change(req_id)) {
|
||||
co_return true;
|
||||
}
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.paused_rf_change_requests) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
for (auto request_id : _topology_state_machine._topology.global_requests_queue) {
|
||||
if (co_await ongoing_ks_rf_change(request_id)) {
|
||||
co_return true;
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
|
||||
future<> storage_service::raft_initialize_discovery_leader(const join_node_request_params& params) {
|
||||
if (params.replaced_id.has_value()) {
|
||||
throw std::runtime_error(::format("Cannot perform a replace operation because this is the first node in the cluster"));
|
||||
@@ -1420,7 +1457,7 @@ future<> storage_service::raft_initialize_discovery_leader(const join_node_reque
|
||||
_migration_manager.local().get_group0_client().get_history_gc_duration(), "bootstrap: adding myself as the first node to the topology");
|
||||
auto mutation_creator_addr = _sys_ks.local().local_db().get_token_metadata().get_topology().my_address();
|
||||
|
||||
co_await write_mutations_to_database(_qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await write_mutations_to_database(*this, _qp.proxy(), mutation_creator_addr, std::move(change.mutations));
|
||||
co_await _qp.proxy().mutate_locally({history_append}, nullptr);
|
||||
}
|
||||
|
||||
@@ -3443,6 +3480,7 @@ future<> storage_service::stop() {
|
||||
_listeners.clear();
|
||||
co_await _tablets_module->stop();
|
||||
co_await _node_ops_module->stop();
|
||||
co_await _global_topology_requests_module->stop();
|
||||
co_await _async_gate.close();
|
||||
co_await std::move(_node_ops_abort_thread);
|
||||
_tablet_split_monitor_event.signal();
|
||||
@@ -5025,6 +5063,50 @@ future<> storage_service::wait_for_topology_not_busy() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::abort_paused_rf_change(utils::UUID request_id) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.abort_paused_rf_change(request_id);
|
||||
});
|
||||
}
|
||||
|
||||
if (!_feature_service.rack_list_rf) {
|
||||
throw std::runtime_error("The RACK_LIST_RF feature is not enabled on the cluster yet");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
|
||||
bool found = std::ranges::contains(_topology_state_machine._topology.paused_rf_change_requests, request_id);
|
||||
if (!found) {
|
||||
slogger.warn("RF change request with id '{}' is not paused, so it can't be aborted", request_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.resume_rf_change_request(_topology_state_machine._topology.paused_rf_change_requests, request_id).build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(request_id)
|
||||
.done("Aborted by user request")
|
||||
.build()));
|
||||
|
||||
topology_change change{std::move(updates)};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("aborting rf change request {}", request_id));
|
||||
|
||||
try {
|
||||
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), _group0_as, raft_timeout{});
|
||||
} catch (group0_concurrent_modification&) {
|
||||
slogger.info("aborting request {}: concurrent modification, retrying.", request_id);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
semaphore& storage_service::get_do_sample_sstables_concurrency_limiter() {
|
||||
return _do_sample_sstables_concurrency_limiter;
|
||||
}
|
||||
@@ -5228,7 +5310,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() {
|
||||
request_id = _topology_state_machine._topology.global_request_id.value();
|
||||
} else if (!_topology_state_machine._topology.global_requests_queue.empty()) {
|
||||
request_id = _topology_state_machine._topology.global_requests_queue[0];
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id, true);
|
||||
auto req_entry = co_await _sys_ks.local().get_topology_request_entry(request_id);
|
||||
curr_req = std::get<global_topology_request>(req_entry.request_type);
|
||||
} else {
|
||||
request_id = utils::UUID{};
|
||||
@@ -7702,6 +7784,9 @@ void storage_service::init_messaging_service() {
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_state()->id());
|
||||
additional_tables.push_back(db::system_keyspace::cdc_streams_history()->id());
|
||||
}
|
||||
if (ss._feature_service.client_routes) {
|
||||
additional_tables.push_back(db::system_keyspace::client_routes()->id());
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& table : boost::join(params.tables, additional_tables)) {
|
||||
@@ -8041,6 +8126,18 @@ future<> endpoint_lifecycle_notifier::notify_joined(gms::inet_address endpoint,
|
||||
});
|
||||
}
|
||||
|
||||
future<> endpoint_lifecycle_notifier::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await seastar::async([this, &client_route_keys] {
|
||||
_subscribers.thread_for_each([&client_route_keys] (endpoint_lifecycle_subscriber* subscriber) {
|
||||
try {
|
||||
subscriber->on_client_routes_change(client_route_keys);
|
||||
} catch (...) {
|
||||
slogger.warn("Client routes notification failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_service::notify_joined(inet_address endpoint, locator::host_id hid) {
|
||||
co_await utils::get_local_injector().inject(
|
||||
"storage_service_notify_joined_sleep", std::chrono::milliseconds{500});
|
||||
@@ -8065,6 +8162,10 @@ future<> storage_service::notify_cql_change(inet_address endpoint, locator::host
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys) {
|
||||
co_await _client_routes.local().notify_client_routes_change(client_route_keys);
|
||||
}
|
||||
|
||||
bool storage_service::is_normal_state_handled_on_boot(locator::host_id node) {
|
||||
return _normal_state_handled_on_boot.contains(node);
|
||||
}
|
||||
|
||||
@@ -17,8 +17,10 @@
|
||||
#include "gms/endpoint_state.hh"
|
||||
#include "gms/i_endpoint_state_change_subscriber.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "service/client_routes.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_guard.hh"
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/snitch_base.hh"
|
||||
@@ -48,6 +50,7 @@
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "service/tablet_operation.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/user_provided_param.hh"
|
||||
#include "utils/sequenced_set.hh"
|
||||
#include "service/topology_coordinator.hh"
|
||||
@@ -202,6 +205,7 @@ private:
|
||||
sharded<locator::snitch_ptr>& _snitch;
|
||||
sharded<qos::service_level_controller>& _sl_controller;
|
||||
auth::cache& _auth_cache;
|
||||
sharded<client_routes_service>& _client_routes;
|
||||
|
||||
// Engaged on shard 0 before `join_cluster`.
|
||||
service::raft_group0* _group0;
|
||||
@@ -225,6 +229,7 @@ private:
|
||||
future<> _node_ops_abort_thread;
|
||||
shared_ptr<node_ops::task_manager_module> _node_ops_module;
|
||||
shared_ptr<service::task_manager_module> _tablets_module;
|
||||
shared_ptr<service::topo::task_manager_module> _global_topology_requests_module;
|
||||
gms::gossip_address_map& _address_map;
|
||||
void node_ops_insert(node_ops_id, gms::inet_address coordinator, std::list<inet_address> ignore_nodes,
|
||||
std::function<future<>()> abort_func);
|
||||
@@ -269,6 +274,7 @@ public:
|
||||
cql3::query_processor& qp,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
auth::cache& auth_cache,
|
||||
sharded<client_routes_service>& _client_routes,
|
||||
topology_state_machine& topology_state_machine,
|
||||
db::view::view_building_state_machine& view_building_state_machine,
|
||||
tasks::task_manager& tm,
|
||||
@@ -931,6 +937,7 @@ public:
|
||||
bool topology_global_queue_empty() const {
|
||||
return !_topology_state_machine._topology.global_request.has_value();
|
||||
}
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
private:
|
||||
@@ -1068,6 +1075,8 @@ public:
|
||||
future<sstring> wait_for_topology_request_completion(utils::UUID id, bool require_entry = true);
|
||||
future<> wait_for_topology_not_busy();
|
||||
|
||||
future<> abort_paused_rf_change(utils::UUID request_id);
|
||||
|
||||
private:
|
||||
semaphore _do_sample_sstables_concurrency_limiter{1};
|
||||
// To avoid overly-large RPC messages, `do_sample_sstables` is broken up into several rounds.
|
||||
@@ -1138,11 +1147,14 @@ public:
|
||||
future<std::vector<std::byte>> train_dict(utils::chunked_vector<temporary_buffer<char>> sample);
|
||||
future<> publish_new_sstable_dict(table_id, std::span<const std::byte>, service::raft_group0_client&);
|
||||
void set_train_dict_callback(decltype(_train_dict));
|
||||
seastar::future<> notify_client_routes_change(const client_routes_service::client_route_keys& client_route_keys);
|
||||
|
||||
|
||||
friend class join_node_rpc_handshaker;
|
||||
friend class node_ops::node_ops_virtual_task;
|
||||
friend class tasks::task_manager;
|
||||
friend class tablet_virtual_task;
|
||||
friend class topo::global_topology_request_virtual_task;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -6,12 +6,16 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "cql3/statements/ks_prop_defs.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "locator/topology.hh"
|
||||
#include "replica/tablets.hh"
|
||||
#include "locator/tablet_replication_strategy.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "service/migration_listener.hh"
|
||||
#include "service/tablet_allocator.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -22,6 +26,7 @@
|
||||
#include "replica/database.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <utility>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -237,6 +242,147 @@ struct migration_candidate {
|
||||
migration_badness badness;
|
||||
};
|
||||
|
||||
struct colocation_source {
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_replica replica;
|
||||
};
|
||||
|
||||
using colocation_source_set = utils::chunked_vector<colocation_source>;
|
||||
using colocation_sources_by_destination_rack = std::unordered_map<endpoint_dc_rack, colocation_source_set>;
|
||||
|
||||
struct rack_list_colocation_state {
|
||||
colocation_sources_by_destination_rack dst_dc_rack_to_tablets;
|
||||
std::unordered_map<endpoint_dc_rack, std::unordered_set<utils::UUID>> dst_to_requests;
|
||||
utils::UUID request_to_resume;
|
||||
|
||||
void maybe_set_request_to_resume(const utils::UUID& id) {
|
||||
if (!request_to_resume) {
|
||||
request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
future<rack_list_colocation_state> find_required_rack_list_colocations(
|
||||
replica::database& db,
|
||||
token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
const std::unordered_set<utils::UUID>& paused_rf_change_requests,
|
||||
const std::unordered_set<locator::global_tablet_id>& already_planned_migrations) {
|
||||
rack_list_colocation_state state;
|
||||
|
||||
auto get_node = [&] (locator::host_id host) -> const locator::node& {
|
||||
auto* node = tmptr->get_topology().find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
return *node;
|
||||
};
|
||||
for (const auto& request_id : paused_rf_change_requests) {
|
||||
auto req_entry = co_await sys_ks->get_topology_request_entry(request_id);
|
||||
sstring ks_name = *req_entry.new_keyspace_rf_change_ks_name;
|
||||
|
||||
if (!db.has_keyspace(ks_name)) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
auto& ks = db.find_keyspace(ks_name);
|
||||
std::unordered_map<sstring, sstring> saved_ks_props = *req_entry.new_keyspace_rf_change_data;
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, db.features(), db.get_config());
|
||||
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
continue;
|
||||
}
|
||||
bool no_changes_needed = true;
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
continue;
|
||||
}
|
||||
const auto& tmap = tmptr->tablets().get_tablet_map(table_or_mv->id());
|
||||
const auto& new_replication_strategy_config = ks_md->strategy_options();
|
||||
for (auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (!std::holds_alternative<rack_list>(rf_value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto racks = std::get<rack_list>(rf_value) | std::ranges::to<std::unordered_set<sstring>>();
|
||||
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
|
||||
auto gid = locator::global_tablet_id{table_or_mv->id(), tid};
|
||||
|
||||
// Current replicas in this DC. There might be multiple replicas in the same rack.
|
||||
auto dc_replicas = ti.replicas | std::views::filter([&] (const tablet_replica& r) {
|
||||
return get_node(r.host).dc_rack().dc == dc;
|
||||
}) | std::ranges::to<std::vector<tablet_replica>>();
|
||||
|
||||
if (dc_replicas.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Find replicas that are not in the desired racks (src_replicas)
|
||||
// and racks that do not have replicas yet (dst_racks).
|
||||
auto dst_racks = racks;
|
||||
std::vector<tablet_replica> src_replicas;
|
||||
for (const auto& r : dc_replicas) {
|
||||
auto rack = get_node(r.host).dc_rack().rack;
|
||||
if (dst_racks.find(rack) != dst_racks.end()) {
|
||||
// There is already a replica in this rack.
|
||||
dst_racks.erase(rack);
|
||||
} else {
|
||||
// There is a replica in this rack, but it needs to be moved.
|
||||
src_replicas.push_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
auto zipped = std::views::zip(src_replicas, dst_racks);
|
||||
if (!std::ranges::empty(zipped)) {
|
||||
no_changes_needed = false;
|
||||
}
|
||||
|
||||
// Skip tablet that is in transitions.
|
||||
auto* tti = tmap.get_tablet_transition_info(tid);
|
||||
if (tti) {
|
||||
lblogger.debug("Skipped colocation for tablet={} which is already in transition={}", gid, tti->transition);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Skip tablet that is about to be in transition.
|
||||
if (already_planned_migrations.contains(gid)) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
for (auto src_dst : zipped) {
|
||||
auto src = std::get<0>(src_dst);
|
||||
auto dst = std::get<1>(src_dst);
|
||||
auto endpoint = locator::endpoint_dc_rack{dc, dst};
|
||||
|
||||
state.dst_dc_rack_to_tablets[endpoint].emplace_back(colocation_source{{table_or_mv->id(), tid}, src});
|
||||
state.dst_to_requests[endpoint].insert(request_id);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
}
|
||||
if (no_changes_needed) {
|
||||
state.maybe_set_request_to_resume(request_id);
|
||||
}
|
||||
}
|
||||
co_return state;
|
||||
}
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id) {
|
||||
auto res = co_await find_required_rack_list_colocations(db, tmptr, sys_ks, {request_id}, {});
|
||||
co_return res.request_to_resume != request_id;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
@@ -658,6 +804,8 @@ class load_balancer {
|
||||
|
||||
replica::database& _db;
|
||||
token_metadata_ptr _tm;
|
||||
service::topology* _topology;
|
||||
db::system_keyspace* _sys_ks;
|
||||
std::optional<locator::load_sketch> _load_sketch;
|
||||
// Holds the set of tablets already scheduled for transition during plan-making.
|
||||
std::unordered_set<global_tablet_id> _scheduled_tablets;
|
||||
@@ -742,7 +890,10 @@ private:
|
||||
return streaming_infos;
|
||||
}
|
||||
public:
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm, locator::load_stats_ptr table_load_stats,
|
||||
load_balancer(replica::database& db, token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
load_balancer_stats_manager& stats,
|
||||
uint64_t target_tablet_size,
|
||||
unsigned tablets_per_shard_goal,
|
||||
@@ -751,19 +902,26 @@ public:
|
||||
, _tablets_per_shard_goal(tablets_per_shard_goal)
|
||||
, _db(db)
|
||||
, _tm(std::move(tm))
|
||||
, _topology(topology)
|
||||
, _sys_ks(sys_ks)
|
||||
, _table_load_stats(std::move(table_load_stats))
|
||||
, _stats(stats)
|
||||
, _skiplist(std::move(skiplist))
|
||||
{ }
|
||||
|
||||
bool ongoing_rack_list_colocation() const {
|
||||
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan() {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
@@ -779,6 +937,10 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (rack_list_colocation) {
|
||||
plan.merge(co_await make_rack_list_colocation_plan(plan));
|
||||
}
|
||||
|
||||
// Merge table-wide resize decisions, may emit new decisions, revoke or finalize ongoing ones.
|
||||
// Note : Resize plans should be generated before repair plans to avoid scheduling repairs when there is pending resize finalization
|
||||
plan.merge_resize_plan(co_await make_resize_plan(plan));
|
||||
@@ -789,8 +951,8 @@ public:
|
||||
}
|
||||
|
||||
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count());
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
@@ -815,6 +977,58 @@ public:
|
||||
co_return false;
|
||||
}
|
||||
|
||||
void ensure_node(node_load_map& nodes, host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_scheduled_load(node_load_map& nodes) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> consider_planned_load(node_load_map& nodes, const migration_plan& mplan) {
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
}
|
||||
|
||||
future<tablet_repair_plan> make_repair_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_repair_plan");
|
||||
|
||||
@@ -830,53 +1044,19 @@ public:
|
||||
// Populate the load of the migration that is already in the plan
|
||||
node_load_map nodes;
|
||||
// TODO: share code with make_plan()
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
};
|
||||
// TODO: share code with make_plan()
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
bool is_drained = node.get_state() == locator::node::state::being_decommissioned
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
ensure_node(node.host_id());
|
||||
ensure_node(nodes, node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled
|
||||
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
|
||||
const auto& tmap = _tm->tablets().get_tablet_map(table);
|
||||
for (auto&& [tid, trinfo]: tmap.transitions()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
if (is_streaming(&trinfo)) {
|
||||
auto& tinfo = tmap.get_tablet_info(tid);
|
||||
apply_load(nodes, get_migration_streaming_info(topo, tinfo, trinfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await consider_scheduled_load(nodes);
|
||||
|
||||
// Consider load that is about to be scheduled
|
||||
auto& tablet_meta = _tm->tablets();
|
||||
for (const tablet_migration_info& tmi : mplan.migrations()) {
|
||||
co_await coroutine::maybe_yield();
|
||||
auto& tmap = tablet_meta.get_tablet_map(tmi.tablet.table);
|
||||
auto& tinfo = tmap.get_tablet_info(tmi.tablet.tablet);
|
||||
auto streaming_info = get_migration_streaming_info(topo, tinfo, tmi);
|
||||
apply_load(nodes, streaming_info);
|
||||
}
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
|
||||
struct repair_plan {
|
||||
locator::global_tablet_id gid;
|
||||
@@ -959,6 +1139,109 @@ public:
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<migration_plan> make_rack_list_colocation_plan(const migration_plan& mplan) {
|
||||
lblogger.debug("In make_rack_list_colocation_plan");
|
||||
|
||||
migration_plan plan;
|
||||
tablet_rack_list_colocation_plan rack_list_plan;
|
||||
if (!ongoing_rack_list_colocation()) {
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
const locator::topology& topo = _tm->get_topology();
|
||||
|
||||
auto migration_tablet_ids = co_await mplan.get_migration_tablet_ids();
|
||||
auto colocation_state = co_await find_required_rack_list_colocations(_db, _tm, _sys_ks,
|
||||
_topology->paused_rf_change_requests, std::move(migration_tablet_ids));
|
||||
|
||||
node_load_map nodes;
|
||||
topo.for_each_node([&] (const locator::node& node) {
|
||||
if (node.get_state() == locator::node::state::normal && !node.is_excluded()) {
|
||||
ensure_node(nodes, node.host_id());
|
||||
}
|
||||
});
|
||||
|
||||
// Consider load that is already scheduled.
|
||||
co_await consider_scheduled_load(nodes);
|
||||
|
||||
// Consider load that is about to be scheduled.
|
||||
co_await consider_planned_load(nodes, mplan);
|
||||
|
||||
std::unordered_set<global_tablet_id> colocation_tablet_ids;
|
||||
for (auto& [dc_rack, colocation_sources] : colocation_state.dst_dc_rack_to_tablets) {
|
||||
auto nodes_by_load_dst = nodes | std::views::filter([&] (const auto& host_load) {
|
||||
auto& [host, load] = host_load;
|
||||
auto& node = *load.node;
|
||||
return node.dc_rack() == dc_rack;
|
||||
}) | std::views::keys | std::ranges::to<std::vector<host_id>>();
|
||||
|
||||
if (nodes_by_load_dst.empty()) {
|
||||
lblogger.warn("No target nodes available for RF change colocation plan in dc {}, rack {}", dc_rack.dc, dc_rack.rack);
|
||||
if (auto it = colocation_state.dst_to_requests.find(dc_rack); it != colocation_state.dst_to_requests.end()) {
|
||||
rack_list_plan.maybe_add_request_to_resume(*it->second.begin());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nodes_cmp = nodes_by_load_cmp(nodes);
|
||||
auto nodes_dst_cmp = [&] (const host_id& a, const host_id& b) {
|
||||
return nodes_cmp(b, a);
|
||||
};
|
||||
|
||||
// Ascending load heap of candidate target nodes.
|
||||
std::make_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
|
||||
const tablet_metadata& tmeta = _tm->tablets();
|
||||
for (colocation_source& source : colocation_sources) {
|
||||
if (colocation_tablet_ids.contains(source.gid)) {
|
||||
lblogger.debug("Skipped colocation of replica {} of tablet={}, another replica of which is about to be colocated", source.replica, source.gid);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pick the least loaded node as target.
|
||||
std::pop_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
auto target = nodes_by_load_dst.back();
|
||||
auto& target_info = nodes[target];
|
||||
auto push_back_target_node = seastar::defer([&] {
|
||||
std::push_heap(nodes_by_load_dst.begin(), nodes_by_load_dst.end(), nodes_dst_cmp);
|
||||
});
|
||||
|
||||
lblogger.debug("target node: {}, avg_load={}", target, target_info.avg_load);
|
||||
|
||||
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
|
||||
|
||||
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
|
||||
target_info.shards[dst.shard].tablet_count,
|
||||
target_info.shard_load(dst.shard, _target_tablet_size));
|
||||
|
||||
tablet_transition_kind kind = tablet_transition_kind::migration;
|
||||
migration_tablet_set source_tablets {
|
||||
.tablet_s = source.gid, // Ignore the merge co-location.
|
||||
};
|
||||
auto src = source.replica;
|
||||
auto mig = get_migration_info(source_tablets, kind, src, dst);
|
||||
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
|
||||
auto mig_streaming_info = get_migration_streaming_infos(topo, tmap, mig);
|
||||
pick(*_load_sketch, dst.host, dst.shard, source_tablets);
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {}", mig);
|
||||
mark_as_scheduled(mig);
|
||||
for (auto& m : mig) {
|
||||
plan.add(std::move(m));
|
||||
colocation_tablet_ids.insert(m.tablet);
|
||||
}
|
||||
}
|
||||
update_node_load_on_migration(nodes, src, dst, source_tablets);
|
||||
}
|
||||
}
|
||||
if (colocation_state.request_to_resume) {
|
||||
rack_list_plan.maybe_add_request_to_resume(colocation_state.request_to_resume);
|
||||
}
|
||||
plan.set_rack_list_colocation_plan(std::move(rack_list_plan));
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
// Returns true if a table has replicas of all its sibling tablets co-located.
|
||||
// This is used for determining whether merge can be finalized, since co-location
|
||||
// is a strict requirement for sibling tablets to be merged.
|
||||
@@ -2967,30 +3250,6 @@ public:
|
||||
node_load_map nodes;
|
||||
std::unordered_set<host_id> nodes_to_drain;
|
||||
|
||||
auto ensure_node = [&] (host_id host) {
|
||||
if (nodes.contains(host)) {
|
||||
return;
|
||||
}
|
||||
auto* node = topo.find_node(host);
|
||||
if (!node) {
|
||||
on_internal_error(lblogger, format("Node {} not found in topology", host));
|
||||
}
|
||||
node_load& load = nodes[host];
|
||||
load.id = host;
|
||||
load.node = node;
|
||||
load.shard_count = node->get_shard_count();
|
||||
load.shards.resize(load.shard_count);
|
||||
if (!load.shard_count) {
|
||||
throw std::runtime_error(format("Shard count of {} not found in topology", host));
|
||||
}
|
||||
if (!_db.features().tablet_load_stats_v2) {
|
||||
// This way load calculation will hold tablet count.
|
||||
load.capacity = _target_tablet_size * load.shard_count;
|
||||
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
|
||||
load.capacity = _table_load_stats->capacity.at(host);
|
||||
}
|
||||
};
|
||||
|
||||
_tm->for_each_token_owner([&] (const locator::node& node) {
|
||||
if (!node_filter(node)) {
|
||||
return;
|
||||
@@ -2999,7 +3258,7 @@ public:
|
||||
|| node.get_state() == locator::node::state::being_removed;
|
||||
if (node.get_state() == locator::node::state::normal || is_drained) {
|
||||
if (is_drained) {
|
||||
ensure_node(node.host_id());
|
||||
ensure_node(nodes, node.host_id());
|
||||
lblogger.info("Will drain node {} ({}) from DC {}", node.host_id(), node.get_state(), dc);
|
||||
nodes_to_drain.emplace(node.host_id());
|
||||
nodes[node.host_id()].drained = true;
|
||||
@@ -3007,7 +3266,7 @@ public:
|
||||
// Excluded nodes should not be chosen as targets for migration.
|
||||
lblogger.debug("Ignoring excluded node {}: state={}", node.host_id(), node.get_state());
|
||||
} else {
|
||||
ensure_node(node.host_id());
|
||||
ensure_node(nodes, node.host_id());
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -3040,7 +3299,7 @@ public:
|
||||
r, global_tablet_id{table, tid}));
|
||||
}
|
||||
if (node->left() && node_filter(*node)) {
|
||||
ensure_node(r.host);
|
||||
ensure_node(nodes, r.host);
|
||||
nodes_to_drain.insert(r.host);
|
||||
nodes[r.host].drained = true;
|
||||
}
|
||||
@@ -3242,7 +3501,7 @@ public:
|
||||
plan.merge(co_await make_intranode_plan(nodes, nodes_to_drain));
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty()) {
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
|
||||
@@ -3264,9 +3523,11 @@ class tablet_allocator_impl : public tablet_allocator::impl
|
||||
locator::load_stats_ptr _load_stats;
|
||||
private:
|
||||
load_balancer make_load_balancer(token_metadata_ptr tm,
|
||||
service::topology* topology,
|
||||
db::system_keyspace* sys_ks,
|
||||
locator::load_stats_ptr table_load_stats,
|
||||
std::unordered_set<host_id> skiplist) {
|
||||
load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats,
|
||||
load_balancer lb(_db, tm, topology, sys_ks, std::move(table_load_stats), _load_balancer_stats,
|
||||
_db.get_config().target_tablet_size_in_bytes(),
|
||||
_db.get_config().tablets_per_shard_goal(),
|
||||
std::move(skiplist));
|
||||
@@ -3293,8 +3554,8 @@ public:
|
||||
_stopped = true;
|
||||
}
|
||||
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
|
||||
auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
|
||||
co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
|
||||
co_return co_await lb.make_plan();
|
||||
}
|
||||
@@ -3314,7 +3575,7 @@ public:
|
||||
// Allocates new tablets for a table which is not co-located with another table.
|
||||
tablet_map allocate_tablets_for_new_base_table(const tablet_aware_replication_strategy* tablet_rs, const schema& s) {
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto lb = make_load_balancer(tm, nullptr, {});
|
||||
auto lb = make_load_balancer(tm, nullptr, nullptr, nullptr, {});
|
||||
auto plan = lb.make_sizing_plan(s.shared_from_this(), tablet_rs).get();
|
||||
auto& table_plan = plan.tables[s.id()];
|
||||
if (table_plan.target_tablet_count_aligned != table_plan.target_tablet_count) {
|
||||
@@ -3328,6 +3589,7 @@ public:
|
||||
|
||||
// Allocate tablets for multiple new tables, which may be co-located with each other, or co-located with an existing base table.
|
||||
void allocate_tablets_for_new_tables(const keyspace_metadata& ksm, const std::vector<schema_ptr>& cfms, utils::chunked_vector<mutation>& muts, api::timestamp_type ts) {
|
||||
utils::get_local_injector().inject("pause_in_allocate_tablets_for_new_table", utils::wait_for_message(std::chrono::minutes(5))).get();
|
||||
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
|
||||
auto tm = _db.get_shared_token_metadata().get();
|
||||
auto rs = abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, tm->get_topology());
|
||||
@@ -3369,7 +3631,7 @@ public:
|
||||
if (s.id() != base_id) {
|
||||
lblogger.debug("Creating tablets for {}.{} id={} with base={}", s.ks_name(), s.cf_name(), s.id(), base_id);
|
||||
muts.emplace_back(colocated_tablet_map_to_mutation(s.id(), s.ks_name(), s.cf_name(), base_id, ts));
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3385,7 +3647,7 @@ public:
|
||||
muts.emplace_back(std::move(m));
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
_db.get_notifier().before_allocate_tablet_map(base_map, s, muts, ts);
|
||||
_db.get_notifier().before_allocate_tablet_map_in_notification(base_map, s, muts, ts);
|
||||
|
||||
create_colocated_tablet_maps(base_map);
|
||||
}
|
||||
@@ -3534,8 +3796,8 @@ future<> tablet_allocator::stop() {
|
||||
return impl().stop();
|
||||
}
|
||||
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), std::move(load_stats), std::move(skiplist));
|
||||
future<migration_plan> tablet_allocator::balance_tablets(locator::token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr load_stats, std::unordered_set<host_id> skiplist) {
|
||||
return impl().balance_tablets(std::move(tm), topology, sys_ks, std::move(load_stats), std::move(skiplist));
|
||||
}
|
||||
|
||||
void tablet_allocator::set_load_stats(locator::load_stats_ptr load_stats) {
|
||||
|
||||
@@ -14,8 +14,14 @@
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include <seastar/core/metrics.hh>
|
||||
|
||||
namespace db {
|
||||
class system_keyspace;
|
||||
}
|
||||
|
||||
namespace service {
|
||||
|
||||
class topology;
|
||||
|
||||
struct load_balancer_dc_stats {
|
||||
uint64_t calls = 0;
|
||||
uint64_t migrations_produced = 0;
|
||||
@@ -133,6 +139,26 @@ struct tablet_repair_plan {
|
||||
}
|
||||
};
|
||||
|
||||
struct tablet_rack_list_colocation_plan {
|
||||
utils::UUID _request_to_resume;
|
||||
|
||||
const utils::UUID& request_to_resume() const noexcept {
|
||||
return _request_to_resume;
|
||||
}
|
||||
|
||||
size_t size() const { return _request_to_resume ? 1 : 0; };
|
||||
|
||||
void merge(tablet_rack_list_colocation_plan&& other) {
|
||||
_request_to_resume = _request_to_resume ? _request_to_resume : other._request_to_resume;
|
||||
}
|
||||
|
||||
void maybe_add_request_to_resume(const utils::UUID& id) {
|
||||
if (!_request_to_resume) {
|
||||
_request_to_resume = id;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class migration_plan {
|
||||
public:
|
||||
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
|
||||
@@ -140,17 +166,19 @@ private:
|
||||
migrations_vector _migrations;
|
||||
table_resize_plan _resize_plan;
|
||||
tablet_repair_plan _repair_plan;
|
||||
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
|
||||
bool _has_nodes_to_drain = false;
|
||||
public:
|
||||
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size();}
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size(); }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
size_t tablet_repair_count() const { return _repair_plan.size(); }
|
||||
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
|
||||
|
||||
void add(tablet_migration_info info) {
|
||||
_migrations.emplace_back(std::move(info));
|
||||
@@ -167,6 +195,7 @@ public:
|
||||
_has_nodes_to_drain |= other._has_nodes_to_drain;
|
||||
_resize_plan.merge(std::move(other._resize_plan));
|
||||
_repair_plan.merge(std::move(other._repair_plan));
|
||||
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
|
||||
}
|
||||
|
||||
void set_has_nodes_to_drain(bool b) {
|
||||
@@ -185,6 +214,12 @@ public:
|
||||
_repair_plan = std::move(repair);
|
||||
}
|
||||
|
||||
const tablet_rack_list_colocation_plan& rack_list_colocation_plan() const { return _rack_list_colocation_plan; }
|
||||
|
||||
void set_rack_list_colocation_plan(tablet_rack_list_colocation_plan rack_list_colocation_plan) {
|
||||
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
|
||||
}
|
||||
|
||||
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
|
||||
};
|
||||
|
||||
@@ -230,7 +265,7 @@ public:
|
||||
///
|
||||
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
|
||||
///
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
future<migration_plan> balance_tablets(locator::token_metadata_ptr, service::topology*, db::system_keyspace*, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
||||
|
||||
void set_load_stats(locator::load_stats_ptr);
|
||||
|
||||
@@ -246,6 +281,12 @@ public:
|
||||
void on_leadership_lost();
|
||||
};
|
||||
|
||||
future<bool> requires_rack_list_colocation(
|
||||
replica::database& db,
|
||||
locator::token_metadata_ptr tmptr,
|
||||
db::system_keyspace* sys_ks,
|
||||
utils::UUID request_id);
|
||||
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/task_manager_module.hh"
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "tasks/task_handler.hh"
|
||||
#include "tasks/virtual_task_hint.hh"
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
@@ -288,4 +289,116 @@ std::set<locator::host_id> task_manager_module::get_nodes() const {
|
||||
return get_task_manager().get_nodes(_ss);
|
||||
}
|
||||
|
||||
namespace topo {
|
||||
|
||||
static tasks::task_manager::task_state get_state(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
if (!entry.id) {
|
||||
return tasks::task_manager::task_state::created;
|
||||
} else if (!entry.done) {
|
||||
return tasks::task_manager::task_state::running;
|
||||
} else if (entry.error == "") {
|
||||
return tasks::task_manager::task_state::done;
|
||||
} else {
|
||||
return tasks::task_manager::task_state::failed;
|
||||
}
|
||||
}
|
||||
|
||||
tasks::task_manager::task_group global_topology_request_virtual_task::get_group() const noexcept {
|
||||
return tasks::task_manager::task_group::global_topology_change_group;
|
||||
}
|
||||
|
||||
future<std::optional<tasks::virtual_task_hint>> global_topology_request_virtual_task::contains(tasks::task_id task_id) const {
|
||||
if (!task_id.uuid().is_timestamp()) {
|
||||
// Task id of node ops operation is always a timestamp.
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
auto hint = std::make_optional<tasks::virtual_task_hint>({});
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(task_id.uuid());
|
||||
if (entry.has_value() && std::holds_alternative<service::global_topology_request>(entry->request_type) &&
|
||||
std::get<service::global_topology_request>(entry->request_type) == global_topology_request::keyspace_rf_change) {
|
||||
co_return hint;
|
||||
}
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<tasks::is_abortable> global_topology_request_virtual_task::is_abortable(tasks::virtual_task_hint) const {
|
||||
return make_ready_future<tasks::is_abortable>(tasks::is_abortable::yes);
|
||||
}
|
||||
|
||||
static tasks::task_stats get_task_stats(const db::system_keyspace::topology_requests_entry& entry) {
|
||||
return tasks::task_stats{
|
||||
.task_id = tasks::task_id{entry.id},
|
||||
.type = fmt::to_string(entry.request_type),
|
||||
.kind = tasks::task_kind::cluster,
|
||||
.scope = "keyspace",
|
||||
.state = get_state(entry),
|
||||
.sequence_number = 0,
|
||||
.keyspace = entry.new_keyspace_rf_change_ks_name.value_or(""),
|
||||
.table = "",
|
||||
.entity = "",
|
||||
.shard = 0,
|
||||
.start_time = entry.start_time,
|
||||
.end_time = entry.end_time,
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::get_status(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await _ss._sys_ks.local().get_topology_request_entry_opt(id.uuid());
|
||||
if (!entry.has_value()) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
auto task_stats = get_task_stats(*entry);
|
||||
co_return tasks::task_status{
|
||||
.task_id = task_stats.task_id,
|
||||
.type = task_stats.type,
|
||||
.kind = task_stats.kind,
|
||||
.scope = task_stats.scope,
|
||||
.state = task_stats.state,
|
||||
.is_abortable = co_await is_abortable(std::move(hint)),
|
||||
.start_time = task_stats.start_time,
|
||||
.end_time = task_stats.end_time,
|
||||
.error = entry->error,
|
||||
.parent_id = tasks::task_id::create_null_id(),
|
||||
.sequence_number = task_stats.sequence_number,
|
||||
.shard = task_stats.shard,
|
||||
.keyspace = task_stats.keyspace,
|
||||
.table = task_stats.table,
|
||||
.entity = task_stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = utils::chunked_vector<tasks::task_identity>{},
|
||||
};
|
||||
}
|
||||
|
||||
future<std::optional<tasks::task_status>> global_topology_request_virtual_task::wait(tasks::task_id id, tasks::virtual_task_hint hint) {
|
||||
auto entry = co_await get_status(id, hint);
|
||||
if (!entry) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
co_await _ss.wait_for_topology_request_completion(id.uuid(), false);
|
||||
co_return co_await get_status(id, std::move(hint));
|
||||
}
|
||||
|
||||
future<> global_topology_request_virtual_task::abort(tasks::task_id id, tasks::virtual_task_hint) noexcept {
|
||||
return _ss.abort_paused_rf_change(id.uuid());
|
||||
}
|
||||
|
||||
future<std::vector<tasks::task_stats>> global_topology_request_virtual_task::get_stats() {
|
||||
db::system_keyspace& sys_ks = _ss._sys_ks.local();
|
||||
co_return std::ranges::to<std::vector<tasks::task_stats>>(co_await sys_ks.get_topology_request_entries({global_topology_request::keyspace_rf_change}, db_clock::now() - get_task_manager().get_user_task_ttl())
|
||||
| std::views::transform([] (const auto& e) {
|
||||
auto& entry = e.second;
|
||||
return get_task_stats(entry);
|
||||
}));
|
||||
}
|
||||
|
||||
task_manager_module::task_manager_module(tasks::task_manager& tm) noexcept
|
||||
: tasks::task_manager::module(tm, "global_topology_requests")
|
||||
{}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -54,4 +54,33 @@ public:
|
||||
|
||||
std::set<locator::host_id> get_nodes() const override;
|
||||
};
|
||||
|
||||
namespace topo {
|
||||
|
||||
class global_topology_request_virtual_task : public tasks::task_manager::virtual_task::impl {
|
||||
private:
|
||||
service::storage_service& _ss;
|
||||
public:
|
||||
global_topology_request_virtual_task(tasks::task_manager::module_ptr module,
|
||||
service::storage_service& ss)
|
||||
: tasks::task_manager::virtual_task::impl(std::move(module))
|
||||
, _ss(ss)
|
||||
{}
|
||||
virtual tasks::task_manager::task_group get_group() const noexcept override;
|
||||
virtual future<std::optional<tasks::virtual_task_hint>> contains(tasks::task_id task_id) const override;
|
||||
virtual future<tasks::is_abortable> is_abortable(tasks::virtual_task_hint hint) const override;
|
||||
|
||||
virtual future<std::optional<tasks::task_status>> get_status(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<std::optional<tasks::task_status>> wait(tasks::task_id id, tasks::virtual_task_hint hint) override;
|
||||
virtual future<> abort(tasks::task_id id, tasks::virtual_task_hint hint) noexcept override;
|
||||
virtual future<std::vector<tasks::task_stats>> get_stats() override;
|
||||
};
|
||||
|
||||
class task_manager_module : public tasks::task_manager::module {
|
||||
public:
|
||||
task_manager_module(tasks::task_manager& tm) noexcept;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <fmt/ranges.h>
|
||||
|
||||
@@ -54,6 +55,7 @@
|
||||
#include "service/topology_state_machine.hh"
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
#include "topology_mutation.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
@@ -953,7 +955,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
} else {
|
||||
assert(_feature_service.topology_global_request_queue);
|
||||
req_id = _topo_sm._topology.global_requests_queue[0];
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id, true);
|
||||
req_entry = co_await _sys_ks.get_topology_request_entry(req_id);
|
||||
req = std::get<global_topology_request>(req_entry.request_type);
|
||||
}
|
||||
switch (req) {
|
||||
@@ -997,6 +999,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
sstring error;
|
||||
bool needs_colocation = false;
|
||||
if (_db.has_keyspace(ks_name)) {
|
||||
try {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
@@ -1004,12 +1007,40 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};
|
||||
new_ks_props.validate();
|
||||
auto ks_md = new_ks_props.as_ks_metadata_update(ks.metadata(), *tmptr, _db.features(), _db.get_config());
|
||||
_db.validate_keyspace_update(*ks_md);
|
||||
size_t unimportant_init_tablet_count = 2; // must be a power of 2
|
||||
locator::tablet_map new_tablet_map{unimportant_init_tablet_count};
|
||||
|
||||
auto schedule_migrations = [&] () -> future<> {
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
if (tables_with_mvs.empty()) {
|
||||
co_return;
|
||||
}
|
||||
auto table = tables_with_mvs.front();
|
||||
auto tablet_count = tmptr->tablets().get_tablet_map(table->id()).tablet_count();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), tablet_count, ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
|
||||
auto check_needs_colocation = [&] () -> future<bool> {
|
||||
const auto& new_replication_strategy_config = new_strategy->get_config_options();
|
||||
const auto& old_replication_strategy_config = ks.metadata()->strategy_options();
|
||||
bool rack_list_conversion = false;
|
||||
for (const auto& [dc, rf_value] : new_replication_strategy_config) {
|
||||
if (std::holds_alternative<locator::rack_list>(rf_value)) {
|
||||
auto it = old_replication_strategy_config.find(dc);
|
||||
if (it != old_replication_strategy_config.end() && std::holds_alternative<sstring>(it->second)) {
|
||||
rack_list_conversion = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return rack_list_conversion ? co_await requires_rack_list_colocation(_db, tmptr, &_sys_ks, req_id) : false;
|
||||
};
|
||||
if (needs_colocation = co_await check_needs_colocation(); needs_colocation) {
|
||||
co_return;
|
||||
}
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
// Apply the transition only on base tables.
|
||||
@@ -1018,8 +1049,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
continue;
|
||||
}
|
||||
auto old_tablets = co_await tmptr->tablets().get_tablet_map(table_or_mv->id()).clone_gently();
|
||||
locator::replication_strategy_params params{ks_md->strategy_options(), old_tablets.tablet_count(), ks.metadata()->consistency_option()};
|
||||
auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params, tmptr->get_topology());
|
||||
new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await old_tablets.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
@@ -1046,6 +1075,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
});
|
||||
}
|
||||
};
|
||||
co_await schedule_migrations();
|
||||
|
||||
auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
|
||||
for (auto& m: schema_muts) {
|
||||
@@ -1061,16 +1092,22 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
|
||||
}
|
||||
|
||||
updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
|
||||
.set_transition_state(topology::transition_state::tablet_migration)
|
||||
bool pause_request = needs_colocation && error.empty();
|
||||
topology_mutation_builder tbuilder(guard.write_timestamp());
|
||||
tbuilder.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.del_global_topology_request()
|
||||
.del_global_topology_request_id()
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id)
|
||||
.build()));
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.drop_first_global_topology_request_id(_topo_sm._topology.global_requests_queue, req_id);
|
||||
if (pause_request) {
|
||||
rtlogger.info("keyspace_rf_change for keyspace {} postponed for colocation", ks_name);
|
||||
tbuilder.pause_rf_change_request(req_id);
|
||||
} else {
|
||||
updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_id)
|
||||
.done(error)
|
||||
.build()));
|
||||
}
|
||||
updates.push_back(canonical_mutation(tbuilder.build()));
|
||||
|
||||
sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
|
||||
rtlogger.trace("do update {} reason {}", updates, reason);
|
||||
@@ -1334,6 +1371,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
.build());
|
||||
}
|
||||
|
||||
void generate_rf_change_resume_update(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, utils::UUID request_to_resume) {
|
||||
rtlogger.debug("Generating RF change resume for request id {}", request_to_resume);
|
||||
out.emplace_back(topology_mutation_builder(guard.write_timestamp())
|
||||
.queue_global_topology_request_id(request_to_resume)
|
||||
.resume_rf_change_request(_topo_sm._topology.paused_rf_change_requests, request_to_resume)
|
||||
.build());
|
||||
}
|
||||
|
||||
future<> generate_migration_updates(utils::chunked_vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
|
||||
if (plan.resize_plan().finalize_resize.empty() || plan.has_nodes_to_drain()) {
|
||||
// schedule tablet migration only if there are no pending resize finalisations or if the node is draining.
|
||||
@@ -1341,6 +1386,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await coroutine::maybe_yield();
|
||||
generate_migration_update(out, guard, mig);
|
||||
}
|
||||
|
||||
if (auto request_to_resume = plan.rack_list_colocation_plan().request_to_resume(); request_to_resume) {
|
||||
generate_rf_change_resume_update(out, guard, request_to_resume);
|
||||
}
|
||||
}
|
||||
|
||||
auto sched_time = db_clock::now();
|
||||
@@ -1831,7 +1880,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool has_nodes_to_drain = false;
|
||||
if (!preempt) {
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
has_nodes_to_drain = plan.has_nodes_to_drain();
|
||||
if (!drain || plan.has_nodes_to_drain()) {
|
||||
co_await generate_migration_updates(updates, guard, plan);
|
||||
@@ -1954,7 +2003,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
co_await utils::get_local_injector().inject("tablet_resize_finalization_post_barrier", utils::wait_for_message(std::chrono::minutes(2)));
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);
|
||||
@@ -2034,7 +2083,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
|
||||
// handler performed the truncate and cleared the session, but crashed before finalizing the request
|
||||
if (_topo_sm._topology.session) {
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id, true);
|
||||
const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id);
|
||||
const table_id& table_id = topology_requests_entry.truncate_table_id;
|
||||
lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(table_id);
|
||||
|
||||
@@ -2623,6 +2672,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
|
||||
co_await sleep_abortable(std::chrono::milliseconds(10), _as);
|
||||
}
|
||||
const bool removenode_with_left_token_ring = _feature_service.removenode_with_left_token_ring;
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
bool barrier_failed = false;
|
||||
// In this state writes goes to old and new replicas but reads start to be done from new replicas
|
||||
@@ -2677,7 +2727,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
break;
|
||||
case node_state::removing: {
|
||||
co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
if (!removenode_with_left_token_ring) {
|
||||
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
|
||||
}
|
||||
}
|
||||
[[fallthrough]];
|
||||
case node_state::decommissioning: {
|
||||
@@ -2685,7 +2737,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
node_state next_state;
|
||||
utils::chunked_vector<canonical_mutation> muts;
|
||||
muts.reserve(2);
|
||||
if (node.rs->state == node_state::decommissioning) {
|
||||
if (removenode_with_left_token_ring || node.rs->state == node_state::decommissioning) {
|
||||
// Both decommission and removenode go through left_token_ring state
|
||||
// to ensure a global barrier is executed before the request is marked as done.
|
||||
// This ensures all nodes have observed the topology change.
|
||||
next_state = node.rs->state;
|
||||
builder.set_transition_state(topology::transition_state::left_token_ring);
|
||||
} else {
|
||||
@@ -2760,6 +2815,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
case topology::transition_state::left_token_ring: {
|
||||
auto node = get_node_to_work_on(std::move(guard));
|
||||
|
||||
// Need to be captured as the node variable might become invalid (e.g. moved out) at particular points.
|
||||
const auto node_rs_state = node.rs->state;
|
||||
|
||||
const bool is_removenode = node_rs_state == node_state::removing;
|
||||
|
||||
if (is_removenode && !_feature_service.removenode_with_left_token_ring) {
|
||||
on_internal_error(
|
||||
rtlogger, "removenode operation can only enter the left_token_ring state when REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled");
|
||||
}
|
||||
|
||||
auto finish_left_token_ring_transition = [&](node_to_work_on& node) -> future<> {
|
||||
// Remove the node from group0 here - in general, it won't be able to leave on its own
|
||||
// because we'll ban it as soon as we tell it to shut down.
|
||||
@@ -2779,9 +2844,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
muts.push_back(builder.build());
|
||||
co_await remove_view_build_statuses_on_left_node(muts, node.guard, node.id);
|
||||
co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
|
||||
auto str = node.rs->state == node_state::decommissioning
|
||||
? ::format("finished decommissioning node {}", node.id)
|
||||
: ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
auto str = std::invoke([&]() {
|
||||
switch (node_rs_state) {
|
||||
case node_state::decommissioning:
|
||||
return ::format("finished decommissioning node {}", node.id);
|
||||
case node_state::removing:
|
||||
return ::format("finished removing node {}", node.id);
|
||||
default:
|
||||
return ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
|
||||
}
|
||||
});
|
||||
co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
|
||||
};
|
||||
|
||||
@@ -2794,6 +2866,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (node.id == _raft.id()) {
|
||||
// Removed node must be dead, so it shouldn't enter here (it can't coordinate its own removal).
|
||||
if (is_removenode) {
|
||||
on_internal_error(rtlogger, "removenode operation cannot be coordinated by the removed node itself");
|
||||
}
|
||||
|
||||
// Someone else needs to coordinate the rest of the decommission process,
|
||||
// because the decommissioning node is going to shut down in the middle of this state.
|
||||
rtlogger.info("coordinator is decommissioning; giving up leadership");
|
||||
@@ -2807,8 +2884,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
bool barrier_failed = false;
|
||||
// Wait until other nodes observe the new token ring and stop sending writes to this node.
|
||||
auto excluded_nodes = get_excluded_nodes_for_topology_request(node);
|
||||
try {
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes_for_topology_request(node)), node.id);
|
||||
// Removed node is added to ignored nodes, so it should be automatically excluded.
|
||||
if (is_removenode && !excluded_nodes.contains(node.id)) {
|
||||
on_internal_error(rtlogger, "removenode operation must have the removed node in excluded_nodes");
|
||||
}
|
||||
node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), std::move(excluded_nodes)), node.id);
|
||||
} catch (term_changed_error&) {
|
||||
throw;
|
||||
} catch (group0_concurrent_modification&) {
|
||||
@@ -2825,15 +2907,17 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
}
|
||||
|
||||
if (barrier_failed) {
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node.
|
||||
// If barrier above failed it means there may be unfinished writes to a decommissioned node,
|
||||
// or some nodes might not have observed the new topology yet (one purpose of the barrier
|
||||
// is to make sure all nodes observed the new topology before completing the request).
|
||||
// Lets wait for the ring delay for those writes to complete and new topology to propagate
|
||||
// before continuing.
|
||||
co_await sleep_abortable(_ring_delay, _as);
|
||||
node = retake_node(co_await start_operation(), node.id);
|
||||
}
|
||||
|
||||
// Make decommissioning node a non voter before reporting operation completion below.
|
||||
// Otherwise the decommissioned node may see the completion and exit before it is removed from
|
||||
// Make decommissioning/removed node a non voter before reporting operation completion below.
|
||||
// Otherwise the node may see the completion and exit before it is removed from
|
||||
// the config at which point the removal from the config will hang if the cluster had only two
|
||||
// nodes before the decommission.
|
||||
co_await _voter_handler.on_node_removed(node.id, _as);
|
||||
@@ -2844,7 +2928,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
|
||||
co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");
|
||||
|
||||
// Tell the node to shut down.
|
||||
// For decommission/rollback: Tell the node to shut down.
|
||||
// This is done to improve user experience when there are no failures.
|
||||
// In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
|
||||
// so there's no guarantee that it would learn about entering that state even if it was still
|
||||
@@ -2853,15 +2937,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
|
||||
// There is the possibility that the node will never get the message
|
||||
// and decommission will hang on that node.
|
||||
// This is fine for the rest of the cluster - we will still remove, ban the node and continue.
|
||||
//
|
||||
// For removenode: The node is already dead, no need to send shutdown command.
|
||||
auto node_id = node.id;
|
||||
bool shutdown_failed = false;
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
if (!is_removenode) {
|
||||
try {
|
||||
node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
|
||||
} catch (...) {
|
||||
rtlogger.warn("failed to tell node {} to shut down - it may hang."
|
||||
" It's safe to shut it down manually now. (Exception: {})",
|
||||
node.id, std::current_exception());
|
||||
shutdown_failed = true;
|
||||
}
|
||||
}
|
||||
if (shutdown_failed) {
|
||||
node = retake_node(co_await start_operation(), node_id);
|
||||
@@ -3458,7 +3546,7 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
|
||||
}
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, {}, get_dead_nodes());
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
if (plan.empty()) {
|
||||
rtlogger.debug("Tablet load balancer did not make any plan");
|
||||
co_return false;
|
||||
|
||||
@@ -256,6 +256,20 @@ topology_mutation_builder& topology_mutation_builder::drop_first_global_topology
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::pause_rf_change_request(const utils::UUID& id) {
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::update, std::vector<data_value>{id});
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::resume_rf_change_request(const std::unordered_set<utils::UUID>& values, const utils::UUID& id) {
|
||||
if (values.contains(id)) {
|
||||
auto new_values = values;
|
||||
new_values.erase(id);
|
||||
return apply_set("paused_rf_change_requests", collection_apply_mode::overwrite, new_values | std::views::transform([] (const auto& id) { return data_value{id}; }));
|
||||
} else {
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
topology_mutation_builder& topology_mutation_builder::set_upgrade_state(topology::upgrade_state_type value) {
|
||||
return apply_atomic("upgrade_state", ::format("{}", value));
|
||||
}
|
||||
|
||||
@@ -129,6 +129,8 @@ public:
|
||||
topology_mutation_builder& del_global_topology_request_id();
|
||||
topology_mutation_builder& queue_global_topology_request_id(const utils::UUID& value);
|
||||
topology_mutation_builder& drop_first_global_topology_request_id(const std::vector<utils::UUID>&, const utils::UUID&);
|
||||
topology_mutation_builder& pause_rf_change_request(const utils::UUID&);
|
||||
topology_mutation_builder& resume_rf_change_request(const std::unordered_set<utils::UUID>&, const utils::UUID&);
|
||||
topology_node_mutation_builder& with_node(raft::server_id);
|
||||
canonical_mutation build() { return canonical_mutation{std::move(_m)}; }
|
||||
};
|
||||
|
||||
@@ -180,6 +180,10 @@ struct topology {
|
||||
// The KS options to be used when executing the scheduled ALTER KS statement
|
||||
std::optional<std::unordered_map<sstring, sstring>> new_keyspace_rf_change_data;
|
||||
|
||||
// The ids of RF change requests that are paused because they require tablet co-location.
|
||||
// It may happen during altering from numerical RF to rack list.
|
||||
std::unordered_set<utils::UUID> paused_rf_change_requests;
|
||||
|
||||
// The IDs of the committed yet unpublished CDC generations sorted by timestamps.
|
||||
std::vector<cdc::generation_id_v2> unpublished_cdc_generations;
|
||||
|
||||
|
||||
@@ -27,7 +27,6 @@ enum class component_type {
|
||||
TemporaryTOC,
|
||||
TemporaryStatistics,
|
||||
Scylla,
|
||||
TemporaryScylla,
|
||||
Rows,
|
||||
Partitions,
|
||||
TemporaryHashes,
|
||||
@@ -77,8 +76,6 @@ struct fmt::formatter<sstables::component_type> : fmt::formatter<string_view> {
|
||||
return formatter<string_view>::format("TemporaryStatistics", ctx);
|
||||
case Scylla:
|
||||
return formatter<string_view>::format("Scylla", ctx);
|
||||
case TemporaryScylla:
|
||||
return formatter<string_view>::format("TemporaryScylla", ctx);
|
||||
case Partitions:
|
||||
return formatter<string_view>::format("Partitions", ctx);
|
||||
case Rows:
|
||||
|
||||
@@ -632,10 +632,6 @@ private:
|
||||
std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
|
||||
|
||||
void close_data_writer();
|
||||
void close_index_writer();
|
||||
void close_rows_writer();
|
||||
void close_partitions_writer();
|
||||
|
||||
void ensure_tombstone_is_written() {
|
||||
if (!_tombstone_written) {
|
||||
consume(tombstone());
|
||||
@@ -948,16 +944,17 @@ void writer::init_file_writers() {
|
||||
_sst._schema->get_compressor_params(),
|
||||
std::move(compressor)), _sst.get_filename());
|
||||
}
|
||||
|
||||
if (_sst.has_component(component_type::Index)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Index).get();
|
||||
_index_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, _sst.index_filename());
|
||||
_index_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), _sst.index_filename());
|
||||
}
|
||||
if (_sst.has_component(component_type::Partitions) && _sst.has_component(component_type::Rows)) {
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Rows).get();
|
||||
_rows_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Rows));
|
||||
_rows_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Rows));
|
||||
_bti_row_index_writer = trie::bti_row_index_writer(*_rows_writer);
|
||||
out = _sst._storage->make_data_or_index_sink(_sst, component_type::Partitions).get();
|
||||
_partitions_writer = std::make_unique<crc32_digest_file_writer>(std::move(out), _sst.sstable_buffer_size, component_name(_sst, component_type::Partitions));
|
||||
_partitions_writer = std::make_unique<file_writer>(output_stream<char>(std::move(out)), component_name(_sst, component_type::Partitions));
|
||||
_bti_partition_index_writer = trie::bti_partition_index_writer(*_partitions_writer);
|
||||
}
|
||||
if (_delayed_filter) {
|
||||
@@ -985,41 +982,6 @@ void writer::close_data_writer() {
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_index_writer() {
|
||||
if (_index_writer) {
|
||||
auto writer = close_writer(_index_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().index_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_partitions_writer() {
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
auto writer = close_writer(_partitions_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().partitions_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::close_rows_writer() {
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
auto writer = close_writer(_rows_writer);
|
||||
auto chksum_wr = static_cast<crc32_digest_file_writer*>(writer.get());
|
||||
_sst.get_components_digests().rows_digest = chksum_wr->full_checksum();
|
||||
}
|
||||
}
|
||||
|
||||
void writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
_c_stats.start_offset = _data_writer->offset();
|
||||
_prev_row_start = _data_writer->offset();
|
||||
@@ -1668,10 +1630,27 @@ void writer::consume_end_of_stream() {
|
||||
_collector.add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
|
||||
}
|
||||
|
||||
close_index_writer();
|
||||
if (_index_writer) {
|
||||
close_writer(_index_writer);
|
||||
}
|
||||
|
||||
close_partitions_writer();
|
||||
close_rows_writer();
|
||||
if (_partitions_writer) {
|
||||
_sst._partitions_db_footer = std::move(*_bti_partition_index_writer).finish(
|
||||
_sst.get_version(),
|
||||
_first_key.value(),
|
||||
_last_key.value());
|
||||
close_writer(_partitions_writer);
|
||||
}
|
||||
if (_rows_writer) {
|
||||
// Append some garbage padding to the file just to ensure that it's never empty.
|
||||
// (Otherwise it would be empty if the sstable contains only small partitions).
|
||||
// This is a hack to work around some bad interactions between zero-sized files
|
||||
// and object storage. (It seems that e.g. minio considers a zero-sized file
|
||||
// upload to be a no-op, which breaks some assumptions).
|
||||
uint32_t garbage = seastar::cpu_to_be(0x13371337);
|
||||
_rows_writer->write(reinterpret_cast<const char*>(&garbage), sizeof(garbage));
|
||||
close_writer(_rows_writer);
|
||||
}
|
||||
|
||||
if (_hashes_writer) {
|
||||
close_writer(_hashes_writer);
|
||||
|
||||
@@ -44,7 +44,6 @@ sstable_version_constants::component_map_t sstable_version_constants::create_com
|
||||
{ component_type::Filter, "Filter.db" },
|
||||
{ component_type::Statistics, "Statistics.db" },
|
||||
{ component_type::Scylla, "Scylla.db" },
|
||||
{ component_type::TemporaryScylla, "Scylla.db.tmp" },
|
||||
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
|
||||
{ component_type::TemporaryStatistics, "Statistics.db.tmp" }
|
||||
};
|
||||
|
||||
@@ -956,22 +956,16 @@ future<file_writer> sstable::make_component_file_writer(component_type c, file_o
|
||||
});
|
||||
}
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> sstable::make_digests_component_file_writer(component_type c, file_output_stream_options options, open_flags oflags) noexcept {
|
||||
return _storage->make_component_sink(*this, c, oflags, std::move(options)).then([this, comp = component_name(*this, c)] (data_sink sink) mutable {
|
||||
return std::make_unique<crc32_digest_file_writer>(std::move(sink), sstable_buffer_size, comp);
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::open_sstable(const sstring& origin) {
|
||||
_origin = origin;
|
||||
generate_toc();
|
||||
_storage->open(*this);
|
||||
}
|
||||
|
||||
void sstable::write_toc(std::unique_ptr<crc32_digest_file_writer> w) {
|
||||
void sstable::write_toc(file_writer w) {
|
||||
sstlog.debug("Writing TOC file {} ", toc_filename());
|
||||
|
||||
do_write_simple(*w, [&] (version_types v, file_writer& w) {
|
||||
do_write_simple(std::move(w), [&] (version_types v, file_writer& w) {
|
||||
for (auto&& key : _recognized_components) {
|
||||
// new line character is appended to the end of each component name.
|
||||
auto value = sstable_version_constants::get_component_map(v).at(key) + "\n";
|
||||
@@ -979,8 +973,6 @@ void sstable::write_toc(std::unique_ptr<crc32_digest_file_writer> w) {
|
||||
write(v, w, b);
|
||||
}
|
||||
});
|
||||
|
||||
_components_digests.toc_digest = w->full_checksum();
|
||||
}
|
||||
|
||||
void sstable::write_crc(const checksum& c) {
|
||||
@@ -997,7 +989,6 @@ void sstable::write_digest(uint32_t full_checksum) {
|
||||
auto digest = to_sstring<bytes>(full_checksum);
|
||||
write(v, w, digest);
|
||||
}, buffer_size);
|
||||
_components_digests.data_digest = full_checksum;
|
||||
}
|
||||
|
||||
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
|
||||
@@ -1054,7 +1045,7 @@ future<> sstable::read_simple(T& component) {
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::do_write_simple(file_writer& writer,
|
||||
void sstable::do_write_simple(file_writer&& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component) {
|
||||
write_component(_version, writer);
|
||||
_metadata_size_on_disk += writer.offset();
|
||||
@@ -1069,7 +1060,7 @@ void sstable::do_write_simple(component_type type,
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(w, std::move(write_component));
|
||||
do_write_simple(std::move(w), std::move(write_component));
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
@@ -1079,30 +1070,10 @@ void sstable::write_simple(const T& component) {
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
uint32_t sstable::do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component, unsigned buffer_size) {
|
||||
auto file_path = filename(type);
|
||||
sstlog.debug("Writing {} file {}", sstable_version_constants::get_component_map(_version).at(type), file_path);
|
||||
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = buffer_size;
|
||||
auto w = make_digests_component_file_writer(type, std::move(options)).get();
|
||||
do_write_simple(*w, std::move(write_component));
|
||||
return w->full_checksum();
|
||||
}
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t sstable::write_simple_with_digest(const T& component) {
|
||||
return do_write_simple_with_digest(Type, [&component] (version_types v, file_writer& w) {
|
||||
write(v, w, component);
|
||||
}, sstable_buffer_size);
|
||||
}
|
||||
|
||||
template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f);
|
||||
template void sstable::write_simple<component_type::Filter>(const sstables::filter& f);
|
||||
|
||||
template void sstable::write_simple<component_type::Summary>(const sstables::summary_ka&);
|
||||
template uint32_t sstable::write_simple_with_digest<component_type::Summary>(const sstables::summary_ka&);
|
||||
|
||||
future<> sstable::read_compression() {
|
||||
// FIXME: If there is no compression, we should expect a CRC file to be present.
|
||||
@@ -1121,8 +1092,7 @@ void sstable::write_compression() {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t digest = write_simple_with_digest<component_type::CompressionInfo>(_components->compression);
|
||||
_components_digests.compression_digest = digest;
|
||||
write_simple<component_type::CompressionInfo>(_components->compression);
|
||||
}
|
||||
|
||||
void sstable::validate_partitioner() {
|
||||
@@ -1347,8 +1317,7 @@ future<> sstable::read_partitions_db_footer() {
|
||||
}
|
||||
|
||||
void sstable::write_statistics() {
|
||||
auto digest = write_simple_with_digest<component_type::Statistics>(_components->statistics);
|
||||
_components_digests.statistics_digest = digest;
|
||||
write_simple<component_type::Statistics>(_components->statistics);
|
||||
}
|
||||
|
||||
void sstable::mark_as_being_repaired(const service::session_id& id) {
|
||||
@@ -1371,25 +1340,13 @@ int64_t sstable::update_repaired_at(int64_t repaired_at) {
|
||||
void sstable::rewrite_statistics() {
|
||||
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
|
||||
|
||||
auto lock = get_units(_mutate_sem, 1).get();
|
||||
file_output_stream_options options;
|
||||
options.buffer_size = sstable_buffer_size;
|
||||
auto w = make_digests_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
auto w = make_component_file_writer(component_type::TemporaryStatistics, std::move(options),
|
||||
open_flags::wo | open_flags::create | open_flags::truncate).get();
|
||||
write(_version, *w, _components->statistics);
|
||||
w->close();
|
||||
|
||||
// When rewriting statistics, we also need to update the scylla component
|
||||
// because it contains the digest of the statistics component.
|
||||
if (has_scylla_component()) {
|
||||
_components_digests.statistics_digest = w->full_checksum();
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests{_components_digests});
|
||||
sstlog.debug("Rewriting scylla component of sstable {}", get_filename());
|
||||
write_simple<component_type::TemporaryScylla>(*_components->scylla_metadata);
|
||||
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryScylla)), fmt::to_string(filename(component_type::Scylla))).get();
|
||||
}
|
||||
|
||||
write(_version, w, _components->statistics);
|
||||
w.close();
|
||||
// rename() guarantees atomicity when renaming a file into place.
|
||||
sstable_write_io_check(rename_file, fmt::to_string(filename(component_type::TemporaryStatistics)), fmt::to_string(filename(component_type::Statistics))).get();
|
||||
}
|
||||
@@ -1583,8 +1540,7 @@ void sstable::write_filter() {
|
||||
|
||||
auto&& bs = f->bits();
|
||||
auto filter_ref = sstables::filter_ref(f->num_hashes(), bs.get_storage());
|
||||
uint32_t digest = write_simple_with_digest<component_type::Filter>(filter_ref);
|
||||
_components_digests.filter_digest = digest;
|
||||
write_simple<component_type::Filter>(filter_ref);
|
||||
}
|
||||
|
||||
void sstable::maybe_rebuild_filter_from_index(uint64_t num_partitions) {
|
||||
@@ -2043,8 +1999,6 @@ sstable::read_scylla_metadata() noexcept {
|
||||
}
|
||||
return read_simple<component_type::Scylla>(*_components->scylla_metadata).then([this] {
|
||||
_features = _components->scylla_metadata->get_features();
|
||||
_components_digests = _components->scylla_metadata->get_components_digests();
|
||||
_components->digest = _components_digests.data_digest;
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2134,7 +2088,6 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
|
||||
sstable_schema.columns.elements.push_back(sstable_column_description{to_sstable_column_kind(col.kind), {col.name()}, {to_bytes(col.type->name())}});
|
||||
}
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::Schema>(std::move(sstable_schema));
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::ComponentsDigests>(components_digests(_components_digests));
|
||||
|
||||
write_simple<component_type::Scylla>(*_components->scylla_metadata);
|
||||
}
|
||||
@@ -2536,15 +2489,19 @@ std::vector<std::pair<component_type, sstring>> sstable::all_components() const
|
||||
}
|
||||
|
||||
future<> sstable::snapshot(const sstring& dir) const {
|
||||
return _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->snapshot(*this, dir, storage::absolute_path::yes);
|
||||
}
|
||||
|
||||
future<> sstable::change_state(sstable_state to, delayed_commit_changes* delay_commit) {
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, _generation, delay_commit);
|
||||
_state = to;
|
||||
}
|
||||
|
||||
future<> sstable::pick_up_from_upload(sstable_state to, generation_type new_generation) {
|
||||
// just in case, not really needed as the sstable is not yet in use while in the upload dir
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
co_await _storage->change_state(*this, to, new_generation, nullptr);
|
||||
_generation = std::move(new_generation);
|
||||
_state = to;
|
||||
@@ -3118,31 +3075,6 @@ void sstable::set_sstable_level(uint32_t new_level) {
|
||||
s.sstable_level = new_level;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> sstable::get_component_digest(component_type c) const {
|
||||
switch (c) {
|
||||
case component_type::Index:
|
||||
return _components_digests.index_digest;
|
||||
case component_type::Summary:
|
||||
return _components_digests.summary_digest;
|
||||
case component_type::TOC:
|
||||
return _components_digests.toc_digest;
|
||||
case component_type::CompressionInfo:
|
||||
return _components_digests.compression_digest;
|
||||
case component_type::Filter:
|
||||
return _components_digests.filter_digest;
|
||||
case component_type::Partitions:
|
||||
return _components_digests.partitions_digest;
|
||||
case component_type::Rows:
|
||||
return _components_digests.rows_digest;
|
||||
case component_type::Data:
|
||||
return _components_digests.data_digest;
|
||||
case component_type::Statistics:
|
||||
return _components_digests.statistics_digest;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
future<> sstable::mutate_sstable_level(uint32_t new_level) {
|
||||
if (!has_component(component_type::Statistics)) {
|
||||
return make_ready_future<>();
|
||||
@@ -3479,6 +3411,9 @@ utils::hashed_key sstable::make_hashed_key(const schema& s, const partition_key&
|
||||
|
||||
future<>
|
||||
sstable::unlink(storage::sync_dir sync) noexcept {
|
||||
// Serialize with other calls to unlink or potentially ongoing mutations.
|
||||
auto lock = co_await get_units(_mutate_sem, 1);
|
||||
|
||||
_unlinked = true;
|
||||
_on_delete(*this);
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "sstables/writer.hh"
|
||||
#include "version.hh"
|
||||
#include "shared_sstable.hh"
|
||||
#include "open_info.hh"
|
||||
@@ -629,7 +628,9 @@ private:
|
||||
size_t _total_memory_reclaimed{0};
|
||||
bool _unlinked{false};
|
||||
|
||||
components_digests _components_digests;
|
||||
// The mutate semaphore is used to serialize operations like rewrite_statistics
|
||||
// with linking or moving the sstable between directories.
|
||||
mutable named_semaphore _mutate_sem{1, named_semaphore_exception_factory{"sstable mutate"}};
|
||||
public:
|
||||
bool has_component(component_type f) const;
|
||||
sstables_manager& manager() { return _manager; }
|
||||
@@ -650,18 +651,12 @@ private:
|
||||
|
||||
template <component_type Type, typename T>
|
||||
void write_simple(const T& comp);
|
||||
void do_write_simple(file_writer& writer,
|
||||
void do_write_simple(file_writer&& writer,
|
||||
noncopyable_function<void (version_types, file_writer&)> write_component);
|
||||
void do_write_simple(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
template <component_type Type, typename T>
|
||||
uint32_t write_simple_with_digest(const T& comp);
|
||||
uint32_t do_write_simple_with_digest(component_type type,
|
||||
noncopyable_function<void (version_types version, file_writer& writer)> write_component,
|
||||
unsigned buffer_size);
|
||||
|
||||
void write_crc(const checksum& c);
|
||||
void write_digest(uint32_t full_checksum);
|
||||
|
||||
@@ -672,9 +667,6 @@ private:
|
||||
future<file_writer> make_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
future<std::unique_ptr<crc32_digest_file_writer>> make_digests_component_file_writer(component_type c, file_output_stream_options options,
|
||||
open_flags oflags = open_flags::wo | open_flags::create | open_flags::exclusive) noexcept;
|
||||
|
||||
void generate_toc();
|
||||
void open_sstable(const sstring& origin);
|
||||
|
||||
@@ -705,8 +697,7 @@ private:
|
||||
future<> read_summary() noexcept;
|
||||
|
||||
void write_summary() {
|
||||
uint32_t digest = write_simple_with_digest<component_type::Summary>(_components->summary);
|
||||
_components_digests.summary_digest = digest;
|
||||
write_simple<component_type::Summary>(_components->summary);
|
||||
}
|
||||
|
||||
// To be called when we try to load an SSTable that lacks a Summary. Could
|
||||
@@ -836,7 +827,7 @@ private:
|
||||
|
||||
future<> open_or_create_data(open_flags oflags, file_open_options options = {}) noexcept;
|
||||
// runs in async context (called from storage::open)
|
||||
void write_toc(std::unique_ptr<crc32_digest_file_writer> w);
|
||||
void write_toc(file_writer w);
|
||||
static future<uint32_t> read_digest_from_file(file f);
|
||||
static future<lw_shared_ptr<checksum>> read_checksum_from_file(file f);
|
||||
public:
|
||||
@@ -1026,12 +1017,6 @@ public:
|
||||
return _components->digest;
|
||||
}
|
||||
|
||||
components_digests& get_components_digests() {
|
||||
return _components_digests;
|
||||
}
|
||||
|
||||
std::optional<uint32_t> get_component_digest(component_type c) const;
|
||||
|
||||
// Gets ratio of droppable tombstone. A tombstone is considered droppable here
|
||||
// for cells and tombstones expired before the time point "GC before", which
|
||||
// is the point before which expiring data can be purged.
|
||||
|
||||
@@ -204,13 +204,13 @@ void filesystem_storage::open(sstable& sst) {
|
||||
open_flags::create |
|
||||
open_flags::exclusive,
|
||||
options).get();
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(sink), sst.sstable_buffer_size, component_name(sst, component_type::TemporaryTOC));
|
||||
auto w = file_writer(output_stream<char>(std::move(sink)), component_name(sst, component_type::TemporaryTOC));
|
||||
|
||||
bool toc_exists = file_exists(fmt::to_string(sst.filename(component_type::TOC))).get();
|
||||
if (toc_exists) {
|
||||
// TOC will exist at this point if write_components() was called with
|
||||
// the generation of a sstable that exists.
|
||||
w->close();
|
||||
w.close();
|
||||
remove_file(fmt::to_string(sst.filename(component_type::TemporaryTOC))).get();
|
||||
throw std::runtime_error(format("SSTable write failed due to existence of TOC file for generation {} of {}.{}", sst._generation, sst._schema->ks_name(), sst._schema->cf_name()));
|
||||
}
|
||||
@@ -670,10 +670,15 @@ void object_storage_base::open(sstable& sst) {
|
||||
sst.manager().sstables_registry().create_entry(owner(), status_creating, sst._state, std::move(desc)).get();
|
||||
|
||||
memory_data_sink_buffers bufs;
|
||||
auto out = data_sink(std::make_unique<memory_data_sink>(bufs));
|
||||
auto w = std::make_unique<crc32_digest_file_writer>(std::move(out), sst.sstable_buffer_size, component_name(sst, component_type::TOC));
|
||||
|
||||
sst.write_toc(std::move(w));
|
||||
sst.write_toc(
|
||||
file_writer(
|
||||
output_stream<char>(
|
||||
data_sink(
|
||||
std::make_unique<memory_data_sink>(bufs)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
put_object(make_object_name(sst, component_type::TOC), std::move(bufs)).get();
|
||||
}
|
||||
|
||||
|
||||
@@ -547,7 +547,6 @@ enum class scylla_metadata_type : uint32_t {
|
||||
ExtTimestampStats = 9,
|
||||
SSTableIdentifier = 10,
|
||||
Schema = 11,
|
||||
ComponentsDigests = 12,
|
||||
};
|
||||
|
||||
// UUID is used for uniqueness across nodes, such that an imported sstable
|
||||
@@ -574,24 +573,6 @@ struct sstable_identifier_type {
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(value); }
|
||||
};
|
||||
|
||||
// Component digests stored in scylla metadata to track integrity of individual components
|
||||
struct components_digests {
|
||||
std::optional<uint32_t> data_digest;
|
||||
std::optional<uint32_t> compression_digest;
|
||||
std::optional<uint32_t> filter_digest;
|
||||
std::optional<uint32_t> statistics_digest;
|
||||
std::optional<uint32_t> summary_digest;
|
||||
std::optional<uint32_t> index_digest;
|
||||
std::optional<uint32_t> toc_digest;
|
||||
std::optional<uint32_t> partitions_digest;
|
||||
std::optional<uint32_t> rows_digest;
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) {
|
||||
return f(data_digest,compression_digest, filter_digest, statistics_digest, summary_digest, index_digest, toc_digest, partitions_digest, rows_digest);
|
||||
}
|
||||
};
|
||||
|
||||
// Types of large data statistics.
|
||||
//
|
||||
// Note: For extensibility, never reuse an identifier,
|
||||
@@ -675,8 +656,7 @@ struct scylla_metadata {
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ScyllaVersion, scylla_version>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtTimestampStats, ext_timestamp_stats>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::SSTableIdentifier, sstable_identifier>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>
|
||||
> data;
|
||||
|
||||
sstable_enabled_features get_features() const {
|
||||
@@ -711,13 +691,6 @@ struct scylla_metadata {
|
||||
auto* sid = data.get<scylla_metadata_type::SSTableIdentifier, scylla_metadata::sstable_identifier>();
|
||||
return sid ? sid->value : sstable_id::create_null_id();
|
||||
}
|
||||
const components_digests get_components_digests() const {
|
||||
auto cd = data.get<scylla_metadata_type::ComponentsDigests, components_digests>();
|
||||
if (!cd) {
|
||||
return {};
|
||||
}
|
||||
return *cd;
|
||||
}
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(data); }
|
||||
|
||||
@@ -65,7 +65,7 @@ serialized_size(sstable_version_types v, const T& object) {
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink_impl : public data_sink_impl {
|
||||
data_sink _out;
|
||||
@@ -92,9 +92,7 @@ public:
|
||||
|
||||
per_chunk_checksum = ChecksumType::checksum(per_chunk_checksum, buf.begin() + offset, size);
|
||||
_full_checksum = checksum_combine_or_feed<ChecksumType>(_full_checksum, per_chunk_checksum, buf.begin() + offset, size);
|
||||
if constexpr (calculate_chunk_checksums) {
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
}
|
||||
_c.checksums.push_back(per_chunk_checksum);
|
||||
}
|
||||
}
|
||||
return _out.put(std::move(bufs));
|
||||
@@ -114,29 +112,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_data_sink : public data_sink {
|
||||
public:
|
||||
checksummed_file_data_sink(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum)
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType, calculate_chunk_checksums>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
: data_sink(std::make_unique<checksummed_file_data_sink_impl<ChecksumType>>(std::move(out), cinfo, full_file_checksum)) {}
|
||||
};
|
||||
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
inline
|
||||
output_stream<char> make_checksummed_file_output_stream(data_sink out, struct checksum& cinfo, uint32_t& full_file_checksum) {
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType, calculate_chunk_checksums>(std::move(out), cinfo, full_file_checksum));
|
||||
return output_stream<char>(checksummed_file_data_sink<ChecksumType>(std::move(out), cinfo, full_file_checksum));
|
||||
}
|
||||
|
||||
template <typename ChecksumType, bool calculate_chunk_checksums>
|
||||
template <typename ChecksumType>
|
||||
requires ChecksumUtils<ChecksumType>
|
||||
class checksummed_file_writer : public file_writer {
|
||||
checksum _c;
|
||||
uint32_t _full_checksum;
|
||||
public:
|
||||
checksummed_file_writer(data_sink out, size_t buffer_size, component_name c)
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType, calculate_chunk_checksums>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
: file_writer(make_checksummed_file_output_stream<ChecksumType>(std::move(out), _c, _full_checksum), std::move(c))
|
||||
, _c(uint32_t(std::min(size_t(DEFAULT_CHUNK_SIZE), buffer_size)), {})
|
||||
, _full_checksum(ChecksumType::init_checksum()) {}
|
||||
|
||||
@@ -154,10 +152,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils, true>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils, true>;
|
||||
|
||||
using crc32_digest_file_writer = checksummed_file_writer<crc32_utils, false>;
|
||||
using adler32_checksummed_file_writer = checksummed_file_writer<adler32_utils>;
|
||||
using crc32_checksummed_file_writer = checksummed_file_writer<crc32_utils>;
|
||||
|
||||
template <typename T, typename W>
|
||||
requires Writer<W>
|
||||
|
||||
@@ -112,6 +112,7 @@ public:
|
||||
// Each virtual task needs to have its group.
|
||||
topology_change_group,
|
||||
tablets_group,
|
||||
global_topology_change_group,
|
||||
};
|
||||
|
||||
class task : public enable_lw_shared_from_this<task> {
|
||||
|
||||
5
test.py
5
test.py
@@ -228,7 +228,7 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
scylla_additional_options = parser.add_argument_group('Additional options for Scylla tests')
|
||||
scylla_additional_options.add_argument('--x-log2-compaction-groups', action="store", default="0", type=int,
|
||||
help="Controls number of compaction groups to be used by Scylla tests. Value of 3 implies 8 groups.")
|
||||
scylla_additional_options.add_argument('--extra-scylla-cmdline-options', action="store", default=[], type=str,
|
||||
scylla_additional_options.add_argument('--extra-scylla-cmdline-options', action="store", default="", type=str,
|
||||
help="Passing extra scylla cmdline options for all tests. Options should be space separated:"
|
||||
"'--logger-log-level raft=trace --default-log-level error'")
|
||||
|
||||
@@ -279,9 +279,6 @@ def parse_cmd_line() -> argparse.Namespace:
|
||||
args.tmpdir = os.path.abspath(args.tmpdir)
|
||||
prepare_dirs(tempdir_base=pathlib.Path(args.tmpdir), modes=args.modes, gather_metrics=args.gather_metrics, save_log_on_success=args.save_log_on_success)
|
||||
|
||||
if args.extra_scylla_cmdline_options:
|
||||
args.extra_scylla_cmdline_options = args.extra_scylla_cmdline_options.split()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
@@ -152,7 +152,7 @@ def test_batch_write_nonduplicate_multiple_tables(test_table_s, test_table_s_2):
|
||||
p = random_string()
|
||||
# The batch_writer() function used in previous tests can't write to more
|
||||
# than one table. So we use the lower level interface boto3 gives us.
|
||||
reply = test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p, 'a': 'hi'}}}],
|
||||
test_table_s_2.name: [{'PutRequest': {'Item': {'p': p, 'b': 'hello'}}}]
|
||||
})
|
||||
@@ -222,7 +222,7 @@ def test_batch_write_multiple_tables(test_table_s, test_table):
|
||||
# We use the low-level batch_write_item API for lack of a more convenient
|
||||
# API (the batch_writer() API can only write to one table). At least it
|
||||
# spares us the need to encode the key's types...
|
||||
reply = test_table.meta.client.batch_write_item(RequestItems = {
|
||||
test_table.meta.client.batch_write_item(RequestItems = {
|
||||
test_table.name: [{'PutRequest': {'Item': {'p': p1, 'c': c1, 'a': 'hi'}}}],
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p2, 'b': 'hello'}}}]
|
||||
})
|
||||
@@ -537,9 +537,8 @@ def test_batch_get_item_full_failure(scylla_only, dynamodb, rest_api, test_table
|
||||
for i in range(count):
|
||||
batch.put_item(Item={
|
||||
'p': p, 'c': i, 'content': content})
|
||||
responses = []
|
||||
to_read = { test_table_sn.name: {'Keys': [{'p': p, 'c': c} for c in range(count)], 'ConsistentRead': True } }
|
||||
# The error injection is permanent, so it will fire for each batch read.
|
||||
with scylla_inject_error(rest_api, "alternator_batch_get_item", one_shot=False):
|
||||
with pytest.raises(ClientError, match="InternalServerError"):
|
||||
reply = test_table_sn.meta.client.batch_get_item(RequestItems = to_read)
|
||||
test_table_sn.meta.client.batch_get_item(RequestItems = to_read)
|
||||
|
||||
@@ -376,7 +376,7 @@ def test_rbac_updateitem_read(dynamodb, cql, test_table_s):
|
||||
assert ret['Attributes'] == {'p': p, 'v': v1}
|
||||
# Just MODIFY permission, not SELECT permission, also allows
|
||||
# us to do a read-modify-write expression:
|
||||
ret = authorized(lambda: tab.update_item(Key={'p': p},
|
||||
authorized(lambda: tab.update_item(Key={'p': p},
|
||||
UpdateExpression='SET v = v + :val',
|
||||
ExpressionAttributeValues={':val': 1}))
|
||||
assert {'p': p, 'v': v2 + 1} == test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
@@ -903,7 +903,6 @@ def test_rbac_tagresource(dynamodb, cql):
|
||||
arn = table.meta.client.describe_table(TableName=table.name)['Table']['TableArn']
|
||||
with new_role(cql) as (role, key):
|
||||
with new_dynamodb(dynamodb, role, key) as d:
|
||||
tab = d.Table(table.name)
|
||||
# Without ALTER permission, TagResource and UntagResource
|
||||
# are refused
|
||||
tags = [{'Key': 'hello', 'Value': 'dog'},
|
||||
|
||||
@@ -80,18 +80,18 @@ def test_table_sn_with_data(test_table_sn):
|
||||
def test_filter_expression_partition_key_1(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.*Condition'):
|
||||
got_items = full_query(table, FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
full_query(table, FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
|
||||
def test_filter_expression_partition_key_2(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.* p'):
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
full_query(table, KeyConditionExpression='p=:p', FilterExpression='p=:p', ExpressionAttributeValues={':p': p})
|
||||
|
||||
# FilterExpression is also not allowed on the sort key.
|
||||
def test_filter_expression_sort_key(test_table_sn_with_data):
|
||||
table, p, items = test_table_sn_with_data
|
||||
with pytest.raises(ClientError, match='ValidationException.* key '):
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='c=:c',
|
||||
full_query(table, KeyConditionExpression='p=:p', FilterExpression='c=:c',
|
||||
ExpressionAttributeValues={':p': p, ':c': 3})
|
||||
|
||||
# Test the "=" operator on different types of attributes (numeric, string,
|
||||
@@ -387,7 +387,6 @@ def test_filter_expression_map_contains(test_table_sn_with_data):
|
||||
assert(got_items == expected_items)
|
||||
# One value from a map:
|
||||
i = next(iter(items[2]['m']))
|
||||
v = items[2]['m'][i]
|
||||
got_items = full_query(table, KeyConditionExpression='p=:p', FilterExpression='contains(m, :i)',
|
||||
ExpressionAttributeValues={':p': p, ':i': i})
|
||||
#The following could have made sense, but it's what DynamoDB does:
|
||||
|
||||
@@ -125,7 +125,6 @@ def test_basic_string_more_update(test_table):
|
||||
val1 = random_string()
|
||||
val2 = random_string()
|
||||
val3 = random_string()
|
||||
val4 = random_string()
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
|
||||
test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
|
||||
|
||||
@@ -304,7 +304,7 @@ def test_wcu_batch_write_item(test_table_s, metrics):
|
||||
with check_increases_operation(metrics, ['PutItem'], 'scylla_alternator_wcu_total', 3):
|
||||
p1 = random_string()
|
||||
p2 = random_string()
|
||||
response = test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': p1, 'a': 'hi'}}}, {'PutRequest': {'Item': {'p': p2, 'a': 'a' * KB}}}]
|
||||
})
|
||||
|
||||
|
||||
@@ -369,7 +369,6 @@ def test_query_exclusivestartkey(test_table_sn):
|
||||
# The ExclusiveStartKey option must indicate both partition key and
|
||||
# sort key. Note that the Python driver further converts this map
|
||||
# into the correct format for the request (including the key types).
|
||||
exclusivestartkey = { 'p': p, 'c': start }
|
||||
got_items = test_table_sn.query(
|
||||
KeyConditions={'p': { 'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}},
|
||||
ExclusiveStartKey= { 'p': p, 'c': start },
|
||||
|
||||
@@ -35,14 +35,12 @@ def test_invalid_consumed_capacity_type(test_table_sb):
|
||||
c = random_bytes()
|
||||
test_table_sb.put_item(Item={'p': p, 'c': c, 'att': val})
|
||||
with pytest.raises(ClientError):
|
||||
response = test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='DUMMY')
|
||||
test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='DUMMY')
|
||||
|
||||
# A missing Item, count as zero length item which require 1 or 0.5 RCU depends on the consistency
|
||||
def test_missing_get_item(test_table):
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
val = random_string()
|
||||
val2 = random_string()
|
||||
response = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True, ReturnConsumedCapacity='TOTAL')
|
||||
assert 'ConsumedCapacity' in response
|
||||
consumed_capacity = response['ConsumedCapacity']
|
||||
@@ -225,7 +223,6 @@ def test_simple_delete_item(test_table_sb):
|
||||
# we will get 1 WCU
|
||||
def test_delete_missing_item(test_table_sb):
|
||||
p = random_string()
|
||||
val = random_string()
|
||||
c = random_bytes()
|
||||
response = test_table_sb.delete_item(Key={'p': p, 'c': c}, ReturnConsumedCapacity='TOTAL')
|
||||
assert 'ConsumedCapacity' in response
|
||||
|
||||
@@ -99,7 +99,7 @@ def test_put_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Failed conditional on non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
ret=test_table_s.put_item(Item={'p': p, 's': 'cat'},
|
||||
test_table_s.put_item(Item={'p': p, 's': 'cat'},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
ExpressionAttributeValues={':v1' : 'dog'})
|
||||
@@ -175,7 +175,7 @@ def test_delete_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Delete of non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
ret=test_table_s.delete_item(Key={'p': p},
|
||||
test_table_s.delete_item(Key={'p': p},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
ExpressionAttributeValues={':v1' : 'dog'})
|
||||
@@ -566,7 +566,7 @@ def test_update_item_returnvalues_on_condition_check_failure(test_table_s):
|
||||
p = random_string()
|
||||
# Modification of non existing item doesn't return values.
|
||||
with pytest.raises(test_table_s.meta.client.exceptions.ConditionalCheckFailedException) as err:
|
||||
ret=test_table_s.update_item(Key={'p': p},
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
ReturnValuesOnConditionCheckFailure='ALL_OLD',
|
||||
ConditionExpression='s = :v1',
|
||||
UpdateExpression='SET s = :v2',
|
||||
|
||||
@@ -220,7 +220,6 @@ def test_scan_with_key_equality_filtering(dynamodb, filled_test_table):
|
||||
# without returning items at all.
|
||||
def test_scan_select(filled_test_table):
|
||||
test_table, items = filled_test_table
|
||||
got_items = full_scan(test_table)
|
||||
# By default, a scan returns all the items, with all their attributes:
|
||||
# query returns all attributes:
|
||||
got_items = full_scan(test_table)
|
||||
|
||||
@@ -135,7 +135,7 @@ def test_list_streams_create(dynamodb, dynamodbstreams):
|
||||
def test_list_streams_alter(dynamodb, dynamodbstreams):
|
||||
for type in stream_types:
|
||||
with create_stream_test_table(dynamodb, StreamViewType=None) as table:
|
||||
res = table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': type});
|
||||
table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': type});
|
||||
wait_for_active_stream(dynamodbstreams, table)
|
||||
|
||||
def test_list_streams_paged(dynamodb, dynamodbstreams):
|
||||
@@ -273,7 +273,7 @@ def test_describe_stream_create_time(dynamodb, dynamodbstreams):
|
||||
|
||||
def test_describe_nonexistent_stream(dynamodb, dynamodbstreams):
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException' if is_local_java(dynamodbstreams) else 'ValidationException'):
|
||||
streams = dynamodbstreams.describe_stream(StreamArn='sdfadfsdfnlfkajakfgjalksfgklasjklasdjfklasdfasdfgasf')
|
||||
dynamodbstreams.describe_stream(StreamArn='sdfadfsdfnlfkajakfgjalksfgklasjklasdjfklasdfasdfgasf')
|
||||
|
||||
def test_describe_stream_with_nonexistent_last_shard(dynamodb, dynamodbstreams):
|
||||
with create_stream_test_table(dynamodb, StreamViewType='KEYS_ONLY') as table:
|
||||
@@ -313,7 +313,7 @@ def test_get_shard_iterator(dynamodb, dynamodbstreams):
|
||||
for type in ['AT_SEQUENCE_NUMBER', 'AFTER_SEQUENCE_NUMBER']:
|
||||
# must have seq in these modes
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
iter = dynamodbstreams.get_shard_iterator(
|
||||
dynamodbstreams.get_shard_iterator(
|
||||
StreamArn=arn, ShardId=shard_id, ShardIteratorType=type
|
||||
)
|
||||
|
||||
@@ -326,7 +326,7 @@ def test_get_shard_iterator(dynamodb, dynamodbstreams):
|
||||
|
||||
# bad arn
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
iter = dynamodbstreams.get_shard_iterator(
|
||||
dynamodbstreams.get_shard_iterator(
|
||||
StreamArn='sdfadsfsdfsdgdfsgsfdabadfbabdadsfsdfsdfsdfsdfsdfsdfdfdssdffbdfdf', ShardId=shard_id, ShardIteratorType=type, SequenceNumber=seq
|
||||
)
|
||||
# bad shard id
|
||||
@@ -735,7 +735,6 @@ def compare_events(expected_events, output, mode, expected_region):
|
||||
assert not 'NewImage' in record
|
||||
if expected_old_image == None:
|
||||
assert not 'OldImage' in record
|
||||
pass
|
||||
else:
|
||||
old_image = {x:deserializer.deserialize(y) for (x,y) in record['OldImage'].items()}
|
||||
assert expected_old_image == old_image
|
||||
@@ -1642,7 +1641,6 @@ def test_table_stream_with_result(dynamodb, dynamodbstreams):
|
||||
# doing an UpdateTable to a table - because before this wait finishes we are
|
||||
# not allowed to update the same table again or delete it.
|
||||
def wait_for_status_active(table):
|
||||
start_time = time.time()
|
||||
for i in range(60):
|
||||
desc = table.meta.client.describe_table(TableName=table.name)
|
||||
if desc['Table']['TableStatus'] == 'ACTIVE':
|
||||
@@ -1919,15 +1917,15 @@ def test_get_records_too_high_limit(test_table_ss_keys_only, dynamodbstreams):
|
||||
shard_id = shard['ShardId']
|
||||
iter = dynamodbstreams.get_shard_iterator(StreamArn=arn, ShardId=shard_id, ShardIteratorType='LATEST')['ShardIterator']
|
||||
# Limit=1000 should be allowed:
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=1000)
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=1000)
|
||||
# Limit=1001 should NOT be allowed
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=1001)
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=1001)
|
||||
# Limit must be >= 0:
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=0)
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=0)
|
||||
with pytest.raises(ClientError, match='ValidationException.*[Ll]imit'):
|
||||
response = dynamodbstreams.get_records(ShardIterator=iter, Limit=-1)
|
||||
dynamodbstreams.get_records(ShardIterator=iter, Limit=-1)
|
||||
|
||||
# padded_name() creates a unique name of given length by taking the
|
||||
# output of unique_table_name() and padding it with extra 'x' characters:
|
||||
|
||||
@@ -56,7 +56,6 @@ def test_page_break_over_range_tombstone_asan(scylla_only, dynamodb, rest_api, c
|
||||
while True:
|
||||
response = client.scan(TableName=qualified_name, Limit=10, **args)
|
||||
pos = response.get('LastEvaluatedKey', None)
|
||||
cnt = 0
|
||||
for i in response['Items']:
|
||||
if i['cf_id'] == 'eee7eb26-a372-4eb4-aeaa-72f224cf0000':
|
||||
items_found.append(i['schema_version'])
|
||||
@@ -101,10 +100,9 @@ def test_fetch_from_system_tables(scylla_only, dynamodb, rest_api):
|
||||
def test_block_access_to_non_system_tables_with_virtual_interface(scylla_only, test_table_s, dynamodb):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException.*{}'.format(internal_prefix)):
|
||||
tables_response = client.scan(TableName="{}alternator_{}.{}".format(internal_prefix, test_table_s.name, test_table_s.name))
|
||||
client.scan(TableName="{}alternator_{}.{}".format(internal_prefix, test_table_s.name, test_table_s.name))
|
||||
|
||||
def test_block_creating_tables_with_reserved_prefix(scylla_only, dynamodb):
|
||||
client = dynamodb.meta.client
|
||||
for wrong_name_postfix in ['', 'a', 'xxx', 'system_auth.roles', 'table_name']:
|
||||
with pytest.raises(ClientError, match=internal_prefix):
|
||||
dynamodb.create_table(TableName=internal_prefix+wrong_name_postfix,
|
||||
@@ -200,7 +198,6 @@ def test_write_to_config(scylla_only, dynamodb):
|
||||
# Same test as above, just using the scylla_config_temporary() utility
|
||||
# function (also validating its correctness)
|
||||
def test_scylla_config_temporary(scylla_only, dynamodb):
|
||||
tbl = '.scylla.alternator.system.config'
|
||||
parameter = 'query_tombstone_page_limit'
|
||||
old_val = scylla_config_read(dynamodb, parameter)
|
||||
new_val = old_val + "1"
|
||||
|
||||
@@ -1021,7 +1021,7 @@ def test_transact_get_items_projection_expression(test_table_s):
|
||||
def test_transact_get_items_unused_expressionattributenames(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*unused.*#qq'):
|
||||
ret = test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
'TableName': test_table_s.name,
|
||||
'Key': {'p': p},
|
||||
@@ -1034,7 +1034,7 @@ def test_transact_get_items_unused_expressionattributenames(test_table_s):
|
||||
def test_transact_get_items_missing_expressionattributenames(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*#zz'):
|
||||
ret = test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
'TableName': test_table_s.name,
|
||||
'Key': {'p': p},
|
||||
@@ -1071,7 +1071,6 @@ def test_transact_get_items_100(test_table_s):
|
||||
# A transaction with 100 read actions is the limit, and 101 are not allowed:
|
||||
@pytest.mark.xfail(reason="#5064 - transactions not yet supported")
|
||||
def test_transact_get_items_101(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException.*[tT]ransactItems.*100'):
|
||||
test_table_s.meta.client.transact_get_items(TransactItems=[
|
||||
{ 'Get': {
|
||||
|
||||
@@ -638,12 +638,10 @@ def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration):
|
||||
assert response['TimeToLiveSpecification'] == ttl_spec
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
l = random_string()
|
||||
# expiration one minute in the past, so item should expire ASAP.
|
||||
expiration = int(time.time()) - 60
|
||||
table.put_item(Item={'p': p, 'c': c, 'l': expiration})
|
||||
start_time = time.time()
|
||||
gsi_was_alive = False
|
||||
while time.time() < start_time + max_duration:
|
||||
if 'Item' not in table.get_item(Key={'p': p, 'c': c}):
|
||||
# test is done - and successful:
|
||||
@@ -787,7 +785,7 @@ def test_ttl_expiration_long(dynamodb, waits_for_expiration):
|
||||
AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'N' },
|
||||
{ 'AttributeName': 'c', 'AttributeType': 'N' }]) as table:
|
||||
ttl_spec = {'AttributeName': 'expiration', 'Enabled': True}
|
||||
response = table.meta.client.update_time_to_live(TableName=table.name,
|
||||
table.meta.client.update_time_to_live(TableName=table.name,
|
||||
TimeToLiveSpecification=ttl_spec)
|
||||
with table.batch_writer() as batch:
|
||||
for p in range(N):
|
||||
|
||||
@@ -244,7 +244,7 @@ def get_region(dynamodb):
|
||||
# will trigger a test to be skipped if it cannot be executed.
|
||||
@contextmanager
|
||||
def scylla_inject_error(rest_api, err, one_shot=False):
|
||||
response = requests.post(f'{rest_api}/v2/error_injection/injection/{err}?one_shot={one_shot}')
|
||||
requests.post(f'{rest_api}/v2/error_injection/injection/{err}?one_shot={one_shot}')
|
||||
response = requests.get(f'{rest_api}/v2/error_injection/injection')
|
||||
print("Enabled error injections:", response.content.decode('utf-8'))
|
||||
if response.content.decode('utf-8') == "[]":
|
||||
@@ -253,7 +253,7 @@ def scylla_inject_error(rest_api, err, one_shot=False):
|
||||
yield
|
||||
finally:
|
||||
print("Disabling error injection", err)
|
||||
response = requests.delete(f'{rest_api}/v2/error_injection/injection/{err}')
|
||||
requests.delete(f'{rest_api}/v2/error_injection/injection/{err}')
|
||||
|
||||
# Send a message to the Scylla log. E.g., we can write a message to the log
|
||||
# indicating that a test has started, which will make it easier to see which
|
||||
@@ -306,7 +306,6 @@ def wait_for_gsi_gone(table, gsi_name):
|
||||
if 'GlobalSecondaryIndexes' in desc['Table']:
|
||||
index_desc = [x for x in desc['Table']['GlobalSecondaryIndexes'] if x['IndexName'] == gsi_name]
|
||||
if len(index_desc) != 0:
|
||||
index_status = index_desc[0]['IndexStatus']
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
return
|
||||
|
||||
@@ -1055,14 +1055,16 @@ SEASTAR_TEST_CASE(test_rack_list_rejected_when_feature_not_enabled) {
|
||||
BOOST_REQUIRE_EQUAL(replication_factor_data(opts.at(loc.dc)).count(), 1);
|
||||
BOOST_REQUIRE(describe(e, "test2").contains(fmt::format("'{}': '1'", loc.dc)));
|
||||
|
||||
// When feature is enabled, rack list is accepted.
|
||||
e.get_feature_service().local().rack_list_rf.enable();
|
||||
e.execute_cql(create_stmt).get();
|
||||
|
||||
// Altering numeric RF to rack list is not supported yet.
|
||||
// Altering to rack list is not allowed when feature is disabled.
|
||||
BOOST_REQUIRE_THROW(e.execute_cql(fmt::format("ALTER KEYSPACE test2 WITH REPLICATION = {{'class': 'NetworkTopologyStrategy',"
|
||||
" '{}': ['{}']}}", loc.dc, loc.rack)).get(),
|
||||
exceptions::configuration_exception);
|
||||
|
||||
// When feature is enabled, rack list is accepted.
|
||||
e.get_feature_service().local().rack_list_rf.enable();
|
||||
e.execute_cql(create_stmt).get();
|
||||
e.execute_cql(fmt::format("ALTER KEYSPACE test2 WITH REPLICATION = {{'class': 'NetworkTopologyStrategy',"
|
||||
" '{}': ['{}']}}", loc.dc, loc.rack)).get();
|
||||
}, cfg);
|
||||
}
|
||||
|
||||
|
||||
@@ -1663,7 +1663,7 @@ SEASTAR_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_engages) {
|
||||
db_cfg.reader_concurrency_semaphore_kill_limit_multiplier.set(4, utils::config_file::config_source::CommandLine);
|
||||
|
||||
return do_with_cql_env_thread([] (cql_test_env& env) {
|
||||
auto tbl = create_memory_limit_table(env, 64);
|
||||
auto tbl = create_memory_limit_table(env, 54);
|
||||
|
||||
auto& db = env.local_db();
|
||||
auto& semaphore = db.get_reader_concurrency_semaphore();
|
||||
|
||||
@@ -105,6 +105,28 @@ SEASTAR_THREAD_TEST_CASE(test_learn_schema_with_cdc) {
|
||||
BOOST_REQUIRE(s->cdc_schema()->registry_entry());
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_learn_loaded_schema_with_cdc) {
|
||||
dummy_init dummy;
|
||||
auto s_cdc = schema_builder("ks", "cdc_cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("val", bytes_type)
|
||||
.build();
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("val", bytes_type)
|
||||
.with_cdc_schema(s_cdc)
|
||||
.build();
|
||||
|
||||
local_schema_registry().get_or_load(s->version(), [s] (table_schema_version) {
|
||||
return make_ready_future<extended_frozen_schema>(s);
|
||||
}).get();
|
||||
|
||||
s = local_schema_registry().learn(s);
|
||||
|
||||
BOOST_REQUIRE(s->registry_entry());
|
||||
BOOST_REQUIRE(s->cdc_schema()->registry_entry());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_async_loading) {
|
||||
return seastar::async([] {
|
||||
dummy_init dummy;
|
||||
|
||||
@@ -15,14 +15,11 @@
|
||||
#include <seastar/core/smp.hh>
|
||||
#include <seastar/util/closeable.hh>
|
||||
|
||||
#include "sstables/checksum_utils.hh"
|
||||
#include <seastar/util/short_streams.hh>
|
||||
#include "sstables/generation_type.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "sstables/key.hh"
|
||||
#include "sstables/open_info.hh"
|
||||
#include "sstables/version.hh"
|
||||
#include "test/lib/random_schema.hh"
|
||||
#include "test/lib/sstable_utils.hh"
|
||||
#include "test/lib/reader_concurrency_semaphore.hh"
|
||||
#include "test/lib/scylla_test_case.hh"
|
||||
@@ -35,7 +32,6 @@
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "sstables/sstable_mutation_reader.hh"
|
||||
#include "sstables/binary_search.hh"
|
||||
#include "test/lib/random_utils.hh"
|
||||
|
||||
#include <boost/range/combine.hpp>
|
||||
|
||||
@@ -883,101 +879,3 @@ BOOST_AUTO_TEST_CASE(test_parse_path_bad) {
|
||||
BOOST_CHECK_THROW(parse_path(path), std::exception);
|
||||
}
|
||||
}
|
||||
|
||||
using compress_sstable = tests::random_schema_specification::compress_sstable;
|
||||
static future<> test_component_digest_persistence(component_type component, sstable::version_types version, compress_sstable compress = compress_sstable::no, bool rewrite_statistics = false) {
|
||||
return test_env::do_with_async([component, version, compress, rewrite_statistics] (test_env& env) mutable {
|
||||
auto random_spec = tests::make_random_schema_specification(
|
||||
"ks",
|
||||
std::uniform_int_distribution<size_t>(1, 4),
|
||||
std::uniform_int_distribution<size_t>(2, 4),
|
||||
std::uniform_int_distribution<size_t>(2, 8),
|
||||
std::uniform_int_distribution<size_t>(2, 8),
|
||||
compress);
|
||||
auto random_schema = tests::random_schema{tests::random::get_int<uint32_t>(), *random_spec};
|
||||
auto schema = random_schema.schema();
|
||||
|
||||
const auto muts = tests::generate_random_mutations(random_schema, 2).get();
|
||||
auto sst_original = make_sstable_containing(env.make_sstable(schema, version), muts);
|
||||
|
||||
auto& components = sstables::test(sst_original).get_components();
|
||||
bool has_component = components.find(component) != components.end();
|
||||
BOOST_REQUIRE(has_component);
|
||||
|
||||
auto toc_path = fmt::to_string(sst_original->toc_filename());
|
||||
auto entry_desc = sstables::parse_path(toc_path, schema->ks_name(), schema->cf_name());
|
||||
auto dir_path = std::filesystem::path(toc_path).parent_path().string();
|
||||
|
||||
std::optional<uint32_t> original_digest;
|
||||
if (rewrite_statistics) {
|
||||
original_digest = sst_original->get_component_digest(component);
|
||||
BOOST_REQUIRE(original_digest.has_value());
|
||||
|
||||
sst_original->mutate_sstable_level(10).get();
|
||||
|
||||
auto new_digest = sst_original->get_component_digest(component);
|
||||
BOOST_REQUIRE(new_digest.has_value());
|
||||
|
||||
BOOST_REQUIRE(original_digest.value() != new_digest.value());
|
||||
}
|
||||
|
||||
sst_original = nullptr;
|
||||
|
||||
auto sst_reopened = env.make_sstable(schema, dir_path, entry_desc.generation, entry_desc.version, entry_desc.format);
|
||||
sst_reopened->load(schema->get_sharder()).get();
|
||||
|
||||
auto loaded_digest = sst_reopened->get_component_digest(component);
|
||||
BOOST_REQUIRE(loaded_digest.has_value());
|
||||
|
||||
auto f = open_file_dma(sstables::test(sst_reopened).filename(component).native(), open_flags::ro).get();
|
||||
auto stream = make_file_input_stream(f);
|
||||
auto close_stream = deferred_close(stream);
|
||||
auto component_data = util::read_entire_stream_contiguous(stream).get();
|
||||
auto calculated_digest = crc32_utils::checksum(component_data.begin(), component_data.size());
|
||||
BOOST_REQUIRE_EQUAL(calculated_digest, loaded_digest.value());
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_index) {
|
||||
return test_component_digest_persistence(component_type::Index, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_partitions) {
|
||||
return test_component_digest_persistence(component_type::Partitions, sstable::version_types::ms);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_rows) {
|
||||
return test_component_digest_persistence(component_type::Rows, sstable::version_types::ms);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_summary) {
|
||||
return test_component_digest_persistence(component_type::Summary, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_filter) {
|
||||
return test_component_digest_persistence(component_type::Filter, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_compression) {
|
||||
return test_component_digest_persistence(component_type::CompressionInfo, sstable::version_types::me, compress_sstable::yes);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_toc) {
|
||||
return test_component_digest_persistence(component_type::TOC, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_statistics) {
|
||||
return test_component_digest_persistence(component_type::Statistics, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_statistics_rewrite) {
|
||||
return test_component_digest_persistence(component_type::Statistics, sstable::version_types::me, compress_sstable::no, true);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_data) {
|
||||
return test_component_digest_persistence(component_type::Data, sstable::version_types::me);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_digest_persistence_data_compressed) {
|
||||
return test_component_digest_persistence(component_type::Data, sstable::version_types::me, compress_sstable::yes);
|
||||
}
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
|
||||
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
#include <boost/test/tools/old/interface.hpp>
|
||||
#include <seastar/core/shard_id.hh>
|
||||
#include <seastar/coroutine/as_future.hh>
|
||||
#include <source_location>
|
||||
@@ -446,6 +448,36 @@ SEASTAR_THREAD_TEST_CASE(test_invalid_colocated_tables) {
|
||||
.get();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_paused_rf_change_requests_persistence) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
auto topology = e.get_system_keyspace().local().load_topology_state({}).get();
|
||||
|
||||
// Check scheduled_rf_change_requests.
|
||||
std::unordered_set<utils::UUID> current_requests;
|
||||
auto new_id1 = utils::make_random_uuid();
|
||||
topo.pause_rf_change_request(new_id1);
|
||||
current_requests.insert(new_id1);
|
||||
auto new_id2 = utils::make_random_uuid();
|
||||
topo.pause_rf_change_request(new_id2);
|
||||
current_requests.insert(new_id2);
|
||||
topology = e.get_system_keyspace().local().load_topology_state({}).get();
|
||||
BOOST_REQUIRE_EQUAL(current_requests.size(), topology.paused_rf_change_requests.size());
|
||||
for (const auto& request : current_requests) {
|
||||
BOOST_REQUIRE(topology.paused_rf_change_requests.contains(request));
|
||||
}
|
||||
|
||||
topo.resume_rf_change_request(current_requests, new_id1);
|
||||
current_requests.erase(new_id1);
|
||||
topology = e.get_system_keyspace().local().load_topology_state({}).get();
|
||||
BOOST_REQUIRE_EQUAL(current_requests.size(), topology.paused_rf_change_requests.size());
|
||||
for (const auto& request : current_requests) {
|
||||
BOOST_REQUIRE(topology.paused_rf_change_requests.contains(request));
|
||||
}
|
||||
}, tablet_cql_test_config());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_tablet_metadata_persistence_with_colocated_tables) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
auto h1 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
@@ -1611,7 +1643,7 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were fully executed.
|
||||
static
|
||||
future<> apply_plan(token_metadata& tm, const migration_plan& plan) {
|
||||
future<> apply_plan(token_metadata& tm, const migration_plan& plan, service::topology& topology) {
|
||||
for (auto&& mig : plan.migrations()) {
|
||||
co_await tm.tablets().mutate_tablet_map_async(mig.tablet.table, [&] (tablet_map& tmap) {
|
||||
auto tinfo = tmap.get_tablet_info(mig.tablet.tablet);
|
||||
@@ -1622,6 +1654,9 @@ future<> apply_plan(token_metadata& tm, const migration_plan& plan) {
|
||||
});
|
||||
}
|
||||
co_await apply_resize_plan(tm, plan);
|
||||
if (auto request_id = plan.rack_list_colocation_plan().request_to_resume(); request_id) {
|
||||
topology.paused_rf_change_requests.erase(request_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were started but not yet executed.
|
||||
@@ -1662,13 +1697,15 @@ void do_rebalance_tablets(cql_test_env& e,
|
||||
{
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
auto& sys_ks = e.get_system_keyspace().local();
|
||||
auto& topology = e.get_topology_state_machine().local()._topology;
|
||||
|
||||
// Sanity limit to avoid infinite loops.
|
||||
// The x10 factor is arbitrary, it's there to account for more complex schedules than direct migration.
|
||||
auto max_iterations = 1 + get_tablet_count(stm.get()->tablets()) * 10;
|
||||
|
||||
for (size_t i = 0; i < max_iterations; ++i) {
|
||||
auto plan = talloc.balance_tablets(stm.get(), load_stats ? load_stats->get() : nullptr, skiplist).get();
|
||||
auto plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks, load_stats ? load_stats->get() : nullptr, skiplist).get();
|
||||
if (plan.empty()) {
|
||||
return;
|
||||
}
|
||||
@@ -1676,7 +1713,7 @@ void do_rebalance_tablets(cql_test_env& e,
|
||||
return;
|
||||
}
|
||||
stm.mutate_token_metadata([&] (token_metadata& tm) {
|
||||
return apply_plan(tm, plan);
|
||||
return apply_plan(tm, plan, e.get_topology_state_machine().local()._topology);
|
||||
}).get();
|
||||
|
||||
if (auto_split && load_stats) {
|
||||
@@ -1734,7 +1771,7 @@ void rebalance_tablets(cql_test_env& e,
|
||||
static
|
||||
void rebalance_tablets_as_in_progress(tablet_allocator& talloc, shared_token_metadata& stm, shared_load_stats& stats) {
|
||||
while (true) {
|
||||
auto plan = talloc.balance_tablets(stm.get(), stats.get()).get();
|
||||
auto plan = talloc.balance_tablets(stm.get(), nullptr, nullptr, stats.get()).get();
|
||||
if (plan.empty()) {
|
||||
break;
|
||||
}
|
||||
@@ -1885,7 +1922,7 @@ SEASTAR_THREAD_TEST_CASE(test_no_conflicting_migrations_in_the_plan) {
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
talloc.set_load_stats(topo.get_load_stats());
|
||||
migration_plan plan = talloc.balance_tablets(stm.get()).get();
|
||||
migration_plan plan = talloc.balance_tablets(stm.get(), nullptr, nullptr).get();
|
||||
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
std::set<global_tablet_id> tablets;
|
||||
@@ -1976,7 +2013,7 @@ SEASTAR_THREAD_TEST_CASE(test_no_conflicting_internode_and_intra_merge_colocatio
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
talloc.set_load_stats(topo.get_load_stats());
|
||||
migration_plan plan = talloc.balance_tablets(stm.get()).get();
|
||||
migration_plan plan = talloc.balance_tablets(stm.get(), nullptr, nullptr).get();
|
||||
|
||||
// The plan should contain non-conflicting migrations.
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
@@ -1989,6 +2026,101 @@ SEASTAR_THREAD_TEST_CASE(test_no_conflicting_internode_and_intra_merge_colocatio
|
||||
}, cfg).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_rack_list_conversion) {
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host3 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host4 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack3 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host5 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host6 = topo.add_node(node_state::normal, shard_count);
|
||||
auto dc2 = topo.start_new_dc().dc;
|
||||
[[maybe_unused]] auto host7 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host8 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 2}}, 4);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
// rack1: host1: A D host2: C
|
||||
// rack2: host3: A host4: B
|
||||
// rack3: host5: C host6: B D
|
||||
tablet_id A{0}, B{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(4);
|
||||
auto tid = tmap.first_tablet();
|
||||
A = tid;
|
||||
tmap.set_tablet(tid, tablet_info { // A
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
tablet_replica{host3, 0},
|
||||
}
|
||||
});
|
||||
tid = *tmap.next_tablet(tid);
|
||||
B = tid;
|
||||
tmap.set_tablet(tid, tablet_info { // B
|
||||
tablet_replica_set {
|
||||
tablet_replica{host4, 0},
|
||||
tablet_replica{host6, 0},
|
||||
}
|
||||
});
|
||||
tid = *tmap.next_tablet(tid);
|
||||
tmap.set_tablet(tid, tablet_info { // C
|
||||
tablet_replica_set {
|
||||
tablet_replica{host2, 0},
|
||||
tablet_replica{host5, 0},
|
||||
}
|
||||
});
|
||||
tid = *tmap.next_tablet(tid);
|
||||
tmap.set_tablet(tid, tablet_info { // D
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
tablet_replica{host6, 0},
|
||||
}
|
||||
});
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto id = utils::UUID_gen::get_time_UUID();
|
||||
// Build the map literal for CQL
|
||||
auto rf_change_data_cql = format("{{'replication:class': 'NetworkTopologyStrategy', 'replication:{}:0': '{}', 'replication:{}:1': '{}'}}",
|
||||
dc1, rack1.rack, dc1, rack3.rack);
|
||||
|
||||
e.execute_cql(format("INSERT INTO system.topology_requests (id, request_type, done, new_keyspace_rf_change_ks_name, new_keyspace_rf_change_data) VALUES ({}, 'keyspace_rf_change', False, '{}', {})",
|
||||
id, ks_name, rf_change_data_cql)).get();
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
talloc.set_load_stats(topo.get_load_stats());
|
||||
auto& sys_ks = e.get_system_keyspace().local();
|
||||
auto& topology = e.get_topology_state_machine().local()._topology;
|
||||
topology.paused_rf_change_requests.insert(id);
|
||||
migration_plan plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks).get();
|
||||
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
// A : host3 -> host5 / host6
|
||||
// B : host4 -> host1 / host2
|
||||
for (auto& mig : plan.migrations()) {
|
||||
testlog.info("Rack list colocation migration: {}", mig);
|
||||
BOOST_REQUIRE(mig.kind == locator::tablet_transition_kind::migration);
|
||||
BOOST_REQUIRE(mig.src.host == host3 || mig.src.host == host4);
|
||||
if (mig.src.host == host3) {
|
||||
BOOST_REQUIRE(mig.tablet.tablet == A);
|
||||
BOOST_REQUIRE(mig.dst.host == host5 || mig.dst.host == host6);
|
||||
} else {
|
||||
BOOST_REQUIRE(mig.tablet.tablet == B);
|
||||
BOOST_REQUIRE(mig.dst.host == host1 || mig.dst.host == host2);
|
||||
}
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
|
||||
// Throws if tablets have more than 1 replica in a given rack.
|
||||
// Run in seastar thread.
|
||||
void check_no_rack_overload(const token_metadata& tm) {
|
||||
@@ -2035,6 +2167,63 @@ void check_rack_list(const locator::topology& topo, const tablet_map& tmap, sstr
|
||||
}).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_rack_list_conversion_with_two_replicas_in_rack) {
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host3 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host4 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack3 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host5 = topo.add_node(node_state::normal, shard_count);
|
||||
[[maybe_unused]] auto host6 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 2}}, 2);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
tablet_id A{0}, B{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(2);
|
||||
auto tid = tmap.first_tablet();
|
||||
A = tid;
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
tablet_replica{host2, 0},
|
||||
}
|
||||
});
|
||||
tid = *tmap.next_tablet(tid);
|
||||
B = tid;
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica{host5, 0},
|
||||
tablet_replica{host6, 0},
|
||||
}
|
||||
});
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto id = utils::UUID_gen::get_time_UUID();
|
||||
// Build the map literal for CQL
|
||||
auto rf_change_data_cql = format("{{'replication:class': 'NetworkTopologyStrategy', 'replication:{}:0': '{}', 'replication:{}:1': '{}'}}",
|
||||
dc1, rack1.rack, dc1, rack2.rack);
|
||||
|
||||
e.execute_cql(format("INSERT INTO system.topology_requests (id, request_type, done, new_keyspace_rf_change_ks_name, new_keyspace_rf_change_data) VALUES ({}, 'keyspace_rf_change', False, '{}', {})",
|
||||
id, ks_name, rf_change_data_cql)).get();
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
auto& topology = e.get_topology_state_machine().local()._topology;
|
||||
topology.paused_rf_change_requests.insert(id);
|
||||
rebalance_tablets(e);
|
||||
check_rack_list(stm.get()->get_topology(), stm.get()->tablets().get_tablet_map(table1), dc1, {rack1.rack, rack2.rack});
|
||||
}).get();
|
||||
}
|
||||
|
||||
struct alter_result {
|
||||
tablet_map new_tablet_map;
|
||||
replication_strategy_config_options opts;
|
||||
@@ -2940,14 +3129,14 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_shuffle_mode) {
|
||||
rebalance_tablets(e, &topo.get_shared_load_stats());
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
BOOST_REQUIRE(e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get().empty());
|
||||
BOOST_REQUIRE(e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get().empty());
|
||||
|
||||
utils::get_local_injector().enable("tablet_allocator_shuffle");
|
||||
auto disable_injection = seastar::defer([&] {
|
||||
utils::get_local_injector().disable("tablet_allocator_shuffle");
|
||||
});
|
||||
|
||||
BOOST_REQUIRE(!e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get().empty());
|
||||
BOOST_REQUIRE(!e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr,topo.get_load_stats()).get().empty());
|
||||
}).get();
|
||||
}
|
||||
#endif
|
||||
@@ -3073,7 +3262,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
});
|
||||
|
||||
{
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
}
|
||||
|
||||
@@ -3084,7 +3273,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
}).get();
|
||||
|
||||
{
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(plan.empty());
|
||||
}
|
||||
|
||||
@@ -3094,7 +3283,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
}).get();
|
||||
|
||||
{
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(plan.empty());
|
||||
}
|
||||
|
||||
@@ -3105,7 +3294,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
}).get();
|
||||
|
||||
{
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
}
|
||||
|
||||
@@ -3115,7 +3304,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
}).get();
|
||||
|
||||
{
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
auto plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(!plan.empty());
|
||||
}
|
||||
}).get();
|
||||
@@ -3147,7 +3336,7 @@ SEASTAR_THREAD_TEST_CASE(test_drained_node_is_not_balanced_internally) {
|
||||
co_return;
|
||||
});
|
||||
|
||||
migration_plan plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), topo.get_load_stats()).get();
|
||||
migration_plan plan = e.get_tablet_allocator().local().balance_tablets(stm.get(), nullptr, nullptr, topo.get_load_stats()).get();
|
||||
BOOST_REQUIRE(plan.has_nodes_to_drain());
|
||||
for (auto&& mig : plan.migrations()) {
|
||||
BOOST_REQUIRE(mig.kind != tablet_transition_kind::intranode_migration);
|
||||
@@ -4751,7 +4940,7 @@ SEASTAR_THREAD_TEST_CASE(test_ensure_node_for_load_sketch) {
|
||||
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
talloc.balance_tablets(stm.get(), topo.get_shared_load_stats().get()).get();
|
||||
talloc.balance_tablets(stm.get(), nullptr, nullptr, topo.get_shared_load_stats().get()).get();
|
||||
}).get();
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
#
|
||||
# Copyright (C) 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
import logging
|
||||
import pytest
|
||||
from test.pylib.rest_client import read_barrier, get_host_api_address
|
||||
from test.pylib.util import unique_name, wait_for_cql_and_get_hosts
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def __test_attach_service_level_to_user(request, manager: ManagerClient, is_raft: bool):
|
||||
user = f"test_user_{unique_name()}"
|
||||
|
||||
# Start nodes with correct topology
|
||||
if is_raft:
|
||||
servers = await manager.servers_add(3, config=auth_config)
|
||||
else:
|
||||
conf = {**auth_config, 'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
|
||||
servers = [await manager.server_add(config=conf) for _ in range(3)]
|
||||
|
||||
cql = manager.get_cql()
|
||||
logging.info("Waiting until driver connects to every server")
|
||||
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
ips = [get_host_api_address(host) for host in hosts]
|
||||
|
||||
logging.info("Creating User")
|
||||
await cql.run_async(f"CREATE ROLE {user} WITH login = true AND password='{user}' AND superuser = true")
|
||||
|
||||
connections = await cql.run_async(f"SELECT username, scheduling_group, shard_id FROM system.clients WHERE client_type='cql' AND username='{user}' ALLOW FILTERING")
|
||||
|
||||
verify_service_level = lambda sl : all([conn.scheduling_group == sl for conn in connections])
|
||||
assert verify_service_level("default"), "All connections should be in default service level"
|
||||
|
||||
logging.info("Creating service levels")
|
||||
sls = ["sl" + unique_name() for _ in range(2)]
|
||||
for i, sl in enumerate(sls):
|
||||
await cql.run_async(f"CREATE SERVICE LEVEL {sl} WITH shares = {100 * (i+1)}")
|
||||
|
||||
logging.info("Attach Service Levels to user")
|
||||
for sl in sls:
|
||||
await cql.run_async(f"ATTACH SERVICE LEVEL {sl} TO {user}")
|
||||
|
||||
#if we are not using raft we have to switch the tenant and wait for it to take effect
|
||||
if not is_raft:
|
||||
for ip in ips:
|
||||
await manager.api.client.post('/service_levels/switch_tenants', host=ip)
|
||||
# Switching tenants may be blocked if a connection is waiting for a request (see 'generic_server::connection::process_until_tenant_switch()').
|
||||
# Execute enough cheap statements, so that connection on each shard will process at one statement and update its tenant.
|
||||
for _ in range(100):
|
||||
read_barrier(manager.api, ip)
|
||||
|
||||
assert verify_service_level(sl), f"All connections should be in {sl} service level"
|
||||
await cql.run_async(f"DETACH SERVICE LEVEL FROM {user}")
|
||||
|
||||
await cql.run_async(f"DROP ROLE {user}")
|
||||
for sl in sls:
|
||||
await cql.run_async(f"DROP SERVICE LEVEL {sl}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_attach_service_level_with_raft(request, manager: ManagerClient):
|
||||
await __test_attach_service_level_to_user(request, manager, is_raft=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_attach_service_level_with_gossip(request, manager: ManagerClient):
|
||||
await __test_attach_service_level_to_user(request, manager, is_raft=False)
|
||||
@@ -604,18 +604,14 @@ async def test_driver_service_creation_failure(manager: ManagerClient) -> None:
|
||||
service_level_names = [sl.service_level for sl in service_levels]
|
||||
assert "driver" not in service_level_names
|
||||
|
||||
def get_processed_tasks_for_group(metrics, group):
|
||||
res = metrics.get("scylla_scheduler_tasks_processed", {'group': group})
|
||||
if res is None:
|
||||
return 0
|
||||
return res
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def _verify_tasks_processed_metrics(manager, server, used_group, unused_group, func):
|
||||
number_of_requests = 1000
|
||||
number_of_requests = 3000
|
||||
|
||||
def get_processed_tasks_for_group(metrics, group):
|
||||
res = metrics.get("scylla_scheduler_tasks_processed", {'group': group})
|
||||
logger.info(f"group={group}, tasks_processed={res}")
|
||||
|
||||
if res is None:
|
||||
return 0
|
||||
return res
|
||||
@@ -627,8 +623,10 @@ async def _verify_tasks_processed_metrics(manager, server, used_group, unused_gr
|
||||
await asyncio.gather(*[asyncio.to_thread(func) for i in range(number_of_requests)])
|
||||
|
||||
metrics = await manager.metrics.query(server.ip_addr)
|
||||
assert get_processed_tasks_for_group(metrics, used_group) - initial_tasks_processed_by_used_group > number_of_requests
|
||||
assert get_processed_tasks_for_group(metrics, unused_group) - initial_tasks_processed_by_unused_group < number_of_requests
|
||||
tasks_processed_by_used_group = get_processed_tasks_for_group(metrics, used_group)
|
||||
tasks_processed_by_unused_group = get_processed_tasks_for_group(metrics, unused_group)
|
||||
assert tasks_processed_by_used_group - initial_tasks_processed_by_used_group > number_of_requests
|
||||
assert tasks_processed_by_unused_group - initial_tasks_processed_by_unused_group < number_of_requests
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_driver_service_level_not_used_for_user_queries(manager: ManagerClient) -> None:
|
||||
|
||||
@@ -52,6 +52,18 @@ KNOWN_LOG_LEVELS = {
|
||||
"OFF": "info",
|
||||
}
|
||||
|
||||
# Captures the aggregate metric before the "[READ ..., WRITE ...]" block.
|
||||
STRESS_SUMMARY_PATTERN = re.compile(r'^\s*([\d\.\,]+\d?)\s*\[.*')
|
||||
|
||||
# Extracts the READ metric number inside the "[READ ..., WRITE ...]" block.
|
||||
STRESS_READ_PATTERN = re.compile(r'.*READ:\s*([\d\.\,]+\d?)[^\d].*')
|
||||
|
||||
# Extracts the WRITE metric number inside the "[READ ..., WRITE ...]" block.
|
||||
STRESS_WRITE_PATTERN = re.compile(r'.*WRITE:\s*([\d\.\,]+\d?)[^\d].*')
|
||||
|
||||
# Splits a "key : value" line into key and value.
|
||||
STRESS_KEY_VALUE_PATTERN = re.compile(r'^\s*([^:]+)\s*:\s*(\S.*)\s*$')
|
||||
|
||||
|
||||
class NodeError(Exception):
|
||||
def __init__(self, msg: str, process: int | None = None):
|
||||
@@ -528,6 +540,15 @@ class ScyllaNode:
|
||||
return self.cluster.manager.server_get_workdir(server_id=self.server_id)
|
||||
|
||||
def stress(self, stress_options: list[str], **kwargs):
|
||||
"""
|
||||
Run `cassandra-stress` against this node.
|
||||
This method does not do any result parsing.
|
||||
|
||||
:param stress_options: List of options to pass to `cassandra-stress`.
|
||||
:param kwargs: Additional arguments to pass to `subprocess.Popen()`.
|
||||
:return: Named tuple with `stdout`, `stderr`, and `rc` (return code).
|
||||
"""
|
||||
|
||||
cmd_args = ["cassandra-stress"] + stress_options
|
||||
|
||||
if not any(opt in cmd_args for opt in ("-d", "-node", "-cloudconf")):
|
||||
@@ -549,6 +570,73 @@ class ScyllaNode:
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
def _set_stress_val(self, key, val, res):
|
||||
"""
|
||||
Normalize a stress result string and populate aggregate/read/write metrics.
|
||||
|
||||
Removes comma-thousands separators from numbers, converts to float,
|
||||
stores the aggregate metric under `key`.
|
||||
If the value contains a "[READ ..., WRITE ...]" block, also stores the
|
||||
read and write metrics under `key:read` and `key:write`.
|
||||
|
||||
:param key: The metric name
|
||||
:param val: The metric value string
|
||||
:param res: The dictionary to populate
|
||||
"""
|
||||
|
||||
def parse_num(s):
|
||||
return float(s.replace(',', ''))
|
||||
|
||||
if "[" in val:
|
||||
p = STRESS_SUMMARY_PATTERN
|
||||
m = p.match(val)
|
||||
if m:
|
||||
res[key] = parse_num(m.group(1))
|
||||
p = STRESS_READ_PATTERN
|
||||
m = p.match(val)
|
||||
if m:
|
||||
res[key + ":read"] = parse_num(m.group(1))
|
||||
p = STRESS_WRITE_PATTERN
|
||||
m = p.match(val)
|
||||
if m:
|
||||
res[key + ":write"] = parse_num(m.group(1))
|
||||
else:
|
||||
try:
|
||||
res[key] = parse_num(val)
|
||||
except ValueError:
|
||||
res[key] = val
|
||||
|
||||
|
||||
def stress_object(self, stress_options=None, ignore_errors=None, **kwargs):
|
||||
"""
|
||||
Run stress test and return results as a structured metrics dictionary.
|
||||
|
||||
Runs `stress()`, finds the `Results:` section in `stdout`, and then
|
||||
processes each `key : value` line, putting it into a dictionary.
|
||||
|
||||
:param stress_options: List of stress options to pass to `stress()`.
|
||||
:param ignore_errors: Deprecated (no effect).
|
||||
:param kwargs: Additional arguments to pass to `stress()`.
|
||||
:return: Dictionary of stress test results.
|
||||
"""
|
||||
if ignore_errors:
|
||||
self.warning("passing `ignore_errors` to stress_object() is deprecated")
|
||||
ret = self.stress(stress_options, **kwargs)
|
||||
p = STRESS_KEY_VALUE_PATTERN
|
||||
res = {}
|
||||
start = False
|
||||
for line in (s.strip() for s in ret.stdout.splitlines()):
|
||||
if start:
|
||||
m = p.match(line)
|
||||
if m:
|
||||
self._set_stress_val(m.group(1).strip().lower(), m.group(2).strip(), res)
|
||||
else:
|
||||
if line == 'Results:':
|
||||
start = True
|
||||
return res
|
||||
|
||||
|
||||
def flush(self, ks: str | None = None, table: str | None = None, **kwargs) -> None:
|
||||
cmd = ["flush"]
|
||||
if ks:
|
||||
|
||||
690
test/cluster/dtest/schema_management_test.py
Normal file
690
test/cluster/dtest/schema_management_test.py
Normal file
@@ -0,0 +1,690 @@
|
||||
#
|
||||
# Copyright (C) 2015-present The Apache Software Foundation
|
||||
# Copyright (C) 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import string
|
||||
import threading
|
||||
import time
|
||||
from concurrent import futures
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from cassandra import AlreadyExists, ConsistencyLevel, InvalidRequest
|
||||
from cassandra.concurrent import execute_concurrent_with_args
|
||||
from cassandra.query import SimpleStatement, dict_factory
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from dtest_class import Tester, create_cf, create_ks, read_barrier
|
||||
from tools.assertions import assert_all, assert_invalid
|
||||
from tools.cluster_topology import generate_cluster_topology
|
||||
from tools.data import create_c1c2_table, insert_c1c2, query_c1c2, rows_to_list
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TestSchemaManagement(Tester):
|
||||
def prepare(self, racks_num: int, has_config: bool = True):
|
||||
cluster = self.cluster
|
||||
cluster_topology = generate_cluster_topology(rack_num=racks_num)
|
||||
|
||||
if has_config:
|
||||
config = {
|
||||
"ring_delay_ms": 5000,
|
||||
}
|
||||
cluster.set_configuration_options(values=config)
|
||||
|
||||
cluster.populate(cluster_topology)
|
||||
cluster.start(wait_other_notice=True)
|
||||
|
||||
return cluster
|
||||
|
||||
|
||||
def test_prepared_statements_work_after_node_restart_after_altering_schema_without_changing_columns(self):
|
||||
cluster = self.prepare(racks_num=3)
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
logger.debug("Creating schema...")
|
||||
create_ks(session, "ks", 3)
|
||||
session.execute(
|
||||
"""
|
||||
CREATE TABLE users (
|
||||
id int,
|
||||
firstname text,
|
||||
lastname text,
|
||||
PRIMARY KEY (id)
|
||||
);
|
||||
"""
|
||||
)
|
||||
|
||||
insert_statement = session.prepare("INSERT INTO users (id, firstname, lastname) VALUES (?, 'A', 'B')")
|
||||
insert_statement.consistency_level = ConsistencyLevel.ALL
|
||||
session.execute(insert_statement, [0])
|
||||
|
||||
logger.debug("Altering schema")
|
||||
session.execute("ALTER TABLE users WITH comment = 'updated'")
|
||||
|
||||
logger.debug("Restarting node2")
|
||||
node2.stop(gently=True)
|
||||
node2.start(wait_for_binary_proto=True)
|
||||
|
||||
logger.debug("Restarting node3")
|
||||
node3.stop(gently=True)
|
||||
node3.start(wait_for_binary_proto=True, wait_other_notice=True)
|
||||
|
||||
n_partitions = 20
|
||||
for i in range(n_partitions):
|
||||
session.execute(insert_statement, [i])
|
||||
|
||||
rows = session.execute("SELECT * FROM users")
|
||||
res = sorted(rows)
|
||||
assert len(res) == n_partitions
|
||||
for i in range(n_partitions):
|
||||
expected = [i, "A", "B"]
|
||||
assert list(res[i]) == expected, f"Expected {expected}, got {res[i]}"
|
||||
|
||||
def test_dropping_keyspace_with_many_columns(self):
|
||||
"""
|
||||
Exploits https://github.com/scylladb/scylla/issues/1484
|
||||
"""
|
||||
cluster = self.prepare(racks_num=1, has_config=False)
|
||||
|
||||
node1 = cluster.nodelist()[0]
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
session.execute("CREATE KEYSPACE testxyz WITH replication = { 'class' : 'NetworkTopologyStrategy', 'replication_factor' : 1 }")
|
||||
for i in range(8):
|
||||
session.execute(f"CREATE TABLE testxyz.test_{i} (k int, c int, PRIMARY KEY (k),)")
|
||||
session.execute("drop keyspace testxyz")
|
||||
|
||||
for node in cluster.nodelist():
|
||||
s = self.patient_cql_connection(node)
|
||||
s.execute("CREATE KEYSPACE testxyz WITH replication = { 'class' : 'NetworkTopologyStrategy', 'replication_factor' : 1 }")
|
||||
s.execute("drop keyspace testxyz")
|
||||
|
||||
def test_multiple_create_table_in_parallel(self):
|
||||
"""
|
||||
Run multiple create table statements via different nodes
|
||||
1. Create a cluster of 3 nodes
|
||||
2. Run create table with different table names in parallel - check all complete
|
||||
3. Run create table with the same table name in parallel - check if they complete
|
||||
"""
|
||||
logger.debug("1. Create a cluster of 3 nodes")
|
||||
nodes_count = 3
|
||||
cluster = self.prepare(racks_num=nodes_count)
|
||||
sessions = [self.patient_exclusive_cql_connection(node) for node in cluster.nodelist()]
|
||||
ks = "ks"
|
||||
create_ks(sessions[0], ks, nodes_count)
|
||||
|
||||
def create_table(session, table_name):
|
||||
create_statement = f"CREATE TABLE {ks}.{table_name} (p int PRIMARY KEY, c0 text, c1 text, c2 text, c3 text, c4 text, c5 text, c6 text, c7 text, c8 text, c9 text);"
|
||||
logger.debug(f"create_statement {create_statement}")
|
||||
session.execute(create_statement)
|
||||
|
||||
logger.debug("2. Run create table with different table names in parallel - check all complete")
|
||||
step2_tables = [f"t{i}" for i in range(nodes_count)]
|
||||
with ThreadPoolExecutor(max_workers=nodes_count) as executor:
|
||||
list(executor.map(create_table, sessions, step2_tables))
|
||||
|
||||
for table in step2_tables:
|
||||
sessions[0].execute(SimpleStatement(f"INSERT INTO {ks}.{table} (p) VALUES (1)", consistency_level=ConsistencyLevel.ALL))
|
||||
rows = sessions[0].execute(SimpleStatement(f"SELECT * FROM {ks}.{table}", consistency_level=ConsistencyLevel.ALL))
|
||||
assert len(rows_to_list(rows)) == 1, f"Expected 1 row but got rows:{rows} instead"
|
||||
|
||||
logger.debug("3. Run create table with the same table name in parallel - check if they complete")
|
||||
step3_table = "test"
|
||||
step3_tables = [step3_table for i in range(nodes_count)]
|
||||
with ThreadPoolExecutor(max_workers=nodes_count) as executor:
|
||||
res_futures = [executor.submit(create_table, *args) for args in zip(sessions, step3_tables)]
|
||||
for res_future in res_futures:
|
||||
try:
|
||||
res_future.result()
|
||||
except AlreadyExists as e:
|
||||
logger.info(f"expected cassandra.AlreadyExists error {e}")
|
||||
|
||||
sessions[0].execute(SimpleStatement(f"INSERT INTO {ks}.{step3_table} (p) VALUES (1)", consistency_level=ConsistencyLevel.ALL))
|
||||
sessions[0].execute(f"SELECT * FROM {ks}.{step3_table}")
|
||||
rows = sessions[0].execute(SimpleStatement(f"SELECT * FROM {ks}.{step3_table}", consistency_level=ConsistencyLevel.ALL))
|
||||
assert len(rows_to_list(rows)) == 1, f"Expected 1 row but got rows:{rows} instead"
|
||||
|
||||
@pytest.mark.parametrize("case", ("write", "read", "mixed"))
|
||||
def test_alter_table_in_parallel_to_read_and_write(self, case):
|
||||
"""
|
||||
Create a table and write into while altering the table
|
||||
1. Create a cluster of 3 nodes and populate a table
|
||||
2. Run write/read/read_and_write" statement in a loop
|
||||
3. Alter table while inserts are running
|
||||
"""
|
||||
logger.debug("1. Create a cluster of 3 nodes and populate a table")
|
||||
cluster = self.prepare(racks_num=3)
|
||||
col_number = 20
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
session = self.patient_exclusive_cql_connection(node1)
|
||||
|
||||
def run_stress(stress_type, col=col_number - 2):
|
||||
node2.stress_object([stress_type, "n=10000", "cl=QUORUM", "-schema", "replication(factor=3)", "-col", f"n=FIXED({col})", "-rate", "threads=1"])
|
||||
|
||||
logger.debug("Populate")
|
||||
run_stress("write", col_number)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
logger.debug(f"2. Run {case} statement in a loop")
|
||||
statement_future = executor.submit(functools.partial(run_stress, case))
|
||||
|
||||
logger.debug(f"let's {case} statement work some time")
|
||||
time.sleep(2)
|
||||
|
||||
logger.debug("3. Alter table while inserts are running")
|
||||
alter_statement = f'ALTER TABLE keyspace1.standard1 DROP ("C{col_number - 1}", "C{col_number - 2}")'
|
||||
logger.debug(f"alter_statement {alter_statement}")
|
||||
alter_result = session.execute(alter_statement)
|
||||
logger.debug(alter_result.all())
|
||||
|
||||
logger.debug(f"wait till {case} statement finished")
|
||||
statement_future.result()
|
||||
|
||||
rows = session.execute(SimpleStatement("SELECT * FROM keyspace1.standard1 LIMIT 1;", consistency_level=ConsistencyLevel.ALL))
|
||||
assert len(rows_to_list(rows)[0]) == col_number - 1, f"Expected {col_number - 1} columns but got rows:{rows} instead"
|
||||
|
||||
logger.debug("read and check data")
|
||||
run_stress("read")
|
||||
|
||||
@pytest.mark.skip("unimplemented")
|
||||
def commitlog_replays_after_schema_change(self):
|
||||
"""
|
||||
Commitlog can be replayed even though schema has been changed
|
||||
1. Create a table and insert data
|
||||
2. Alter table
|
||||
3. Kill node
|
||||
4. Boot node and verify that commitlog have been replayed and that all data is restored
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@pytest.mark.parametrize("case", ("create_table", "alter_table", "drop_table"))
|
||||
def test_update_schema_while_node_is_killed(self, case):
|
||||
"""
|
||||
Check that a node that is killed durring a table creation/alter/drop is able to rejoin and to synch on schema
|
||||
"""
|
||||
|
||||
logger.debug("1. Create a cluster and insert data")
|
||||
cluster = self.prepare(racks_num=3)
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
def create_table_case():
|
||||
try:
|
||||
logger.debug("Creating table")
|
||||
create_c1c2_table(session)
|
||||
logger.debug("Populating")
|
||||
insert_c1c2(session, n=10)
|
||||
except AlreadyExists:
|
||||
# the CQL command can be called multiple time case of retries
|
||||
pass
|
||||
|
||||
def alter_table_case():
|
||||
try:
|
||||
session.execute("ALTER TABLE ks.cf ADD (c3 text);", timeout=180)
|
||||
except InvalidRequest as exc:
|
||||
# the CQL command can be called multiple time case of retries
|
||||
assert "Invalid column name c3" in str(exc)
|
||||
|
||||
def drop_table_case():
|
||||
try:
|
||||
session.execute("DROP TABLE cf;", timeout=180)
|
||||
except InvalidRequest as exc:
|
||||
# the CQL command can be called multiple time case of retries
|
||||
assert "Cannot drop non existing table" in str(exc)
|
||||
|
||||
logger.debug("Creating keyspace")
|
||||
create_ks(session, "ks", 3)
|
||||
if case != "create_table":
|
||||
create_table_case()
|
||||
|
||||
case_map = {
|
||||
"create_table": create_table_case,
|
||||
"alter_table": alter_table_case,
|
||||
"drop_table": drop_table_case,
|
||||
}
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
logger.debug(f"2. kill node during {case}")
|
||||
kill_node_future = executor.submit(node2.stop, gently=False, wait_other_notice=True)
|
||||
case_map[case]()
|
||||
kill_node_future.result()
|
||||
|
||||
logger.debug("3. Start the stopped node2")
|
||||
node2.start(wait_for_binary_proto=True)
|
||||
|
||||
session = self.patient_exclusive_cql_connection(node2)
|
||||
read_barrier(session)
|
||||
|
||||
def create_or_alter_table_expected_result(col_mun):
|
||||
rows = session.execute(SimpleStatement("SELECT * FROM ks.cf LIMIT 1;", consistency_level=ConsistencyLevel.QUORUM))
|
||||
assert len(rows_to_list(rows)[0]) == col_mun, f"Expected {col_mun} columns but got rows:{rows} instead"
|
||||
for key in range(10):
|
||||
query_c1c2(session=session, key=key, consistency=ConsistencyLevel.QUORUM)
|
||||
|
||||
expected_case_result_map = {
|
||||
"create_table": functools.partial(create_or_alter_table_expected_result, 3),
|
||||
"alter_table": functools.partial(create_or_alter_table_expected_result, 4),
|
||||
"drop_table": functools.partial(assert_invalid, session, "SELECT * FROM test1"),
|
||||
}
|
||||
logger.debug("verify that commitlog has been replayed and that all data is restored")
|
||||
expected_case_result_map[case]()
|
||||
|
||||
@pytest.mark.parametrize("is_gently_stop", [True, False])
|
||||
def test_nodes_rejoining_a_cluster_synch_on_schema(self, is_gently_stop):
|
||||
"""
|
||||
Nodes rejoining the cluster synch on schema changes
|
||||
1. Create a cluster and insert data
|
||||
2. Stop a node
|
||||
3. Alter table
|
||||
4. Insert additional data
|
||||
5. Start the stopped node
|
||||
6. Verify the stopped node synchs on the updated schema
|
||||
"""
|
||||
|
||||
logger.debug("1. Create a cluster and insert data")
|
||||
cluster = self.prepare(racks_num=3)
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
logger.debug("Creating schema")
|
||||
create_ks(session, "ks", 3)
|
||||
create_c1c2_table(session)
|
||||
create_cf(session, "cf", key_name="p", key_type="int", columns={"v": "text"})
|
||||
|
||||
logger.debug("Populating")
|
||||
insert_c1c2(session, n=10, consistency=ConsistencyLevel.ALL)
|
||||
|
||||
logger.debug("2 Stop a node1")
|
||||
node1.stop(gently=is_gently_stop, wait_other_notice=True)
|
||||
|
||||
logger.debug("3 Alter table")
|
||||
session = self.patient_cql_connection(node2)
|
||||
session.execute("ALTER TABLE ks.cf ADD (c3 text);", timeout=180)
|
||||
|
||||
logger.debug("4 Insert additional data")
|
||||
session.execute(SimpleStatement("INSERT INTO ks.cf (key, c1, c2, c3) VALUES ('test', 'test', 'test', 'test')", consistency_level=ConsistencyLevel.QUORUM))
|
||||
|
||||
logger.debug("5. Start the stopped node1")
|
||||
node1.start(wait_for_binary_proto=True)
|
||||
|
||||
logger.debug("6. Verify the stopped node synchs on the updated schema")
|
||||
session = self.patient_exclusive_cql_connection(node1)
|
||||
read_barrier(session)
|
||||
|
||||
rows = session.execute(SimpleStatement("SELECT * FROM ks.cf WHERE key='test'", consistency_level=ConsistencyLevel.ALL))
|
||||
expected = [["test", "test", "test", "test"]]
|
||||
assert rows_to_list(rows) == expected, f"Expected {expected} but got {rows} instead"
|
||||
for key in range(10):
|
||||
query_c1c2(session=session, key=key, consistency=ConsistencyLevel.ALL)
|
||||
|
||||
def test_reads_schema_recreated_while_node_down(self):
|
||||
cluster = self.prepare(racks_num=3)
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
logger.debug("Creating schema")
|
||||
create_ks(session, "ks", 3)
|
||||
session.execute("CREATE TABLE cf (p int PRIMARY KEY, v text);")
|
||||
|
||||
logger.debug("Populating")
|
||||
session.execute(SimpleStatement("INSERT INTO cf (p, v) VALUES (1, '1')", consistency_level=ConsistencyLevel.ALL))
|
||||
|
||||
logger.debug("Stopping node2")
|
||||
node2.stop(gently=True)
|
||||
|
||||
logger.debug("Re-creating schema")
|
||||
session.execute("DROP TABLE cf;")
|
||||
session.execute("CREATE TABLE cf (p int PRIMARY KEY, v1 bigint, v2 text);")
|
||||
|
||||
logger.debug("Restarting node2")
|
||||
node2.start(wait_for_binary_proto=True)
|
||||
session2 = self.patient_cql_connection(node2)
|
||||
read_barrier(session2)
|
||||
|
||||
rows = session.execute(SimpleStatement("SELECT * FROM cf", consistency_level=ConsistencyLevel.ALL))
|
||||
assert rows_to_list(rows) == [], f"Expected an empty result set, got {rows}"
|
||||
|
||||
def test_writes_schema_recreated_while_node_down(self):
|
||||
cluster = self.prepare(racks_num=3)
|
||||
|
||||
[node1, node2, node3] = cluster.nodelist()
|
||||
|
||||
session = self.patient_cql_connection(node1)
|
||||
|
||||
logger.debug("Creating schema")
|
||||
create_ks(session, "ks", 3)
|
||||
session.execute("CREATE TABLE cf (p int PRIMARY KEY, v text);")
|
||||
|
||||
logger.debug("Populating")
|
||||
session.execute(SimpleStatement("INSERT INTO cf (p, v) VALUES (1, '1')", consistency_level=ConsistencyLevel.ALL))
|
||||
|
||||
logger.debug("Stopping node2")
|
||||
node2.stop(gently=True, wait_other_notice=True)
|
||||
|
||||
logger.debug("Re-creating schema")
|
||||
session.execute("DROP TABLE cf;")
|
||||
session.execute("CREATE TABLE cf (p int PRIMARY KEY, v text);")
|
||||
|
||||
logger.debug("Restarting node2")
|
||||
node2.start(wait_for_binary_proto=True)
|
||||
session2 = self.patient_cql_connection(node2)
|
||||
read_barrier(session2)
|
||||
|
||||
session.execute(SimpleStatement("INSERT INTO cf (p, v) VALUES (2, '2')", consistency_level=ConsistencyLevel.ALL))
|
||||
|
||||
rows = session.execute(SimpleStatement("SELECT * FROM cf", consistency_level=ConsistencyLevel.ALL))
|
||||
expected = [[2, "2"]]
|
||||
assert rows_to_list(rows) == expected, f"Expected {expected}, got {rows_to_list(rows)}"
|
||||
|
||||
|
||||
class TestLargePartitionAlterSchema(Tester):
|
||||
# Issue scylladb/scylla: #5135:
|
||||
#
|
||||
# Issue: Cache reads may miss some writes if schema alter followed by a read happened concurrently with preempted
|
||||
# partition entry update
|
||||
# Affects only tables with multi-row partitions, which are the only ones that can experience the update of partition
|
||||
# entry being preempted.
|
||||
#
|
||||
# The scenario in which the problem could have happened has to involve:
|
||||
# - a large partition with many rows, large enough for preemption (every 0.5ms) to happen during the scan of the partition.
|
||||
# - appending writes to the partition (not overwrites)
|
||||
# - scans of the partition
|
||||
# - schema alter of that table. The issue is exposed only by adding or dropping a column, such that the added/dropped
|
||||
# column lands in the middle (in alphabetical order) of the old column set.
|
||||
#
|
||||
# Memtable flush has to happen after a schema alter concurrently with a read.
|
||||
#
|
||||
# The bug could result in cache corruption which manifests as some past writes being missing (not visible to reads).
|
||||
|
||||
PARTITIONS = 50
|
||||
STRING_VALUE = string.ascii_lowercase
|
||||
|
||||
def prepare(self, cluster_topology: dict[str, dict[str, int]], rf: int):
|
||||
if not self.cluster.nodelist():
|
||||
self.cluster.populate(cluster_topology)
|
||||
self.cluster.start(wait_other_notice=True)
|
||||
|
||||
node1 = self.cluster.nodelist()[0]
|
||||
session = self.patient_cql_connection(node=node1)
|
||||
self.create_schema(session=session, rf=rf)
|
||||
|
||||
return session
|
||||
|
||||
def create_schema(self, session, rf):
|
||||
logger.debug("Creating schema")
|
||||
create_ks(session=session, name="ks", rf=rf)
|
||||
|
||||
session.execute(
|
||||
"""
|
||||
CREATE TABLE lp_table (
|
||||
pk int,
|
||||
ck1 int,
|
||||
val1 text,
|
||||
val2 text,
|
||||
PRIMARY KEY (pk, ck1)
|
||||
);
|
||||
"""
|
||||
)
|
||||
|
||||
def populate(self, session, data, ck_start, ck_end=None, stop_populating: threading.Event = None):
|
||||
ck = ck_start
|
||||
def _populate_loop():
|
||||
nonlocal ck
|
||||
while True:
|
||||
if stop_populating is not None and stop_populating.is_set():
|
||||
return
|
||||
if ck_end is not None and ck >= ck_end:
|
||||
return
|
||||
for pk in range(self.PARTITIONS):
|
||||
row = [pk, ck, self.STRING_VALUE, self.STRING_VALUE]
|
||||
data.append(row)
|
||||
yield tuple(row)
|
||||
ck += 1
|
||||
|
||||
records_written = ck - ck_start
|
||||
|
||||
logger.debug(f"Start populate DB: {self.PARTITIONS} partitions with {ck_end - ck_start if ck_end else 'infinite'} records in each partition")
|
||||
|
||||
parameters = _populate_loop()
|
||||
|
||||
stmt = session.prepare("INSERT INTO lp_table (pk, ck1, val1, val2) VALUES (?, ?, ?, ?)")
|
||||
|
||||
execute_concurrent_with_args(session=session, statement=stmt, parameters=parameters, concurrency=100)
|
||||
logger.debug(f"Finish populate DB: {self.PARTITIONS} partitions with {records_written} records in each partition")
|
||||
return data
|
||||
|
||||
def read(self, session, ck_max, stop_reading: threading.Event = None):
|
||||
def _read_loop():
|
||||
while True:
|
||||
for ck in range(ck_max):
|
||||
for pk in range(self.PARTITIONS):
|
||||
if stop_reading is not None and stop_reading.is_set():
|
||||
return
|
||||
session.execute(f"select * from lp_table where pk = {pk} and ck1 = {ck}")
|
||||
if stop_reading is None:
|
||||
return
|
||||
|
||||
logger.debug(f"Start reading..")
|
||||
_read_loop()
|
||||
logger.debug(f"Finish reading..")
|
||||
|
||||
def add_column(self, session, column_name, column_type):
|
||||
logger.debug(f"Add {column_name} column")
|
||||
session.execute(f"ALTER TABLE lp_table ADD {column_name} {column_type}")
|
||||
|
||||
def drop_column(self, session, column_name):
|
||||
logger.debug(f"Drop {column_name} column")
|
||||
session.execute(f"ALTER TABLE lp_table DROP {column_name}")
|
||||
|
||||
def test_large_partition_with_add_column(self):
|
||||
cluster_topology = generate_cluster_topology()
|
||||
session = self.prepare(cluster_topology, rf=1)
|
||||
data = self.populate(session=session, data=[], ck_start=0, ck_end=10)
|
||||
|
||||
threads = []
|
||||
timeout = 300
|
||||
ck_end = 5000
|
||||
if self.cluster.scylla_mode == "debug":
|
||||
timeout = 900
|
||||
ck_end = 500
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
stop_populating = threading.Event()
|
||||
stop_reading = threading.Event()
|
||||
# Insert new rows in background
|
||||
threads.append(executor.submit(self.populate, session=session, data=data, ck_start=10, ck_end=None, stop_populating=stop_populating))
|
||||
threads.append(executor.submit(self.read, session=session, ck_max=ck_end, stop_reading=stop_reading))
|
||||
# Wait for running load
|
||||
time.sleep(10)
|
||||
self.add_column(session, "new_clmn", "int")
|
||||
|
||||
# Memtable flush has to happen after a schema alter concurrently with a read
|
||||
logger.debug("Flush data")
|
||||
self.cluster.nodelist()[0].flush()
|
||||
|
||||
# Stop populating and reading soon after flush
|
||||
time.sleep(1)
|
||||
logger.debug("Stop populating and reading")
|
||||
stop_populating.set()
|
||||
stop_reading.set()
|
||||
|
||||
for future in futures.as_completed(threads, timeout=timeout):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
pytest.fail(f"Generated an exception: {exc}")
|
||||
|
||||
# Add 'null' values for the new column `new_clmn` in the expected data
|
||||
for i, _ in enumerate(data):
|
||||
data[i].append(None)
|
||||
|
||||
assert_all(session, f"select pk, ck1, val1, val2, new_clmn from lp_table", data, ignore_order=True, print_result_on_failure=False)
|
||||
|
||||
def test_large_partition_with_drop_column(self):
|
||||
cluster_topology = generate_cluster_topology()
|
||||
session = self.prepare(cluster_topology, rf=1)
|
||||
data = self.populate(session=session, data=[], ck_start=0, ck_end=10)
|
||||
|
||||
threads = []
|
||||
timeout = 300
|
||||
ck_end = 5000
|
||||
if self.cluster.scylla_mode == "debug":
|
||||
timeout = 900
|
||||
ck_end = 500
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
stop_populating = threading.Event()
|
||||
stop_reading = threading.Event()
|
||||
# Insert new rows in background
|
||||
threads.append(executor.submit(self.populate, session=session, data=data, ck_start=10, ck_end=None, stop_populating=stop_populating))
|
||||
threads.append(executor.submit(self.read, session=session, ck_max=ck_end, stop_reading=stop_reading))
|
||||
# Wait for running load
|
||||
time.sleep(10)
|
||||
self.drop_column(session=session, column_name="val1")
|
||||
|
||||
# Memtable flush has to happen after a schema alter concurrently with a read
|
||||
logger.debug("Flush data")
|
||||
self.cluster.nodelist()[0].flush()
|
||||
|
||||
# Stop populating and reading soon after flush
|
||||
time.sleep(1)
|
||||
logger.debug("Stop populating and reading")
|
||||
stop_populating.set()
|
||||
stop_reading.set()
|
||||
|
||||
result = []
|
||||
for future in futures.as_completed(threads, timeout=timeout):
|
||||
try:
|
||||
result.append(future.result())
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# "Unknown identifier val1" is expected error
|
||||
if not len(exc.args) or "Unknown identifier val1" not in exc.args[0]:
|
||||
pytest.fail(f"Generated an exception: {exc}")
|
||||
|
||||
|
||||
class HistoryVerifier:
|
||||
def __init__(self, table_name="table1", keyspace_name="lwt_load_ks"):
|
||||
"""
|
||||
Initialize parameters for further verification of schema history.
|
||||
:param table_name: table thats we change it's schema and verify schema history accordingly.
|
||||
"""
|
||||
|
||||
self.table_name = table_name
|
||||
self.keyspace_name = keyspace_name
|
||||
self.versions = []
|
||||
self.versions_dict = {}
|
||||
self.query = ""
|
||||
|
||||
def verify(self, session, expected_current_diff, expected_prev_diff, query):
|
||||
"""
|
||||
Verify current schema history entry by comparing to previous schema entry.
|
||||
:param session: python cql session
|
||||
:param expected_current_diff: difference of current schema from previous schema
|
||||
:param expected_prev_diff: difference of previous schema from current schema
|
||||
:param query: The query that created new schema
|
||||
"""
|
||||
|
||||
def get_table_id(session, keyspace_name, table_name):
|
||||
assert keyspace_name, f"Input kesyspcase should have value, keyspace_name={keyspace_name}"
|
||||
assert table_name, f"Input table_name should have value, table_name={table_name}"
|
||||
query = "select keyspace_name,table_name,id from system_schema.tables"
|
||||
query += f" WHERE keyspace_name='{keyspace_name}' AND table_name='{table_name}'"
|
||||
current_rows = session.execute(query).current_rows
|
||||
assert len(current_rows) == 1, f"Not found table description, ks={keyspace_name} table_name={table_name}"
|
||||
res = current_rows[0]
|
||||
return res["id"]
|
||||
|
||||
def read_schema_history_table(session, cf_id):
|
||||
"""
|
||||
read system.scylla_table_schema_history and verify current version diff from previous vesion
|
||||
:param session: python cql session
|
||||
:param cf_id: uuid of the table we changed it's schema
|
||||
"""
|
||||
|
||||
query = f"select * from system.scylla_table_schema_history WHERE cf_id={cf_id}"
|
||||
res = session.execute(query).current_rows
|
||||
new_versions = list({
|
||||
entry["schema_version"]
|
||||
for entry in res
|
||||
if str(entry["schema_version"]) not in self.versions
|
||||
})
|
||||
msg = f"Expect 1, got len(new_versions)={len(new_versions)}"
|
||||
assert len(new_versions) == 1, msg
|
||||
current_version = str(new_versions[0])
|
||||
logger.debug(f"New schema_version {current_version} after executing '{self.query}'")
|
||||
columns_list = (
|
||||
{"column_name": entry["column_name"], "type": entry["type"]}
|
||||
for entry in res
|
||||
if entry["kind"] == "regular" and current_version == str(entry["schema_version"])
|
||||
)
|
||||
self.versions_dict[current_version] = {}
|
||||
for item in columns_list:
|
||||
self.versions_dict[current_version][item["column_name"]] = item["type"]
|
||||
|
||||
self.versions.append(current_version)
|
||||
if len(self.versions) > 1:
|
||||
current_id = self.versions[-1]
|
||||
previous_id = self.versions[-2]
|
||||
set_current = set(self.versions_dict[current_id].items())
|
||||
set_previous = set(self.versions_dict[previous_id].items())
|
||||
current_diff = set_current - set_previous
|
||||
previous_diff = set_previous - set_current
|
||||
msg1 = f"Expect diff(new schema,old schema) to be {expected_current_diff} got {current_diff}"
|
||||
msg2 = f" query is '{self.query}' versions={current_id},{previous_id}"
|
||||
if current_diff != expected_current_diff:
|
||||
logger.debug(msg1 + msg2)
|
||||
assert current_diff == expected_current_diff, msg1 + msg2
|
||||
msg1 = f"Expect diff(old schema,new schema) to be {expected_prev_diff} got {previous_diff}"
|
||||
assert previous_diff == expected_prev_diff, msg1 + msg2
|
||||
|
||||
self.query = query
|
||||
cf_id = get_table_id(session, keyspace_name=self.keyspace_name, table_name=self.table_name)
|
||||
read_schema_history_table(session, cf_id)
|
||||
|
||||
|
||||
class DDL(NamedTuple):
|
||||
ddl_command: str
|
||||
expected_current_diff: set | None
|
||||
expected_prev_diff: set | None
|
||||
|
||||
|
||||
class TestSchemaHistory(Tester):
|
||||
def prepare(self):
|
||||
cluster = self.cluster
|
||||
# in case support tablets and rf-rack-valid-keyspaces
|
||||
# create cluster with 3 racks with 1 node in each rack
|
||||
cluster_topology = generate_cluster_topology(rack_num=3)
|
||||
rf = 3
|
||||
cluster.populate(cluster_topology).start(wait_other_notice=True)
|
||||
self.session = self.patient_cql_connection(self.cluster.nodelist()[0], row_factory=dict_factory)
|
||||
create_ks(self.session, "lwt_load_ks", rf)
|
||||
|
||||
def test_schema_history_alter_table(self):
|
||||
"""test schema history changes following alter table cql commands"""
|
||||
self.prepare()
|
||||
verifier = HistoryVerifier(table_name="table2")
|
||||
queries_and_expected_diffs = [
|
||||
DDL(ddl_command="CREATE TABLE IF NOT EXISTS lwt_load_ks.table2 (pk int PRIMARY KEY, v int, int_col int)", expected_current_diff=None, expected_prev_diff=None),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 ALTER v TYPE varint", expected_current_diff={("v", "varint")}, expected_prev_diff={("v", "int")}),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 ADD (v2 int, v3 int)", expected_current_diff={("v2", "int"), ("v3", "int")}, expected_prev_diff=set()),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 ALTER int_col TYPE varint", expected_current_diff={("int_col", "varint")}, expected_prev_diff={("int_col", "int")}),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 DROP int_col", expected_current_diff=set(), expected_prev_diff={("int_col", "varint")}),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 ADD int_col bigint", expected_current_diff={("int_col", "bigint")}, expected_prev_diff=set()),
|
||||
DDL(ddl_command="ALTER TABLE lwt_load_ks.table2 DROP (int_col,v)", expected_current_diff=set(), expected_prev_diff={("int_col", "bigint"), ("v", "varint")}),
|
||||
]
|
||||
for ddl in queries_and_expected_diffs:
|
||||
self.session.execute(ddl.ddl_command)
|
||||
verifier.verify(self.session, ddl.expected_current_diff, ddl.expected_prev_diff, query=ddl.ddl_command)
|
||||
@@ -218,6 +218,18 @@ def assert_row_count_in_select_less(
|
||||
assert count < max_rows_expected, f'Expected a row count < of {max_rows_expected} in query "{query}", but got {count}'
|
||||
|
||||
|
||||
def assert_length_equal(object_with_length, expected_length):
|
||||
"""
|
||||
Assert an object has a specific length.
|
||||
@param object_with_length The object whose length will be checked
|
||||
@param expected_length The expected length of the object
|
||||
|
||||
Examples:
|
||||
assert_length_equal(res, nb_counter)
|
||||
"""
|
||||
assert len(object_with_length) == expected_length, f"Expected {object_with_length} to have length {expected_length}, but instead is of length {len(object_with_length)}"
|
||||
|
||||
|
||||
def assert_lists_equal_ignoring_order(list1, list2, sort_key=None):
|
||||
"""
|
||||
asserts that the contents of the two provided lists are equal
|
||||
|
||||
@@ -14,6 +14,7 @@ from cassandra.query import SimpleStatement
|
||||
from cassandra.concurrent import execute_concurrent_with_args
|
||||
|
||||
from test.cluster.dtest.dtest_class import create_cf
|
||||
from test.cluster.dtest.tools import assertions
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -51,6 +52,27 @@ def insert_c1c2( # noqa: PLR0913
|
||||
execute_concurrent_with_args(session, statement, [[f"k{k}"] for k in keys], concurrency=concurrency)
|
||||
|
||||
|
||||
def query_c1c2( # noqa: PLR0913
|
||||
session,
|
||||
key,
|
||||
consistency=ConsistencyLevel.QUORUM,
|
||||
tolerate_missing=False,
|
||||
must_be_missing=False,
|
||||
c1_value="value1",
|
||||
c2_value="value2",
|
||||
ks="ks",
|
||||
cf="cf",
|
||||
):
|
||||
query = SimpleStatement(f"SELECT c1, c2 FROM {ks}.{cf} WHERE key='k{key}'", consistency_level=consistency)
|
||||
rows = list(session.execute(query))
|
||||
if not tolerate_missing and not must_be_missing:
|
||||
assertions.assert_length_equal(rows, 1)
|
||||
res = rows[0]
|
||||
assert len(res) == 2 and res[0] == c1_value and res[1] == c2_value, res
|
||||
if must_be_missing:
|
||||
assertions.assert_length_equal(rows, 0)
|
||||
|
||||
|
||||
def rows_to_list(rows):
|
||||
new_list = [list(row) for row in rows]
|
||||
return new_list
|
||||
|
||||
@@ -181,11 +181,14 @@ async def test_random_failures(manager: ManagerClient,
|
||||
LOGGER.info("Found following message in the coordinator's log:\n\t%s", matches[-1][0])
|
||||
await manager.server_stop(server_id=s_info.server_id)
|
||||
|
||||
BANNED_NOTIFICATION = "received notification of being banned from the cluster from"
|
||||
STARTUP_FAILED_PATTERN = f"init - Startup failed:|{BANNED_NOTIFICATION}"
|
||||
|
||||
if s_info in await manager.running_servers():
|
||||
LOGGER.info("Wait until the new node initialization completes or fails.")
|
||||
await server_log.wait_for("init - (Startup failed:|Scylla version .* initialization completed)", timeout=120)
|
||||
await server_log.wait_for(f"init - (Startup failed:|Scylla version .* initialization completed)|{BANNED_NOTIFICATION}", timeout=120)
|
||||
|
||||
if await server_log.grep("init - Startup failed:"):
|
||||
if await server_log.grep(STARTUP_FAILED_PATTERN):
|
||||
LOGGER.info("Check that the new node is dead.")
|
||||
expected_statuses = [psutil.STATUS_DEAD]
|
||||
else:
|
||||
@@ -216,7 +219,7 @@ async def test_random_failures(manager: ManagerClient,
|
||||
else:
|
||||
if s_info in await manager.running_servers():
|
||||
LOGGER.info("The new node is dead. Check if it failed to startup.")
|
||||
assert await server_log.grep("init - Startup failed:")
|
||||
assert await server_log.grep(STARTUP_FAILED_PATTERN)
|
||||
await manager.server_stop(server_id=s_info.server_id) # remove the node from the list of running servers
|
||||
|
||||
LOGGER.info("Try to remove the dead new node from the cluster.")
|
||||
|
||||
@@ -26,6 +26,7 @@ skip_in_release:
|
||||
- test_raft_cluster_features
|
||||
- test_cluster_features
|
||||
- dtest/limits_test
|
||||
- dtest/schema_management_test
|
||||
skip_in_debug:
|
||||
- test_shutdown_hang
|
||||
- test_replace
|
||||
|
||||
294
test/cluster/test_client_routes.py
Normal file
294
test/cluster/test_client_routes.py
Normal file
@@ -0,0 +1,294 @@
|
||||
# Copyright (C) 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
import asyncio
|
||||
import pytest
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import HTTPError
|
||||
from test.pylib.util import wait_for
|
||||
from test.cluster.util import trigger_snapshot
|
||||
|
||||
from cassandra.protocol import EventMessage
|
||||
import cassandra.protocol
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
CLIENT_ROUTES_CHANGE_EVENT_NAME = "CLIENT_ROUTES_CHANGE"
|
||||
|
||||
async def wait_for_expected_client_routes_size(cql, expected_routes_size):
|
||||
async def expected_client_routes_size(cql, expected_size):
|
||||
client_routes = await cql.run_async("SELECT * FROM system.client_routes")
|
||||
logger.info(f"Got client routes, expected_size={expected_size}, res={client_routes}")
|
||||
if len(client_routes) == expected_size:
|
||||
return client_routes
|
||||
return None
|
||||
await wait_for(lambda: expected_client_routes_size(cql, expected_routes_size), time.time() + 10)
|
||||
|
||||
def generate_connection_id(i):
|
||||
# Make the string longer than 30 characters to make sure that in C++ the string has a heap allocation
|
||||
return f"connection_id_{i}_" + "abc" * 10
|
||||
|
||||
def generate_host_id(i):
|
||||
return str(uuid.UUID(int=(i + 100)))
|
||||
|
||||
def generate_client_routes_entry(i):
|
||||
return {
|
||||
"connection_id": generate_connection_id(i),
|
||||
"host_id": generate_host_id(i),
|
||||
"address": "addr1.test",
|
||||
"port": 8001,
|
||||
"tls_port": 8002,
|
||||
"alternator_port": 8003,
|
||||
"alternator_https_port": 8004
|
||||
}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_client_routes(request, manager: ManagerClient):
|
||||
num_servers = 3
|
||||
cql = None
|
||||
# Run three nodes one by one
|
||||
for i in range(num_servers):
|
||||
# SMP=2 to verify that requests work properly even when a shard other than 0 receives them
|
||||
servers = await manager.servers_add(1, cmdline=['--smp=2'])
|
||||
cql, hosts = await manager.get_ready_cql(await manager.running_servers())
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(i)])
|
||||
await wait_for_expected_client_routes_size(cql, i+1)
|
||||
|
||||
|
||||
# Remove one node
|
||||
running_servers = await manager.running_servers()
|
||||
server_to_stop = running_servers[0]
|
||||
running_server = running_servers[1]
|
||||
await manager.server_stop(server_to_stop.server_id)
|
||||
await manager.remove_node(running_server.server_id, server_to_stop.server_id)
|
||||
await wait_for_expected_client_routes_size(cql, num_servers)
|
||||
|
||||
# Verify everything works
|
||||
await manager.api.client.post("/v2/client-routes", host=running_server.ip_addr, json=[generate_client_routes_entry(num_servers + 1)])
|
||||
await wait_for_expected_client_routes_size(cql, num_servers + 1)
|
||||
await manager.api.client.delete("/v2/client-routes", host=running_server.ip_addr, json=[generate_client_routes_entry(0)])
|
||||
await wait_for_expected_client_routes_size(cql, num_servers)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_client_routes_node_restart(request, manager: ManagerClient):
|
||||
"""
|
||||
This test verifies that a node receives updates if client routes were updated
|
||||
when the node was down.
|
||||
"""
|
||||
servers = await manager.servers_add(3)
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
server_to_restart = servers[2]
|
||||
|
||||
await manager.server_stop(server_to_restart.server_id)
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(1)])
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
|
||||
await manager.server_start(server_to_restart.server_id)
|
||||
cql = await manager.get_cql_exclusive(server_to_restart)
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
async def test_client_routes_upgrade(request, manager: ManagerClient):
|
||||
"""
|
||||
This test verifies updating the system to a version with the CLIENT_ROUTES feature in the following steps:
|
||||
1. Create 2 nodes with the CLIENT_ROUTES feature disabled.
|
||||
2. Verify `/v2/client-routes` rejects requests.
|
||||
3. Enable the `CLIENT_ROUTES` feature after restart.
|
||||
4. Verify `/v2/client-routes` works.
|
||||
"""
|
||||
num_servers = 2
|
||||
config = [
|
||||
{"name": "suppress_features", "value": "CLIENT_ROUTES"}
|
||||
]
|
||||
servers = await manager.servers_add(num_servers, config={'error_injections_at_startup': config})
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
# Empty `system.client_routes` is there even if the feature is disabled.
|
||||
wait_for_expected_client_routes_size(cql, 0)
|
||||
|
||||
with pytest.raises(HTTPError) as exc:
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
with pytest.raises(HTTPError) as exc:
|
||||
await manager.api.client.delete("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
with pytest.raises(HTTPError) as exc:
|
||||
await manager.api.client.get("/v2/client-routes", host=servers[0].ip_addr)
|
||||
|
||||
for server in servers:
|
||||
await manager.server_update_config(server.server_id, "error_injections_at_startup", [])
|
||||
await manager.server_restart(server.server_id)
|
||||
|
||||
async def client_routes_ready():
|
||||
try:
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
await manager.api.client.delete("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
await manager.api.client.get("/v2/client-routes", host=servers[0].ip_addr)
|
||||
return True
|
||||
except HTTPError as exc:
|
||||
# Allow cluster to be not ready
|
||||
if "requires all nodes to support the CLIENT_ROUTES cluster feature" not in exc.message:
|
||||
raise exc
|
||||
return None
|
||||
|
||||
wait_for(client_routes_ready, time.time() + 10)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_client_routes_lost_quorum(request, manager: ManagerClient):
|
||||
"""
|
||||
This test verifies that `/v2/client-routes` fails with a timeout if the Raft quorum cannot be reached.
|
||||
"""
|
||||
num_servers = 3
|
||||
timeout = 10
|
||||
config = {'group0_raft_op_timeout_in_ms': timeout * 1000}
|
||||
servers = await manager.servers_add(num_servers, config=config)
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
|
||||
await wait_for_expected_client_routes_size(cql, 0)
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)], timeout=timeout + 60)
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
|
||||
for server in servers[1:]:
|
||||
await manager.server_stop(server.server_id)
|
||||
|
||||
async def fail_req(f):
|
||||
with pytest.raises(HTTPError) as exc:
|
||||
await f("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)], timeout=timeout + 60)
|
||||
assert "raft operation [read_barrier] timed out, there is no raft quorum" in exc.value.message
|
||||
|
||||
await asyncio.gather(fail_req(manager.api.client.post), fail_req(manager.api.client.delete))
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
|
||||
def setup_events_test(cql, received_events, monkeypatch):
|
||||
# scylla-driver >= 3.29.6 supports CLIENT_ROUTES_CHANGE events.
|
||||
# For older python driver, monkeypatching is necessary
|
||||
if CLIENT_ROUTES_CHANGE_EVENT_NAME not in cassandra.protocol.known_event_types:
|
||||
def _recv_client_routes_change(f, arg):
|
||||
logger.info(f"monkeypatch_driver recv_client_routes_change, f={f} arg={arg}")
|
||||
change_type = cassandra.protocol.read_string(f)
|
||||
connection_ids = [cassandra.protocol.read_string(f) for _ in range(cassandra.protocol.read_short(f))]
|
||||
host_ids = [cassandra.protocol.read_string(f) for _ in range(cassandra.protocol.read_short(f))]
|
||||
return {
|
||||
"change_type": change_type,
|
||||
"connection_ids": connection_ids,
|
||||
"host_ids": host_ids
|
||||
}
|
||||
monkeypatch.setattr(cassandra.protocol, "known_event_types", cassandra.protocol.known_event_types.union([CLIENT_ROUTES_CHANGE_EVENT_NAME]), raising=True)
|
||||
monkeypatch.setattr(EventMessage, "recv_client_routes_change", _recv_client_routes_change, raising=False)
|
||||
|
||||
def on_event(event):
|
||||
logger.info(f"Received an event: {event}")
|
||||
if len(received_events) > 0 and received_events[-1] == event:
|
||||
logger.info(f"The received event is a duplicate: {event}")
|
||||
else:
|
||||
received_events.append(event)
|
||||
|
||||
cql.cluster.control_connection._connection.register_watchers({CLIENT_ROUTES_CHANGE_EVENT_NAME: on_event})
|
||||
|
||||
async def wait_for_expected_event_num(expected_num, received_events):
|
||||
async def expected_event_num(num):
|
||||
logger.info(f"Checking if number of events is equal expected_num={expected_num}, events={received_events}")
|
||||
if len(received_events) == num:
|
||||
return num
|
||||
return None
|
||||
await wait_for(lambda: expected_event_num(expected_num), time.time() + 10)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_events(request, manager: ManagerClient, monkeypatch):
|
||||
"""
|
||||
This test verifies client routes change events in the following steps:
|
||||
1. Add one new entry to client_routes.
|
||||
2. Verify that the driver received one new event.
|
||||
3. Add two new entries to client_routes using one POST request.
|
||||
4. Verify that the driver received one new event with two updates.
|
||||
5. Delete an entry, and verify that the driver received the event.
|
||||
"""
|
||||
|
||||
servers = await manager.servers_add(2, cmdline=['--smp=2'])
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
|
||||
received_events = []
|
||||
setup_events_test(cql, received_events, monkeypatch)
|
||||
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
|
||||
await wait_for_expected_event_num(1, received_events)
|
||||
assert received_events[0]["change_type"] == "UPDATE_NODES"
|
||||
assert received_events[0]["connection_ids"] == [generate_connection_id(0)]
|
||||
assert received_events[0]["host_ids"] == [generate_host_id(0)]
|
||||
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[
|
||||
generate_client_routes_entry(1),
|
||||
generate_client_routes_entry(2),
|
||||
])
|
||||
await wait_for_expected_event_num(2, received_events)
|
||||
assert received_events[1]["change_type"] == "UPDATE_NODES"
|
||||
assert received_events[1]["connection_ids"] == [generate_connection_id(1), generate_connection_id(2)]
|
||||
assert received_events[1]["host_ids"] == [generate_host_id(1), generate_host_id(2)]
|
||||
|
||||
await manager.api.client.delete("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
await wait_for_expected_event_num(3, received_events)
|
||||
assert received_events[2]["change_type"] == "UPDATE_NODES"
|
||||
assert received_events[2]["connection_ids"] == [generate_connection_id(0)]
|
||||
assert received_events[2]["host_ids"] == [generate_host_id(0)]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode("release", "error injections are not supported in release mode")
|
||||
async def test_client_routes_snapshot_transfer(request, manager: ManagerClient, monkeypatch):
|
||||
"""
|
||||
This test verifies that client routes change events are sent when client_routes
|
||||
data is propagated via snapshot transfer:
|
||||
1. Create a 3-node cluster.
|
||||
2. Enable `block_group0_transfer_snapshot` error injection on one node, and stop it.
|
||||
3. Change client routes with a POST request on other nodes, and trigger a snapshot.
|
||||
4. Start the stopped node, and send a message to stop waiting on `block_group0_transfer_snapshot`.
|
||||
5. Verify that an event was sent.
|
||||
"""
|
||||
servers = await manager.servers_add(3, cmdline=['--smp=2'])
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
server_to_restart = servers[2]
|
||||
error_to_inject = "block_group0_transfer_snapshot"
|
||||
|
||||
await manager.server_update_config(server_to_restart.server_id, "error_injections_at_startup", [error_to_inject])
|
||||
await manager.server_stop(server_to_restart.server_id)
|
||||
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(1)])
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
await trigger_snapshot(manager, servers[0])
|
||||
|
||||
await manager.server_start(server_to_restart.server_id)
|
||||
log = await manager.server_open_log(server_to_restart.server_id)
|
||||
await log.wait_for("block_group0_transfer_snapshot: waiting for message")
|
||||
cql = await manager.get_cql_exclusive(server_to_restart)
|
||||
await wait_for_expected_client_routes_size(cql, 0)
|
||||
|
||||
received_events = []
|
||||
setup_events_test(cql, received_events, monkeypatch)
|
||||
|
||||
await manager.api.message_injection(server_to_restart.ip_addr, error_to_inject)
|
||||
await wait_for_expected_client_routes_size(cql, 1)
|
||||
await wait_for_expected_event_num(1, received_events)
|
||||
assert received_events[0]["change_type"] == "UPDATE_NODES"
|
||||
assert received_events[0]["connection_ids"] == [generate_connection_id(1)]
|
||||
assert received_events[0]["host_ids"] == [generate_host_id(1)]
|
||||
await log.wait_for("transfer snapshot: raft snapshot includes client_routes mutation")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_huge_event(request, manager: ManagerClient, monkeypatch):
|
||||
"""
|
||||
This test verifies that an event can be sent to the driver even when it contains many host_ids and connection_ids.
|
||||
"""
|
||||
servers = await manager.servers_add(2, cmdline=['--smp=2'])
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
|
||||
received_events = []
|
||||
setup_events_test(cql, received_events, monkeypatch)
|
||||
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(i) for i in range(1000)])
|
||||
|
||||
await wait_for_expected_event_num(1, received_events)
|
||||
assert set(received_events[0]["connection_ids"]) == set([generate_connection_id(i) for i in range(1000)])
|
||||
assert set(received_events[0]["host_ids"]) == set([generate_host_id(i) for i in range(1000)])
|
||||
@@ -146,13 +146,13 @@ async def test_joining_old_node_fails(manager: ManagerClient) -> None:
|
||||
|
||||
# Try to add a node that doesn't support the feature - should fail
|
||||
new_server_info = await manager.server_add(start=False, property_file=servers[0].property_file())
|
||||
await manager.server_start(new_server_info.server_id, expected_error="Feature check failed")
|
||||
await manager.server_start(new_server_info.server_id, expected_error="Feature check failed|received notification of being banned from the cluster from")
|
||||
|
||||
# Try to replace with a node that doesn't support the feature - should fail
|
||||
await manager.server_stop_gracefully(servers[0].server_id)
|
||||
replace_cfg = ReplaceConfig(replaced_id=servers[0].server_id, reuse_ip_addr=False, use_host_id=False)
|
||||
new_server_info = await manager.server_add(start=False, replace_cfg=replace_cfg, property_file=servers[0].property_file())
|
||||
await manager.server_start(new_server_info.server_id, expected_error="Feature check failed")
|
||||
await manager.server_start(new_server_info.server_id, expected_error="Feature check failed|received notification of being banned from the cluster from")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user