mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 03:12:13 +00:00
Compare commits
1 Commits
master
...
scylladb_1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a3641077ce |
@@ -4,8 +4,6 @@ on:
|
||||
milestone:
|
||||
types: [created, closed]
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
sync-milestone-to-jira:
|
||||
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
|
||||
|
||||
@@ -299,7 +299,6 @@ target_sources(scylla-main
|
||||
serializer.cc
|
||||
service/direct_failure_detector/failure_detector.cc
|
||||
sstables_loader.cc
|
||||
sstables_loader_helpers.cc
|
||||
table_helper.cc
|
||||
tasks/task_handler.cc
|
||||
tasks/task_manager.cc
|
||||
|
||||
@@ -247,18 +247,6 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
|
||||
if (!v1) {
|
||||
return false;
|
||||
}
|
||||
if (!v1->IsObject() || v1->MemberCount() != 1) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!v2.IsObject() || v2.MemberCount() != 1) {
|
||||
if (v2_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const auto& kv1 = *v1->MemberBegin();
|
||||
const auto& kv2 = *v2.MemberBegin();
|
||||
if (kv1.name == "S" && kv2.name == "S") {
|
||||
@@ -277,17 +265,9 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
|
||||
}
|
||||
}
|
||||
} else if (kv1.name == "L") {
|
||||
if (!kv1.value.IsArray()) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator received a malformed list");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
|
||||
if (!i->IsObject() || i->MemberCount() != 1) {
|
||||
if (v1_from_query) {
|
||||
throw api_error::serialization("CONTAINS operator received a list whose element is malformed");
|
||||
}
|
||||
clogger.error("check_CONTAINS received a list whose element is malformed");
|
||||
return false;
|
||||
}
|
||||
const auto& el = *i->MemberBegin();
|
||||
|
||||
@@ -38,7 +38,6 @@ controller::controller(
|
||||
sharded<auth::service>& auth_service,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
sharded<vector_search::vector_store_client>& vsc,
|
||||
sharded<updateable_timeout_config>& timeout_config,
|
||||
const db::config& config,
|
||||
seastar::scheduling_group sg)
|
||||
: protocol_server(sg)
|
||||
@@ -53,7 +52,6 @@ controller::controller(
|
||||
, _auth_service(auth_service)
|
||||
, _sl_controller(sl_controller)
|
||||
, _vsc(vsc)
|
||||
, _timeout_config(timeout_config)
|
||||
, _config(config)
|
||||
{
|
||||
}
|
||||
@@ -101,7 +99,7 @@ future<> controller::start_server() {
|
||||
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks), std::ref(_sys_ks),
|
||||
sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), std::ref(_vsc), _ssg.value(),
|
||||
sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller), std::ref(_timeout_config)).get();
|
||||
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
|
||||
// Note: from this point on, if start_server() throws for any reason,
|
||||
// it must first call stop_server() to stop the executor and server
|
||||
// services we just started - or Scylla will cause an assertion
|
||||
|
||||
@@ -48,8 +48,6 @@ namespace vector_search {
|
||||
class vector_store_client;
|
||||
}
|
||||
|
||||
class updateable_timeout_config;
|
||||
|
||||
namespace alternator {
|
||||
|
||||
// This is the official DynamoDB API version.
|
||||
@@ -74,7 +72,6 @@ class controller : public protocol_server {
|
||||
sharded<auth::service>& _auth_service;
|
||||
sharded<qos::service_level_controller>& _sl_controller;
|
||||
sharded<vector_search::vector_store_client>& _vsc;
|
||||
sharded<updateable_timeout_config>& _timeout_config;
|
||||
const db::config& _config;
|
||||
|
||||
std::vector<socket_address> _listen_addresses;
|
||||
@@ -95,7 +92,6 @@ public:
|
||||
sharded<auth::service>& auth_service,
|
||||
sharded<qos::service_level_controller>& sl_controller,
|
||||
sharded<vector_search::vector_store_client>& vsc,
|
||||
sharded<updateable_timeout_config>& timeout_config,
|
||||
const db::config& config,
|
||||
seastar::scheduling_group sg);
|
||||
|
||||
|
||||
@@ -485,7 +485,7 @@ std::optional<bytes> unwrap_bytes(const rjson::value& value, bool from_query) {
|
||||
return rjson::base64_decode(value);
|
||||
} catch (...) {
|
||||
if (from_query) {
|
||||
throw api_error::serialization("Invalid base64 data");
|
||||
throw api_error::serialization(format("Invalid base64 data"));
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
@@ -835,7 +835,7 @@ void server::set_routes(routes& r) {
|
||||
//FIXME: A way to immediately invalidate the cache should be considered,
|
||||
// e.g. when the system table which stores the keys is changed.
|
||||
// For now, this propagation may take up to 1 minute.
|
||||
server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config)
|
||||
server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller)
|
||||
: _http_server("http-alternator")
|
||||
, _https_server("https-alternator")
|
||||
, _executor(exec)
|
||||
@@ -847,7 +847,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
|
||||
, _max_users_query_size_in_trace_output(1024)
|
||||
, _enabled_servers{}
|
||||
, _pending_requests("alternator::server::pending_requests")
|
||||
, _timeout_config(timeout_config)
|
||||
, _timeout_config(_proxy.data_dictionary().get_config())
|
||||
, _callbacks{
|
||||
{"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
||||
return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request), audit_info);
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include <seastar/net/tls.hh>
|
||||
#include <optional>
|
||||
#include "alternator/auth.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
@@ -54,8 +53,8 @@ class server : public peering_sharded_service<server> {
|
||||
named_gate _pending_requests;
|
||||
// In some places we will need a CQL updateable_timeout_config object even
|
||||
// though it isn't really relevant for Alternator which defines its own
|
||||
// timeouts separately.
|
||||
updateable_timeout_config& _timeout_config;
|
||||
// timeouts separately. We can create this object only once.
|
||||
updateable_timeout_config _timeout_config;
|
||||
client_options_cache_type _connection_options_keys_and_values;
|
||||
|
||||
alternator_callbacks_map _callbacks;
|
||||
@@ -99,7 +98,7 @@ class server : public peering_sharded_service<server> {
|
||||
utils::scoped_item_list<ongoing_request> _ongoing_requests;
|
||||
|
||||
public:
|
||||
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config);
|
||||
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
|
||||
|
||||
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port,
|
||||
std::optional<uint16_t> port_proxy_protocol, std::optional<uint16_t> https_port_proxy_protocol,
|
||||
|
||||
@@ -974,54 +974,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/tablets/restore",
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Starts copying SSTables from a designated bucket in object storage to a specified keyspace",
|
||||
"type":"string",
|
||||
"nickname":"tablet_aware_restore",
|
||||
"produces":[
|
||||
"application/json"
|
||||
],
|
||||
"parameters":[
|
||||
{
|
||||
"name":"keyspace",
|
||||
"description":"Name of a keyspace to copy SSTables to",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"table",
|
||||
"description":"Name of a table to copy SSTables to",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"snapshot",
|
||||
"description":"Name of the snapshot to restore from",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"string",
|
||||
"paramType":"query"
|
||||
},
|
||||
{
|
||||
"name":"backup_location",
|
||||
"description":"JSON array of backup location objects. Each object must contain: 'datacenter' (string), 'endpoint' (string), 'bucket' (string), and 'manifests' (array of strings). Currently, the array must contain exactly one entry.",
|
||||
"required":true,
|
||||
"allowMultiple":false,
|
||||
"type":"array",
|
||||
"paramType":"body"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"path":"/storage_service/keyspace_compaction/{keyspace}",
|
||||
"operations":[
|
||||
|
||||
@@ -527,56 +527,11 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
|
||||
ss::tablet_aware_restore.set(r, [&ctx, &sst_loader](std::unique_ptr<http::request> req) -> future<json_return_type> {
|
||||
std::string keyspace = req->get_query_param("keyspace");
|
||||
std::string table = req->get_query_param("table");
|
||||
std::string snapshot = req->get_query_param("snapshot");
|
||||
|
||||
rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
|
||||
rjson::value parsed = rjson::parse(std::move(content));
|
||||
if (!parsed.IsArray()) {
|
||||
throw httpd::bad_param_exception("backup locations (in body) must be a JSON array");
|
||||
}
|
||||
|
||||
const auto& locations = parsed.GetArray();
|
||||
if (locations.Size() != 1) {
|
||||
throw httpd::bad_param_exception("backup locations array (in body) must contain exactly one entry");
|
||||
}
|
||||
|
||||
const auto& location = locations[0];
|
||||
if (!location.IsObject()) {
|
||||
throw httpd::bad_param_exception("backup location (in body) must be a JSON object");
|
||||
}
|
||||
|
||||
auto endpoint = rjson::to_string_view(location["endpoint"]);
|
||||
auto bucket = rjson::to_string_view(location["bucket"]);
|
||||
auto dc = rjson::to_string_view(location["datacenter"]);
|
||||
|
||||
if (!location.HasMember("manifests") || !location["manifests"].IsArray()) {
|
||||
throw httpd::bad_param_exception("backup location entry must have 'manifests' array");
|
||||
}
|
||||
|
||||
auto manifests = location["manifests"].GetArray() |
|
||||
std::views::transform([] (const auto& m) { return sstring(rjson::to_string_view(m)); }) |
|
||||
std::ranges::to<utils::chunked_vector<sstring>>();
|
||||
|
||||
if (manifests.empty()) {
|
||||
throw httpd::bad_param_exception("backup location 'manifests' array must not be empty");
|
||||
}
|
||||
|
||||
apilog.info("Tablet restore for {}:{} called. Parameters: snapshot={} datacenter={} endpoint={} bucket={} manifests_count={}",
|
||||
keyspace, table, snapshot, dc, endpoint, bucket, manifests.size());
|
||||
|
||||
auto table_id = validate_table(ctx.db.local(), keyspace, table);
|
||||
auto task_id = co_await sst_loader.local().restore_tablets(table_id, keyspace, table, snapshot, sstring(endpoint), sstring(bucket), std::move(manifests));
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
}
|
||||
|
||||
void unset_sstables_loader(http_context& ctx, routes& r) {
|
||||
ss::load_new_ss_tables.unset(r);
|
||||
ss::start_restore.unset(r);
|
||||
ss::tablet_aware_restore.unset(r);
|
||||
}
|
||||
|
||||
void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
|
||||
|
||||
@@ -564,7 +564,6 @@ scylla_tests = set([
|
||||
'test/boost/crc_test',
|
||||
'test/boost/dict_trainer_test',
|
||||
'test/boost/dirty_memory_manager_test',
|
||||
'test/boost/tablet_aware_restore_test',
|
||||
'test/boost/double_decker_test',
|
||||
'test/boost/duration_test',
|
||||
'test/boost/dynamic_bitset_test',
|
||||
@@ -1173,8 +1172,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
'index/vector_index.cc',
|
||||
'index/fulltext_index.cc',
|
||||
'index/index_option_utils.cc',
|
||||
'utils/UUID_gen.cc',
|
||||
'utils/i_filter.cc',
|
||||
'utils/bloom_filter.cc',
|
||||
@@ -1337,7 +1334,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'ent/ldap/ldap_connection.cc',
|
||||
'reader_concurrency_semaphore.cc',
|
||||
'sstables_loader.cc',
|
||||
'sstables_loader_helpers.cc',
|
||||
'utils/utf8.cc',
|
||||
'utils/ascii.cc',
|
||||
'utils/like_matcher.cc',
|
||||
@@ -1477,7 +1473,6 @@ idls = ['idl/gossip_digest.idl.hh',
|
||||
'idl/frozen_mutation.idl.hh',
|
||||
'idl/reconcilable_result.idl.hh',
|
||||
'idl/streaming.idl.hh',
|
||||
'idl/sstables_loader.idl.hh',
|
||||
'idl/paging_state.idl.hh',
|
||||
'idl/frozen_schema.idl.hh',
|
||||
'idl/repair.idl.hh',
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#include "cql3/prepare_context.hh"
|
||||
#include "cql3/expr/expr-utils.hh"
|
||||
#include "types/list.hh"
|
||||
#include "types/tuple.hh"
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
|
||||
@@ -117,34 +116,6 @@ void validate_token_relation(const std::vector<const column_definition*> column_
|
||||
}
|
||||
}
|
||||
|
||||
void validate_tuples_size(const expression& rhs, size_t valid_size) {
|
||||
auto coll = as_if<collection_constructor>(&rhs);
|
||||
if (!coll) {
|
||||
// Pre-prepare, the IN list arrives as a collection_constructor.
|
||||
// After prepare it would be a constant of list type whose elements
|
||||
// are serialized; arity validation has already happened earlier in
|
||||
// that case, so nothing to do here.
|
||||
return;
|
||||
}
|
||||
for (const auto& expr : coll->elements) {
|
||||
size_t expr_size = 0;
|
||||
if (auto tuple = as_if<tuple_constructor>(&expr)) {
|
||||
expr_size = tuple->elements.size();
|
||||
} else {
|
||||
auto the_const = as_if<constant>(&expr);
|
||||
if (the_const && the_const->type->without_reversed().is_tuple()) {
|
||||
const tuple_type_impl* const_tuple = dynamic_cast<const tuple_type_impl*>(&the_const->type->without_reversed());
|
||||
expr_size = const_tuple->size();
|
||||
} else {
|
||||
continue; // not a tuple; perhaps we need to set expr_size to 1 here when #12554 is fixed
|
||||
}
|
||||
}
|
||||
if (expr_size != valid_size) {
|
||||
throw exceptions::invalid_request_exception(format("Expected {} elements in value tuple, but got {}: {}", valid_size, expr_size, expr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void preliminary_binop_vaidation_checks(const binary_operator& binop) {
|
||||
if (binop.op == oper_t::NEQ) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported \"!=\" relation: {:user}", binop));
|
||||
@@ -171,10 +142,6 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
|
||||
throw exceptions::invalid_request_exception("LIKE cannot be used for Multi-column relations");
|
||||
}
|
||||
|
||||
if (binop.op == oper_t::IN) {
|
||||
validate_tuples_size(binop.rhs, lhs_tup->elements.size());
|
||||
}
|
||||
|
||||
if (auto rhs_tup = as_if<tuple_constructor>(&binop.rhs)) {
|
||||
if (lhs_tup->elements.size() != rhs_tup->elements.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
|
||||
@@ -343,102 +343,102 @@ to_predicates(
|
||||
auto cdef = col.col;
|
||||
auto type = &cdef->type->without_reversed();
|
||||
if (oper.op == oper_t::IS_NOT) {
|
||||
return to_vector(predicate{
|
||||
.solve_for = nullptr,
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_not_null_single_column = is_null_constant(oper.rhs),
|
||||
.op = oper.op,
|
||||
});
|
||||
return to_vector(predicate{
|
||||
.solve_for = nullptr,
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_not_null_single_column = is_null_constant(oper.rhs),
|
||||
.op = oper.op,
|
||||
});
|
||||
}
|
||||
if (is_compare(oper.op)) {
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return oper.op == oper_t::EQ ? value_set(value_list{*val})
|
||||
: to_range(oper.op, std::move(*val));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return oper.op == oper_t::EQ ? value_set(value_list{*val})
|
||||
: to_range(oper.op, std::move(*val));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
} else if (oper.op == oper_t::IN) {
|
||||
auto solve = [oper, type, cdef] (const query_options& options) {
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.is_in = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper, type, cdef] (const query_options& options) {
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.is_in = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
} else if (oper.op == oper_t::CONTAINS || oper.op == oper_t::CONTAINS_KEY) {
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return value_set(value_list{*val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) {
|
||||
managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
return value_set(value_list{*val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = false,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
},
|
||||
[&] (const subscript& s) -> std::vector<predicate> {
|
||||
const column_value& col = get_subscripted_column(s);
|
||||
|
||||
if (oper.op == oper_t::EQ) {
|
||||
auto solve = [s, oper] (const query_options& options) {
|
||||
managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
|
||||
if (!sval) {
|
||||
return empty_value_set; // NULL can't be a map key
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
auto solve = [s, oper] (const query_options& options) {
|
||||
managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
|
||||
if (!sval) {
|
||||
return empty_value_set; // NULL can't be a map key
|
||||
}
|
||||
|
||||
managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!rval) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
managed_bytes_opt elements[] = {sval, rval};
|
||||
managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
|
||||
return value_set(value_list{val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = true,
|
||||
.equality = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
.is_subscript = true,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!rval) {
|
||||
return empty_value_set; // All NULL comparisons fail; no column values match.
|
||||
}
|
||||
managed_bytes_opt elements[] = {sval, rval};
|
||||
managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
|
||||
return value_set(value_list{val});
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_column{col.col},
|
||||
.is_singleton = true,
|
||||
.equality = true,
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
.is_subscript = true,
|
||||
});
|
||||
}
|
||||
return cannot_solve_on_column(oper, col.col);
|
||||
},
|
||||
[&] (const tuple_constructor& tuple) -> std::vector<predicate> {
|
||||
auto columns = tuple.elements
|
||||
| std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
|
||||
| std::ranges::to<std::vector>();
|
||||
| std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
|
||||
| std::ranges::to<std::vector>();
|
||||
for (unsigned i = 0; i < columns.size(); ++i) {
|
||||
if (!columns[i]->is_clustering_key() || columns[i]->position() != i) {
|
||||
on_internal_error(rlogger, "to_predicates: multi-column relation not on a clustering key prefix");
|
||||
@@ -481,42 +481,42 @@ to_predicates(
|
||||
if (!(oper.op == oper_t::EQ || is_slice(oper.op))) {
|
||||
return cannot_solve(oper);
|
||||
}
|
||||
auto solve = [oper] (const query_options& options) -> value_set {
|
||||
auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no token values match.
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
return value_list{*val};
|
||||
} else if (oper.op == oper_t::GT) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
|
||||
} else if (oper.op == oper_t::GTE) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
|
||||
}
|
||||
static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
|
||||
MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
|
||||
// Undocumented feature: when the user types `token(...) < MININT`, we interpret
|
||||
// that as MAXINT for some reason.
|
||||
const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
|
||||
if (oper.op == oper_t::LT) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
|
||||
} else if (oper.op == oper_t::LTE) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
|
||||
}
|
||||
throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_partition_key_token{table_schema_opt},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
auto solve = [oper] (const query_options& options) -> value_set {
|
||||
auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
|
||||
if (!val) {
|
||||
return empty_value_set; // All NULL comparisons fail; no token values match.
|
||||
}
|
||||
if (oper.op == oper_t::EQ) {
|
||||
return value_list{*val};
|
||||
} else if (oper.op == oper_t::GT) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
|
||||
} else if (oper.op == oper_t::GTE) {
|
||||
return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
|
||||
}
|
||||
static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
|
||||
MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
|
||||
// Undocumented feature: when the user types `token(...) < MININT`, we interpret
|
||||
// that as MAXINT for some reason.
|
||||
const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
|
||||
if (oper.op == oper_t::LT) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
|
||||
} else if (oper.op == oper_t::LTE) {
|
||||
return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
|
||||
}
|
||||
throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
|
||||
};
|
||||
return to_vector(predicate{
|
||||
.solve_for = std::move(solve),
|
||||
.filter = oper,
|
||||
.on = on_partition_key_token{table_schema_opt},
|
||||
.is_singleton = (oper.op == oper_t::EQ),
|
||||
.equality = (oper.op == oper_t::EQ),
|
||||
.is_slice = expr::is_slice(oper.op),
|
||||
.is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
|
||||
.is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
|
||||
.order = oper.order,
|
||||
.op = oper.op,
|
||||
});
|
||||
},
|
||||
[&] (const binary_operator&) -> std::vector<predicate> {
|
||||
return cannot_solve(oper);
|
||||
@@ -555,7 +555,7 @@ to_predicates(
|
||||
return cannot_solve(oper);
|
||||
},
|
||||
}, oper.lhs);
|
||||
},
|
||||
},
|
||||
[] (const column_value& cv) -> std::vector<predicate> {
|
||||
return cannot_solve(cv);
|
||||
},
|
||||
@@ -806,26 +806,26 @@ bool is_empty_restriction(const expression& e) {
|
||||
static
|
||||
std::function<bytes_opt (const query_options&)>
|
||||
build_value_for_fn(const column_definition& cdef, const expression& e, const schema& s) {
|
||||
auto ac = to_predicate_on_column(e, &cdef, &s);
|
||||
return [ac] (const query_options& options) -> bytes_opt {
|
||||
value_set possible_vals = solve(ac, options);
|
||||
return std::visit(overloaded_functor {
|
||||
[&](const value_list& val_list) -> bytes_opt {
|
||||
if (val_list.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (val_list.size() != 1) {
|
||||
on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
|
||||
}
|
||||
|
||||
return to_bytes(val_list.front());
|
||||
},
|
||||
[&](const interval<managed_bytes>&) -> bytes_opt {
|
||||
on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
|
||||
auto ac = to_predicate_on_column(e, &cdef, &s);
|
||||
return [ac] (const query_options& options) -> bytes_opt {
|
||||
value_set possible_vals = solve(ac, options);
|
||||
return std::visit(overloaded_functor {
|
||||
[&](const value_list& val_list) -> bytes_opt {
|
||||
if (val_list.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
}, possible_vals);
|
||||
};
|
||||
|
||||
if (val_list.size() != 1) {
|
||||
on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
|
||||
}
|
||||
|
||||
return to_bytes(val_list.front());
|
||||
},
|
||||
[&](const interval<managed_bytes>&) -> bytes_opt {
|
||||
on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
|
||||
}
|
||||
}, possible_vals);
|
||||
};
|
||||
}
|
||||
|
||||
bool contains_multi_column_restriction(const expression& e) {
|
||||
@@ -1337,11 +1337,11 @@ statement_restrictions::ck_restrictions_need_filtering() const {
|
||||
}
|
||||
|
||||
return has_partition_key_unrestricted_components()
|
||||
|| clustering_key_restrictions_need_filtering()
|
||||
// If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
|
||||
// A single token restriction can have multiple matching partition key values.
|
||||
// Because of this we can't create a clustering prefix with more than token restriction.
|
||||
|| (_uses_secondary_indexing && has_token_restrictions());
|
||||
|| clustering_key_restrictions_need_filtering()
|
||||
// If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
|
||||
// A single token restriction can have multiple matching partition key values.
|
||||
// Because of this we can't create a clustering prefix with more than token restriction.
|
||||
|| (_uses_secondary_indexing && has_token_restrictions());
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1705,28 +1705,28 @@ dht::partition_range_vector statement_restrictions::get_partition_key_ranges(con
|
||||
get_partition_key_ranges_fn_t
|
||||
statement_restrictions::build_partition_key_ranges_fn() const {
|
||||
return std::visit(overloaded_functor{
|
||||
[&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
|
||||
return [] (const query_options& options) -> dht::partition_range_vector{
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
[&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
|
||||
return [] (const query_options& options) -> dht::partition_range_vector{
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
};
|
||||
},
|
||||
[&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
return [&] (const query_options& options) -> dht::partition_range_vector {
|
||||
return partition_ranges_from_token(r.token_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
if (_partition_range_is_simple) {
|
||||
return [&] (const query_options& options) {
|
||||
// Special case to avoid extra allocations required for a Cartesian product.
|
||||
return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
return [&] (const query_options& options) -> dht::partition_range_vector {
|
||||
return partition_ranges_from_token(r.token_restrictions, options, *_schema);
|
||||
} else {
|
||||
return [&] (const query_options& options) {
|
||||
return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
},
|
||||
[&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
|
||||
if (_partition_range_is_simple) {
|
||||
return [&] (const query_options& options) {
|
||||
// Special case to avoid extra allocations required for a Cartesian product.
|
||||
return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
} else {
|
||||
return [&] (const query_options& options) {
|
||||
return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
|
||||
};
|
||||
}
|
||||
}}, _partition_range_restrictions);
|
||||
}
|
||||
}}, _partition_range_restrictions);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@@ -1970,28 +1970,28 @@ build_get_multi_column_clustering_bounds_fn(
|
||||
}
|
||||
});
|
||||
}
|
||||
return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
multi_column_range_accumulator acc;
|
||||
for (auto& builder : range_builders) {
|
||||
builder(acc, options);
|
||||
}
|
||||
auto bounds = std::move(acc.ranges);
|
||||
return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
multi_column_range_accumulator acc;
|
||||
for (auto& builder : range_builders) {
|
||||
builder(acc, options);
|
||||
}
|
||||
auto bounds = std::move(acc.ranges);
|
||||
|
||||
if (!all_natural && !all_reverse) {
|
||||
std::vector<query::clustering_range> bounds_in_clustering_order;
|
||||
for (const auto& b : bounds) {
|
||||
const auto eqv = get_equivalent_ranges(b, *schema);
|
||||
bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
|
||||
}
|
||||
return bounds_in_clustering_order;
|
||||
if (!all_natural && !all_reverse) {
|
||||
std::vector<query::clustering_range> bounds_in_clustering_order;
|
||||
for (const auto& b : bounds) {
|
||||
const auto eqv = get_equivalent_ranges(b, *schema);
|
||||
bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
|
||||
}
|
||||
if (all_reverse) {
|
||||
for (auto& crange : bounds) {
|
||||
crange = query::clustering_range(crange.end(), crange.start());
|
||||
}
|
||||
return bounds_in_clustering_order;
|
||||
}
|
||||
if (all_reverse) {
|
||||
for (auto& crange : bounds) {
|
||||
crange = query::clustering_range(crange.end(), crange.start());
|
||||
}
|
||||
return bounds;
|
||||
};
|
||||
}
|
||||
return bounds;
|
||||
};
|
||||
}
|
||||
|
||||
/// Reverses the range if the type is reversed. Why don't we have interval::reverse()??
|
||||
@@ -2288,17 +2288,17 @@ build_range_from_raw_bounds_fn(
|
||||
std::vector<std::function<query::clustering_range (const query_options&)>> range_builders;
|
||||
for (const auto& e : exprs | std::views::transform(&predicate::filter)) {
|
||||
if (auto b = find_clustering_order(e)) {
|
||||
range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
|
||||
auto* b = &bb;
|
||||
cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
|
||||
if (tup_val.is_null()) {
|
||||
on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
|
||||
}
|
||||
range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
|
||||
auto* b = &bb;
|
||||
cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
|
||||
if (tup_val.is_null()) {
|
||||
on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
|
||||
}
|
||||
|
||||
const auto r = to_range(
|
||||
const auto r = to_range(
|
||||
b->op, clustering_key_prefix::from_optional_exploded(schema, expr::get_tuple_elements(tup_val, *type_of(b->rhs))));
|
||||
return r;
|
||||
});
|
||||
return r;
|
||||
});
|
||||
}
|
||||
}
|
||||
return [range_builders] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
@@ -2322,9 +2322,9 @@ build_range_from_raw_bounds_fn(
|
||||
get_clustering_bounds_fn_t
|
||||
statement_restrictions::build_get_clustering_bounds_fn() const {
|
||||
if (_clustering_prefix_restrictions.empty()) {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
};
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
};
|
||||
}
|
||||
if (_clustering_prefix_restrictions[0].is_multi_column) {
|
||||
bool all_natural = true, all_reverse = true; ///< Whether column types are reversed or natural.
|
||||
@@ -2342,14 +2342,14 @@ statement_restrictions::build_get_clustering_bounds_fn() const {
|
||||
}
|
||||
}
|
||||
}
|
||||
return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
|
||||
all_natural, all_reverse);
|
||||
} else {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
|
||||
};
|
||||
}
|
||||
return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
|
||||
all_natural, all_reverse);
|
||||
} else {
|
||||
return [&] (const query_options& options) -> std::vector<query::clustering_range> {
|
||||
return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_clustering_bounds(const query_options& options) const {
|
||||
return _get_clustering_bounds_fn(options);
|
||||
@@ -2475,11 +2475,11 @@ void statement_restrictions::prepare_indexed_global(const schema& idx_tbl_schema
|
||||
_idx_tbl_ck_prefix->reserve(_idx_tbl_ck_prefix->size() + idx_tbl_schema.clustering_key_size());
|
||||
auto *single_column_partition_key_restrictions = std::get_if<single_column_partition_range_restrictions>(&_partition_range_restrictions);
|
||||
if (single_column_partition_key_restrictions) {
|
||||
for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
|
||||
const auto col = require_on_single_column(e);
|
||||
const auto pos = _schema->position(*col) + 1;
|
||||
(*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
|
||||
}
|
||||
for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
|
||||
const auto col = require_on_single_column(e);
|
||||
const auto pos = _schema->position(*col) + 1;
|
||||
(*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
|
||||
}
|
||||
}
|
||||
|
||||
if (std::ranges::any_of(*_idx_tbl_ck_prefix | std::views::drop(1) | std::views::transform(&predicate::filter), is_empty_restriction)) {
|
||||
@@ -2621,10 +2621,10 @@ statement_restrictions::build_get_global_index_clustering_ranges_fn() const {
|
||||
return {};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_global_index_clustering_ranges(
|
||||
@@ -2643,14 +2643,14 @@ statement_restrictions::build_get_global_index_token_clustering_ranges_fn() cons
|
||||
// In old indexes the token column was of type blob.
|
||||
// This causes problems with sorting and must be handled separately.
|
||||
if (token_column.type != long_type) {
|
||||
return [&] (const query_options& options) {
|
||||
return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
|
||||
};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_global_index_token_clustering_ranges(
|
||||
@@ -2664,10 +2664,10 @@ statement_restrictions::build_get_local_index_clustering_ranges_fn() const {
|
||||
return {};
|
||||
}
|
||||
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
return [&] (const query_options& options) {
|
||||
// Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
|
||||
return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<query::clustering_range> statement_restrictions::get_local_index_clustering_ranges(
|
||||
|
||||
@@ -351,9 +351,6 @@ public:
|
||||
if (agg.state_to_result_function) {
|
||||
ret.push_back(agg.state_to_result_function);
|
||||
}
|
||||
if (agg.state_reduction_function) {
|
||||
ret.push_back(agg.state_reduction_function);
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
@@ -71,7 +71,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
|
||||
using namespace service::strong_consistency;
|
||||
if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
|
||||
bool is_write = true;
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
|
||||
}
|
||||
utils::get_local_injector().inject("sc_modification_statement_timeout", [&] {
|
||||
throw exceptions::mutation_write_timeout_exception{"", "", options.get_consistency(), 0, 0, db::write_type::SIMPLE};
|
||||
|
||||
@@ -47,7 +47,7 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
|
||||
using namespace service::strong_consistency;
|
||||
if (const auto* redirect = get_if<need_redirect>(&query_result)) {
|
||||
bool is_write = false;
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
|
||||
co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
|
||||
}
|
||||
|
||||
co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
|
||||
|
||||
@@ -12,23 +12,19 @@
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "replica/database.hh"
|
||||
#include "locator/tablet_replication_strategy.hh"
|
||||
#include "service/strong_consistency/coordinator.hh"
|
||||
|
||||
namespace cql3::statements::strong_consistency {
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
|
||||
const query_options& options,
|
||||
const locator::tablet_replica& target,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool is_write,
|
||||
service::strong_consistency::stats& stats)
|
||||
bool is_write)
|
||||
{
|
||||
auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
|
||||
const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
|
||||
if (target.host != my_host_id) {
|
||||
++(is_write ? stats.write_node_bounces : stats.read_node_bounces);
|
||||
co_return qp.bounce_to_node(target, std::move(func_values_cache), timeout, is_write);
|
||||
}
|
||||
++(is_write ? stats.write_shard_bounces : stats.read_shard_bounces);
|
||||
co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
|
||||
}
|
||||
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
#include "cql3/cql_statement.hh"
|
||||
#include "locator/tablets.hh"
|
||||
|
||||
namespace service::strong_consistency { struct stats; }
|
||||
|
||||
namespace cql3::statements::strong_consistency {
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
|
||||
@@ -20,8 +18,7 @@ future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement
|
||||
const query_options& options,
|
||||
const locator::tablet_replica& target,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool is_write,
|
||||
service::strong_consistency::stats& stats);
|
||||
bool is_write);
|
||||
|
||||
bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);
|
||||
|
||||
|
||||
@@ -1429,13 +1429,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance.")
|
||||
, enable_ipv6_dns_lookup(this, "enable_ipv6_dns_lookup", value_status::Used, false, "Use IPv6 address resolution")
|
||||
, abort_on_internal_error(this, "abort_on_internal_error", liveness::LiveUpdate, value_status::Used, false, "Abort the server instead of throwing exception when internal invariants are violated.")
|
||||
, abort_on_malformed_sstable_error(this, "abort_on_malformed_sstable_error", liveness::LiveUpdate, value_status::Used,
|
||||
#if defined(DEBUG) || defined(DEVEL)
|
||||
true,
|
||||
#else
|
||||
false,
|
||||
#endif
|
||||
"Abort the server and generate a coredump instead of throwing an exception when any sstable parse error is detected (malformed_sstable_exception, bufsize_mismatch_exception, parse_assert() failures, or BTI parse errors). Intended for debugging memory corruption that may manifest as sstable corruption. Defaults to true in debug and dev builds.")
|
||||
, max_partition_key_restrictions_per_query(this, "max_partition_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100,
|
||||
"Maximum number of distinct partition keys restrictions per query. This limit places a bound on the size of IN tuples, "
|
||||
"especially when multiple partition key columns have IN restrictions. Increasing this value can result in server instability.")
|
||||
|
||||
@@ -456,7 +456,6 @@ public:
|
||||
named_value<bool> enable_shard_aware_drivers;
|
||||
named_value<bool> enable_ipv6_dns_lookup;
|
||||
named_value<bool> abort_on_internal_error;
|
||||
named_value<bool> abort_on_malformed_sstable_error;
|
||||
named_value<uint32_t> max_partition_key_restrictions_per_query;
|
||||
named_value<uint32_t> max_clustering_key_restrictions_per_query;
|
||||
named_value<uint64_t> max_memory_for_unlimited_query_soft_limit;
|
||||
|
||||
@@ -29,9 +29,6 @@ class large_data_handler {
|
||||
public:
|
||||
struct stats {
|
||||
int64_t partitions_bigger_than_threshold = 0; // number of large partition updates exceeding threshold_bytes
|
||||
int64_t rows_bigger_than_threshold = 0; // number of large row updates exceeding row_threshold_bytes
|
||||
int64_t cells_bigger_than_threshold = 0; // number of large cell updates exceeding cell_threshold_bytes
|
||||
int64_t collections_bigger_than_threshold = 0; // number of large collection updates exceeding collection_elements_count_threshold
|
||||
};
|
||||
|
||||
private:
|
||||
@@ -85,7 +82,6 @@ public:
|
||||
const clustering_key_prefix* clustering_key, uint64_t row_size) {
|
||||
SCYLLA_ASSERT(running());
|
||||
if (row_size > _row_threshold_bytes) [[unlikely]] {
|
||||
++_stats.rows_bigger_than_threshold;
|
||||
return with_sem([&sst, &partition_key, clustering_key, row_size, this] {
|
||||
return record_large_rows(sst, partition_key, clustering_key, row_size);
|
||||
}).then([] {
|
||||
@@ -106,8 +102,6 @@ public:
|
||||
const clustering_key_prefix* clustering_key, const column_definition& cdef, uint64_t cell_size, uint64_t collection_elements) {
|
||||
SCYLLA_ASSERT(running());
|
||||
above_threshold_result above_threshold{.size = cell_size > _cell_threshold_bytes, .elements = collection_elements > _collection_elements_count_threshold};
|
||||
_stats.cells_bigger_than_threshold += above_threshold.size;
|
||||
_stats.collections_bigger_than_threshold += above_threshold.elements;
|
||||
if (above_threshold.size || above_threshold.elements) [[unlikely]] {
|
||||
return with_sem([&sst, &partition_key, clustering_key, &cdef, cell_size, collection_elements, this] {
|
||||
return record_large_cells(sst, partition_key, clustering_key, cdef, cell_size, collection_elements);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "db/snapshot-ctl.hh"
|
||||
#include "db/snapshot/backup_task.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "sstables/exceptions.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "sstables/sstable_directory.hh"
|
||||
#include "sstables/sstables_manager.hh"
|
||||
@@ -163,23 +164,22 @@ future<> backup_task_impl::process_snapshot_dir() {
|
||||
auto file_path = _snapshot_dir / name;
|
||||
auto st = co_await file_stat(directory, name);
|
||||
total += st.size;
|
||||
auto result = sstables::parse_path(file_path, "", "");
|
||||
if (!result) {
|
||||
_files.emplace_back(name);
|
||||
continue;
|
||||
}
|
||||
auto desc = std::move(*result);
|
||||
const auto& gen = desc.generation;
|
||||
_sstable_comps[gen].emplace_back(name);
|
||||
_sstables_in_snapshot.insert(desc.generation);
|
||||
++num_sstable_comps;
|
||||
try {
|
||||
auto desc = sstables::parse_path(file_path, "", "");
|
||||
const auto& gen = desc.generation;
|
||||
_sstable_comps[gen].emplace_back(name);
|
||||
_sstables_in_snapshot.insert(desc.generation);
|
||||
++num_sstable_comps;
|
||||
|
||||
// When the SSTable is only linked-to by the snapshot directory,
|
||||
// it is already deleted from the table's base directory, and
|
||||
// therefore it better be uploaded earlier to free-up its capacity.
|
||||
if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
|
||||
snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
|
||||
_deleted_sstables.push_back(gen);
|
||||
// When the SSTable is only linked-to by the snapshot directory,
|
||||
// it is already deleted from the table's base directory, and
|
||||
// therefore it better be uploaded earlier to free-up its capacity.
|
||||
if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
|
||||
snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
|
||||
_deleted_sstables.push_back(gen);
|
||||
}
|
||||
} catch (const sstables::malformed_sstable_exception&) {
|
||||
_files.emplace_back(name);
|
||||
}
|
||||
}
|
||||
_total_progress.total = total;
|
||||
|
||||
@@ -96,38 +96,6 @@ schema_ptr cdc_timestamps() {
|
||||
|
||||
static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
|
||||
|
||||
schema_ptr snapshot_sstables() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES);
|
||||
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES, std::make_optional(id))
|
||||
// Name of the snapshot
|
||||
.with_column("snapshot_name", utf8_type, column_kind::partition_key)
|
||||
// Keyspace where the snapshot was taken
|
||||
.with_column("keyspace", utf8_type, column_kind::partition_key)
|
||||
// Table within the keyspace
|
||||
.with_column("table", utf8_type, column_kind::partition_key)
|
||||
// Datacenter where this SSTable is located
|
||||
.with_column("datacenter", utf8_type, column_kind::partition_key)
|
||||
// Rack where this SSTable is located
|
||||
.with_column("rack", utf8_type, column_kind::partition_key)
|
||||
// First token in the token range covered by this SSTable
|
||||
.with_column("first_token", long_type, column_kind::clustering_key)
|
||||
// Unique identifier for the SSTable (UUID)
|
||||
.with_column("sstable_id", uuid_type, column_kind::clustering_key)
|
||||
// Last token in the token range covered by this SSTable
|
||||
.with_column("last_token", long_type)
|
||||
// TOC filename of the SSTable
|
||||
.with_column("toc_name", utf8_type)
|
||||
// Prefix path in object storage where the SSTable was backed up
|
||||
.with_column("prefix", utf8_type)
|
||||
// Flag if the SSTable was downloaded already
|
||||
.with_column("downloaded", boolean_type)
|
||||
.with_hash_version()
|
||||
.build();
|
||||
}();
|
||||
return schema;
|
||||
}
|
||||
|
||||
// This is the set of tables which this node ensures to exist in the cluster.
|
||||
// It does that by announcing the creation of these schemas on initialization
|
||||
// of the `system_distributed_keyspace` service (see `start()`), unless it first
|
||||
@@ -143,12 +111,11 @@ static std::vector<schema_ptr> ensured_tables() {
|
||||
view_build_status(),
|
||||
cdc_desc(),
|
||||
cdc_timestamps(),
|
||||
snapshot_sstables(),
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps(), snapshot_sstables()};
|
||||
return {view_build_status(), cdc_desc(), cdc_timestamps()};
|
||||
}
|
||||
|
||||
system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
|
||||
@@ -433,83 +400,4 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
|
||||
co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl) {
|
||||
// Not inserting the downloaded column so that re-populating on restore
|
||||
// retry doesn't overwrite downloaded=true set by a previous attempt
|
||||
static const sstring query = format("INSERT INTO {}.{} (snapshot_name, \"keyspace\", \"table\", datacenter, rack, first_token, sstable_id, last_token, toc_name, prefix) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) USING TTL {}", NAME, SNAPSHOT_SSTABLES, SNAPSHOT_SSTABLES_TTL_SECONDS);
|
||||
|
||||
return _qp.execute_internal(
|
||||
query,
|
||||
cl,
|
||||
internal_distributed_query_state(),
|
||||
{ std::move(snapshot_name), std::move(ks), std::move(table), std::move(dc), std::move(rack),
|
||||
dht::token::to_int64(first_token), sstable_id.uuid(), dht::token::to_int64(last_token), std::move(toc_name), std::move(prefix) },
|
||||
cql3::query_processor::cache_internal::yes).discard_result();
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<snapshot_sstable_entry>>
|
||||
system_distributed_keyspace::get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl, std::optional<dht::token> start_token, std::optional<dht::token> end_token) const {
|
||||
utils::chunked_vector<snapshot_sstable_entry> sstables;
|
||||
|
||||
static const sstring base_query = format("SELECT toc_name, prefix, sstable_id, first_token, last_token, downloaded FROM {}.{}"
|
||||
" WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND datacenter = ? AND rack = ?", NAME, SNAPSHOT_SSTABLES);
|
||||
|
||||
auto read_row = [&] (const cql3::untyped_result_set_row& row) {
|
||||
sstables.emplace_back(sstables::sstable_id(row.get_as<utils::UUID>("sstable_id")), dht::token::from_int64(row.get_as<int64_t>("first_token")), dht::token::from_int64(row.get_as<int64_t>("last_token")), row.get_as<sstring>("toc_name"), row.get_as<sstring>("prefix"), is_downloaded(row.get_or<bool>("downloaded", false)));
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
};
|
||||
|
||||
if (start_token && end_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token >= ? AND first_token <= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token), dht::token::to_int64(*end_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else if (start_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token >= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else if (end_token) {
|
||||
co_await _qp.query_internal(
|
||||
base_query + " AND first_token <= ?",
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack, dht::token::to_int64(*end_token) },
|
||||
1000,
|
||||
read_row);
|
||||
} else {
|
||||
co_await _qp.query_internal(
|
||||
base_query,
|
||||
cl,
|
||||
{ snapshot_name, ks, table, dc, rack },
|
||||
1000,
|
||||
read_row);
|
||||
}
|
||||
|
||||
co_return sstables;
|
||||
}
|
||||
|
||||
future<> system_distributed_keyspace::update_sstable_download_status(sstring snapshot_name,
|
||||
sstring ks,
|
||||
sstring table,
|
||||
sstring dc,
|
||||
sstring rack,
|
||||
sstables::sstable_id sstable_id,
|
||||
dht::token start_token,
|
||||
is_downloaded downloaded) const {
|
||||
static const sstring update_query = format("UPDATE {}.{} USING TTL {} SET downloaded = ? WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND "
|
||||
"datacenter = ? AND rack = ? AND first_token = ? AND sstable_id = ?",
|
||||
NAME,
|
||||
SNAPSHOT_SSTABLES,
|
||||
SNAPSHOT_SSTABLES_TTL_SECONDS);
|
||||
co_await _qp.execute_internal(update_query,
|
||||
consistency_level::ONE,
|
||||
internal_distributed_query_state(),
|
||||
{downloaded == is_downloaded::yes ? true : false, snapshot_name, ks, table, dc, rack, dht::token::to_int64(start_token), sstable_id.uuid()},
|
||||
cql3::query_processor::cache_internal::no);
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
|
||||
@@ -9,17 +9,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/consistency_level_type.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "dht/token.hh"
|
||||
#include "sstables/types.hh"
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/util/bool_class.hh>
|
||||
|
||||
#include <optional>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace cql3 {
|
||||
@@ -36,20 +30,8 @@ namespace service {
|
||||
class migration_manager;
|
||||
}
|
||||
|
||||
|
||||
namespace db {
|
||||
|
||||
using is_downloaded = bool_class<class is_downloaded_tag>;
|
||||
|
||||
struct snapshot_sstable_entry {
|
||||
sstables::sstable_id sstable_id;
|
||||
dht::token first_token;
|
||||
dht::token last_token;
|
||||
sstring toc_name;
|
||||
sstring prefix;
|
||||
is_downloaded downloaded{is_downloaded::no};
|
||||
};
|
||||
|
||||
class system_distributed_keyspace {
|
||||
public:
|
||||
static constexpr auto NAME = "system_distributed";
|
||||
@@ -67,12 +49,6 @@ public:
|
||||
* in the old table also appear in the new table, if necessary. */
|
||||
static constexpr auto CDC_DESC_V1 = "cdc_streams_descriptions";
|
||||
|
||||
/* This table is used by the backup and restore code to store per-sstable metadata.
|
||||
* The data the coordinator node puts in this table comes from the snapshot manifests. */
|
||||
static constexpr auto SNAPSHOT_SSTABLES = "snapshot_sstables";
|
||||
|
||||
static constexpr uint64_t SNAPSHOT_SSTABLES_TTL_SECONDS = std::chrono::seconds(std::chrono::days(3)).count();
|
||||
|
||||
/* Information required to modify/query some system_distributed tables, passed from the caller. */
|
||||
struct context {
|
||||
/* How many different token owners (endpoints) are there in the token ring? */
|
||||
@@ -111,26 +87,6 @@ public:
|
||||
// NOTE: currently used only by alternator
|
||||
future<db_clock::time_point> cdc_current_generation_timestamp(context);
|
||||
|
||||
/* Inserts a single SSTable entry for a given snapshot, keyspace, table, datacenter,
|
||||
* and rack. The row is written with the specified TTL (in seconds). Uses consistency
|
||||
* level `EACH_QUORUM` by default.*/
|
||||
future<> insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl = db::consistency_level::EACH_QUORUM);
|
||||
|
||||
/* Retrieves all SSTable entries for a given snapshot, keyspace, table, datacenter, and rack.
|
||||
* If `start_token` and `end_token` are provided, only entries whose `first_token` is in the range [`start_token`, `end_token`] will be returned.
|
||||
* Returns a vector of `snapshot_sstable_entry` structs containing `sstable_id`, `first_token`, `last_token`,
|
||||
* `toc_name`, and `prefix`. Uses consistency level `LOCAL_QUORUM` by default. */
|
||||
future<utils::chunked_vector<snapshot_sstable_entry>> get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl = db::consistency_level::LOCAL_QUORUM, std::optional<dht::token> start_token = std::nullopt, std::optional<dht::token> end_token = std::nullopt) const;
|
||||
|
||||
future<> update_sstable_download_status(sstring snapshot_name,
|
||||
sstring ks,
|
||||
sstring table,
|
||||
sstring dc,
|
||||
sstring rack,
|
||||
sstables::sstable_id sstable_id,
|
||||
dht::token start_token,
|
||||
is_downloaded downloaded) const;
|
||||
|
||||
private:
|
||||
future<> create_tables(std::vector<schema_ptr> tables);
|
||||
};
|
||||
|
||||
@@ -1146,8 +1146,7 @@ schema_ptr system_keyspace::sstables_registry() {
|
||||
static thread_local auto schema = [] {
|
||||
auto id = generate_legacy_id(NAME, SSTABLES_REGISTRY);
|
||||
return schema_builder(NAME, SSTABLES_REGISTRY, id)
|
||||
.with_column("table_id", uuid_type, column_kind::partition_key)
|
||||
.with_column("node_owner", uuid_type, column_kind::partition_key)
|
||||
.with_column("owner", uuid_type, column_kind::partition_key)
|
||||
.with_column("generation", timeuuid_type, column_kind::clustering_key)
|
||||
.with_column("status", utf8_type)
|
||||
.with_column("state", utf8_type)
|
||||
@@ -1310,7 +1309,6 @@ schema_ptr system_keyspace::view_building_tasks() {
|
||||
return schema_builder(NAME, VIEW_BUILDING_TASKS, std::make_optional(id))
|
||||
.with_column("key", utf8_type, column_kind::partition_key)
|
||||
.with_column("id", timeuuid_type, column_kind::clustering_key)
|
||||
.with_column("min_task_id", timeuuid_type, column_kind::static_column)
|
||||
.with_column("type", utf8_type)
|
||||
.with_column("aborted", boolean_type)
|
||||
.with_column("base_id", uuid_type)
|
||||
@@ -2751,36 +2749,12 @@ future<mutation> system_keyspace::make_remove_view_build_status_on_host_mutation
|
||||
|
||||
static constexpr auto VIEW_BUILDING_KEY = "view_building";
|
||||
|
||||
future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> system_keyspace::get_view_building_tasks() {
|
||||
future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
|
||||
static const sstring query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
using namespace db::view;
|
||||
|
||||
// When the VIEW_BUILDING_TASKS_MIN_TASK_ID feature is active, read the static
|
||||
// column min_task_id first and use it as a lower bound for the clustering row
|
||||
// scan. This skips tombstoned rows below the boundary, avoiding dead-cell
|
||||
// warnings from the tombstone_warn_threshold check.
|
||||
std::optional<utils::UUID> min_task_id;
|
||||
if (_db.features().view_building_tasks_min_task_id) {
|
||||
auto schema = view_building_tasks();
|
||||
auto pk = partition_key::from_single_value(*schema, data_value(VIEW_BUILDING_KEY).serialize_nonnull());
|
||||
auto dk = dht::decorate_key(*schema, pk);
|
||||
auto col_id = schema->get_column_definition("min_task_id")->id;
|
||||
query::partition_slice slice(
|
||||
query::clustering_row_ranges{},
|
||||
{col_id},
|
||||
{},
|
||||
query::partition_slice::option_set::of<query::partition_slice::option::always_return_static_content>());
|
||||
auto cmd = query::read_command(schema->id(), schema->version(), slice,
|
||||
_db.get_query_max_result_size(), query::tombstone_limit::max);
|
||||
auto [qr, _cache_temp] = co_await _db.query(schema, cmd, query::result_options::only_result(),
|
||||
{dht::partition_range::make_singular(dk)}, nullptr, db::no_timeout);
|
||||
auto rs = query::result_set::from_raw_result(schema, slice, *qr);
|
||||
if (!rs.empty()) {
|
||||
min_task_id = rs.row(0).get<utils::UUID>("min_task_id");
|
||||
}
|
||||
}
|
||||
|
||||
building_tasks tasks;
|
||||
auto process_row = [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
||||
co_await _qp.query_internal(query, [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
auto type = task_type_from_string(row.get_as<sstring>("type"));
|
||||
auto aborted = row.get_as<bool>("aborted");
|
||||
@@ -2805,18 +2779,8 @@ future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> system_k
|
||||
break;
|
||||
}
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
if (min_task_id) {
|
||||
static const sstring bounded_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}' AND id >= ?",
|
||||
NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
co_await _qp.query_internal(bounded_query, db::consistency_level::LOCAL_ONE, {*min_task_id}, 1000, std::move(process_row));
|
||||
} else {
|
||||
static const sstring full_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'",
|
||||
NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
|
||||
co_await _qp.query_internal(full_query, std::move(process_row));
|
||||
}
|
||||
co_return std::pair{std::move(tasks), std::move(min_task_id)};
|
||||
});
|
||||
co_return tasks;
|
||||
}
|
||||
|
||||
future<mutation> system_keyspace::make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task) {
|
||||
@@ -3509,37 +3473,37 @@ system_keyspace::read_cdc_generation_opt(utils::UUID id) {
|
||||
co_return cdc::topology_description{std::move(entries)};
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
|
||||
static const auto req = format("INSERT INTO system.{} (table_id, node_owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
|
||||
slogger.trace("Inserting {}.{}.{} into {}", tid, node_owner, desc.generation, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, tid.id, node_owner.uuid(), desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
|
||||
future<> system_keyspace::sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
|
||||
static const auto req = format("INSERT INTO system.{} (owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
|
||||
slogger.trace("Inserting {}.{} into {}", owner, desc.generation, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, owner.id, desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) {
|
||||
static const auto req = format("UPDATE system.{} SET status = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Updating {}.{}.{} -> status={} in {}", tid, node_owner, gen, status, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, status, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
future<> system_keyspace::sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status) {
|
||||
static const auto req = format("UPDATE system.{} SET status = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Updating {}.{} -> status={} in {}", owner, gen, status, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, status, owner.id, gen).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) {
|
||||
static const auto req = format("UPDATE system.{} SET state = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
future<> system_keyspace::sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) {
|
||||
static const auto req = format("UPDATE system.{} SET state = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
auto new_state = sstables::state_to_dir(state);
|
||||
slogger.trace("Updating {}.{}.{} -> state={} in {}", tid, node_owner, gen, new_state, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, new_state, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
slogger.trace("Updating {}.{} -> state={} in {}", owner, gen, new_state, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, new_state, owner.id, gen).discard_result();
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) {
|
||||
static const auto req = format("DELETE FROM system.{} WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Removing {}.{}.{} from {}", tid, node_owner, gen, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, tid.id, node_owner.uuid(), gen).discard_result();
|
||||
future<> system_keyspace::sstables_registry_delete_entry(table_id owner, sstables::generation_type gen) {
|
||||
static const auto req = format("DELETE FROM system.{} WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Removing {}.{} from {}", owner, gen, SSTABLES_REGISTRY);
|
||||
co_await execute_cql(req, owner.id, gen).discard_result();
|
||||
|
||||
}
|
||||
|
||||
future<> system_keyspace::sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer) {
|
||||
static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE table_id = ? AND node_owner = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Listing {}.{} entries from {}", tid, node_owner, SSTABLES_REGISTRY);
|
||||
future<> system_keyspace::sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer) {
|
||||
static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE owner = ?", SSTABLES_REGISTRY);
|
||||
slogger.trace("Listing {} entries from {}", owner, SSTABLES_REGISTRY);
|
||||
|
||||
co_await _qp.query_internal(req, db::consistency_level::ONE, { tid.id, node_owner.uuid() }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
co_await _qp.query_internal(req, db::consistency_level::ONE, { owner.id }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
auto status = row.get_as<sstring>("status");
|
||||
auto state = sstables::state_from_dir(row.get_as<sstring>("state"));
|
||||
auto gen = sstables::generation_type(row.get_as<utils::UUID>("generation"));
|
||||
|
||||
@@ -572,7 +572,7 @@ public:
|
||||
future<mutation> make_remove_view_build_status_on_host_mutation(api::timestamp_type ts, system_keyspace_view_name view_name, locator::host_id host_id);
|
||||
|
||||
// system.view_building_tasks
|
||||
future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> get_view_building_tasks();
|
||||
future<db::view::building_tasks> get_view_building_tasks();
|
||||
future<mutation> make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task);
|
||||
future<mutation> make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id);
|
||||
|
||||
@@ -671,12 +671,12 @@ public:
|
||||
future<mutation> make_view_builder_version_mutation(api::timestamp_type ts, view_builder_version_t version);
|
||||
future<view_builder_version_t> get_view_builder_version();
|
||||
|
||||
future<> sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
|
||||
future<> sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status);
|
||||
future<> sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state);
|
||||
future<> sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen);
|
||||
future<> sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
|
||||
future<> sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status);
|
||||
future<> sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state);
|
||||
future<> sstables_registry_delete_entry(table_id owner, sstables::generation_type gen);
|
||||
using sstable_registry_entry_consumer = sstables::sstables_registry::entry_consumer;
|
||||
future<> sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer);
|
||||
future<> sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer);
|
||||
|
||||
future<std::optional<sstring>> load_group0_upgrade_state();
|
||||
future<> save_group0_upgrade_state(sstring);
|
||||
|
||||
@@ -15,24 +15,24 @@ class system_keyspace_sstables_registry : public sstables::sstables_registry {
|
||||
public:
|
||||
system_keyspace_sstables_registry(system_keyspace& keyspace) : _keyspace(keyspace.shared_from_this()) {}
|
||||
|
||||
virtual seastar::future<> create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
|
||||
return _keyspace->sstables_registry_create_entry(tid, node_owner, status, state, desc);
|
||||
virtual seastar::future<> create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
|
||||
return _keyspace->sstables_registry_create_entry(owner, status, state, desc);
|
||||
}
|
||||
|
||||
virtual seastar::future<> update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) override {
|
||||
return _keyspace->sstables_registry_update_entry_status(tid, node_owner, gen, status);
|
||||
virtual seastar::future<> update_entry_status(table_id owner, sstables::generation_type gen, sstring status) override {
|
||||
return _keyspace->sstables_registry_update_entry_status(owner, gen, status);
|
||||
}
|
||||
|
||||
virtual seastar::future<> update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) override {
|
||||
return _keyspace->sstables_registry_update_entry_state(tid, node_owner, gen, state);
|
||||
virtual seastar::future<> update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) override {
|
||||
return _keyspace->sstables_registry_update_entry_state(owner, gen, state);
|
||||
}
|
||||
|
||||
virtual seastar::future<> delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) override {
|
||||
return _keyspace->sstables_registry_delete_entry(tid, node_owner, gen);
|
||||
virtual seastar::future<> delete_entry(table_id owner, sstables::generation_type gen) override {
|
||||
return _keyspace->sstables_registry_delete_entry(owner, gen);
|
||||
}
|
||||
|
||||
virtual seastar::future<> sstables_registry_list(table_id tid, locator::host_id node_owner, entry_consumer consumer) override {
|
||||
return _keyspace->sstables_registry_list(tid, node_owner, std::move(consumer));
|
||||
virtual seastar::future<> sstables_registry_list(table_id owner, entry_consumer consumer) override {
|
||||
return _keyspace->sstables_registry_list(owner, std::move(consumer));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <exception>
|
||||
#include <ranges>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "db/view/view_building_coordinator.hh"
|
||||
@@ -180,10 +179,7 @@ future<> view_building_coordinator::clean_finished_tasks() {
|
||||
co_return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
|
||||
// Collect tasks eligible for deletion: must still be in state and not aborted.
|
||||
std::vector<utils::UUID> tasks_to_delete;
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
for (auto& [replica, tasks]: _finished_tasks) {
|
||||
for (auto& task_id: tasks) {
|
||||
// The task might be aborted in the meantime. In this case we cannot remove it because we need it to create a new task.
|
||||
@@ -193,65 +189,15 @@ future<> view_building_coordinator::clean_finished_tasks() {
|
||||
// If yes, we can just remove it instead of aborting it.
|
||||
auto task_opt = _vb_sm.building_state.get_task(*_vb_sm.building_state.currently_processed_base_table, replica, task_id);
|
||||
if (task_opt && !task_opt->get().aborted) {
|
||||
tasks_to_delete.push_back(task_id);
|
||||
builder.del_task(task_id);
|
||||
vbc_logger.debug("Removing finished task with ID: {}", task_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!tasks_to_delete.empty()) {
|
||||
// Find the minimum UUID (by timeuuid ordering) among tasks that are NOT being
|
||||
// deleted — i.e., alive tasks that must remain in the table.
|
||||
// Everything strictly below this boundary is safe to cover with one range tombstone.
|
||||
const std::unordered_set<utils::UUID> to_delete_set(tasks_to_delete.begin(), tasks_to_delete.end());
|
||||
std::optional<utils::UUID> min_alive_uuid;
|
||||
for (auto& [base_id, base_tasks] : _vb_sm.building_state.tasks_state) {
|
||||
for (auto& [replica, rep_tasks] : base_tasks) {
|
||||
auto check = [&](const utils::UUID& id) {
|
||||
if (!to_delete_set.contains(id)
|
||||
&& (!min_alive_uuid || timeuuid_tri_compare(id, *min_alive_uuid) < 0)) {
|
||||
min_alive_uuid = id;
|
||||
}
|
||||
};
|
||||
for (auto& [id, task] : rep_tasks.staging_tasks) {
|
||||
check(id);
|
||||
}
|
||||
for (auto& [view_id, task_m] : rep_tasks.view_tasks) {
|
||||
for (auto& [id, task] : task_m) {
|
||||
check(id);
|
||||
}
|
||||
}
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
|
||||
if (min_alive_uuid) {
|
||||
vbc_logger.debug("Removing finished tasks before ID: {} using range tombstone", *min_alive_uuid);
|
||||
builder.del_tasks_before(*min_alive_uuid);
|
||||
for (auto& task_id : tasks_to_delete) {
|
||||
// Tasks below min_alive_uuid are already covered by the range tombstone.
|
||||
if (timeuuid_tri_compare(task_id, *min_alive_uuid) < 0) {
|
||||
continue;
|
||||
}
|
||||
vbc_logger.debug("Removing finished task with ID: {}", task_id);
|
||||
builder.del_task(task_id);
|
||||
}
|
||||
} else {
|
||||
// No alive tasks remain — one range tombstone covers everything.
|
||||
vbc_logger.debug("No alive tasks remain, removing all finished tasks using range tombstone");
|
||||
builder.del_all_tasks();
|
||||
}
|
||||
|
||||
if (_db.features().view_building_tasks_min_task_id) {
|
||||
// If min_alive_uuid == std::nullopt, set min_task_id to a fresh UUID,
|
||||
// so future scans start past all the just-deleted rows (new tasks created
|
||||
// later will have larger UUIDs).
|
||||
builder.set_min_task_id(min_alive_uuid ? *min_alive_uuid : utils::UUID_gen::get_time_UUID());
|
||||
}
|
||||
|
||||
co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
|
||||
for (auto& [_, tasks_set]: _finished_tasks) {
|
||||
tasks_set.clear();
|
||||
}
|
||||
co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
|
||||
for (auto& [_, tasks_set]: _finished_tasks) {
|
||||
tasks_set.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -587,7 +533,7 @@ void view_building_coordinator::generate_tablet_migration_updates(utils::chunked
|
||||
}
|
||||
|
||||
auto last_token = tmap.get_last_token(gid.tablet);
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
|
||||
auto create_task_copy_on_pending_replica = [&] (const view_building_task& task) {
|
||||
auto new_id = builder.new_id();
|
||||
@@ -655,7 +601,7 @@ void view_building_coordinator::generate_tablet_resize_updates(utils::chunked_ve
|
||||
return;
|
||||
}
|
||||
bool is_split = old_tmap.tablet_count() < new_tmap.tablet_count();
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
|
||||
auto create_task_copy = [&] (const view_building_task& task, dht::token last_token) -> utils::UUID {
|
||||
auto new_id = builder.new_id();
|
||||
@@ -725,7 +671,7 @@ void view_building_coordinator::abort_tasks(utils::chunked_vector<canonical_muta
|
||||
}
|
||||
vbc_logger.debug("Generating abort mutations for tasks for table {}", table_id);
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto abort_task_map = [&] (const task_map& task_map) {
|
||||
for (auto& [id, _]: task_map) {
|
||||
vbc_logger.debug("Aborting task {}", id);
|
||||
@@ -754,7 +700,7 @@ void abort_view_building_tasks(const view_building_state_machine& vb_sm,
|
||||
}
|
||||
vbc_logger.debug("Generating abort mutations for tasks for table {} on replica {} and last token {}", table_id, replica, last_token);
|
||||
|
||||
view_building_task_mutation_builder builder(write_timestamp, vb_sm.building_state.make_task_uuid_generator(write_timestamp));
|
||||
view_building_task_mutation_builder builder(write_timestamp);
|
||||
auto abort_task_map = [&] (const task_map& task_map) {
|
||||
for (auto& [id, task]: task_map) {
|
||||
if (task.last_token == last_token) {
|
||||
@@ -796,7 +742,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
|
||||
return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto& base_tasks = _vb_sm.building_state.tasks_state.at(table_id);
|
||||
for (auto& [_, replica_tasks]: base_tasks) {
|
||||
for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
|
||||
@@ -813,7 +759,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
|
||||
return;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
|
||||
view_building_task_mutation_builder builder(guard.write_timestamp());
|
||||
auto& replica_tasks = _vb_sm.building_state.tasks_state.at(table_id).at(replica);
|
||||
for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
|
||||
rollback_task_map(builder, building_task_map);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
*/
|
||||
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
|
||||
namespace db {
|
||||
|
||||
@@ -23,10 +22,9 @@ view_building_task::view_building_task(utils::UUID id, task_type type, bool abor
|
||||
, replica(replica)
|
||||
, last_token(last_token) {}
|
||||
|
||||
view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid)
|
||||
view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table)
|
||||
: tasks_state(std::move(tasks_state))
|
||||
, currently_processed_base_table(std::move(processed_base_table))
|
||||
, min_alive_uuid(std::move(min_alive_uuid)) {}
|
||||
, currently_processed_base_table(std::move(processed_base_table)) {}
|
||||
|
||||
views_state::views_state(std::map<table_id, std::vector<table_id>> views_per_base, view_build_status_map status_map)
|
||||
: views_per_base(std::move(views_per_base))
|
||||
@@ -129,24 +127,6 @@ std::map<dht::token, std::vector<view_building_task>> view_building_state::colle
|
||||
return tasks;
|
||||
}
|
||||
|
||||
task_uuid_generator::task_uuid_generator(api::timestamp_type base_ts)
|
||||
: _next_ts(base_ts) {}
|
||||
|
||||
utils::UUID task_uuid_generator::operator()() {
|
||||
return utils::UUID_gen::get_random_time_UUID_from_micros(
|
||||
std::chrono::microseconds{_next_ts++});
|
||||
}
|
||||
|
||||
task_uuid_generator view_building_state::make_task_uuid_generator(api::timestamp_type ts) const {
|
||||
if (min_alive_uuid) {
|
||||
auto lower_bound = utils::UUID_gen::micros_timestamp(*min_alive_uuid);
|
||||
if (ts <= lower_bound) {
|
||||
ts = lower_bound + 1;
|
||||
}
|
||||
}
|
||||
return task_uuid_generator{ts};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "db/view/view_build_status.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include <fmt/base.h>
|
||||
#include "schema/schema_fwd.hh"
|
||||
@@ -65,16 +64,6 @@ struct replica_tasks {
|
||||
using base_table_tasks = std::map<locator::tablet_replica, replica_tasks>;
|
||||
using building_tasks = std::map<table_id, base_table_tasks>;
|
||||
|
||||
// Generates unique timeuuids with strictly increasing microsecond timestamps.
|
||||
// Each call to operator() returns a new timeuuid whose timestamp is one
|
||||
// microsecond greater than the previous one.
|
||||
class task_uuid_generator {
|
||||
api::timestamp_type _next_ts;
|
||||
public:
|
||||
explicit task_uuid_generator(api::timestamp_type base_ts);
|
||||
utils::UUID operator()();
|
||||
};
|
||||
|
||||
// Represents cluster-wide view building state (only for tablet-based views).
|
||||
// The state stores all unfinished view building tasks for all tablet-based views
|
||||
// and table_id of currently processed base table by view building coordinator.
|
||||
@@ -84,22 +73,14 @@ public:
|
||||
struct view_building_state {
|
||||
building_tasks tasks_state;
|
||||
std::optional<table_id> currently_processed_base_table;
|
||||
std::optional<utils::UUID> min_alive_uuid;
|
||||
|
||||
view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid);
|
||||
view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table);
|
||||
view_building_state() = default;
|
||||
|
||||
std::optional<std::reference_wrapper<const view_building_task>> get_task(table_id base_id, locator::tablet_replica replica, utils::UUID id) const;
|
||||
std::vector<std::reference_wrapper<const view_building_task>> get_tasks_for_host(table_id base_id, locator::host_id host) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id) const;
|
||||
std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id, const locator::tablet_replica& replica) const;
|
||||
|
||||
// Creates a generator that produces unique timeuuids suitable for view
|
||||
// building task IDs. The generated uuids have strictly increasing
|
||||
// microsecond timestamps starting from write_timestamp. If min_alive_uuid
|
||||
// is set, all generated uuids are guaranteed to be greater than
|
||||
// *min_alive_uuid in timeuuid order.
|
||||
task_uuid_generator make_task_uuid_generator(api::timestamp_type write_timestamp) const;
|
||||
};
|
||||
|
||||
// Represents global state of tablet-based views.
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace db {
|
||||
namespace view {
|
||||
|
||||
utils::UUID view_building_task_mutation_builder::new_id() {
|
||||
return _uuid_gen();
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
clustering_key view_building_task_mutation_builder::get_ck(utils::UUID id) {
|
||||
@@ -52,30 +52,6 @@ view_building_task_mutation_builder& view_building_task_mutation_builder::del_ta
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::del_tasks_before(utils::UUID id) {
|
||||
auto ck = get_ck(id);
|
||||
range_tombstone rt(
|
||||
position_in_partition::before_all_clustered_rows(),
|
||||
position_in_partition_view(ck, bound_weight::before_all_prefixed),
|
||||
tombstone{_ts, gc_clock::now()});
|
||||
_m.partition().apply_row_tombstone(*_s, std::move(rt));
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::del_all_tasks() {
|
||||
range_tombstone rt(
|
||||
position_in_partition::before_all_clustered_rows(),
|
||||
position_in_partition::after_all_clustered_rows(),
|
||||
tombstone{_ts, gc_clock::now()});
|
||||
_m.partition().apply_row_tombstone(*_s, std::move(rt));
|
||||
return *this;
|
||||
}
|
||||
|
||||
view_building_task_mutation_builder& view_building_task_mutation_builder::set_min_task_id(utils::UUID id) {
|
||||
_m.set_static_cell("min_task_id", data_value(id), _ts);
|
||||
return *this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/view/view_building_state.hh"
|
||||
#include "mutation/mutation.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "mutation/timestamp.hh"
|
||||
@@ -20,19 +19,17 @@ namespace view {
|
||||
// Factory for mutations to `system.view_building_tasks` table.
|
||||
class view_building_task_mutation_builder {
|
||||
api::timestamp_type _ts;
|
||||
task_uuid_generator _uuid_gen;
|
||||
schema_ptr _s;
|
||||
mutation _m;
|
||||
|
||||
public:
|
||||
view_building_task_mutation_builder(api::timestamp_type ts, task_uuid_generator uuid_gen)
|
||||
view_building_task_mutation_builder(api::timestamp_type ts)
|
||||
: _ts(ts)
|
||||
, _uuid_gen(std::move(uuid_gen))
|
||||
, _s(db::system_keyspace::view_building_tasks())
|
||||
, _m(_s, partition_key::from_single_value(*_s, data_value("view_building").serialize_nonnull()))
|
||||
{ }
|
||||
|
||||
utils::UUID new_id();
|
||||
static utils::UUID new_id();
|
||||
|
||||
view_building_task_mutation_builder& set_type(utils::UUID id, db::view::view_building_task::task_type type);
|
||||
view_building_task_mutation_builder& set_aborted(utils::UUID id, bool aborted);
|
||||
@@ -41,12 +38,6 @@ public:
|
||||
view_building_task_mutation_builder& set_last_token(utils::UUID id, dht::token last_token);
|
||||
view_building_task_mutation_builder& set_replica(utils::UUID id, const locator::tablet_replica& replica);
|
||||
view_building_task_mutation_builder& del_task(utils::UUID id);
|
||||
// Deletes all tasks with clustering key < id using a range tombstone.
|
||||
view_building_task_mutation_builder& del_tasks_before(utils::UUID id);
|
||||
// Deletes all tasks using a range tombstone covering the entire clustering range.
|
||||
view_building_task_mutation_builder& del_all_tasks();
|
||||
// Sets the static column min_task_id to `id`.
|
||||
view_building_task_mutation_builder& set_min_task_id(utils::UUID id);
|
||||
|
||||
mutation build() {
|
||||
return std::move(_m);
|
||||
|
||||
@@ -275,12 +275,11 @@ future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto guard = co_await _group0.client().start_operation(_as);
|
||||
auto uuid_gen = _vb_state_machine.building_state.make_task_uuid_generator(guard.write_timestamp());
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
for (auto& [table_id, sst_infos]: _sstables_to_register) {
|
||||
for (auto& sst_info: sst_infos) {
|
||||
view_building_task task {
|
||||
uuid_gen(), view_building_task::task_type::process_staging, false,
|
||||
utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
|
||||
table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
|
||||
};
|
||||
auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
|
||||
|
||||
16
dist/common/supervisor/scylla-server.sh
vendored
16
dist/common/supervisor/scylla-server.sh
vendored
@@ -9,22 +9,6 @@ for f in "$etcdir"/scylla.d/*.conf; do
|
||||
done
|
||||
|
||||
if is_privileged; then
|
||||
# Override pipe-based core_pattern that may not work inside a container
|
||||
# (e.g. Ubuntu host's apport). File-based patterns resolve inside the
|
||||
# container's mount namespace, so coredumps land in the right place.
|
||||
# Derive workdir from scylla.yaml, matching the Python entrypoint logic.
|
||||
_workdir=$(python3 -c "import yaml; cfg=yaml.safe_load(open('/etc/scylla/scylla.yaml')); print(cfg.get('workdir') or '/var/lib/scylla')" 2>/dev/null || echo "/var/lib/scylla")
|
||||
_coredump_dir="${_workdir}/coredump"
|
||||
core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || true)
|
||||
if [[ "$core_pattern" == "|"* ]]; then
|
||||
if ! mkdir -p "$_coredump_dir" 2>/dev/null; then
|
||||
echo "WARNING: could not create coredump directory $_coredump_dir" >&2
|
||||
elif echo "${_coredump_dir}/core.%e.%p.%t" > /proc/sys/kernel/core_pattern 2>/dev/null; then
|
||||
echo "kernel.core_pattern overridden to file-based pattern: ${_coredump_dir}/core.%e.%p.%t" >&2
|
||||
else
|
||||
echo "WARNING: pipe-based core_pattern detected but could not override. Coredumps may be lost." >&2
|
||||
fi
|
||||
fi
|
||||
"$scriptsdir"/scylla_prepare
|
||||
fi
|
||||
execsudo /usr/bin/env SCYLLA_HOME=$SCYLLA_HOME SCYLLA_CONF=$SCYLLA_CONF "$bindir"/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
|
||||
|
||||
1
dist/docker/docker-entrypoint.py
vendored
1
dist/docker/docker-entrypoint.py
vendored
@@ -24,7 +24,6 @@ try:
|
||||
setup.developerMode()
|
||||
setup.cpuSet()
|
||||
setup.io()
|
||||
setup.coredumpSetup()
|
||||
setup.cqlshrc()
|
||||
setup.write_rackdc_properties()
|
||||
setup.arguments()
|
||||
|
||||
66
dist/docker/scyllasetup.py
vendored
66
dist/docker/scyllasetup.py
vendored
@@ -3,7 +3,6 @@ import logging
|
||||
import yaml
|
||||
import os
|
||||
import socket
|
||||
import errno
|
||||
|
||||
def is_bind_mount(path):
|
||||
# Check if the file or its parent is a mount point (bind mount or otherwise)
|
||||
@@ -48,7 +47,6 @@ class ScyllaSetup:
|
||||
self._dc = arguments.dc
|
||||
self._rack = arguments.rack
|
||||
self._blocked_reactor_notify_ms = arguments.blocked_reactor_notify_ms
|
||||
self._coredump_dir = None
|
||||
|
||||
def _run(self, *args, **kwargs):
|
||||
logging.info('running: {}'.format(args))
|
||||
@@ -134,70 +132,6 @@ class ScyllaSetup:
|
||||
f.write(f"dc={dc}\n")
|
||||
f.write(f"rack={rack}\n")
|
||||
|
||||
CORE_PATTERN_PATH = '/proc/sys/kernel/core_pattern'
|
||||
|
||||
def _get_coredump_dir(self):
|
||||
"""Return the coredump directory, deriving it from scylla.yaml workdir if needed."""
|
||||
if self._coredump_dir is not None:
|
||||
return self._coredump_dir
|
||||
conf_dir = "/etc/scylla"
|
||||
try:
|
||||
with open(os.path.join(conf_dir, "scylla.yaml")) as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
cfg = {}
|
||||
workdir = cfg.get('workdir') or '/var/lib/scylla'
|
||||
self._coredump_dir = os.path.join(workdir, 'coredump')
|
||||
return self._coredump_dir
|
||||
|
||||
def coredumpSetup(self):
|
||||
"""Configure coredump handling for containers.
|
||||
|
||||
The host's kernel.core_pattern may pipe core dumps to a handler
|
||||
(e.g. Ubuntu's apport) that does not exist or work correctly
|
||||
inside the container. This method tries to switch to a file-based
|
||||
core_pattern so that coredumps are written directly to disk.
|
||||
|
||||
Writing to /proc/sys/kernel/core_pattern requires privileges
|
||||
(root with CAP_SYS_ADMIN). When the container lacks permission
|
||||
a warning is logged with guidance for the operator.
|
||||
"""
|
||||
coredump_dir = self._get_coredump_dir()
|
||||
|
||||
try:
|
||||
os.makedirs(coredump_dir, exist_ok=True)
|
||||
except OSError as e:
|
||||
logging.warning('Could not create coredump directory %s: %s',
|
||||
coredump_dir, e)
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.CORE_PATTERN_PATH) as f:
|
||||
current = f.read().strip()
|
||||
except Exception as e:
|
||||
logging.debug('Could not read %s: %s', self.CORE_PATTERN_PATH, e)
|
||||
return
|
||||
|
||||
if not current.startswith('|'):
|
||||
return
|
||||
|
||||
desired = f'{coredump_dir}/core.%e.%p.%t'
|
||||
try:
|
||||
with open(self.CORE_PATTERN_PATH, 'w') as f:
|
||||
f.write(desired + '\n')
|
||||
logging.info('kernel.core_pattern set to %s', desired)
|
||||
except OSError as e:
|
||||
if e.errno in (errno.EACCES, errno.EPERM, errno.EROFS):
|
||||
logging.warning(
|
||||
'kernel.core_pattern pipes to a program that may not work '
|
||||
'inside the container, and we lack permission to override it. '
|
||||
'To fix this, either run with --privileged or set on the host: '
|
||||
'sysctl -w kernel.core_pattern="%s"', desired)
|
||||
else:
|
||||
logging.debug('Unexpected OSError setting core_pattern: %s', e)
|
||||
except Exception as e:
|
||||
logging.debug('Unexpected error in coredumpSetup: %s', e)
|
||||
|
||||
def arguments(self):
|
||||
args = []
|
||||
if self._memory is not None:
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
# Alternator Vector Search
|
||||
|
||||
```{admonition} Availability
|
||||
:class: important
|
||||
|
||||
The Vector Search feature is only available in [ScyllaDB Cloud](https://cloud.docs.scylladb.com/) - a fully managed DBaaS running ScyllaDB.
|
||||
```
|
||||
|
||||
## Introduction
|
||||
|
||||
Alternator vector search is a ScyllaDB extension to the DynamoDB-compatible
|
||||
|
||||
@@ -71,7 +71,7 @@ used. If it is used, the statement will be a no-op if the materialized view alre
|
||||
MV Select Statement
|
||||
...................
|
||||
|
||||
The select statement of a materialized view creation defines which of the base table columns are included in the view. That
|
||||
The select statement of a materialized view creation defines which of the base table is included in the view. That
|
||||
statement is limited in a number of ways:
|
||||
|
||||
- The :ref:`selection <selection-clause>` is limited to those that only select columns of the base table. In other
|
||||
|
||||
@@ -167,11 +167,6 @@ All tables in a keyspace are uploaded, the destination object names will look li
|
||||
or
|
||||
`gs://bucket/some/prefix/to/store/data/.../sstable`
|
||||
|
||||
# System tables
|
||||
There are a few system tables that object storage related code needs to touch in order to operate.
|
||||
* [system_distributed.snapshot_sstables](docs/dev/snapshot_sstables.md) - Used during restore by worker nodes to get the list of SSTables that need to be downloaded from object storage and restored locally.
|
||||
* [system.sstables](docs/dev/system_keyspace.md#systemsstables) - Used to keep track of SSTables on object storage when a keyspace is created with object storage storage_options.
|
||||
|
||||
# Manipulating S3 data
|
||||
|
||||
This section intends to give an overview of where, when and how we store data in S3 and provide a quick set of commands
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
# system\_distributed.snapshot\_sstables
|
||||
|
||||
## Purpose
|
||||
|
||||
This table is used during tablet-aware restore to exchange per-SSTable metadata between
|
||||
the coordinator and worker nodes. When the restore process starts, the coordinator node
|
||||
populates this table with information about each SSTable extracted from the snapshot
|
||||
manifests. Worker nodes then read from this table to determine which SSTables need to
|
||||
be downloaded from object storage and restored locally.
|
||||
|
||||
Rows are inserted with a TTL so that stale restore metadata is automatically cleaned up.
|
||||
|
||||
## Schema
|
||||
|
||||
~~~
|
||||
CREATE TABLE system_distributed.snapshot_sstables (
|
||||
snapshot_name text,
|
||||
"keyspace" text,
|
||||
"table" text,
|
||||
datacenter text,
|
||||
rack text,
|
||||
first_token bigint,
|
||||
sstable_id uuid,
|
||||
last_token bigint,
|
||||
toc_name text,
|
||||
prefix text,
|
||||
PRIMARY KEY ((snapshot_name, "keyspace", "table", datacenter, rack), first_token, sstable_id)
|
||||
)
|
||||
~~~
|
||||
|
||||
Column descriptions:
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `snapshot_name` | text (partition key) | Name of the snapshot |
|
||||
| `keyspace` | text (partition key) | Keyspace the snapshot was taken from |
|
||||
| `table` | text (partition key) | Table within the keyspace |
|
||||
| `datacenter` | text (partition key) | Datacenter where the SSTable is located |
|
||||
| `rack` | text (partition key) | Rack where the SSTable is located |
|
||||
| `first_token` | bigint (clustering key) | First token in the token range covered by this SSTable |
|
||||
| `sstable_id` | uuid (clustering key) | Unique identifier for the SSTable |
|
||||
| `last_token` | bigint | Last token in the token range covered by this SSTable |
|
||||
| `toc_name` | text | TOC filename of the SSTable (e.g. `me-3gdq_0bki_2cvk01yl83nj0tp5gh-big-TOC.txt`) |
|
||||
| `prefix` | text | Prefix path in object storage where the SSTable was backed up |
|
||||
|
||||
## APIs
|
||||
|
||||
The following C++ APIs are provided in `db::system_distributed_keyspace`:
|
||||
|
||||
- insert\_snapshot\_sstable
|
||||
|
||||
- get\_snapshot\_sstables
|
||||
@@ -274,8 +274,6 @@ globally driven by the topology change coordinator and serialized per-tablet. Tr
|
||||
|
||||
- repair - tablet replicas are repaired
|
||||
|
||||
- restore - tablet replicas download SSTables from object storage during cluster-wide backup restore
|
||||
|
||||
Each tablet has its own state machine for keeping state of transition stored in group0 which is part of the tablet state. It involves
|
||||
these properties of a tablet:
|
||||
|
||||
@@ -392,9 +390,6 @@ stateDiagram-v2
|
||||
|
||||
The repair tablet transition kind is different. It transits only to the repair and end_repair stage because no token ownership is changed.
|
||||
|
||||
The restore tablet transition kind is also simple. It uses a single `restore` stage and does not change token
|
||||
ownership. See the [Tablet-aware restore](#tablet-aware-restore) section below for details.
|
||||
|
||||
The behavioral difference between "migration" and "intranode_migration" transitions is in the way "streaming" stage
|
||||
is performed. In case of intra-node migration, streaming is done by fast duplication of data by creating hard links to
|
||||
sstable files on the destination shard. Original sstable files on the source shard will be removed by the standard "cleanup" stage.
|
||||
@@ -989,18 +984,3 @@ Losing a committed entry can be observed by external systems. For example, the l
|
||||
schema version in the cluster can go back in time from the driver's perspective. This
|
||||
is outside the scope of the recovery procedure, though, and it shouldn't cause
|
||||
problems in practice.
|
||||
|
||||
# Tablet restore transition
|
||||
|
||||
The `restore` tablet transition kind is used by the tablet-aware restore to download SSTables
|
||||
from object storage. The transition contains `restore_config` with snapshot name, endpoint and
|
||||
bucket.
|
||||
|
||||
Like `repair`, the `restore` transition does not change token ownership — replicas remain intact.
|
||||
The topology coordinator processes a tablet in this stage by calling the `RESTORE_TABLET` RPC on
|
||||
all tablet replicas. Each replica then downloads and attaches the SSTables that are contained in
|
||||
the tablet's token range. If the operation succeeds or fails, the transition is cleared and the
|
||||
failure to download SSTables is propagated back to user by the API handler itself.
|
||||
|
||||
Restore transitions are serialized per-tablet like any other transition (invariant [INV-TABL-2]),
|
||||
so they do not run concurrently with migrations or repairs on the same tablet.
|
||||
|
||||
@@ -106,7 +106,6 @@ The most important table is `system.view_building_tasks`, which stores all unfin
|
||||
CREATE TABLE system.view_building_tasks (
|
||||
key text,
|
||||
id timeuuid,
|
||||
min_task_id timeuuid STATIC, -- lower bound for task scans; see "Tombstone avoidance" below
|
||||
type text,
|
||||
aborted boolean,
|
||||
base_id uuid,
|
||||
@@ -118,26 +117,6 @@ CREATE TABLE system.view_building_tasks (
|
||||
)
|
||||
```
|
||||
|
||||
### Tombstone avoidance
|
||||
|
||||
`system.view_building_tasks` is a single partition. When `finished_task_gc_fiber()` removes
|
||||
finished tasks in batches, the deleted rows remain as tombstones in SSTables until compaction,
|
||||
causing `tombstone_warn_threshold` warnings on subsequent reloads in large clusters.
|
||||
|
||||
Two mechanisms address this:
|
||||
|
||||
**Range tombstone on GC.** Instead of one row tombstone per deleted task, the coordinator emits
|
||||
a single range tombstone `[before_all, min_alive_uuid)` where `min_alive_uuid` is the smallest
|
||||
timeuuid among surviving tasks. Tasks above the boundary (rare) still get individual row tombstones.
|
||||
When all tasks are deleted, a single full-partition range tombstone is used.
|
||||
|
||||
**Bounded scan on reload.** Physical rows remain until compaction and are still counted as dead cells.
|
||||
After each GC batch, `min_task_id = min_alive_uuid` is written atomically as a static cell (same Raft
|
||||
batch as the range tombstone). On reload, `min_task_id` is read using a **static-only partition slice**
|
||||
(empty `_row_ranges` + `always_return_static_content`) — this makes the SSTable reader stop immediately
|
||||
after the static row, before any clustering tombstones, so zero dead cells are counted. The value is
|
||||
then used as `AND id >= min_task_id` to skip all tombstoned rows in the main scan.
|
||||
|
||||
The view building coordinator stores currently processing base table in `system.scylla_local`
|
||||
under `view_building_processing_base` key.
|
||||
The entry is managed by group0.
|
||||
|
||||
@@ -45,7 +45,7 @@ Example:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
To only mark the node as permanently down without doing actual removal, use :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>`:
|
||||
|
||||
@@ -79,6 +79,6 @@ Example:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1db0-aac8-43fddce9123e 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
.. include:: nodetool-index.rst
|
||||
|
||||
@@ -74,7 +74,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UJ 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UJ 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Nodes in the cluster finished streaming data to the new node:
|
||||
|
||||
@@ -86,7 +86,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
#. When the new node status is Up Normal (UN), run the :doc:`nodetool cleanup </operating-scylla/nodetool-commands/cleanup>` command on all nodes in the cluster except for the new node that has just been added. Cleanup removes keys that were streamed to the newly added node and are no longer owned by the node.
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ Adding new nodes
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-ca08-43fddce952de RACK2
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
@@ -205,7 +205,7 @@ Adding new nodes
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-ca08-43fddce952de RACK2
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
|
||||
@@ -163,5 +163,5 @@ This example shows how to install and configure a three-node cluster using Gossi
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c 43
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e 44
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de 45
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy 45
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ Prerequisites
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-lac8-23fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Datacenter: ASIA-DC
|
||||
Status=Up/Down
|
||||
@@ -165,7 +165,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Datacenter: EUROPE-DC
|
||||
Status=Up/Down
|
||||
|
||||
@@ -18,7 +18,7 @@ Removing a Running Node
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
UN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
#. If the node status is **Up Normal (UN)**, run the :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` command
|
||||
to remove the node you are connected to. Using ``nodetool decommission`` is the recommended method for cluster scale-down operations. It prevents data loss
|
||||
@@ -75,7 +75,7 @@ command providing the Host ID of the node you are removing. See :doc:`nodetool r
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
|
||||
The ``nodetool removenode`` command notifies other nodes that the token range it owns needs to be moved and
|
||||
the nodes should redistribute the data using streaming. Using the command does not guarantee the consistency of the rebalanced data if
|
||||
|
||||
@@ -23,7 +23,7 @@ Prerequisites
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
DN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Login to one of the nodes in the cluster with (UN) status, collect the following info from the node:
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ Down (DN), and the node can be replaced.
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
Remove the Data
|
||||
==================
|
||||
@@ -72,7 +72,7 @@ Procedure
|
||||
|
||||
For example (using the Host ID of the failed node from above):
|
||||
|
||||
``replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de``
|
||||
``replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy``
|
||||
|
||||
#. Start the new node.
|
||||
|
||||
@@ -90,7 +90,7 @@ Procedure
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 192.168.1.201 112.82 KB 256 32.7% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c B1
|
||||
UN 192.168.1.202 91.11 KB 256 32.9% 125ed9f4-7777-1dbn-mac8-43fddce9123e B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-ca08-43fddce952de B1
|
||||
DN 192.168.1.203 124.42 KB 256 32.6% 675ed9f4-6564-6dbd-can8-43fddce952gy B1
|
||||
|
||||
``192.168.1.203`` is the dead node.
|
||||
|
||||
@@ -121,7 +121,7 @@ Procedure
|
||||
/192.168.1.203
|
||||
generation:1553759866
|
||||
heartbeat:2147483647
|
||||
HOST_ID:675ed9f4-6564-6dbd-ca08-43fddce952de
|
||||
HOST_ID:675ed9f4-6564-6dbd-can8-43fddce952gy
|
||||
STATUS:shutdown,true
|
||||
RELEASE_VERSION:3.0.8
|
||||
X3:3
|
||||
@@ -178,7 +178,7 @@ In this case, the node's data will be cleaned after restart. To remedy this, you
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de' | sudo tee --append /etc/scylla/scylla.yaml
|
||||
echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy' | sudo tee --append /etc/scylla/scylla.yaml
|
||||
|
||||
#. Run the following command to re-setup RAID
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
Migrate a Keyspace from Vnodes to Tablets :label-caution:`Experimental`
|
||||
=========================================================================
|
||||
Migrate a Keyspace from Vnodes to Tablets
|
||||
==========================================
|
||||
|
||||
This procedure describes how to migrate an existing keyspace from vnodes
|
||||
to tablets. Tablets are designed to be the long-term replacement for vnodes,
|
||||
@@ -8,9 +8,6 @@ balancing, automatic cleanups, and improved streaming performance. Migrating to
|
||||
tablets is strongly recommended. See :doc:`Data Distribution with Tablets </architecture/tablets/>`
|
||||
for details.
|
||||
|
||||
ℹ️ This feature is experimental and will change in future releases, including
|
||||
the removal of current limitations.
|
||||
|
||||
.. note::
|
||||
|
||||
The migration is an online operation. This means that the keyspace remains
|
||||
|
||||
@@ -16,7 +16,7 @@ Cluster and Node Limits
|
||||
* - Nodes per cluster
|
||||
- Low hundreds
|
||||
* - Node size
|
||||
- 4096 CPUs
|
||||
- 256 vcpu
|
||||
|
||||
See :ref:`Hardware Requirements <system-requirements-hardware>` for storage
|
||||
and memory requirements and limits.
|
||||
|
||||
@@ -289,8 +289,8 @@ private:
|
||||
|
||||
sstring _host;
|
||||
host_options& _options;
|
||||
std::optional<output_stream<char>> _output;
|
||||
std::optional<input_stream<char>> _input;
|
||||
output_stream<char> _output;
|
||||
input_stream<char> _input;
|
||||
seastar::connected_socket _socket;
|
||||
std::optional<temporary_buffer<char>> _in_buffer;
|
||||
std::optional<future<>> _pending;
|
||||
@@ -347,8 +347,8 @@ future<> kmip_host::impl::connection::connect() {
|
||||
// #998 Set keepalive to try avoiding connection going stale in between commands.
|
||||
s.set_keepalive_parameters(net::tcp_keepalive_params{60s, 60s, 10});
|
||||
s.set_keepalive(true);
|
||||
_input.emplace(s.input());
|
||||
_output.emplace(s.output());
|
||||
_input = s.input();
|
||||
_output = s.output();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -367,9 +367,9 @@ int kmip_host::impl::connection::send(void* data, unsigned int len, unsigned int
|
||||
}
|
||||
kmip_log.trace("{}: Sending {} bytes", *this, len);
|
||||
|
||||
auto f = _output->write(reinterpret_cast<char *>(data), len).then([this] {
|
||||
auto f = _output.write(reinterpret_cast<char *>(data), len).then([this] {
|
||||
kmip_log.trace("{}: send done. flushing...", *this);
|
||||
return _output->flush();
|
||||
return _output.flush();
|
||||
});
|
||||
// if the call failed already, we still want to
|
||||
// drop back to "wait_for_io()", because we cannot throw
|
||||
@@ -405,7 +405,7 @@ int kmip_host::impl::connection::recv(void* data, unsigned int len, unsigned int
|
||||
}
|
||||
|
||||
kmip_log.trace("{}: issue read", *this);
|
||||
auto f = _input->read().then([this](temporary_buffer<char> buf) {
|
||||
auto f = _input.read().then([this](temporary_buffer<char> buf) {
|
||||
kmip_log.trace("{}: got {} bytes", *this, buf.size());
|
||||
_in_buffer = std::move(buf);
|
||||
});
|
||||
@@ -462,8 +462,8 @@ void kmip_host::impl::connection::attach(KMIP_CMD* cmd) {
|
||||
}
|
||||
|
||||
future<> kmip_host::impl::connection::close() {
|
||||
return _output->close().finally([this] {
|
||||
return _input->close();
|
||||
return _output.close().finally([this] {
|
||||
return _input.close();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -182,7 +182,6 @@ public:
|
||||
gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
|
||||
gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
|
||||
gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
|
||||
gms::feature view_building_tasks_min_task_id { *this, "VIEW_BUILDING_TASKS_MIN_TASK_ID"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -53,7 +53,6 @@ set(idl_headers
|
||||
group0.idl.hh
|
||||
hinted_handoff.idl.hh
|
||||
sstables.idl.hh
|
||||
sstables_loader.idl.hh
|
||||
storage_proxy.idl.hh
|
||||
storage_service.idl.hh
|
||||
strong_consistency/state_machine.idl.hh
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
class restore_result {
|
||||
};
|
||||
|
||||
verb [[]] restore_tablet (raft::server_id dst_id, locator::global_tablet_id gid) -> restore_result;
|
||||
@@ -72,7 +72,6 @@ struct raft_topology_cmd_result {
|
||||
success
|
||||
};
|
||||
service::raft_topology_cmd_result::command_status status;
|
||||
sstring error_message [[version 2026.2]];
|
||||
};
|
||||
|
||||
struct raft_snapshot {
|
||||
|
||||
@@ -5,8 +5,6 @@ target_sources(index
|
||||
PRIVATE
|
||||
secondary_index.cc
|
||||
secondary_index_manager.cc
|
||||
fulltext_index.cc
|
||||
index_option_utils.cc
|
||||
vector_index.cc)
|
||||
target_include_directories(index
|
||||
PUBLIC
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
// Supported text analyzers for fulltext indexing.
|
||||
// This list corresponds to analyzers expected to be provided
|
||||
// by the backend search engine (Tantivy).
|
||||
static const std::vector<sstring> analyzer_values = {
|
||||
"standard", "english", "german", "french", "spanish", "italian", "portuguese", "russian", "chinese", "japanese", "korean", "simple", "whitespace"};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(std::string_view, const sstring&, const sstring&)>> fulltext_index_options = {
|
||||
// 'analyzer' specifies the built-in text analyzer to use for tokenization.
|
||||
{"analyzer", std::bind_front(util::validate_enumerated_option, analyzer_values)},
|
||||
// 'positions' controls whether token positions are stored in the index.
|
||||
// Required for phrase queries. Set to false to save space.
|
||||
{"positions", std::bind_front(util::validate_enumerated_option, util::boolean_values)},
|
||||
};
|
||||
|
||||
bool fulltext_index::view_should_exist() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::optional<cql3::description> fulltext_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
auto target = im.options().at(cql3::statements::index_target::target_option_name);
|
||||
auto target_column = cql3::statements::index_target::column_name_from_target_string(target);
|
||||
return describe_with_target(im, base_schema, cql3::util::maybe_quote(target_column));
|
||||
}
|
||||
|
||||
void fulltext_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index must have exactly one target column");
|
||||
}
|
||||
|
||||
auto& target = targets[0];
|
||||
if (!std::holds_alternative<index_target::single_column>(target->value)) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index target must be a single column");
|
||||
}
|
||||
|
||||
auto& column = std::get<index_target::single_column>(target->value);
|
||||
auto c_name = column->to_string();
|
||||
auto const* c_def = schema.get_column_definition(column->name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto kind = c_def->type->get_kind();
|
||||
if (kind != abstract_type::kind::utf8 && kind != abstract_type::kind::ascii) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Fulltext index is only supported on text, varchar, or ascii columns, but column {} has an incompatible type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::check_index_options(const cql3::statements::index_specific_prop_defs& properties) const {
|
||||
for (auto option : properties.get_raw_options()) {
|
||||
auto it = fulltext_index_options.find(option.first);
|
||||
if (it == fulltext_index_options.end()) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported option {} for fulltext index", option.first));
|
||||
}
|
||||
it->second(index_type_name(), option.first, option.second);
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service&, const data_dictionary::database&) const {
|
||||
check_target(schema, targets);
|
||||
check_index_options(properties);
|
||||
}
|
||||
|
||||
utils::UUID fulltext_index::index_version(const schema& schema) {
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory() {
|
||||
return std::make_unique<fulltext_index>();
|
||||
}
|
||||
|
||||
} // namespace secondary_index
|
||||
@@ -1,43 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema.hh"
|
||||
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
class fulltext_index : public custom_index {
|
||||
public:
|
||||
std::string_view index_type_name() const override {
|
||||
return "fulltext";
|
||||
}
|
||||
|
||||
fulltext_index() = default;
|
||||
~fulltext_index() override = default;
|
||||
std::optional<cql3::description> describe(const index_metadata& im, const schema& base_schema) const override;
|
||||
bool view_should_exist() const override;
|
||||
void validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const override;
|
||||
utils::UUID index_version(const schema& schema) override;
|
||||
|
||||
private:
|
||||
void check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const;
|
||||
void check_index_options(const cql3::statements::index_specific_prop_defs& properties) const;
|
||||
};
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory();
|
||||
|
||||
} // namespace secondary_index
|
||||
@@ -1,70 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <fmt/ranges.h>
|
||||
#include <seastar/core/format.hh>
|
||||
|
||||
namespace secondary_index::util {
|
||||
|
||||
void validate_enumerated_option(
|
||||
const std::vector<sstring>& supported_values, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
bool is_valid = std::any_of(supported_values.begin(), supported_values.end(), [&](const std::string& v) {
|
||||
return boost::iequals(value, v);
|
||||
});
|
||||
|
||||
if (!is_valid) {
|
||||
throw exceptions::invalid_request_exception(seastar::format("Invalid value in option '{}' for {} index: '{}'."
|
||||
" Supported are case-insensitive: {}",
|
||||
value_name, index_type_name, value, fmt::join(supported_values, ", ")));
|
||||
}
|
||||
}
|
||||
|
||||
void validate_positive_option(int max, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
int num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stoi(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not an integer", value_name, index_type_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not an integer", value_name, index_type_name, value));
|
||||
}
|
||||
|
||||
if (num_value <= 0 || num_value > max) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is out of valid range [1 - {}]", value_name, index_type_name, value, max));
|
||||
}
|
||||
}
|
||||
|
||||
void validate_factor_option(float min, float max, std::string_view index_type_name, const sstring& value_name, const sstring& value) {
|
||||
float num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stof(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not a float", value_name, index_type_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for {} index: '{}' is not a float", value_name, index_type_name, value));
|
||||
}
|
||||
|
||||
if (!(num_value >= min && num_value <= max)) {
|
||||
throw exceptions::invalid_request_exception(seastar::format(
|
||||
"Invalid value in option '{}' for {} index: '{}' is out of valid range [{} - {}]", value_name, index_type_name, value, min, max));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace secondary_index::util
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <seastar/core/sstring.hh>
|
||||
|
||||
namespace secondary_index::util {
|
||||
|
||||
inline const std::vector<seastar::sstring> boolean_values = {"false", "true"};
|
||||
|
||||
void validate_enumerated_option(const std::vector<seastar::sstring>& supported_values, std::string_view index_type_name, const seastar::sstring& value_name,
|
||||
const seastar::sstring& value);
|
||||
|
||||
void validate_positive_option(int max, std::string_view index_type_name, const seastar::sstring& value_name, const seastar::sstring& value);
|
||||
|
||||
void validate_factor_option(float min, float max, std::string_view index_type_name, const seastar::sstring& value_name, const seastar::sstring& value);
|
||||
|
||||
} // namespace secondary_index::util
|
||||
@@ -9,21 +9,17 @@
|
||||
*/
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <ranges>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/vector_index.hh"
|
||||
|
||||
#include "cql3/expr/expression.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "utils/histogram_metrics_helper.hh"
|
||||
@@ -215,7 +211,6 @@ std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_ma
|
||||
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
|
||||
|
||||
const static std::unordered_map<std::string_view, std::function<std::unique_ptr<custom_index>()>> classes = {
|
||||
{"fulltext_index", fulltext_index_factory},
|
||||
{"vector_index", vector_index_factory},
|
||||
};
|
||||
|
||||
@@ -238,49 +233,6 @@ std::optional<std::unique_ptr<custom_index>> secondary_index_manager::get_custom
|
||||
return (*custom_class_factory)();
|
||||
}
|
||||
|
||||
std::optional<cql3::description> custom_index::describe_with_target(
|
||||
const index_metadata& im,
|
||||
const schema& base_schema,
|
||||
const sstring& target_cql) const {
|
||||
static const std::unordered_set<sstring> system_options = {
|
||||
cql3::statements::index_target::target_option_name,
|
||||
db::index::secondary_index::custom_class_option_name,
|
||||
db::index::secondary_index::index_version_option_name,
|
||||
};
|
||||
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
|
||||
<< cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << target_cql << ")"
|
||||
<< " USING '" << index_type_name() << "_index'";
|
||||
|
||||
std::map<sstring, sstring> user_options;
|
||||
for (const auto& [key, value] : im.options()) {
|
||||
if (!system_options.contains(key)) {
|
||||
user_options.emplace(key, value);
|
||||
}
|
||||
}
|
||||
if (!user_options.empty()) {
|
||||
os << " WITH OPTIONS = {";
|
||||
bool first = true;
|
||||
for (const auto& [key, value] : user_options) {
|
||||
if (!first) {
|
||||
os << ", ";
|
||||
}
|
||||
os << "'" << key << "': '" << value << "'";
|
||||
first = false;
|
||||
}
|
||||
os << "}";
|
||||
}
|
||||
|
||||
return cql3::description{
|
||||
.keyspace = base_schema.ks_name(),
|
||||
.type = "index",
|
||||
.name = im.name(),
|
||||
.create_statement = std::move(os).to_managed_string(),
|
||||
};
|
||||
}
|
||||
|
||||
stats::stats(const sstring& ks_name, const sstring& index_name) {
|
||||
metrics.add_group("index",
|
||||
{seastar::metrics::make_histogram("query_latencies", seastar::metrics::description("Index query latencies"), {idx(index_name), ks(ks_name)},
|
||||
|
||||
@@ -100,7 +100,6 @@ public:
|
||||
class custom_index {
|
||||
public:
|
||||
virtual ~custom_index() = default;
|
||||
virtual std::string_view index_type_name() const = 0;
|
||||
/// Returns a custom description of the index, or std::nullopt if the default index description logic should be used instead.
|
||||
virtual std::optional<cql3::description> describe(const index_metadata& im, const schema& base_schema) const = 0;
|
||||
virtual bool view_should_exist() const = 0;
|
||||
@@ -108,12 +107,6 @@ public:
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>> &targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const = 0;
|
||||
virtual utils::UUID index_version(const schema& schema) = 0;
|
||||
|
||||
protected:
|
||||
std::optional<cql3::description> describe_with_target(
|
||||
const index_metadata& im,
|
||||
const schema& base_schema,
|
||||
const sstring& target_cql) const;
|
||||
};
|
||||
|
||||
struct stats {
|
||||
|
||||
@@ -14,19 +14,66 @@
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "index/vector_index.hh"
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
|
||||
int num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stoi(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not an integer", value_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not an integer", value_name, value));
|
||||
}
|
||||
|
||||
if (num_value <= 0 || num_value > max) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is out of valid range [1 - {}]", value_name, value, max));
|
||||
}
|
||||
}
|
||||
|
||||
static void validate_factor_option(float min, float max, const sstring& value_name, const sstring& value) {
|
||||
float num_value;
|
||||
size_t len;
|
||||
try {
|
||||
num_value = std::stof(value, &len);
|
||||
} catch (...) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not a float", value_name, value));
|
||||
}
|
||||
if (len != value.size()) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is not a float", value_name, value));
|
||||
}
|
||||
|
||||
if (!(num_value >= min && num_value <= max)) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid value in option '{}' for vector index: '{}' is out of valid range [{} - {}]", value_name, value, min, max));
|
||||
}
|
||||
}
|
||||
|
||||
static void validate_enumerated_option(const std::vector<sstring>& supported_values, const sstring& value_name, const sstring& value) {
|
||||
bool is_valid = std::any_of(supported_values.begin(), supported_values.end(),
|
||||
[&](const std::string& func) { return boost::iequals(value, func); });
|
||||
|
||||
if (!is_valid) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
seastar::format("Invalid value in option '{}' for vector index: '{}'. Supported are case-insensitive: {}",
|
||||
value_name,
|
||||
value,
|
||||
fmt::join(supported_values, ", ")));
|
||||
}
|
||||
}
|
||||
|
||||
static const std::vector<sstring> similarity_function_values = {
|
||||
"cosine", "euclidean", "dot_product"
|
||||
};
|
||||
@@ -35,29 +82,33 @@ static const std::vector<sstring> quantization_values = {
|
||||
"f32", "f16", "bf16", "i8", "b1"
|
||||
};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(std::string_view, const sstring&, const sstring&)>> vector_index_options = {
|
||||
static const std::vector<sstring> boolean_values = {
|
||||
"false", "true"
|
||||
};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(const sstring&, const sstring&)>> vector_index_options = {
|
||||
// `similarity_function` defines method of calculating similarity between vectors
|
||||
// Used internally by vector store during both indexing and querying
|
||||
// CQL implements corresponding functions in cql3/functions/similarity_functions.hh
|
||||
{"similarity_function", std::bind_front(util::validate_enumerated_option, similarity_function_values)},
|
||||
{"similarity_function", std::bind_front(validate_enumerated_option, similarity_function_values)},
|
||||
// 'maximum_node_connections', 'construction_beam_width', 'search_beam_width' define HNSW index parameters
|
||||
// Used internally by vector store.
|
||||
{"maximum_node_connections", std::bind_front(util::validate_positive_option, 512)},
|
||||
{"construction_beam_width", std::bind_front(util::validate_positive_option, 4096)},
|
||||
{"search_beam_width", std::bind_front(util::validate_positive_option, 4096)},
|
||||
{"maximum_node_connections", std::bind_front(validate_positive_option, 512)},
|
||||
{"construction_beam_width", std::bind_front(validate_positive_option, 4096)},
|
||||
{"search_beam_width", std::bind_front(validate_positive_option, 4096)},
|
||||
// 'quantization' enables compression of vectors in vector store (not in base table!)
|
||||
// Used internally by vector store. Scylla only checks it to enable rescoring.
|
||||
{"quantization", std::bind_front(util::validate_enumerated_option, quantization_values)},
|
||||
{"quantization", std::bind_front(validate_enumerated_option, quantization_values)},
|
||||
// 'oversampling' defines factor by which number of candidates retrieved from vector store is multiplied.
|
||||
// It can improve accuracy of ANN queries, especially for quantized vectors when combined with rescoring.
|
||||
// Used by Scylla during query processing to increase query limit sent to vector store.
|
||||
{"oversampling", std::bind_front(util::validate_factor_option, 1.0f, 100.0f)},
|
||||
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
|
||||
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
|
||||
{"rescoring", std::bind_front(util::validate_enumerated_option, util::boolean_values)},
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
// 'source_model' is a Cassandra SAI option specifying the embedding model name.
|
||||
// Used by Cassandra libraries (e.g., CassIO) to tag indexes with the model that produced the vectors.
|
||||
// Accepted for compatibility but not used by ScyllaDB.
|
||||
{"source_model", [](std::string_view, const sstring&, const sstring&) { /* accepted for Cassandra compatibility */ }},
|
||||
{"source_model", [](const sstring&, const sstring&) { /* accepted for Cassandra compatibility */ }},
|
||||
};
|
||||
|
||||
static constexpr auto TC_TARGET_KEY = "tc";
|
||||
@@ -204,8 +255,43 @@ bool vector_index::view_should_exist() const {
|
||||
}
|
||||
|
||||
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
return describe_with_target(im, base_schema,
|
||||
targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)));
|
||||
static const std::unordered_set<sstring> system_options = {
|
||||
cql3::statements::index_target::target_option_name,
|
||||
db::index::secondary_index::custom_class_option_name,
|
||||
db::index::secondary_index::index_version_option_name,
|
||||
};
|
||||
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
<< " USING 'vector_index'";
|
||||
|
||||
// Collect user-provided options (excluding system keys like target, class_name, index_version).
|
||||
std::map<sstring, sstring> user_options;
|
||||
for (const auto& [key, value] : im.options()) {
|
||||
if (!system_options.contains(key)) {
|
||||
user_options.emplace(key, value);
|
||||
}
|
||||
}
|
||||
if (!user_options.empty()) {
|
||||
os << " WITH OPTIONS = {";
|
||||
bool first = true;
|
||||
for (const auto& [key, value] : user_options) {
|
||||
if (!first) {
|
||||
os << ", ";
|
||||
}
|
||||
os << "'" << key << "': '" << value << "'";
|
||||
first = false;
|
||||
}
|
||||
os << "}";
|
||||
}
|
||||
|
||||
return cql3::description{
|
||||
.keyspace = base_schema.ks_name(),
|
||||
.type = "index",
|
||||
.name = im.name(),
|
||||
.create_statement = std::move(os).to_managed_string(),
|
||||
};
|
||||
}
|
||||
|
||||
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
@@ -343,7 +429,7 @@ void vector_index::check_index_options(const cql3::statements::index_specific_pr
|
||||
if (it == vector_index_options.end()) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported option {} for vector index", option.first));
|
||||
}
|
||||
it->second(index_type_name(), option.first, option.second);
|
||||
it->second(option.first, option.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,8 +20,6 @@ namespace secondary_index {
|
||||
|
||||
class vector_index: public custom_index {
|
||||
public:
|
||||
std::string_view index_type_name() const override { return "vector"; }
|
||||
|
||||
// The minimal TTL for the CDC used by Vector Search.
|
||||
// Required to ensure that the data is not deleted until the vector index is fully built.
|
||||
static constexpr int VS_TTL_SECONDS = 86400; // 24 hours
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include <ranges>
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/serialization.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <seastar/util/backtrace.hh>
|
||||
|
||||
enum class allow_prefixes { no, yes };
|
||||
@@ -104,12 +103,7 @@ public:
|
||||
static managed_bytes serialize_value(RangeOfSerializedComponents&& values) {
|
||||
auto size = serialized_size(values);
|
||||
if (size > std::numeric_limits<size_type>::max()) {
|
||||
// Matches Cassandra's wording so CQL-level compatibility tests
|
||||
// (and client-visible error messages) line up.
|
||||
// Issues #10366 (SELECT) and #12247 (INSERT) both require a
|
||||
// clean InvalidRequest here rather than a generic server error.
|
||||
throw exceptions::invalid_request_exception(format("Key length of {:d} is longer than maximum of {:d}",
|
||||
size, std::numeric_limits<size_type>::max()));
|
||||
throw std::runtime_error(format("Key size too large: {:d} > {:d}", size, std::numeric_limits<size_type>::max()));
|
||||
}
|
||||
managed_bytes b(managed_bytes::initialized_later(), size);
|
||||
serialize_value(values, managed_bytes_mutable_view(b));
|
||||
|
||||
@@ -90,8 +90,6 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage
|
||||
return write_replica_set_selector::previous;
|
||||
case tablet_transition_stage::end_migration:
|
||||
return write_replica_set_selector::next;
|
||||
case tablet_transition_stage::restore:
|
||||
return write_replica_set_selector::previous;
|
||||
}
|
||||
on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast<int>(stage)));
|
||||
}
|
||||
@@ -125,8 +123,6 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
|
||||
return read_replica_set_selector::previous;
|
||||
case tablet_transition_stage::end_migration:
|
||||
return read_replica_set_selector::next;
|
||||
case tablet_transition_stage::restore:
|
||||
return read_replica_set_selector::previous;
|
||||
}
|
||||
on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast<int>(stage)));
|
||||
}
|
||||
@@ -135,14 +131,12 @@ tablet_transition_info::tablet_transition_info(tablet_transition_stage stage,
|
||||
tablet_transition_kind transition,
|
||||
tablet_replica_set next,
|
||||
std::optional<tablet_replica> pending_replica,
|
||||
service::session_id session_id,
|
||||
std::optional<locator::restore_config> restore_cfg)
|
||||
service::session_id session_id)
|
||||
: stage(stage)
|
||||
, transition(transition)
|
||||
, next(std::move(next))
|
||||
, pending_replica(std::move(pending_replica))
|
||||
, session_id(session_id)
|
||||
, restore_cfg(std::move(restore_cfg))
|
||||
, writes(get_selector_for_writes(stage))
|
||||
, reads(get_selector_for_reads(stage))
|
||||
{ }
|
||||
@@ -192,20 +186,12 @@ tablet_migration_streaming_info get_migration_streaming_info(const locator::topo
|
||||
|
||||
return result;
|
||||
}
|
||||
case tablet_transition_kind::repair: {
|
||||
case tablet_transition_kind::repair:
|
||||
auto s = std::unordered_set<tablet_replica>(tinfo.replicas.begin(), tinfo.replicas.end());
|
||||
result.stream_weight = locator::tablet_migration_stream_weight_repair;
|
||||
result.read_from = s;
|
||||
result.written_to = std::move(s);
|
||||
return result;
|
||||
}
|
||||
case tablet_transition_kind::restore: {
|
||||
auto s = std::unordered_set<tablet_replica>(tinfo.replicas.begin(), tinfo.replicas.end());
|
||||
result.stream_weight = locator::tablet_migration_stream_weight_restore;
|
||||
result.read_from = s;
|
||||
result.written_to = std::move(s);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
on_internal_error(tablet_logger, format("Invalid tablet transition kind: {}", static_cast<int>(trinfo.transition)));
|
||||
}
|
||||
@@ -861,7 +847,6 @@ static const std::unordered_map<tablet_transition_stage, sstring> tablet_transit
|
||||
{tablet_transition_stage::cleanup_target, "cleanup_target"},
|
||||
{tablet_transition_stage::revert_migration, "revert_migration"},
|
||||
{tablet_transition_stage::end_migration, "end_migration"},
|
||||
{tablet_transition_stage::restore, "restore"},
|
||||
};
|
||||
|
||||
static const std::unordered_map<sstring, tablet_transition_stage> tablet_transition_stage_from_name = std::invoke([] {
|
||||
@@ -895,7 +880,6 @@ static const std::unordered_map<tablet_transition_kind, sstring> tablet_transiti
|
||||
{tablet_transition_kind::rebuild, "rebuild"},
|
||||
{tablet_transition_kind::rebuild_v2, "rebuild_v2"},
|
||||
{tablet_transition_kind::repair, "repair"},
|
||||
{tablet_transition_kind::restore, "restore"},
|
||||
};
|
||||
|
||||
static const std::unordered_map<sstring, tablet_transition_kind> tablet_transition_kind_from_name = std::invoke([] {
|
||||
@@ -1142,8 +1126,6 @@ std::optional<uint64_t> load_stats::get_tablet_size_in_transition(host_id host,
|
||||
}
|
||||
case tablet_transition_kind::intranode_migration:
|
||||
[[fallthrough]];
|
||||
case tablet_transition_kind::restore:
|
||||
[[fallthrough]];
|
||||
case tablet_transition_kind::repair:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -268,13 +268,6 @@ struct tablet_task_info {
|
||||
static std::unordered_set<sstring> deserialize_repair_dcs_filter(sstring filter);
|
||||
};
|
||||
|
||||
struct restore_config {
|
||||
sstring snapshot_name;
|
||||
sstring endpoint;
|
||||
sstring bucket;
|
||||
bool operator==(const restore_config&) const = default;
|
||||
};
|
||||
|
||||
/// Stores information about a single tablet.
|
||||
struct tablet_info {
|
||||
tablet_replica_set replicas;
|
||||
@@ -330,7 +323,6 @@ enum class tablet_transition_stage {
|
||||
end_migration,
|
||||
repair,
|
||||
end_repair,
|
||||
restore,
|
||||
};
|
||||
|
||||
enum class tablet_transition_kind {
|
||||
@@ -353,9 +345,6 @@ enum class tablet_transition_kind {
|
||||
|
||||
// Repair the tablet replicas
|
||||
repair,
|
||||
|
||||
// Download sstables for tablet
|
||||
restore,
|
||||
};
|
||||
|
||||
tablet_transition_kind choose_rebuild_transition_kind(const gms::feature_service& features);
|
||||
@@ -379,7 +368,6 @@ struct tablet_transition_info {
|
||||
tablet_replica_set next;
|
||||
std::optional<tablet_replica> pending_replica; // Optimization (next - tablet_info::replicas)
|
||||
service::session_id session_id;
|
||||
std::optional<locator::restore_config> restore_cfg;
|
||||
write_replica_set_selector writes;
|
||||
read_replica_set_selector reads;
|
||||
|
||||
@@ -387,8 +375,7 @@ struct tablet_transition_info {
|
||||
tablet_transition_kind kind,
|
||||
tablet_replica_set next,
|
||||
std::optional<tablet_replica> pending_replica,
|
||||
service::session_id session_id = {},
|
||||
std::optional<locator::restore_config> rcfg = std::nullopt);
|
||||
service::session_id session_id = {});
|
||||
|
||||
bool operator==(const tablet_transition_info&) const = default;
|
||||
};
|
||||
@@ -419,7 +406,6 @@ tablet_transition_info migration_to_transition_info(const tablet_info&, const ta
|
||||
/// Describes streaming required for a given tablet transition.
|
||||
constexpr int tablet_migration_stream_weight_default = 1;
|
||||
constexpr int tablet_migration_stream_weight_repair = 2;
|
||||
constexpr int tablet_migration_stream_weight_restore = 2;
|
||||
struct tablet_migration_streaming_info {
|
||||
std::unordered_set<tablet_replica> read_from;
|
||||
std::unordered_set<tablet_replica> written_to;
|
||||
|
||||
23
main.cc
23
main.cc
@@ -30,7 +30,6 @@
|
||||
#include "utils/build_id.hh"
|
||||
#include "utils/only_on_shard0.hh"
|
||||
#include "supervisor.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "replica/database.hh"
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <seastar/core/app-template.hh>
|
||||
@@ -87,7 +86,6 @@
|
||||
#include "service/cache_hitrate_calculator.hh"
|
||||
#include "compaction/compaction_manager.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "sstables/exceptions.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "replica/distributed_loader.hh"
|
||||
#include "sstables_loader.hh"
|
||||
@@ -1044,11 +1042,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
set_abort_on_internal_error(cfg->abort_on_internal_error());
|
||||
|
||||
auto abort_on_malformed_sstable_error_observer = cfg->abort_on_malformed_sstable_error.observe([] (bool val) {
|
||||
sstables::set_abort_on_malformed_sstable_error(val);
|
||||
});
|
||||
sstables::set_abort_on_malformed_sstable_error(cfg->abort_on_malformed_sstable_error());
|
||||
|
||||
checkpoint(stop_signal, "creating snitch");
|
||||
debug::the_snitch = &snitch;
|
||||
snitch_config snitch_cfg;
|
||||
@@ -1375,11 +1368,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
|
||||
spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
|
||||
static db::view::node_update_backlog node_backlog(smp::count, 10ms, cfg->view_flow_control_delay_limit_in_ms);
|
||||
|
||||
static sharded<updateable_timeout_config> timeout_cfg;
|
||||
timeout_cfg.start(std::ref(*cfg)).get();
|
||||
auto stop_timeout_cfg = defer_verbose_shutdown("updateable timeout config", [] { timeout_cfg.stop().get(); });
|
||||
|
||||
scheduling_group_key_config storage_proxy_stats_cfg =
|
||||
make_scheduling_group_key_config<service::storage_proxy_stats::stats>();
|
||||
storage_proxy_stats_cfg.constructor = [plain_constructor = storage_proxy_stats_cfg.constructor] (void* ptr) {
|
||||
@@ -1393,8 +1381,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
};
|
||||
proxy.start(std::ref(db), spcfg, std::ref(node_backlog),
|
||||
scheduling_group_key_create(storage_proxy_stats_cfg).get(),
|
||||
std::ref(feature_service), std::ref(token_metadata), std::ref(erm_factory),
|
||||
std::ref(timeout_cfg)).get();
|
||||
std::ref(feature_service), std::ref(token_metadata), std::ref(erm_factory)).get();
|
||||
|
||||
// #293 - do not stop anything
|
||||
// engine().at_exit([&proxy] { return proxy.stop(); });
|
||||
@@ -2199,7 +2186,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth::make_maintenance_socket_role_manager_factory(qp, group0_client, mm, auth_cache),
|
||||
maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
|
||||
cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, messaging, timeout_cfg, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);
|
||||
cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, messaging, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);
|
||||
|
||||
start_auth_service(maintenance_auth_service, stop_maintenance_auth_service, "maintenance auth service");
|
||||
}
|
||||
@@ -2268,7 +2255,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
});
|
||||
|
||||
checkpoint(stop_signal, "starting sstables loader");
|
||||
sst_loader.start(std::ref(db), std::ref(ss), std::ref(messaging), std::ref(view_builder), std::ref(view_building_worker), std::ref(task_manager), std::ref(sstm), std::ref(sys_dist_ks), dbcfg.streaming_scheduling_group).get();
|
||||
sst_loader.start(std::ref(db), std::ref(ss), std::ref(messaging), std::ref(view_builder), std::ref(view_building_worker), std::ref(task_manager), std::ref(sstm), dbcfg.streaming_scheduling_group).get();
|
||||
auto stop_sst_loader = defer_verbose_shutdown("sstables loader", [&sst_loader] {
|
||||
sst_loader.stop().get();
|
||||
});
|
||||
@@ -2631,11 +2618,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
// after drain stops them in stop_transport()
|
||||
// Register controllers after drain_on_shutdown() below, so that even on start
|
||||
// failure drain is called and stops controllers
|
||||
cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, messaging, timeout_cfg, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group);
|
||||
cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, messaging, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group);
|
||||
|
||||
api::set_server_service_levels(ctx, cql_server_ctl, qp).get();
|
||||
|
||||
alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, sys_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, vector_store_client, timeout_cfg, *cfg, dbcfg.statement_scheduling_group);
|
||||
alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, sys_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, vector_store_client, *cfg, dbcfg.statement_scheduling_group);
|
||||
|
||||
// Register at_exit last, so that storage_service::drain_on_shutdown will be called first
|
||||
auto do_drain = defer_verbose_shutdown("local storage", [&ss] {
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "streaming/prepare_message.hh"
|
||||
#include "sstables_loader.hh"
|
||||
#include "gms/gossip_digest_syn.hh"
|
||||
#include "gms/gossip_digest_ack.hh"
|
||||
#include "gms/gossip_digest_ack2.hh"
|
||||
@@ -140,7 +139,6 @@
|
||||
#include "idl/tasks.dist.impl.hh"
|
||||
#include "idl/forward_cql.dist.impl.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "idl/sstables_loader.dist.impl.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
@@ -736,7 +734,6 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
case messaging_verb::TABLE_LOAD_STATS:
|
||||
case messaging_verb::WORK_ON_VIEW_BUILDING_TASKS:
|
||||
case messaging_verb::SNAPSHOT_WITH_TABLETS:
|
||||
case messaging_verb::RESTORE_TABLET:
|
||||
return 1;
|
||||
case messaging_verb::CLIENT_ID:
|
||||
case messaging_verb::MUTATION:
|
||||
|
||||
@@ -214,8 +214,7 @@ enum class messaging_verb : int32_t {
|
||||
RAFT_READ_BARRIER = 85,
|
||||
FORWARD_CQL_EXECUTE = 86,
|
||||
FORWARD_CQL_PREPARE = 87,
|
||||
RESTORE_TABLET = 88,
|
||||
LAST = 89,
|
||||
LAST = 88,
|
||||
};
|
||||
|
||||
} // namespace netw
|
||||
|
||||
@@ -1279,9 +1279,6 @@ future<int> repair_service::do_repair_start(gms::gossip_address_map& addr_map, s
|
||||
}
|
||||
|
||||
if (!options.start_token.empty() || !options.end_token.empty()) {
|
||||
if (!options.start_token.empty() && !options.end_token.empty() && options.start_token == options.end_token) {
|
||||
throw std::invalid_argument("Start and end tokens must be different.");
|
||||
}
|
||||
// Intersect the list of local ranges with the given token range,
|
||||
// dropping ranges with no intersection.
|
||||
std::optional<::wrapping_interval<dht::token>::bound> tok_start;
|
||||
|
||||
@@ -206,7 +206,6 @@ public:
|
||||
|
||||
lw_shared_ptr<memtable_list>& memtables() noexcept;
|
||||
size_t memtable_count() const noexcept;
|
||||
bool memtable_empty() const noexcept;
|
||||
// Returns minimum timestamp from memtable list
|
||||
api::timestamp_type min_memtable_timestamp() const;
|
||||
// Returns maximum timestamp from memtable list
|
||||
@@ -290,9 +289,6 @@ public:
|
||||
seastar::named_gate& sstable_add_gate() noexcept {
|
||||
return _sstable_add_gate;
|
||||
}
|
||||
const seastar::named_gate& sstable_add_gate() const noexcept {
|
||||
return _sstable_add_gate;
|
||||
}
|
||||
|
||||
compaction::compaction_manager& get_compaction_manager() noexcept;
|
||||
const compaction::compaction_manager& get_compaction_manager() const noexcept;
|
||||
@@ -530,13 +526,3 @@ public:
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template <> struct fmt::formatter<replica::compaction_group> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const replica::compaction_group&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<replica::storage_group> {
|
||||
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
||||
auto format(const replica::storage_group&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
||||
};
|
||||
|
||||
@@ -726,18 +726,6 @@ database::setup_metrics() {
|
||||
sm::description("Number of large partitions exceeding compaction_large_partition_warning_threshold_mb. "
|
||||
"Large partitions have performance impact and should be avoided, check the documentation for details.")),
|
||||
|
||||
sm::make_counter("large_rows_exceeding_threshold", [this] { return _large_data_handler->stats().rows_bigger_than_threshold; },
|
||||
sm::description("Number of large rows exceeding compaction_large_row_warning_threshold_mb. "
|
||||
"Large rows have performance impact and should be avoided, check the documentation for details.")),
|
||||
|
||||
sm::make_counter("large_cell_exceeding_threshold", [this] { return _large_data_handler->stats().cells_bigger_than_threshold; },
|
||||
sm::description("Number of large cells exceeding compaction_large_cell_warning_threshold_mb. "
|
||||
"Large cells have performance impact and should be avoided, check the documentation for details.")),
|
||||
|
||||
sm::make_counter("large_collection_exceeding_threshold", [this] { return _large_data_handler->stats().collections_bigger_than_threshold; },
|
||||
sm::description("Number of large collections exceeding compaction_collection_elements_count_warning_threshold. "
|
||||
"Large collections have performance impact and should be avoided, check the documentation for details.")),
|
||||
|
||||
sm::make_total_operations("total_view_updates_pushed_local", _cf_stats.total_view_updates_pushed_local,
|
||||
sm::description("Total number of view updates generated for tables and applied locally."))(basic_level),
|
||||
|
||||
|
||||
@@ -1413,15 +1413,6 @@ compaction_group& table::compaction_group_for_key(partition_key_view key, const
|
||||
return _sg_manager->compaction_group_for_key(key, s);
|
||||
}
|
||||
|
||||
static sstring sstable_desc(const sstables::shared_sstable& sst) {
|
||||
auto& identifier_opt = sst->sstable_identifier();
|
||||
auto& originating_host_id_opt = sst->get_stats_metadata().originating_host_id;
|
||||
return format("{} (originated from {} with id {} on host {})",
|
||||
sst->get_filename(), sst->get_origin(),
|
||||
identifier_opt ? identifier_opt->to_sstring() : "unknown",
|
||||
originating_host_id_opt ? originating_host_id_opt->to_sstring() : "unknown");
|
||||
}
|
||||
|
||||
compaction_group& tablet_storage_group_manager::compaction_group_for_token_range(sstring desc, dht::token first_token, dht::token last_token) const {
|
||||
auto first_id = storage_group_of(first_token);
|
||||
auto last_id = storage_group_of(last_token);
|
||||
@@ -1455,6 +1446,15 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
|
||||
auto first_token = sst->get_first_decorated_key().token();
|
||||
auto last_token = sst->get_last_decorated_key().token();
|
||||
|
||||
auto sstable_desc = [] (const sstables::shared_sstable& sst) {
|
||||
auto& identifier_opt = sst->sstable_identifier();
|
||||
auto& originating_host_id_opt = sst->get_stats_metadata().originating_host_id;
|
||||
return format("{} (originated from {} with id {} on host {})",
|
||||
sst->get_filename(), sst->get_origin(),
|
||||
identifier_opt ? identifier_opt->to_sstring() : "unknown",
|
||||
originating_host_id_opt ? originating_host_id_opt->to_sstring() : "unknown");
|
||||
};
|
||||
|
||||
return compaction_group_for_token_range(sstable_desc(sst), first_token, last_token);
|
||||
}
|
||||
|
||||
@@ -3313,8 +3313,7 @@ bool has_size_on_leaving (locator::tablet_transition_stage stage) {
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::restore:
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
@@ -3337,8 +3336,7 @@ bool has_size_on_pending (locator::tablet_transition_stage stage) {
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::restore:
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -3447,8 +3445,8 @@ void tablet_storage_group_manager::handle_tablet_split_completion(const locator:
|
||||
for (auto& [id, sg] : _storage_groups) {
|
||||
if (!sg->split_unready_groups_are_empty()) {
|
||||
on_internal_error(tlogger, format("Found that storage of group {} for table {} wasn't split correctly, " \
|
||||
"therefore groups cannot be remapped with the new tablet count.\nDiagnostics: {}",
|
||||
id, table_id, *sg));
|
||||
"therefore groups cannot be remapped with the new tablet count.",
|
||||
id, table_id));
|
||||
}
|
||||
// Remove old empty groups, they're unused, but they need to be deregistered properly
|
||||
// FIXME: indent.
|
||||
@@ -4529,10 +4527,6 @@ size_t compaction_group::memtable_count() const noexcept {
|
||||
return _memtables->size();
|
||||
}
|
||||
|
||||
bool compaction_group::memtable_empty() const noexcept {
|
||||
return _memtables->empty();
|
||||
}
|
||||
|
||||
size_t storage_group::memtable_count() const {
|
||||
size_t count = 0;
|
||||
for_each_compaction_group([&count] (const compaction_group_ptr& cg) {
|
||||
@@ -5771,43 +5765,3 @@ tombstone_gc_state table::get_tombstone_gc_state() const {
|
||||
}
|
||||
|
||||
} // namespace replica
|
||||
|
||||
auto fmt::formatter<replica::compaction_group>::format(const replica::compaction_group& cg, fmt::format_context& ctx) const -> decltype(ctx.out()) {
|
||||
auto out = ctx.out();
|
||||
out = fmt::format_to(out, "[sstables=[");
|
||||
bool first = true;
|
||||
for (const auto& sst : cg.all_sstables()) {
|
||||
if (!first) {
|
||||
out = fmt::format_to(out, ", ");
|
||||
}
|
||||
out = fmt::format_to(out, "{}", replica::sstable_desc(sst));
|
||||
first = false;
|
||||
}
|
||||
return fmt::format_to(out, "], memtable_empty={}, sstable_add_gate={}]",
|
||||
cg.memtable_empty(),
|
||||
cg.sstable_add_gate().get_count());
|
||||
}
|
||||
|
||||
auto fmt::formatter<replica::storage_group>::format(const replica::storage_group& sg, fmt::format_context& ctx) const -> decltype(ctx.out()) {
|
||||
auto out = ctx.out();
|
||||
out = fmt::format_to(out, "main={}", *sg.main_compaction_group());
|
||||
out = fmt::format_to(out, ", merging=[");
|
||||
bool first = true;
|
||||
for (const auto& cg : sg.merging_groups()) {
|
||||
if (!first) {
|
||||
out = fmt::format_to(out, ", ");
|
||||
}
|
||||
out = fmt::format_to(out, "{}", *cg);
|
||||
first = false;
|
||||
}
|
||||
out = fmt::format_to(out, "], split_ready=[");
|
||||
first = true;
|
||||
for (const auto& cg : sg.split_ready_compaction_groups()) {
|
||||
if (!first) {
|
||||
out = fmt::format_to(out, ", ");
|
||||
}
|
||||
out = fmt::format_to(out, "{}", *cg);
|
||||
first = false;
|
||||
}
|
||||
return fmt::format_to(out, "]");
|
||||
}
|
||||
|
||||
@@ -50,9 +50,6 @@ public:
|
||||
tablet_mutation_builder& set_resize_task_info(locator::tablet_task_info info, const gms::feature_service& features);
|
||||
tablet_mutation_builder& del_resize_task_info(const gms::feature_service& features);
|
||||
tablet_mutation_builder& set_base_table(table_id base_table);
|
||||
tablet_mutation_builder& set_restore_config(dht::token last_token, locator::restore_config rcfg);
|
||||
tablet_mutation_builder& del_restore_config(dht::token last_token);
|
||||
|
||||
|
||||
mutation build() {
|
||||
return std::move(_m);
|
||||
|
||||
@@ -40,9 +40,6 @@ static thread_local auto tablet_task_info_type = user_type_impl::get_instance(
|
||||
static thread_local auto replica_type = tuple_type_impl::get_instance({uuid_type, int32_type});
|
||||
static thread_local auto replica_set_type = list_type_impl::get_instance(replica_type, false);
|
||||
static thread_local auto tablet_info_type = tuple_type_impl::get_instance({long_type, long_type, replica_set_type});
|
||||
static thread_local auto restore_config_type = user_type_impl::get_instance(
|
||||
"system", "restore_config", {"snapshot_name", "endpoint", "bucket"},
|
||||
{utf8_type, utf8_type, utf8_type}, false);
|
||||
|
||||
data_type get_replica_set_type() {
|
||||
return replica_set_type;
|
||||
@@ -55,7 +52,6 @@ data_type get_tablet_info_type() {
|
||||
void tablet_add_repair_scheduler_user_types(const sstring& ks, replica::database& db) {
|
||||
db.find_keyspace(ks).add_user_type(repair_scheduler_config_type);
|
||||
db.find_keyspace(ks).add_user_type(tablet_task_info_type);
|
||||
db.find_keyspace(ks).add_user_type(restore_config_type);
|
||||
}
|
||||
|
||||
static bool strongly_consistent_tables_enabled = false;
|
||||
@@ -91,8 +87,7 @@ schema_ptr make_tablets_schema() {
|
||||
.with_column("repair_incremental_mode", utf8_type)
|
||||
.with_column("migration_task_info", tablet_task_info_type)
|
||||
.with_column("resize_task_info", tablet_task_info_type, column_kind::static_column)
|
||||
.with_column("base_table", uuid_type, column_kind::static_column)
|
||||
.with_column("restore_config", restore_config_type);
|
||||
.with_column("base_table", uuid_type, column_kind::static_column);
|
||||
|
||||
if (strongly_consistent_tables_enabled) {
|
||||
builder
|
||||
@@ -226,15 +221,6 @@ data_value tablet_task_info_to_data_value(const locator::tablet_task_info& info)
|
||||
return result;
|
||||
};
|
||||
|
||||
data_value restore_config_to_data_value(const locator::restore_config& cfg) {
|
||||
data_value result = make_user_value(restore_config_type, {
|
||||
data_value(cfg.snapshot_name),
|
||||
data_value(cfg.endpoint),
|
||||
data_value(cfg.bucket),
|
||||
});
|
||||
return result;
|
||||
};
|
||||
|
||||
data_value repair_scheduler_config_to_data_value(const locator::repair_scheduler_config& config) {
|
||||
data_value result = make_user_value(repair_scheduler_config_type, {
|
||||
data_value(config.auto_repair_enabled),
|
||||
@@ -458,12 +444,6 @@ tablet_mutation_builder::set_repair_task_info(dht::token last_token, locator::ta
|
||||
return *this;
|
||||
}
|
||||
|
||||
tablet_mutation_builder&
|
||||
tablet_mutation_builder::set_restore_config(dht::token last_token, locator::restore_config rcfg) {
|
||||
_m.set_clustered_cell(get_ck(last_token), "restore_config", restore_config_to_data_value(rcfg), _ts);
|
||||
return *this;
|
||||
}
|
||||
|
||||
tablet_mutation_builder&
|
||||
tablet_mutation_builder::del_repair_task_info(dht::token last_token, const gms::feature_service& features) {
|
||||
auto col = _s->get_column_definition("repair_task_info");
|
||||
@@ -475,13 +455,6 @@ tablet_mutation_builder::del_repair_task_info(dht::token last_token, const gms::
|
||||
return *this;
|
||||
}
|
||||
|
||||
tablet_mutation_builder&
|
||||
tablet_mutation_builder::del_restore_config(dht::token last_token) {
|
||||
auto col = _s->get_column_definition("restore_config");
|
||||
_m.set_clustered_cell(get_ck(last_token), *col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
||||
return *this;
|
||||
}
|
||||
|
||||
tablet_mutation_builder&
|
||||
tablet_mutation_builder::set_migration_task_info(dht::token last_token, locator::tablet_task_info migration_task_info, const gms::feature_service& features) {
|
||||
if (features.tablet_migration_virtual_task) {
|
||||
@@ -572,22 +545,6 @@ locator::tablet_task_info deserialize_tablet_task_info(cql3::untyped_result_set_
|
||||
tablet_task_info_type->deserialize_value(raw_value));
|
||||
}
|
||||
|
||||
locator::restore_config restore_config_from_cell(const data_value& v) {
|
||||
std::vector<data_value> dv = value_cast<user_type_impl::native_type>(v);
|
||||
auto result = locator::restore_config{
|
||||
value_cast<sstring>(dv[0]),
|
||||
value_cast<sstring>(dv[1]),
|
||||
value_cast<sstring>(dv[2]),
|
||||
};
|
||||
return result;
|
||||
}
|
||||
|
||||
static
|
||||
locator::restore_config deserialize_restore_config(cql3::untyped_result_set_row::view_type raw_value) {
|
||||
return restore_config_from_cell(
|
||||
restore_config_type->deserialize_value(raw_value));
|
||||
}
|
||||
|
||||
locator::repair_scheduler_config repair_scheduler_config_from_cell(const data_value& v) {
|
||||
std::vector<data_value> dv = value_cast<user_type_impl::native_type>(v);
|
||||
auto result = locator::repair_scheduler_config{
|
||||
@@ -789,11 +746,6 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<locator::restore_config> restore_cfg;
|
||||
if (row.has("restore_config")) {
|
||||
restore_cfg = deserialize_restore_config(row.get_view("restore_config"));
|
||||
}
|
||||
|
||||
locator::tablet_task_info migration_task_info;
|
||||
if (row.has("migration_task_info")) {
|
||||
migration_task_info = deserialize_tablet_task_info(row.get_view("migration_task_info"));
|
||||
@@ -817,7 +769,7 @@ tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map
|
||||
session_id = service::session_id(row.get_as<utils::UUID>("session"));
|
||||
}
|
||||
map.set_tablet_transition_info(tid, tablet_transition_info{stage, transition,
|
||||
std::move(new_tablet_replicas), pending_replica, session_id, std::move(restore_cfg)});
|
||||
std::move(new_tablet_replicas), pending_replica, session_id});
|
||||
}
|
||||
|
||||
tablet_logger.debug("Set sstables_repaired_at={} table={} tablet={}", sstables_repaired_at, table, tid);
|
||||
|
||||
@@ -1227,7 +1227,7 @@ fragmented_ostringstream& schema::schema_properties(const schema_describe_helper
|
||||
map_as_cql_param(os, caching_options().to_map());
|
||||
os << "}";
|
||||
os << "\n AND comment = " << cql3::util::single_quote(comment());
|
||||
os << "\n AND compaction = {'class': '" << compaction::compaction_strategy::name(configured_compaction_strategy()) << "'";
|
||||
os << "\n AND compaction = {'class': '" << compaction::compaction_strategy::name(compaction_strategy()) << "'";
|
||||
map_as_cql_param(os, compaction_strategy_options(), false) << "}";
|
||||
os << "\n AND compression = {";
|
||||
map_as_cql_param(os, get_compressor_params().get_options());
|
||||
|
||||
@@ -5677,21 +5677,6 @@ class scylla_sstable_summary(gdb.Command):
|
||||
position: 0}
|
||||
|
||||
Keys are printed in the hexadecimal notation.
|
||||
|
||||
For ms-format (trie-based) sstables, displays data from the
|
||||
partitions db footer instead:
|
||||
|
||||
(gdb) scylla sstable-summary $sst
|
||||
sstable uses ms format (trie-based index).
|
||||
first_key: 63617373616e647261
|
||||
last_key: 63617373616e647261
|
||||
partition_count: 42
|
||||
trie_root_position: 12345
|
||||
|
||||
If the partitions db footer has not been lazily loaded yet (e.g. the
|
||||
sstable was opened but never read from), the command will report:
|
||||
|
||||
sstable uses ms format but partitions db footer is not loaded.
|
||||
"""
|
||||
def __init__(self):
|
||||
gdb.Command.__init__(self, 'scylla sstable-summary', gdb.COMMAND_USER, gdb.COMPLETE_NONE, True)
|
||||
@@ -5713,16 +5698,7 @@ class scylla_sstable_summary(gdb.Command):
|
||||
sst = arg
|
||||
ms_version = int(gdb.parse_and_eval('sstables::sstable_version_types::ms'))
|
||||
if int(sst['_version']) >= ms_version:
|
||||
footer_opt = std_optional(sst['_partitions_db_footer'])
|
||||
if not footer_opt:
|
||||
gdb.write("sstable uses ms format but partitions db footer is not loaded.\n")
|
||||
return
|
||||
footer = footer_opt.get()
|
||||
gdb.write("sstable uses ms format (trie-based index).\n")
|
||||
gdb.write("first_key: {}\n".format(sstring(footer['first_key']['_bytes'])))
|
||||
gdb.write("last_key: {}\n".format(sstring(footer['last_key']['_bytes'])))
|
||||
gdb.write("partition_count: {}\n".format(footer['partition_count']))
|
||||
gdb.write("trie_root_position: {}\n".format(footer['trie_root_position']))
|
||||
gdb.write("sstable uses ms format (trie-based index); summary is not populated.\n")
|
||||
return
|
||||
summary = seastar_lw_shared_ptr(sst['_components']['_value']).get().dereference()['summary']
|
||||
|
||||
|
||||
@@ -793,18 +793,16 @@ static future<> add_view_building_tasks_mutations(storage_proxy& sp, view_ptr vi
|
||||
|
||||
auto& db = sp.local_db();
|
||||
auto& sys_ks = sp.system_keyspace();
|
||||
auto& vb_sm = sp.view_building_state_machine();
|
||||
|
||||
auto base_id = view->view_info()->base_id();
|
||||
auto& base_cf = db.find_column_family(base_id);
|
||||
auto erm = base_cf.get_effective_replication_map();
|
||||
auto& tablet_map = erm->get_token_metadata().tablets().get_tablet_map(base_id);
|
||||
auto uuid_gen = vb_sm.building_state.make_task_uuid_generator(ts);
|
||||
|
||||
co_await tablet_map.for_each_tablet([&] (auto tid, const auto& tablet_info) -> future<> {
|
||||
auto last_token = tablet_map.get_last_token(tid);
|
||||
for (auto& replica: tablet_info.replicas) {
|
||||
auto id = uuid_gen();
|
||||
auto id = utils::UUID_gen::get_time_UUID();
|
||||
view_building_task task {
|
||||
id, view_building_task::task_type::build_range, false,
|
||||
base_id, view->id(), replica, last_token
|
||||
|
||||
@@ -590,7 +590,7 @@ private:
|
||||
|
||||
storage_proxy::clock_type::time_point timeout;
|
||||
if (!t) {
|
||||
auto timeout_in_ms = _sp._timeout_config.write_timeout_in_ms();
|
||||
auto timeout_in_ms = _sp._db.local().get_config().write_request_timeout_in_ms();
|
||||
timeout = clock_type::now() + std::chrono::milliseconds(timeout_in_ms);
|
||||
} else {
|
||||
timeout = *t;
|
||||
@@ -3321,8 +3321,7 @@ storage_proxy::~storage_proxy() {
|
||||
}
|
||||
|
||||
storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::config cfg, db::view::node_update_backlog& max_view_update_backlog,
|
||||
scheduling_group_key stats_key, gms::feature_service& feat, const locator::shared_token_metadata& stm, locator::effective_replication_map_factory& erm_factory,
|
||||
updateable_timeout_config& timeout_config)
|
||||
scheduling_group_key stats_key, gms::feature_service& feat, const locator::shared_token_metadata& stm, locator::effective_replication_map_factory& erm_factory)
|
||||
: _db(db)
|
||||
, _shared_token_metadata(stm)
|
||||
, _erm_factory(erm_factory)
|
||||
@@ -3342,7 +3341,6 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
|
||||
, _background_write_throttle_threahsold(cfg.available_memory / 10)
|
||||
, _mutate_stage{"storage_proxy_mutate", &storage_proxy::do_mutate}
|
||||
, _max_view_update_backlog(max_view_update_backlog)
|
||||
, _timeout_config(timeout_config)
|
||||
, _cancellable_write_handlers_list(std::make_unique<cancellable_write_handlers_list>())
|
||||
{
|
||||
namespace sm = seastar::metrics;
|
||||
@@ -3972,7 +3970,7 @@ future<result<>> storage_proxy::mutate_begin(unique_response_handler_vector ids,
|
||||
// frozen_mutation copy, or manage handler live time differently.
|
||||
hint_to_dead_endpoints(response_id, cl);
|
||||
|
||||
auto timeout = timeout_opt.value_or(clock_type::now() + std::chrono::milliseconds(_timeout_config.write_timeout_in_ms()));
|
||||
auto timeout = timeout_opt.value_or(clock_type::now() + std::chrono::milliseconds(_db.local().get_config().write_request_timeout_in_ms()));
|
||||
// call before send_to_live_endpoints() for the same reason as above
|
||||
auto f = response_wait(response_id, timeout);
|
||||
send_to_live_endpoints(protected_response.release(), timeout); // response is now running and it will either complete or timeout
|
||||
@@ -5944,7 +5942,7 @@ public:
|
||||
// occur within write_timeout of a write, as these are the cases where repair is most
|
||||
// beneficial.
|
||||
if (is_datacenter_local(exec->_cl) && exec->_cmd->read_timestamp >= 0 && digest_resolver->last_modified() >= 0) {
|
||||
auto write_timeout = exec->_proxy->_timeout_config.write_timeout_in_ms() * 1000;
|
||||
auto write_timeout = exec->_proxy->_db.local().get_config().write_request_timeout_in_ms() * 1000;
|
||||
auto delta = int64_t(digest_resolver->last_modified()) - int64_t(exec->_cmd->read_timestamp);
|
||||
if (std::abs(delta) <= write_timeout) {
|
||||
exec->_proxy->get_stats().global_read_repairs_canceled_due_to_concurrent_write++;
|
||||
@@ -6068,7 +6066,7 @@ public:
|
||||
});
|
||||
auto& sr = _schema->speculative_retry();
|
||||
auto t = (sr.get_type() == speculative_retry::type::PERCENTILE) ?
|
||||
std::min(_cf->get_coordinator_read_latency_percentile(sr.get_value()), std::chrono::milliseconds(_proxy->_timeout_config.read_timeout_in_ms()/2)) :
|
||||
std::min(_cf->get_coordinator_read_latency_percentile(sr.get_value()), std::chrono::milliseconds(_proxy->get_db().local().get_config().read_request_timeout_in_ms()/2)) :
|
||||
std::chrono::milliseconds(unsigned(sr.get_value()));
|
||||
_speculate_timer.arm(t);
|
||||
resolver->set_on_disconnect([this] {
|
||||
@@ -6786,7 +6784,7 @@ storage_proxy::do_query_with_paxos(schema_ptr s,
|
||||
db::timeout_clock::time_point timeout = query_options.timeout(*this);
|
||||
// When to give up due to contention
|
||||
db::timeout_clock::time_point cas_timeout = db::timeout_clock::now() +
|
||||
std::chrono::milliseconds(_timeout_config.cas_timeout_in_ms());
|
||||
std::chrono::milliseconds(_db.local().get_config().cas_contention_timeout_in_ms());
|
||||
|
||||
struct read_cas_request : public cas_request {
|
||||
foreign_ptr<lw_shared_ptr<query::result>> res;
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/cas_shard.hh"
|
||||
#include "service/maintenance_mode.hh"
|
||||
#include "timeout_config.hh"
|
||||
#include "service/storage_proxy_fwd.hh"
|
||||
|
||||
class reconcilable_result;
|
||||
@@ -320,7 +319,6 @@ private:
|
||||
lw_shared_ptr<cdc::operation_result_tracker>,
|
||||
coordinator_mutate_options> _mutate_stage;
|
||||
db::view::node_update_backlog& _max_view_update_backlog;
|
||||
updateable_timeout_config& _timeout_config;
|
||||
std::unordered_map<locator::host_id, view_update_backlog_timestamped> _view_update_backlogs;
|
||||
|
||||
//NOTICE(sarna): This opaque pointer is here just to avoid moving write handler class definitions from .cc to .hh. It's slow path.
|
||||
@@ -530,7 +528,7 @@ private:
|
||||
public:
|
||||
storage_proxy(sharded<replica::database>& db, config cfg, db::view::node_update_backlog& max_view_update_backlog,
|
||||
scheduling_group_key stats_key, gms::feature_service& feat, const locator::shared_token_metadata& stm,
|
||||
locator::effective_replication_map_factory& erm_factory, updateable_timeout_config& timeout_config);
|
||||
locator::effective_replication_map_factory& erm_factory);
|
||||
~storage_proxy();
|
||||
|
||||
const sharded<replica::database>& get_db() const {
|
||||
|
||||
@@ -806,7 +806,7 @@ future<> storage_service::view_building_state_load() {
|
||||
};
|
||||
|
||||
|
||||
auto [vb_tasks, min_alive_uuid] = co_await _sys_ks.local().get_view_building_tasks();
|
||||
auto vb_tasks = co_await _sys_ks.local().get_view_building_tasks();
|
||||
auto processing_base_table = co_await _sys_ks.local().get_view_building_processing_base_id();
|
||||
|
||||
std::map<table_id, std::vector<table_id>> views_per_base;
|
||||
@@ -825,7 +825,7 @@ future<> storage_service::view_building_state_load() {
|
||||
})
|
||||
| std::ranges::to<db::view::views_state::view_build_status_map>();
|
||||
|
||||
db::view::view_building_state building_state {std::move(vb_tasks), std::move(processing_base_table), std::move(min_alive_uuid)};
|
||||
db::view::view_building_state building_state {std::move(vb_tasks), std::move(processing_base_table)};
|
||||
db::view::views_state views_state {std::move(views_per_base), std::move(status_map)};
|
||||
|
||||
_view_building_state_machine.building_state = std::move(building_state);
|
||||
@@ -2731,23 +2731,13 @@ future<> storage_service::decommission(sharded<db::snapshot_ctl>& snapshot_ctl)
|
||||
throw std::runtime_error(::format("Node in {} state; wait for status to become normal or restart", ss._operation_mode));
|
||||
}
|
||||
|
||||
ss.raft_decommission().get();
|
||||
|
||||
// SCYLLADB-1693. In case we abort, the snapshot/backup mechanism need
|
||||
// to remain open. Move it to after raft_decommission.
|
||||
// In the case of a cluster snapshot, our nodes ownership
|
||||
// or not of tables will be serialized by raft anyway, so
|
||||
// should remain consistent. In that case we at worst coordinate
|
||||
// from a node in "leave" status
|
||||
// In the case of a local snapshot, ownership matters less,
|
||||
// only sstables on disk, which should not change.
|
||||
// In the case of backup, this operates on a snapshot, state of which
|
||||
// is not affected.
|
||||
snapshot_ctl.invoke_on_all([](auto& sctl) {
|
||||
return sctl.disable_all_operations();
|
||||
}).get();
|
||||
slogger.info("DECOMMISSIONING: disabled backup and snapshots");
|
||||
|
||||
ss.raft_decommission().get();
|
||||
|
||||
ss.stop_transport().get();
|
||||
slogger.info("DECOMMISSIONING: stopped transport");
|
||||
|
||||
@@ -4813,13 +4803,8 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
} catch (const raft::request_aborted& e) {
|
||||
rtlogger.warn("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
|
||||
result.error_message = e.what();
|
||||
} catch (const std::exception& e) {
|
||||
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
|
||||
result.error_message = e.what();
|
||||
} catch (...) {
|
||||
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, std::current_exception());
|
||||
result.error_message = "unknown error";
|
||||
}
|
||||
|
||||
rtlogger.info("topology cmd rpc {} completed with status={} index={}",
|
||||
@@ -5637,71 +5622,6 @@ future<> storage_service::del_tablet_replica(table_id table, dht::token token, l
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_service::restore_tablets(table_id table, sstring snap_name, sstring endpoint, sstring bucket) {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
if (this_shard_id() != 0) {
|
||||
// group0 is only set on shard 0.
|
||||
co_return co_await container().invoke_on(0, [&] (auto& ss) {
|
||||
return ss.restore_tablets(table, snap_name, endpoint, bucket);
|
||||
});
|
||||
}
|
||||
|
||||
// Holding tm around transit_tablet() can lead to deadlock, if state machine is busy
|
||||
// with something which executes a barrier. The barrier will wait for tm to die, and
|
||||
// transit_tablet() will wait for the barrier to finish.
|
||||
// Due to that, we first collect tablet boundaries, then prepare and submit transition
|
||||
// mutations. Since this code is called with equal min:max tokens set for the table,
|
||||
// the tablet map cannot split and merge and, thus, the static vector of tokens should
|
||||
// map to correct tablet boundaries throughout the whole operation
|
||||
utils::chunked_vector<std::pair<locator::tablet_id, dht::token>> tablets;
|
||||
{
|
||||
const auto tm = get_token_metadata_ptr();
|
||||
const auto& tmap = tm->tablets().get_tablet_map(table);
|
||||
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
|
||||
auto last_token = tmap.get_last_token(tid);
|
||||
tablets.push_back(std::make_pair(tid, last_token));
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
auto wait_one_transition = [this] (locator::global_tablet_id gid) {
|
||||
return _topology_state_machine.event.wait([this, gid] {
|
||||
auto& tmap = get_token_metadata().tablets().get_tablet_map(gid.table);
|
||||
return !tmap.get_tablet_transition_info(gid.tablet);
|
||||
});
|
||||
};
|
||||
|
||||
std::vector<future<>> wait;
|
||||
co_await coroutine::parallel_for_each(tablets, [&] (const auto& tablet) -> future<> {
|
||||
auto [ tid, last_token ] = tablet;
|
||||
auto gid = locator::global_tablet_id{table, tid};
|
||||
while (true) {
|
||||
auto success = co_await try_transit_tablet(table, last_token, [&] (const locator::tablet_map& tmap, api::timestamp_type write_timestamp) {
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
updates.emplace_back(tablet_mutation_builder_for_base_table(write_timestamp, table)
|
||||
.set_stage(last_token, locator::tablet_transition_stage::restore)
|
||||
.set_new_replicas(last_token, tmap.get_tablet_info(tid).replicas)
|
||||
.set_restore_config(last_token, locator::restore_config{ snap_name, endpoint, bucket })
|
||||
.set_transition(last_token, locator::tablet_transition_kind::restore)
|
||||
.build());
|
||||
|
||||
sstring reason = format("Restoring tablet {}", gid);
|
||||
return std::make_tuple(std::move(updates), std::move(reason));
|
||||
});
|
||||
if (success) {
|
||||
wait.emplace_back(wait_one_transition(gid));
|
||||
break;
|
||||
}
|
||||
slogger.debug("Tablet is in transition, waiting");
|
||||
co_await wait_one_transition(gid);
|
||||
}
|
||||
});
|
||||
|
||||
co_await when_all_succeed(wait.begin(), wait.end()).discard_result();
|
||||
slogger.info("Restoring {} finished", table);
|
||||
}
|
||||
|
||||
future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables() {
|
||||
auto holder = _async_gate.hold();
|
||||
|
||||
@@ -5784,21 +5704,6 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
}
|
||||
|
||||
future<> storage_service::transit_tablet(table_id table, dht::token token, noncopyable_function<std::tuple<utils::chunked_vector<canonical_mutation>, sstring>(const locator::tablet_map&, api::timestamp_type)> prepare_mutations) {
|
||||
auto success = co_await try_transit_tablet(table, token, std::move(prepare_mutations));
|
||||
if (!success) {
|
||||
auto& tmap = get_token_metadata().tablets().get_tablet_map(table);
|
||||
auto tid = tmap.get_tablet_id(token);
|
||||
throw std::runtime_error(fmt::format("Tablet {} is in transition", locator::global_tablet_id{table, tid}));
|
||||
}
|
||||
|
||||
// Wait for transition to finish.
|
||||
co_await _topology_state_machine.event.when([&] {
|
||||
auto& tmap = get_token_metadata().tablets().get_tablet_map(table);
|
||||
return !tmap.get_tablet_transition_info(tmap.get_tablet_id(token));
|
||||
});
|
||||
}
|
||||
|
||||
future<bool> storage_service::try_transit_tablet(table_id table, dht::token token, noncopyable_function<std::tuple<utils::chunked_vector<canonical_mutation>, sstring>(const locator::tablet_map&, api::timestamp_type)> prepare_mutations) {
|
||||
while (true) {
|
||||
auto guard = co_await _group0->client().start_operation(_group0_as, raft_timeout{});
|
||||
bool topology_busy;
|
||||
@@ -5818,7 +5723,7 @@ future<bool> storage_service::try_transit_tablet(table_id table, dht::token toke
|
||||
auto& tmap = get_token_metadata().tablets().get_tablet_map(table);
|
||||
auto tid = tmap.get_tablet_id(token);
|
||||
if (tmap.get_tablet_transition_info(tid)) {
|
||||
co_return false;
|
||||
throw std::runtime_error(fmt::format("Tablet {} is in transition", locator::global_tablet_id{table, tid}));
|
||||
}
|
||||
|
||||
auto [ updates, reason ] = prepare_mutations(tmap, guard.write_timestamp());
|
||||
@@ -5848,7 +5753,11 @@ future<bool> storage_service::try_transit_tablet(table_id table, dht::token toke
|
||||
}
|
||||
}
|
||||
|
||||
co_return true;
|
||||
// Wait for transition to finish.
|
||||
co_await _topology_state_machine.event.when([&] {
|
||||
auto& tmap = get_token_metadata().tablets().get_tablet_map(table);
|
||||
return !tmap.get_tablet_transition_info(tmap.get_tablet_id(token));
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_service::set_tablet_balancing_enabled(bool enabled) {
|
||||
@@ -6255,15 +6164,6 @@ node_state storage_service::get_node_state(locator::host_id id) {
|
||||
return p->second.state;
|
||||
}
|
||||
|
||||
void storage_service::check_raft_rpc(raft::server_id dst_id) {
|
||||
if (!_group0 || !_group0->joined_group0()) {
|
||||
throw std::runtime_error("The node did not join group 0 yet");
|
||||
}
|
||||
if (_group0->load_my_id() != dst_id) {
|
||||
throw raft_destination_id_not_correct(_group0->load_my_id(), dst_id);
|
||||
}
|
||||
}
|
||||
|
||||
void storage_service::init_messaging_service() {
|
||||
ser::node_ops_rpc_verbs::register_node_ops_cmd(&_messaging.local(), [this] (const rpc::client_info& cinfo, node_ops_cmd_request req) {
|
||||
auto coordinator = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
@@ -6275,6 +6175,17 @@ void storage_service::init_messaging_service() {
|
||||
return ss.node_ops_cmd_handler(coordinator, coordinator_host_id, std::move(req));
|
||||
});
|
||||
});
|
||||
auto handle_raft_rpc = [this] (raft::server_id dst_id, auto handler) {
|
||||
return container().invoke_on(0, [dst_id, handler = std::move(handler)] (auto& ss) mutable {
|
||||
if (!ss._group0 || !ss._group0->joined_group0()) {
|
||||
throw std::runtime_error("The node did not join group 0 yet");
|
||||
}
|
||||
if (ss._group0->load_my_id() != dst_id) {
|
||||
throw raft_destination_id_not_correct(ss._group0->load_my_id(), dst_id);
|
||||
}
|
||||
return handler(ss);
|
||||
});
|
||||
};
|
||||
ser::streaming_rpc_verbs::register_tablet_stream_files(&_messaging.local(),
|
||||
[this] (const rpc::client_info& cinfo, streaming::stream_files_request req) -> future<streaming::stream_files_response> {
|
||||
streaming::stream_files_response resp;
|
||||
@@ -6286,13 +6197,13 @@ void storage_service::init_messaging_service() {
|
||||
std::plus<size_t>());
|
||||
co_return resp;
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_raft_topology_cmd(&_messaging.local(), [this] (raft::server_id dst_id, raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
ser::storage_service_rpc_verbs::register_raft_topology_cmd(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
return handle_raft_rpc(dst_id, [cmd = std::move(cmd), term, cmd_index] (auto& ss) {
|
||||
check_raft_rpc_scheduling_group(ss._db.local(), ss._feature_service, "raft_topology_cmd");
|
||||
return ss.raft_topology_cmd_handler(term, cmd_index, cmd);
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_raft_pull_snapshot(&_messaging.local(), [this] (raft::server_id dst_id, raft_snapshot_pull_params params) {
|
||||
ser::storage_service_rpc_verbs::register_raft_pull_snapshot(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, raft_snapshot_pull_params params) {
|
||||
return handle_raft_rpc(dst_id, [params = std::move(params)] (storage_service& ss) -> future<raft_snapshot> {
|
||||
check_raft_rpc_scheduling_group(ss._db.local(), ss._feature_service, "raft_pull_snapshot");
|
||||
utils::chunked_vector<canonical_mutation> mutations;
|
||||
@@ -6387,28 +6298,28 @@ void storage_service::init_messaging_service() {
|
||||
};
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_tablet_stream_data(&_messaging.local(), [this] (raft::server_id dst_id, locator::global_tablet_id tablet) {
|
||||
ser::storage_service_rpc_verbs::register_tablet_stream_data(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, locator::global_tablet_id tablet) {
|
||||
return handle_raft_rpc(dst_id, [tablet] (auto& ss) {
|
||||
return ss.stream_tablet(tablet);
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_tablet_repair(&_messaging.local(), [this] (raft::server_id dst_id, locator::global_tablet_id tablet, rpc::optional<service::session_id> session_id) {
|
||||
ser::storage_service_rpc_verbs::register_tablet_repair(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, locator::global_tablet_id tablet, rpc::optional<service::session_id> session_id) {
|
||||
return handle_raft_rpc(dst_id, [tablet, session_id = session_id.value_or(service::session_id::create_null_id())] (auto& ss) -> future<service::tablet_operation_repair_result> {
|
||||
auto res = co_await ss.repair_tablet(tablet, session_id);
|
||||
co_return res;
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_tablet_cleanup(&_messaging.local(), [this] (raft::server_id dst_id, locator::global_tablet_id tablet) {
|
||||
ser::storage_service_rpc_verbs::register_tablet_cleanup(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, locator::global_tablet_id tablet) {
|
||||
return handle_raft_rpc(dst_id, [tablet] (auto& ss) {
|
||||
return ss.cleanup_tablet(tablet);
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_table_load_stats(&_messaging.local(), [this] (raft::server_id dst_id) {
|
||||
ser::storage_service_rpc_verbs::register_table_load_stats(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id) {
|
||||
return handle_raft_rpc(dst_id, [] (auto& ss) mutable {
|
||||
return ss.load_stats_for_tablet_based_tables();
|
||||
});
|
||||
});
|
||||
ser::storage_service_rpc_verbs::register_table_load_stats_v1(&_messaging.local(), [this] (raft::server_id dst_id) {
|
||||
ser::storage_service_rpc_verbs::register_table_load_stats_v1(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id) {
|
||||
return handle_raft_rpc(dst_id, [] (auto& ss) mutable {
|
||||
return ss.load_stats_for_tablet_based_tables().then([] (auto stats) {
|
||||
return locator::load_stats_v1{ .tables = std::move(stats.tables) };
|
||||
@@ -6429,7 +6340,7 @@ void storage_service::init_messaging_service() {
|
||||
ser::storage_service_rpc_verbs::register_sample_sstables(&_messaging.local(), [this] (table_id table, uint64_t chunk_size, uint64_t n_chunks) -> future<utils::chunked_vector<temporary_buffer<char>>> {
|
||||
return _db.local().sample_data_files(table, chunk_size, n_chunks);
|
||||
});
|
||||
ser::join_node_rpc_verbs::register_join_node_request(&_messaging.local(), [this] (raft::server_id dst_id, service::join_node_request_params params) {
|
||||
ser::join_node_rpc_verbs::register_join_node_request(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, service::join_node_request_params params) {
|
||||
return handle_raft_rpc(dst_id, [params = std::move(params)] (auto& ss) mutable {
|
||||
check_raft_rpc_scheduling_group(ss._db.local(), ss._feature_service, "join_node_request");
|
||||
return ss.join_node_request_handler(std::move(params));
|
||||
@@ -6445,7 +6356,7 @@ void storage_service::init_messaging_service() {
|
||||
co_return co_await ss.join_node_response_handler(std::move(params));
|
||||
});
|
||||
});
|
||||
ser::join_node_rpc_verbs::register_join_node_query(&_messaging.local(), [this] (raft::server_id dst_id, service::join_node_query_params) {
|
||||
ser::join_node_rpc_verbs::register_join_node_query(&_messaging.local(), [handle_raft_rpc] (raft::server_id dst_id, service::join_node_query_params) {
|
||||
return handle_raft_rpc(dst_id, [] (auto& ss) -> future<join_node_query_result> {
|
||||
check_raft_rpc_scheduling_group(ss._db.local(), ss._feature_service, "join_node_query");
|
||||
auto result = join_node_query_result{
|
||||
|
||||
@@ -230,6 +230,9 @@ private:
|
||||
shared_ptr<service::topo::task_manager_module> _global_topology_requests_module;
|
||||
shared_ptr<service::vnodes_to_tablets::task_manager_module> _vnodes_to_tablets_migration_module;
|
||||
gms::gossip_address_map& _address_map;
|
||||
future<service::tablet_operation_result> do_tablet_operation(locator::global_tablet_id tablet,
|
||||
sstring op_name,
|
||||
std::function<future<service::tablet_operation_result>(locator::tablet_metadata_guard&)> op);
|
||||
future<service::tablet_operation_repair_result> repair_tablet(locator::global_tablet_id, service::session_id);
|
||||
future<> stream_tablet(locator::global_tablet_id);
|
||||
// Clones storage of leaving tablet into pending one. Done in the context of intra-node migration,
|
||||
@@ -241,20 +244,7 @@ private:
|
||||
future<> process_tablet_split_candidate(table_id) noexcept;
|
||||
void register_tablet_split_candidate(table_id) noexcept;
|
||||
future<> run_tablet_split_monitor();
|
||||
void check_raft_rpc(raft::server_id dst);
|
||||
public:
|
||||
future<service::tablet_operation_result> do_tablet_operation(locator::global_tablet_id tablet,
|
||||
sstring op_name,
|
||||
std::function<future<service::tablet_operation_result>(locator::tablet_metadata_guard&)> op);
|
||||
|
||||
template <typename Func>
|
||||
auto handle_raft_rpc(raft::server_id dst_id, Func&& handler) {
|
||||
return container().invoke_on(0, [dst_id, handler = std::forward<Func>(handler)] (auto& ss) mutable {
|
||||
ss.check_raft_rpc(dst_id);
|
||||
return handler(ss);
|
||||
});
|
||||
};
|
||||
|
||||
storage_service(abort_source& as, sharded<replica::database>& db,
|
||||
gms::gossiper& gossiper,
|
||||
sharded<db::system_keyspace>&,
|
||||
@@ -961,7 +951,6 @@ private:
|
||||
future<> _upgrade_to_topology_coordinator_fiber = make_ready_future<>();
|
||||
|
||||
future<> transit_tablet(table_id, dht::token, noncopyable_function<std::tuple<utils::chunked_vector<canonical_mutation>, sstring>(const locator::tablet_map& tmap, api::timestamp_type)> prepare_mutations);
|
||||
future<bool> try_transit_tablet(table_id, dht::token, noncopyable_function<std::tuple<utils::chunked_vector<canonical_mutation>, sstring>(const locator::tablet_map& tmap, api::timestamp_type)> prepare_mutations);
|
||||
future<service::group0_guard> get_guard_for_tablet_update();
|
||||
future<bool> exec_tablet_update(service::group0_guard guard, utils::chunked_vector<canonical_mutation> updates, sstring reason);
|
||||
public:
|
||||
@@ -971,7 +960,6 @@ public:
|
||||
future<> move_tablet(table_id, dht::token, locator::tablet_replica src, locator::tablet_replica dst, loosen_constraints force = loosen_constraints::no);
|
||||
future<> add_tablet_replica(table_id, dht::token, locator::tablet_replica dst, loosen_constraints force = loosen_constraints::no);
|
||||
future<> del_tablet_replica(table_id, dht::token, locator::tablet_replica dst, loosen_constraints force = loosen_constraints::no);
|
||||
future<> restore_tablets(table_id, sstring snap_name, sstring endpoint, sstring bucket);
|
||||
future<> set_tablet_balancing_enabled(bool);
|
||||
|
||||
future<> await_topology_quiesced();
|
||||
|
||||
@@ -19,10 +19,10 @@
|
||||
#include "idl/strong_consistency/state_machine.dist.hh"
|
||||
#include "idl/strong_consistency/state_machine.dist.impl.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "utils/histogram_metrics_helper.hh"
|
||||
|
||||
namespace service::strong_consistency {
|
||||
|
||||
|
||||
static logging::logger logger("sc_coordinator");
|
||||
|
||||
// FIXME: Once the drivers support new error codes corresponding
|
||||
@@ -49,68 +49,6 @@ struct read_timeout : public exceptions::read_timeout_exception {
|
||||
{}
|
||||
};
|
||||
|
||||
void stats::register_stats() {
|
||||
namespace sm = seastar::metrics;
|
||||
sm::label reason_label("reason");
|
||||
|
||||
_metrics.add_group("strong_consistency_coordinator", {
|
||||
sm::make_summary("write_latency_summary", sm::description("Strong consistency write latency summary"),
|
||||
[this] { return to_metrics_summary(write.summary()); }).set_skip_when_empty(),
|
||||
|
||||
sm::make_histogram("write_latency", sm::description("Strong consistency write latency histogram"),
|
||||
{}, [this] { return to_metrics_histogram(write.histogram()); })
|
||||
.aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("write_errors", write_errors_timeout,
|
||||
sm::description("number of strong consistency write requests that failed"),
|
||||
{reason_label("timeout")})
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("write_errors", write_errors_status_unknown,
|
||||
sm::description("number of strong consistency write requests that failed"),
|
||||
{reason_label("status_unknown")})
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("write_errors", write_errors_other,
|
||||
sm::description("number of strong consistency write requests that failed"),
|
||||
{reason_label("other")})
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("write_node_bounces", write_node_bounces,
|
||||
sm::description("number of strong consistency write requests bounced to another node"))
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("write_shard_bounces", write_shard_bounces,
|
||||
sm::description("number of strong consistency write requests bounced to another shard"))
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_summary("read_latency_summary", sm::description("Strong consistency read latency summary"),
|
||||
[this] { return to_metrics_summary(read.summary()); }).set_skip_when_empty(),
|
||||
|
||||
sm::make_histogram("read_latency", sm::description("Strong consistency read latency histogram"),
|
||||
{}, [this] { return to_metrics_histogram(read.histogram()); })
|
||||
.aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("read_errors", read_errors_timeout,
|
||||
sm::description("number of strong consistency read requests that failed"),
|
||||
{reason_label("timeout")})
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("read_errors", read_errors_other,
|
||||
sm::description("number of strong consistency read requests that failed"),
|
||||
{reason_label("other")})
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("read_node_bounces", read_node_bounces,
|
||||
sm::description("number of strong consistency read requests bounced to another node"))
|
||||
.set_skip_when_empty(),
|
||||
|
||||
sm::make_counter("read_shard_bounces", read_shard_bounces,
|
||||
sm::description("number of strong consistency read requests bounced to another shard"))
|
||||
.set_skip_when_empty(),
|
||||
});
|
||||
}
|
||||
|
||||
static const locator::tablet_replica* find_replica(const locator::tablet_info& tinfo, locator::host_id id) {
|
||||
const auto it = std::ranges::find_if(tinfo.replicas,
|
||||
[&] (const locator::tablet_replica& r) {
|
||||
@@ -232,7 +170,6 @@ coordinator::coordinator(groups_manager& groups_manager, replica::database& db,
|
||||
, _db(db)
|
||||
, _gossiper(gossiper)
|
||||
{
|
||||
_stats.register_stats();
|
||||
}
|
||||
|
||||
future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
@@ -244,11 +181,6 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
auto aoe = abort_on_expiry<timeout_clock>(timeout);
|
||||
[[maybe_unused]] const auto subs = chain_abort_sources(aoe.abort_source(), as);
|
||||
|
||||
utils::latency_counter lc;
|
||||
lc.start();
|
||||
auto mark_write_latency = defer([this, &lc] { _stats.write.mark(lc.stop().latency()); });
|
||||
bool commit_status_unknown_ex = false;
|
||||
|
||||
try {
|
||||
auto op_result = co_await create_operation_ctx(*schema, token, aoe.abort_source());
|
||||
if (const auto* redirect = get_if<need_redirect>(&op_result)) {
|
||||
@@ -313,11 +245,7 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
logger.debug("mutate(): add_entry, got commit_status_unknown {}, table {}.{}, tablet {}, term {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
|
||||
|
||||
++_stats.write_errors_status_unknown;
|
||||
// FIXME: use a dedicated ERROR_CODE instead of SERVER_ERROR
|
||||
// FIXME: when a dedicated ERROR_CODE will be used,
|
||||
// we can get rid of the boolean flag
|
||||
commit_status_unknown_ex = true;
|
||||
throw exceptions::server_exception(
|
||||
"The outcome of this statement is unknown. It may or may not have been applied. "
|
||||
"Retrying the statement may be necessary.");
|
||||
@@ -343,12 +271,8 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
|| try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
|
||||
logger.trace("mutate(): request timed out with error {}, table {}.{}, token {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), token);
|
||||
++_stats.write_errors_timeout;
|
||||
co_return coroutine::return_exception(write_timeout(schema->ks_name(), schema->cf_name()));
|
||||
} else {
|
||||
if (!commit_status_unknown_ex) {
|
||||
++_stats.write_errors_other;
|
||||
}
|
||||
logger.trace("mutate(): unknown exception {}, table {}.{}, token {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), token);
|
||||
// We know nothing about other errors. Let the CQL server convert them to SERVER_ERROR.
|
||||
@@ -368,10 +292,6 @@ auto coordinator::query(schema_ptr schema,
|
||||
auto aoe = abort_on_expiry<timeout_clock>(timeout);
|
||||
[[maybe_unused]] const auto subs = chain_abort_sources(aoe.abort_source(), as);
|
||||
|
||||
utils::latency_counter lc;
|
||||
lc.start();
|
||||
auto mark_read_latency = defer([this, &lc] { _stats.read.mark(lc.stop().latency()); });
|
||||
|
||||
try {
|
||||
auto op_result = co_await create_operation_ctx(*schema, ranges[0].start()->value().token(), aoe.abort_source());
|
||||
if (const auto* redirect = get_if<need_redirect>(&op_result)) {
|
||||
@@ -403,12 +323,10 @@ auto coordinator::query(schema_ptr schema,
|
||||
|| try_catch<timed_out_error>(ex)) {
|
||||
logger.trace("query(): request timed out with error {}, table {}.{}, read cmd {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), cmd);
|
||||
++_stats.read_errors_timeout;
|
||||
co_return coroutine::return_exception(read_timeout(schema->ks_name(), schema->cf_name()));
|
||||
} else {
|
||||
logger.trace("mutate(): unknown exception {}, table {}.{}, read cmd {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), cmd);
|
||||
++_stats.read_errors_other;
|
||||
// We know nothing about other errors. Let the CQL server convert them to SERVER_ERROR.
|
||||
throw;
|
||||
}
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
|
||||
#include "mutation/mutation.hh"
|
||||
#include "query/query-result.hh"
|
||||
#include "utils/histogram.hh"
|
||||
#include <seastar/core/metrics.hh>
|
||||
|
||||
namespace gms {
|
||||
|
||||
@@ -29,25 +27,6 @@ struct need_redirect {
|
||||
template <typename T = std::monostate>
|
||||
using value_or_redirect = std::variant<T, need_redirect>;
|
||||
|
||||
struct stats {
|
||||
utils::timed_rate_moving_average_summary_and_histogram write;
|
||||
uint64_t write_errors_timeout = 0;
|
||||
uint64_t write_errors_status_unknown = 0;
|
||||
uint64_t write_errors_other = 0;
|
||||
uint64_t write_node_bounces = 0;
|
||||
uint64_t write_shard_bounces = 0;
|
||||
|
||||
utils::timed_rate_moving_average_summary_and_histogram read;
|
||||
uint64_t read_errors_timeout = 0;
|
||||
uint64_t read_errors_other = 0;
|
||||
uint64_t read_node_bounces = 0;
|
||||
uint64_t read_shard_bounces = 0;
|
||||
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
|
||||
void register_stats();
|
||||
};
|
||||
|
||||
class coordinator : public peering_sharded_service<coordinator> {
|
||||
public:
|
||||
using timeout_clock = typename db::timeout_clock;
|
||||
@@ -56,7 +35,6 @@ private:
|
||||
groups_manager& _groups_manager;
|
||||
replica::database& _db;
|
||||
gms::gossiper& _gossiper;
|
||||
stats _stats;
|
||||
|
||||
struct operation_ctx;
|
||||
future<value_or_redirect<operation_ctx>> create_operation_ctx(const schema& schema,
|
||||
@@ -65,8 +43,6 @@ private:
|
||||
public:
|
||||
coordinator(groups_manager& groups_manager, replica::database& db, gms::gossiper& gossiper);
|
||||
|
||||
stats& get_stats() { return _stats; }
|
||||
|
||||
using mutation_gen = noncopyable_function<mutation(api::timestamp_type)>;
|
||||
future<value_or_redirect<>> mutate(schema_ptr schema,
|
||||
const dht::token& token,
|
||||
|
||||
@@ -57,8 +57,6 @@ void load_balancer_stats_manager::setup_metrics(const dc_name& dc, load_balancer
|
||||
stats.migrations_skipped)(dc_lb),
|
||||
sm::make_counter("cross_rack_collocations", sm::description("number of co-locating migrations which move replica across racks"),
|
||||
stats.cross_rack_collocations)(dc_lb),
|
||||
sm::make_counter("rebuilds_produced", sm::description("number of rebuilds produced by the load balancer"),
|
||||
stats.rebuilds_produced)(dc_lb),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -85,9 +83,7 @@ void load_balancer_stats_manager::setup_metrics(load_balancer_cluster_stats& sta
|
||||
sm::make_counter("auto_repair_needs_repair_nr", sm::description("number of tablets with auto repair enabled that currently needs repair"),
|
||||
stats.auto_repair_needs_repair_nr),
|
||||
sm::make_counter("auto_repair_enabled_nr", sm::description("number of tablets with auto repair enabled"),
|
||||
stats.auto_repair_enabled_nr),
|
||||
sm::make_counter("repairs_produced", sm::description("number of repairs produced by the load balancer"),
|
||||
stats.repairs_produced),
|
||||
stats.auto_repair_enabled_nr)
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1014,8 +1010,6 @@ private:
|
||||
return true;
|
||||
case tablet_transition_stage::repair:
|
||||
return true;
|
||||
case tablet_transition_stage::restore:
|
||||
return false;
|
||||
case tablet_transition_stage::end_repair:
|
||||
return false;
|
||||
case tablet_transition_stage::write_both_read_new:
|
||||
@@ -1350,7 +1344,6 @@ public:
|
||||
auto range = tmap.get_token_range(id);
|
||||
auto last_token = tmap.get_last_token(id);
|
||||
plans.push_back(repair_plan{gid, info, range, last_token, diff, is_user_reuqest});
|
||||
++_stats.for_cluster().repairs_produced;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3962,10 +3955,6 @@ public:
|
||||
_current_stats->migrations_produced++;
|
||||
mark_as_scheduled(mig);
|
||||
plan.add(std::move(mig));
|
||||
|
||||
if (kind == tablet_transition_kind::rebuild || kind == tablet_transition_kind::rebuild_v2) {
|
||||
++_current_stats->rebuilds_produced;
|
||||
}
|
||||
} else {
|
||||
// Shards are overloaded with streaming. Do not include the migration in the plan, but
|
||||
// continue as if it was in the hope that we will find a migration which can be executed without
|
||||
@@ -4263,10 +4252,10 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// Only excluded nodes are allowed to have incomplete tablet stats
|
||||
// For size based balancing, only excluded nodes are allowed to have incomplete tablet stats
|
||||
for (auto& [host, node] : nodes) {
|
||||
if (!_load_sketch->has_complete_data(host)) {
|
||||
if (node.drained && node.node->is_excluded()) {
|
||||
if (!_force_capacity_based_balancing && node.drained && node.node->is_excluded()) {
|
||||
_load_sketch->ignore_incomplete_data(host);
|
||||
} else {
|
||||
lblogger.info("Cannot balance because node {} (or more) has incomplete tablet stats", host);
|
||||
|
||||
@@ -48,7 +48,6 @@ struct load_balancer_dc_stats {
|
||||
uint64_t stop_skip_limit = 0;
|
||||
uint64_t stop_batch_size = 0;
|
||||
uint64_t cross_rack_collocations = 0;
|
||||
uint64_t rebuilds_produced = 0;
|
||||
|
||||
load_balancer_dc_stats operator-(const load_balancer_dc_stats& other) const {
|
||||
return {
|
||||
@@ -68,7 +67,6 @@ struct load_balancer_dc_stats {
|
||||
stop_skip_limit - other.stop_skip_limit,
|
||||
stop_batch_size - other.stop_batch_size,
|
||||
cross_rack_collocations - other.cross_rack_collocations,
|
||||
rebuilds_produced - other.rebuilds_produced,
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -96,8 +94,6 @@ struct load_balancer_cluster_stats {
|
||||
uint64_t resizes_finalized = 0;
|
||||
uint64_t auto_repair_needs_repair_nr = 0;
|
||||
uint64_t auto_repair_enabled_nr = 0;
|
||||
|
||||
uint64_t repairs_produced = 0;
|
||||
};
|
||||
|
||||
using dc_name = sstring;
|
||||
|
||||
@@ -63,7 +63,6 @@
|
||||
#include "utils/stall_free.hh"
|
||||
#include "utils/to_string.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
#include "sstables_loader.hh"
|
||||
|
||||
#include "idl/join_node.dist.hh"
|
||||
#include "idl/storage_service.dist.hh"
|
||||
@@ -73,7 +72,6 @@
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "repair/repair.hh"
|
||||
#include "idl/repair.dist.hh"
|
||||
#include "idl/sstables_loader.dist.hh"
|
||||
|
||||
#include "service/topology_coordinator.hh"
|
||||
|
||||
@@ -445,11 +443,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
|
||||
&_messaging, to_host_id(id), id, _term, cmd_index, cmd);
|
||||
if (result.status == raft_topology_cmd_result::command_status::fail) {
|
||||
auto msg = result.error_message.empty()
|
||||
? ::format("failed status returned from {}", id)
|
||||
: ::format("failed status returned from {}: {}", id, result.error_message);
|
||||
co_await coroutine::exception(std::make_exception_ptr(
|
||||
std::runtime_error(std::move(msg))));
|
||||
std::runtime_error(::format("failed status returned from {}", id))));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1558,7 +1553,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
background_action_holder cleanup;
|
||||
background_action_holder repair;
|
||||
background_action_holder repair_update_compaction_ctrl;
|
||||
background_action_holder restore;
|
||||
std::unordered_map<locator::tablet_transition_stage, background_action_holder> barriers;
|
||||
// Record the repair_time returned by the repair_tablet rpc call
|
||||
db_clock::time_point repair_time;
|
||||
@@ -2331,33 +2325,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
}
|
||||
break;
|
||||
case locator::tablet_transition_stage::restore: {
|
||||
if (!trinfo.restore_cfg.has_value()) {
|
||||
on_internal_error(rtlogger, format("Cannot handle restore transition without config for tablet {}", gid));
|
||||
}
|
||||
if (action_failed(tablet_state.restore)) {
|
||||
rtlogger.debug("Clearing restore transition for {} due to error", gid);
|
||||
updates.emplace_back(get_mutation_builder().del_transition(last_token).del_restore_config(last_token).build());
|
||||
break;
|
||||
}
|
||||
if (advance_in_background(gid, tablet_state.restore, "restore", [this, gid, &tmap] () -> future<> {
|
||||
auto& tinfo = tmap.get_tablet_info(gid.tablet);
|
||||
auto replicas = tinfo.replicas;
|
||||
|
||||
rtlogger.info("Restoring tablet={} on {}", gid, replicas);
|
||||
co_await coroutine::parallel_for_each(replicas, [this, gid] (locator::tablet_replica r) -> future<> {
|
||||
auto dst = raft::server_id(r.host.uuid());
|
||||
if (!is_excluded(dst)) {
|
||||
co_await ser::sstables_loader_rpc_verbs::send_restore_tablet(&_messaging, r.host, dst, gid);
|
||||
rtlogger.debug("Tablet {} restored on {}", gid, r.host);
|
||||
}
|
||||
});
|
||||
})) {
|
||||
rtlogger.debug("Clearing restore transition for {}", gid);
|
||||
updates.emplace_back(get_mutation_builder().del_transition(last_token).del_restore_config(last_token).build());
|
||||
}
|
||||
}
|
||||
break;
|
||||
case locator::tablet_transition_stage::end_repair: {
|
||||
if (do_barrier()) {
|
||||
if (tablet_state.session_id.uuid().is_null()) {
|
||||
@@ -2544,8 +2511,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
case locator::tablet_transition_kind::repair:
|
||||
[[fallthrough]];
|
||||
case locator::tablet_transition_kind::restore:
|
||||
[[fallthrough]];
|
||||
case locator::tablet_transition_kind::intranode_migration:
|
||||
break;
|
||||
}
|
||||
@@ -3846,9 +3811,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
// Leave break point. For testing decommission
|
||||
co_await utils::get_local_injector().inject("topology_coordinator_before_leave", utils::wait_for_message(std::chrono::minutes(2)));
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
builder.with_node(node.id)
|
||||
@@ -3947,15 +3909,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
throw;
|
||||
} catch (seastar::abort_requested_exception&) {
|
||||
throw;
|
||||
} catch (const std::exception& e) {
|
||||
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
|
||||
" (node state is rebuilding): {}", e);
|
||||
rtbuilder.done(e.what());
|
||||
retake = true;
|
||||
} catch (...) {
|
||||
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
|
||||
" (node state is rebuilding): {}", std::current_exception());
|
||||
rtbuilder.done("unknown error");
|
||||
rtbuilder.done("streaming failed");
|
||||
retake = true;
|
||||
}
|
||||
if (retake) {
|
||||
|
||||
@@ -318,9 +318,6 @@ struct raft_topology_cmd_result {
|
||||
success
|
||||
};
|
||||
command_status status = command_status::fail;
|
||||
// Carries the error description back to the topology coordinator
|
||||
// when the command fails.
|
||||
sstring error_message;
|
||||
};
|
||||
|
||||
// This class is used in RPC's signatures to hold the topology_version of the caller.
|
||||
|
||||
@@ -115,7 +115,7 @@ public:
|
||||
if (buf.size() != chunk_size) {
|
||||
auto actual_end = _underlying_pos + buf.size();
|
||||
if (chunk_index + 1 < _checksum.checksums.size()) {
|
||||
throw_malformed_sstable_exception(seastar::format("Checksummed reader hit premature end-of-file at file offset {}: expected {} chunks of size {} but data file has {}",
|
||||
throw malformed_sstable_exception(seastar::format("Checksummed reader hit premature end-of-file at file offset {}: expected {} chunks of size {} but data file has {}",
|
||||
actual_end, _checksum.checksums.size(), chunk_size, chunk_index + 1));
|
||||
} else if (actual_end < _file_len) {
|
||||
// Truncation on last chunk. Update _end_pos so that future
|
||||
@@ -124,7 +124,7 @@ public:
|
||||
}
|
||||
}
|
||||
if (chunk_index >= _checksum.checksums.size()) {
|
||||
throw_malformed_sstable_exception(seastar::format("Chunk count mismatch between CRC and Data.db: expected {} but data file has more", _checksum.checksums.size()));
|
||||
throw malformed_sstable_exception(seastar::format("Chunk count mismatch between CRC and Data.db: expected {} but data file has more", _checksum.checksums.size()));
|
||||
}
|
||||
auto expected_checksum = _checksum.checksums[chunk_index];
|
||||
auto actual_checksum = ChecksumType::checksum(buf.get(), buf.size());
|
||||
@@ -231,7 +231,7 @@ input_stream<char> make_checksummed_file_m_format_input_stream(
|
||||
}
|
||||
|
||||
void throwing_integrity_error_handler(sstring msg) {
|
||||
throw_malformed_sstable_exception(msg);
|
||||
throw sstables::malformed_sstable_exception(msg);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ void compression::segmented_offsets::state::update_position_trackers(std::size_t
|
||||
|
||||
void compression::segmented_offsets::init(uint32_t chunk_size) {
|
||||
if (chunk_size == 0) {
|
||||
throw_malformed_sstable_exception("Segmented offsets chunk size is zero.");
|
||||
throw sstables::malformed_sstable_exception("Segmented offsets chunk size is zero.");
|
||||
}
|
||||
|
||||
_chunk_size = chunk_size;
|
||||
@@ -373,11 +373,11 @@ public:
|
||||
throw std::runtime_error(format("compressed reader not aligned to chunk boundary: pos={} offset={}", _pos, addr.offset));
|
||||
}
|
||||
if (!addr.chunk_len) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed chunk_len must be greater than zero, chunk_start={}", addr.chunk_start));
|
||||
throw sstables::malformed_sstable_exception(format("compressed chunk_len must be greater than zero, chunk_start={}", addr.chunk_start));
|
||||
}
|
||||
auto buf = co_await _input_stream->read_exactly(addr.chunk_len);
|
||||
if (buf.size() != addr.chunk_len) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed reader hit premature end-of-file at file offset {}, expected chunk_len={}, actual={}", _underlying_pos, addr.chunk_len, buf.size()));
|
||||
throw sstables::malformed_sstable_exception(format("compressed reader hit premature end-of-file at file offset {}, expected chunk_len={}, actual={}", _underlying_pos, addr.chunk_len, buf.size()));
|
||||
}
|
||||
auto res_units = co_await _permit.request_memory(_compression_metadata->uncompressed_chunk_length());
|
||||
// The last 4 bytes of the chunk are the adler32/crc32 checksum
|
||||
@@ -388,7 +388,7 @@ public:
|
||||
auto expected_checksum = read_be<uint32_t>(buf.get() + compressed_len);
|
||||
auto actual_checksum = ChecksumType::checksum(buf.get(), compressed_len);
|
||||
if (expected_checksum != actual_checksum) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed chunk of size {} at file offset {} failed checksum, expected={}, actual={}", addr.chunk_len, _underlying_pos, expected_checksum, actual_checksum));
|
||||
throw sstables::malformed_sstable_exception(format("compressed chunk of size {} at file offset {} failed checksum, expected={}, actual={}", addr.chunk_len, _underlying_pos, expected_checksum, actual_checksum));
|
||||
}
|
||||
|
||||
if constexpr (check_digest) {
|
||||
@@ -420,7 +420,7 @@ public:
|
||||
if (_digests.can_calculate_digest
|
||||
&& _pos == _compression_metadata->uncompressed_file_length()
|
||||
&& _digests.expected_digest != _digests.actual_digest) {
|
||||
sstables::throw_malformed_sstable_exception(seastar::format("Digest mismatch: expected={}, actual={}", _digests.expected_digest, _digests.actual_digest));
|
||||
throw sstables::malformed_sstable_exception(seastar::format("Digest mismatch: expected={}, actual={}", _digests.expected_digest, _digests.actual_digest));
|
||||
}
|
||||
}
|
||||
co_return make_tracked_temporary_buffer(std::move(out), std::move(res_units));
|
||||
@@ -511,20 +511,20 @@ public:
|
||||
|
||||
auto chunk_len = get_chunk_len(_current_chunk_index);
|
||||
if (!chunk_len) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed raw reader chunk_len must be greater than zero, pos={}", _pos));
|
||||
throw sstables::malformed_sstable_exception(format("compressed raw reader chunk_len must be greater than zero, pos={}", _pos));
|
||||
}
|
||||
|
||||
auto res_units = co_await _permit.request_memory(chunk_len);
|
||||
auto buf = co_await _input_stream->read_exactly(chunk_len);
|
||||
if (buf.size() != chunk_len) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed raw reader hit premature end-of-file at file offset {}, expected chunk_len={}, actual={}", _pos, chunk_len, buf.size()));
|
||||
throw sstables::malformed_sstable_exception(format("compressed raw reader hit premature end-of-file at file offset {}, expected chunk_len={}, actual={}", _pos, chunk_len, buf.size()));
|
||||
}
|
||||
|
||||
auto compressed_len = chunk_len - 4;
|
||||
auto expected_checksum = read_be<uint32_t>(buf.get() + compressed_len);
|
||||
auto actual_checksum = crc32_utils::checksum(buf.get(), compressed_len);
|
||||
if (expected_checksum != actual_checksum) {
|
||||
sstables::throw_malformed_sstable_exception(format("compressed chunk of size {} at file offset {} failed checksum, expected={}, actual={}", chunk_len, _pos, expected_checksum, actual_checksum));
|
||||
throw sstables::malformed_sstable_exception(format("compressed chunk of size {} at file offset {} failed checksum, expected={}, actual={}", chunk_len, _pos, expected_checksum, actual_checksum));
|
||||
}
|
||||
|
||||
if constexpr (check_digest) {
|
||||
@@ -543,7 +543,7 @@ public:
|
||||
if (_digests.can_calculate_digest
|
||||
&& _current_chunk_index == _compression_metadata->offsets.size()
|
||||
&& _digests.expected_digest != _digests.actual_digest) {
|
||||
sstables::throw_malformed_sstable_exception(seastar::format("Digest mismatch: expected={}, actual={}", _digests.expected_digest, _digests.actual_digest));
|
||||
throw sstables::malformed_sstable_exception(seastar::format("Digest mismatch: expected={}, actual={}", _digests.expected_digest, _digests.actual_digest));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -363,7 +363,7 @@ static std::optional<std::vector<std::byte>> dict_from_options(const sstables::c
|
||||
auto i = std::stoi(k_str.substr(DICTIONARY_OPTION.size()));
|
||||
parts.emplace(i, v.value);
|
||||
} catch (const std::exception& e) {
|
||||
sstables::throw_malformed_sstable_exception(fmt::format("Corrupted dictionary option: {}", k_str));
|
||||
throw sstables::malformed_sstable_exception(fmt::format("Corrupted dictionary option: {}", k_str));
|
||||
}
|
||||
}
|
||||
auto v_str = sstring(v.value.begin(), v.value.end());
|
||||
@@ -372,7 +372,7 @@ static std::optional<std::vector<std::byte>> dict_from_options(const sstables::c
|
||||
int i = 0;
|
||||
for (const auto& [k, v] : parts) {
|
||||
if (k != i) {
|
||||
sstables::throw_malformed_sstable_exception(fmt::format("Missing dictionary part: expected {}, got {}", i, k));
|
||||
throw sstables::malformed_sstable_exception(fmt::format("Missing dictionary part: expected {}, got {}", i, k));
|
||||
}
|
||||
++i;
|
||||
auto s = std::as_bytes(std::span(v));
|
||||
|
||||
@@ -48,30 +48,4 @@ struct bufsize_mismatch_exception : malformed_sstable_exception {
|
||||
{}
|
||||
};
|
||||
|
||||
// Controls whether malformed sstable errors abort the process (generating a coredump) or throw an
|
||||
// exception. Aborting is useful when the malformed sstable error is caused by memory corruption
|
||||
// rather than actual sstable corruption, as it allows post-mortem analysis of the coredump.
|
||||
// Controlled by the --abort-on-malformed-sstable-error command-line option.
|
||||
// Returns the previous value of the flag.
|
||||
bool set_abort_on_malformed_sstable_error(bool value) noexcept;
|
||||
bool abort_on_malformed_sstable_error() noexcept;
|
||||
|
||||
// Use these helpers instead of directly throwing malformed_sstable_exception or
|
||||
// bufsize_mismatch_exception. They check the abort_on_malformed_sstable_error flag and either
|
||||
// abort the process (with logging) or throw the appropriate exception.
|
||||
[[noreturn]] void throw_malformed_sstable_exception(sstring msg);
|
||||
[[noreturn]] void throw_malformed_sstable_exception(sstring msg, component_name filename);
|
||||
[[noreturn]] void throw_bufsize_mismatch_exception(size_t size, size_t expected);
|
||||
|
||||
// Disables aborting on malformed sstable errors for a scope.
|
||||
//
|
||||
// Intended for tests which intentionally corrupt sstables and expect
|
||||
// malformed_sstable_exception to be thrown rather than the process aborting.
|
||||
class scoped_no_abort_on_malformed_sstable_error {
|
||||
bool _prev;
|
||||
public:
|
||||
scoped_no_abort_on_malformed_sstable_error() noexcept;
|
||||
~scoped_no_abort_on_malformed_sstable_error();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -191,10 +191,10 @@ private:
|
||||
public:
|
||||
void verify_end_state() const {
|
||||
if (this->_remain > 0) {
|
||||
throw_malformed_sstable_exception(fmt::format("index_consume_entry_context (state={}): parsing ended but there is unconsumed data", _state), _sst.index_filename());
|
||||
throw malformed_sstable_exception(fmt::format("index_consume_entry_context (state={}): parsing ended but there is unconsumed data", _state), _sst.index_filename());
|
||||
}
|
||||
if (_state != state::KEY_SIZE && _state != state::START) {
|
||||
throw_malformed_sstable_exception(fmt::format("index_consume_entry_context (state={}): cannot finish parsing current entry, no more data", _state), _sst.index_filename());
|
||||
throw malformed_sstable_exception(fmt::format("index_consume_entry_context (state={}): cannot finish parsing current entry, no more data", _state), _sst.index_filename());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -544,7 +544,7 @@ private:
|
||||
bound.current_index_idx = 0;
|
||||
bound.current_pi_idx = 0;
|
||||
if (bound.current_list->empty()) {
|
||||
throw_malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
}
|
||||
bound.data_file_position = bound.current_list->_entries[0].position();
|
||||
bound.element = indexable_element::partition;
|
||||
|
||||
@@ -176,7 +176,7 @@ public:
|
||||
} else if (clustering.size() == (expected_normal + 1)) {
|
||||
return true;
|
||||
}
|
||||
throw_malformed_sstable_exception(format("Found {:d} clustering elements in column name. Was not expecting that!", clustering.size()));
|
||||
throw malformed_sstable_exception(format("Found {:d} clustering elements in column name. Was not expecting that!", clustering.size()));
|
||||
}
|
||||
|
||||
static bool check_static(const schema& schema, bytes_view col) {
|
||||
@@ -210,12 +210,12 @@ public:
|
||||
if (is_static) {
|
||||
for (auto& e: clustering) {
|
||||
if (e.size() != 0) {
|
||||
throw_malformed_sstable_exception("Static row has clustering key information. I didn't expect that!");
|
||||
throw malformed_sstable_exception("Static row has clustering key information. I didn't expect that!");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_present && is_static != cdef->is_static()) {
|
||||
throw_malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
|
||||
throw malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
|
||||
is_static ? "static" : "non-static", cdef->is_static() ? "static" : "non-static"));
|
||||
}
|
||||
}
|
||||
@@ -577,20 +577,20 @@ public:
|
||||
[] (const collection_type_impl& ctype) -> const abstract_type& { return *ctype.value_comparator(); },
|
||||
[&] (const user_type_impl& utype) -> const abstract_type& {
|
||||
if (col.collection_extra_data.size() != sizeof(int16_t)) {
|
||||
throw_malformed_sstable_exception(format("wrong size of field index while reading UDT column: expected {}, got {}",
|
||||
throw malformed_sstable_exception(format("wrong size of field index while reading UDT column: expected {}, got {}",
|
||||
sizeof(int16_t), col.collection_extra_data.size()));
|
||||
}
|
||||
|
||||
auto field_idx = deserialize_field_index(col.collection_extra_data);
|
||||
if (field_idx >= utype.size()) {
|
||||
throw_malformed_sstable_exception(format("field index too big while reading UDT column: type has {} fields, got {}",
|
||||
throw malformed_sstable_exception(format("field index too big while reading UDT column: type has {} fields, got {}",
|
||||
utype.size(), field_idx));
|
||||
}
|
||||
|
||||
return *utype.type(field_idx);
|
||||
},
|
||||
[] (const abstract_type& o) -> const abstract_type& {
|
||||
throw_malformed_sstable_exception(format("attempted to read multi-cell column, but expected type was {}", o.name()));
|
||||
throw malformed_sstable_exception(format("attempted to read multi-cell column, but expected type was {}", o.name()));
|
||||
}
|
||||
));
|
||||
auto ac = make_atomic_cell(value_type,
|
||||
@@ -708,7 +708,7 @@ public:
|
||||
case composite::eoc::end:
|
||||
return bound_kind::excl_start;
|
||||
}
|
||||
throw_malformed_sstable_exception(format("Unexpected start composite marker {:d}", uint16_t(uint8_t(found))));
|
||||
throw malformed_sstable_exception(format("Unexpected start composite marker {:d}", uint16_t(uint8_t(found))));
|
||||
}
|
||||
|
||||
static bound_kind end_marker_to_bound_kind(bytes_view component) {
|
||||
@@ -723,7 +723,7 @@ public:
|
||||
case composite::eoc::end:
|
||||
return bound_kind::incl_end;
|
||||
}
|
||||
throw_malformed_sstable_exception(format("Unexpected end composite marker {:d}", uint16_t(uint8_t(found))));
|
||||
throw malformed_sstable_exception(format("Unexpected end composite marker {:d}", uint16_t(uint8_t(found))));
|
||||
}
|
||||
|
||||
// Consume one range tombstone.
|
||||
@@ -1050,7 +1050,7 @@ private:
|
||||
} else {
|
||||
// FIXME: see ColumnSerializer.java:deserializeColumnBody
|
||||
if ((mask & column_mask::counter_update) != column_mask::none) {
|
||||
throw_malformed_sstable_exception("FIXME COUNTER_UPDATE_MASK");
|
||||
throw malformed_sstable_exception("FIXME COUNTER_UPDATE_MASK");
|
||||
}
|
||||
_ttl = _expiration = 0;
|
||||
_deleted = (mask & column_mask::deletion) != column_mask::none;
|
||||
@@ -1062,7 +1062,7 @@ private:
|
||||
mp_row_consumer_k_l::proceed ret;
|
||||
if (_deleted) {
|
||||
if (_val_fragmented.size_bytes() != 4) {
|
||||
throw_malformed_sstable_exception("deleted cell expects local_deletion_time value");
|
||||
throw malformed_sstable_exception("deleted cell expects local_deletion_time value");
|
||||
}
|
||||
_val = temporary_buffer<char>(4);
|
||||
auto v = fragmented_temporary_buffer::view(_val_fragmented);
|
||||
@@ -1110,7 +1110,7 @@ public:
|
||||
return;
|
||||
}
|
||||
if (_state != state::ROW_START || data_consumer::primitive_consumer::active()) {
|
||||
throw_malformed_sstable_exception("end of input, but not end of row");
|
||||
throw malformed_sstable_exception("end of input, but not end of row");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1249,7 +1249,7 @@ private:
|
||||
}
|
||||
|
||||
if (!_consumer.is_mutation_end()) {
|
||||
throw_malformed_sstable_exception(format("consumer not at partition boundary, position: {}",
|
||||
throw malformed_sstable_exception(format("consumer not at partition boundary, position: {}",
|
||||
position_in_partition_view::printer(*_schema, _consumer.position())), _sst->get_filename());
|
||||
}
|
||||
|
||||
@@ -1442,7 +1442,7 @@ public:
|
||||
try {
|
||||
f.get();
|
||||
} catch(sstables::malformed_sstable_exception& e) {
|
||||
throw_malformed_sstable_exception(format("Failed to read partition from SSTable {} due to {}", _sst->get_filename(), e.what()));
|
||||
throw sstables::malformed_sstable_exception(format("Failed to read partition from SSTable {} due to {}", _sst->get_filename(), e.what()));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace sstables {
|
||||
|
||||
static void check_buf_size(temporary_buffer<char>& buf, size_t expected) {
|
||||
if (buf.size() < expected) {
|
||||
throw_bufsize_mismatch_exception(buf.size(), expected);
|
||||
throw bufsize_mismatch_exception(buf.size(), expected);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -46,10 +46,10 @@ inline api::timestamp_type parse_timestamp(const serialization_header& header,
|
||||
inline gc_clock::duration parse_ttl(int64_t value) {
|
||||
if (!is_expired_liveness_ttl(value)) {
|
||||
if (value < 0) {
|
||||
throw_malformed_sstable_exception(format("Negative ttl: {}", value));
|
||||
throw malformed_sstable_exception(format("Negative ttl: {}", value));
|
||||
}
|
||||
if (value > max_ttl.count()) {
|
||||
throw_malformed_sstable_exception(format("Too big ttl: {}", value));
|
||||
throw malformed_sstable_exception(format("Too big ttl: {}", value));
|
||||
}
|
||||
}
|
||||
return gc_clock::duration(value);
|
||||
|
||||
@@ -79,7 +79,7 @@ public:
|
||||
data_consumer::proceed consume_range_tombstone_start(clustering_key_prefix ck, bound_kind k, tombstone t) {
|
||||
sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_start(ck={}, k={}, t={})", fmt::ptr(this), ck, k, t);
|
||||
if (_mf_filter->current_tombstone()) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Range tombstones have to be disjoint: current opened range tombstone {}, new tombstone {}",
|
||||
_mf_filter->current_tombstone(), t));
|
||||
}
|
||||
@@ -90,12 +90,12 @@ public:
|
||||
data_consumer::proceed consume_range_tombstone_end(clustering_key_prefix ck, bound_kind k, tombstone t) {
|
||||
sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_end(ck={}, k={}, t={})", fmt::ptr(this), ck, k, t);
|
||||
if (!_mf_filter->current_tombstone()) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Closing range tombstone that wasn't opened: clustering {}, kind {}, tombstone {}",
|
||||
ck, k, t));
|
||||
}
|
||||
if (_mf_filter->current_tombstone() != t) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Range tombstone with ck {} and two different tombstones at ends: {}, {}",
|
||||
ck, _mf_filter->current_tombstone(), t));
|
||||
}
|
||||
@@ -106,11 +106,11 @@ public:
|
||||
data_consumer::proceed consume_range_tombstone_boundary(position_in_partition pos, tombstone left, tombstone right) {
|
||||
sstlog.trace("mp_row_consumer_m {}: consume_range_tombstone_boundary(pos={}, left={}, right={})", fmt::ptr(this), pos, left, right);
|
||||
if (!_mf_filter->current_tombstone()) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Closing range tombstone that wasn't opened: pos {}, tombstone {}", pos, left));
|
||||
}
|
||||
if (_mf_filter->current_tombstone() != left) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Range tombstone at {} and two different tombstones at ends: {}, {}",
|
||||
pos, _mf_filter->current_tombstone(), left));
|
||||
}
|
||||
@@ -166,7 +166,7 @@ public:
|
||||
|
||||
void check_schema_mismatch(const column_translation::column_info& column_info, const column_definition& column_def) const {
|
||||
if (column_info.schema_mismatch) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw malformed_sstable_exception(
|
||||
format("{} definition in serialization header does not match schema. Expected {} but got {}",
|
||||
column_def.name_as_text(),
|
||||
column_def.type->name(),
|
||||
@@ -180,7 +180,7 @@ public:
|
||||
sstring name = sstring(to_string_view(*column_info.name));
|
||||
auto it = _schema->dropped_columns().find(name);
|
||||
if (it == _schema->dropped_columns().end() || timestamp > it->second.timestamp) {
|
||||
throw_malformed_sstable_exception(format("Column {} missing in current schema", name));
|
||||
throw malformed_sstable_exception(format("Column {} missing in current schema", name));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -399,20 +399,20 @@ public:
|
||||
[] (const collection_type_impl& ctype) -> const abstract_type& { return *ctype.value_comparator(); },
|
||||
[&] (const user_type_impl& utype) -> const abstract_type& {
|
||||
if (cell_path.size() != sizeof(int16_t)) {
|
||||
throw_malformed_sstable_exception(format("wrong size of field index while reading UDT column: expected {}, got {}",
|
||||
throw malformed_sstable_exception(format("wrong size of field index while reading UDT column: expected {}, got {}",
|
||||
sizeof(int16_t), cell_path.size()));
|
||||
}
|
||||
|
||||
auto field_idx = deserialize_field_index(cell_path);
|
||||
if (field_idx >= utype.size()) {
|
||||
throw_malformed_sstable_exception(format("field index too big while reading UDT column: type has {} fields, got {}",
|
||||
throw malformed_sstable_exception(format("field index too big while reading UDT column: type has {} fields, got {}",
|
||||
utype.size(), field_idx));
|
||||
}
|
||||
|
||||
return *utype.type(field_idx);
|
||||
},
|
||||
[] (const abstract_type& o) -> const abstract_type& {
|
||||
throw_malformed_sstable_exception(format("attempted to read multi-cell column, but expected type was {}", o.name()));
|
||||
throw malformed_sstable_exception(format("attempted to read multi-cell column, but expected type was {}", o.name()));
|
||||
}
|
||||
));
|
||||
auto ac = is_deleted ? atomic_cell::make_dead(timestamp, local_deletion_time)
|
||||
@@ -559,7 +559,7 @@ public:
|
||||
sstlog.trace("mp_row_consumer_m {}: on_end_of_stream()", fmt::ptr(this));
|
||||
if (_mf_filter && _mf_filter->current_tombstone()) {
|
||||
if (_mf_filter->out_of_range()) {
|
||||
throw_malformed_sstable_exception("Unclosed range tombstone.");
|
||||
throw sstables::malformed_sstable_exception("Unclosed range tombstone.");
|
||||
}
|
||||
auto result = _mf_filter->apply(position_in_partition_view::after_all_clustered_rows(), {});
|
||||
for (auto&& rt : result.rts) {
|
||||
@@ -904,7 +904,7 @@ private:
|
||||
_is_first_unfiltered = false;
|
||||
goto row_body_label;
|
||||
} else {
|
||||
throw_malformed_sstable_exception("static row should be a first unfiltered in a partition");
|
||||
throw malformed_sstable_exception("static row should be a first unfiltered in a partition");
|
||||
}
|
||||
}
|
||||
start_row(_regular_row);
|
||||
@@ -924,7 +924,7 @@ private:
|
||||
continue;
|
||||
}
|
||||
if (_null_component_occured) {
|
||||
throw_malformed_sstable_exception("non-null component after null component");
|
||||
throw malformed_sstable_exception("non-null component after null component");
|
||||
}
|
||||
if (is_block_empty()) {
|
||||
_row_key.push_back({});
|
||||
@@ -971,7 +971,7 @@ private:
|
||||
}
|
||||
if (_extended_flags.is_static()) {
|
||||
if (_flags.has_timestamp() || _flags.has_ttl() || _flags.has_deletion()) {
|
||||
throw_malformed_sstable_exception(format("Static row has unexpected flags: timestamp={}, ttl={}, deletion={}",
|
||||
throw malformed_sstable_exception(format("Static row has unexpected flags: timestamp={}, ttl={}, deletion={}",
|
||||
_flags.has_timestamp(), _flags.has_ttl(), _flags.has_deletion()));
|
||||
}
|
||||
} else {
|
||||
@@ -994,7 +994,7 @@ private:
|
||||
}
|
||||
if (_extended_flags.has_scylla_shadowable_deletion()) {
|
||||
if (!_has_shadowable_tombstones) {
|
||||
throw_malformed_sstable_exception("Scylla shadowable tombstone flag is set but not supported on this SSTables");
|
||||
throw malformed_sstable_exception("Scylla shadowable tombstone flag is set but not supported on this SSTables");
|
||||
}
|
||||
co_yield this->read_unsigned_vint(*_processing_data);
|
||||
_row_shadowable_tombstone.timestamp = parse_timestamp(_header, this->_u64);
|
||||
@@ -1155,7 +1155,7 @@ private:
|
||||
_left_range_tombstone.deletion_time = parse_expiry(_header, this->_u64);
|
||||
if (!is_boundary_between_adjacent_intervals(_range_tombstone_kind)) {
|
||||
if (!is_bound_kind(_range_tombstone_kind)) {
|
||||
throw_malformed_sstable_exception(
|
||||
throw sstables::malformed_sstable_exception(
|
||||
format("Corrupted range tombstone: invalid boundary type {}", _range_tombstone_kind));
|
||||
}
|
||||
_sst->get_stats().on_range_tombstone_read();
|
||||
@@ -1219,7 +1219,7 @@ public:
|
||||
// is the first state corresponding to the contents of a new partition.
|
||||
if (_state != state::DELETION_TIME
|
||||
&& (_state != state::PARTITION_START || data_consumer::primitive_consumer::active())) {
|
||||
throw_malformed_sstable_exception("end of input, but not end of partition");
|
||||
throw malformed_sstable_exception("end of input, but not end of partition");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1410,7 +1410,7 @@ private:
|
||||
}
|
||||
|
||||
if (!_consumer.is_mutation_end()) {
|
||||
throw_malformed_sstable_exception(format("consumer not at partition boundary, position: {}",
|
||||
throw malformed_sstable_exception(format("consumer not at partition boundary, position: {}",
|
||||
position_in_partition_view::printer(*_schema, _consumer.position())), _sst->get_filename());
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user