Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
9e806cb3f7 Fix critical bugs and issues found in alternator code review
Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
2026-01-29 22:54:57 +00:00
copilot-swe-agent[bot]
f267af38bd Initial plan 2026-01-29 22:49:31 +00:00
267 changed files with 4308 additions and 4846 deletions

View File

@@ -1,22 +0,0 @@
name: Sync Jira Based on PR Milestone Events
on:
pull_request_target:
types: [milestoned, demilestoned]
permissions:
contents: read
pull-requests: read
jobs:
jira-sync-milestone-set:
if: github.event.action == 'milestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-sync-milestone-removed:
if: github.event.action == 'demilestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,4 +1,4 @@
name: Call Jira release creation for new milestone
name: Call Jira release creation for new milestone
on:
milestone:
@@ -9,6 +9,6 @@ jobs:
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
with:
# Comma-separated list of Jira project keys
jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
jira_project_keys: "SCYLLADB,CUSTOMER"
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,62 +0,0 @@
name: Close issues created by Scylla associates
on:
issues:
types: [opened, reopened]
permissions:
issues: write
jobs:
comment-and-close:
runs-on: ubuntu-latest
steps:
- name: Comment and close if author email is scylladb.com
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const issue = context.payload.issue;
const actor = context.actor;
// Get user data (only public email is available)
const { data: user } = await github.rest.users.getByUsername({
username: actor,
});
const email = user.email || "";
console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
// Only continue if email exists and ends with @scylladb.com
if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
console.log("User is not a scylladb.com email (or email not public); skipping.");
return;
}
const owner = context.repo.owner;
const repo = context.repo.repo;
const issue_number = issue.number;
const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
// Add the comment
await github.rest.issues.createComment({
owner,
repo,
issue_number,
body,
});
console.log(`Comment added to #${issue_number}`);
// Close the issue
await github.rest.issues.update({
owner,
repo,
issue_number,
state: "closed",
state_reason: "not_planned"
});
console.log(`Issue #${issue_number} closed.`);

View File

@@ -9,34 +9,16 @@ on:
jobs:
trigger-jenkins:
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
runs-on: ubuntu-latest
steps:
- name: Validate Comment Trigger
if: github.event_name == 'issue_comment'
id: verify_comment
shell: bash
run: |
BODY=$(cat << 'EOF'
${{ github.event.comment.body }}
EOF
)
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
echo "trigger=true" >> $GITHUB_OUTPUT
else
echo "trigger=false" >> $GITHUB_OUTPUT
fi
- name: Trigger Scylla-CI-Route Jenkins Job
if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
env:
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
JENKINS_URL: "https://jenkins.scylladb.com"
run: |
PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
PR_NUMBER=${{ github.event.issue.number }}
PR_REPO_NAME=${{ github.event.repository.full_name }}
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v

View File

@@ -43,7 +43,7 @@ For further information, please see:
[developer documentation]: HACKING.md
[build documentation]: docs/dev/building.md
[docker image build documentation]: dist/docker/redhat/README.md
[docker image build documentation]: dist/docker/debian/README.md
## Running Scylla

View File

@@ -244,7 +244,10 @@ static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
// Check if two JSON-encoded values match with the CONTAINS relation
bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query) {
if (!v1) {
if (!v1 || !v1->IsObject() || v1->MemberCount() == 0) {
return false;
}
if (!v2.IsObject() || v2.MemberCount() == 0) {
return false;
}
const auto& kv1 = *v1->MemberBegin();
@@ -618,7 +621,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
// Check if the existing values of the item (previous_item) match the
// conditions given by the Expected and ConditionalOperator parameters
// (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
// This function can throw a ValidationException API error if there
// This function can throw an ValidationException API error if there
// are errors in the format of the condition itself.
bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
const rjson::value* expected = rjson::find(req, "Expected");

View File

@@ -53,7 +53,9 @@ void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjso
}
static uint64_t calculate_half_units(uint64_t unit_block_size, uint64_t total_bytes, bool is_quorum) {
uint64_t half_units = (total_bytes + unit_block_size -1) / unit_block_size; //divide by unit_block_size and round up
// Avoid potential integer overflow when total_bytes is close to UINT64_MAX
// by using division with modulo instead of addition before division
uint64_t half_units = total_bytes / unit_block_size + (total_bytes % unit_block_size != 0 ? 1 : 0);
if (is_quorum) {
half_units *= 2;

View File

@@ -237,7 +237,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
}
// This function assumes the given value is an object and returns requested member value.
// If it is not possible, an api_error::validation is thrown.
// If it is not possible an api_error::validation is thrown.
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
validate_is_object(obj, caller);
const rjson::value* ret = rjson::find(obj, member_name);
@@ -249,7 +249,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe
// This function assumes the given value is an object with a single member, and returns this member.
// In case the requirements are not met, an api_error::validation is thrown.
// In case the requirements are not met an api_error::validation is thrown.
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
if (!v.IsObject() || v.MemberCount() != 1) {
throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -682,7 +682,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
}
// Sets a KeySchema object inside the given JSON parent describing the key
// attributes of the given schema as being either HASH or RANGE keys.
// attributes of the the given schema as being either HASH or RANGE keys.
// Additionally, adds to a given map mappings between the key attribute
// names and their type (as a DynamoDB type string).
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -834,11 +834,13 @@ future<> executor::fill_table_size(rjson::value &table_description, schema_ptr s
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
// Note: we don't care when the notification of other shards will finish, as long as it will be done
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
// which is also fine, as the specification doesn't give precision guarantees of any kind.
// A race condition is possible: if a DescribeTable request arrives on a different shard before
// that shard receives the cached size, it will recalculate independently. This is acceptable because:
// 1. Both calculations will cache their results with an expiry time
// 2. Expiry times are unlikely to be identical, so eventually all shards converge to the most recent value
// 3. Even if expiry times match, different shards may briefly return different table sizes
// 4. This temporary inconsistency is acceptable per DynamoDB specification, which doesn't guarantee
// exact precision for DescribeTable size information
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
}
}
@@ -916,7 +918,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
sstring index_name = cf_name.substr(delim_it + 1);
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
// Add index's KeySchema and collect types for AttributeDefinitions:
// Add indexes's KeySchema and collect types for AttributeDefinitions:
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
// Add projection type
rjson::value projection = rjson::empty_object();
@@ -2435,7 +2437,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
// case, this function simply won't be called for this attribute.)
//
// This function checks if the given attribute update is an update to some
// GSI's key, and if the value is unsuitable, an api_error::validation is
// GSI's key, and if the value is unsuitable, a api_error::validation is
// thrown. The checking here is similar to the checking done in
// get_key_from_typed_value() for the base table's key columns.
//
@@ -3548,7 +3550,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
return true;
}
// Add a path to an attribute_path_map. Throws a validation error if the path
// Add a path to a attribute_path_map. Throws a validation error if the path
// "overlaps" with one already in the filter (one is a sub-path of the other)
// or "conflicts" with it (both a member and index is requested).
template<typename T>

View File

@@ -50,7 +50,7 @@ public:
_operators.emplace_back(i);
check_depth_limit();
}
void add_dot(std::string name) {
void add_dot(std::string(name)) {
_operators.emplace_back(std::move(name));
check_depth_limit();
}
@@ -85,7 +85,7 @@ struct constant {
}
};
// "value" is a value used in the right hand side of an assignment
// "value" is is a value used in the right hand side of an assignment
// expression, "SET a = ...". It can be a constant (a reference to a value
// included in the request, e.g., ":val"), a path to an attribute from the
// existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
// The supported primitive conditions are:
// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
// v1 and v2 are values - from the item (an attribute path), the query
// (a ":val" reference), or a function of the above (only the size()
// (a ":val" reference), or a function of the the above (only the size()
// function is supported).
// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
// 3. N-ary operator - v1 IN ( v2, v3, ... )

View File

@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);
// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it. Otherwise,
// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it. Otherwise,
// raises ValidationException with diagnostic.
big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

View File

@@ -141,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLive request.
//
// Here is a brief overview of how the expiration service works:
//
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
if (retries >= 10) {
// Don't get stuck forever asking the same page, maybe there's
// a bug or a real problem in several replicas. Give up on
// this scan and retry the scan from a random position later,
// this scan an retry the scan from a random position later,
// in the next scan period.
throw runtime_exception("scanner thread failed after too many timeouts for the same page");
}

View File

@@ -30,7 +30,7 @@ namespace alternator {
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLeave request.
class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
public:
// Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
data_dictionary::database _db;
service::storage_proxy& _proxy;
gms::gossiper& _gossiper;
// _end is set by start(), and resolves when the background service
// _end is set by start(), and resolves when the the background service
// started by it ends. To ask the background service to end, _abort_source
// should be triggered. stop() below uses both _abort_source and _end.
std::optional<future<>> _end;

View File

@@ -12,7 +12,7 @@
"operations":[
{
"method":"POST",
"summary":"Resets authorized prepared statements cache",
"summary":"Reset cache",
"type":"void",
"nickname":"authorization_cache_reset",
"produces":[

View File

@@ -23,6 +23,31 @@
namespace api {
template<class T>
std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
std::vector<T> res;
res.reserve(map.size());
for (const auto& [key, value] : map) {
res.push_back(T());
res.back().key = key;
res.back().value = value;
}
return res;
}
template<class T, class MAP>
std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
res.reserve(res.size() + std::size(map));
for (const auto& [key, value] : map) {
T val;
val.key = fmt::to_string(key);
val.value = fmt::to_string(value);
res.push_back(val);
}
return res;
}
template <typename T, typename S = T>
T map_sum(T&& dest, const S& src) {
for (const auto& i : src) {

View File

@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
auto sstables = parsed.GetArray() |
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
std::ranges::to<std::vector>();
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
keyspace,
table,
endpoint,
bucket,
prefix,
sstables.size(),
scope,
primary_replica_only);
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
co_return json::json_return_type(fmt::to_string(task_id));
});
@@ -536,15 +527,13 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
}
void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
auto keyspace = validate_keyspace(ctx, req);
auto view = req->get_path_param("view");
co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
storage_service_json::mapper res;
res.key = i.first;
res.value = i.second;
return res;
}));
return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
});
});
cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -582,16 +571,6 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
}
namespace {
template <typename Key, typename Value>
storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
storage_service_json::mapper val;
val.key = fmt::to_string(i.first);
val.value = fmt::to_string(i.second);
return val;
}
}
static
future<json::json_return_type>
rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -609,7 +588,12 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
}
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
storage_service_json::mapper val;
val.key = fmt::to_string(i.first);
val.value = fmt::to_string(i.second);
return val;
}));
}
static
@@ -693,6 +677,7 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
table_id = validate_table(ctx.db.local(), keyspace, table);
}
std::vector<ss::maplist_mapper> res;
co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
[](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
ss::maplist_mapper m;
@@ -1323,7 +1308,10 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
}
co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
return ss.local().get_ownership().then([] (auto&& ownership) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
});
}
static
@@ -1340,7 +1328,10 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
}
}
co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
});
}
static
@@ -1350,7 +1341,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
}
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1416,7 +1407,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
apilog.warn("retrain_dict: called before the cluster feature was enabled");
throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
}
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);

View File

@@ -17,6 +17,7 @@ target_sources(scylla_auth
password_authenticator.cc
passwords.cc
permission.cc
permissions_cache.cc
resource.cc
role_or_anonymous.cc
roles-metadata.cc

View File

@@ -8,7 +8,6 @@
#include "auth/cache.hh"
#include "auth/common.hh"
#include "auth/role_or_anonymous.hh"
#include "auth/roles-metadata.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
@@ -19,8 +18,6 @@
#include <seastar/core/abort_source.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/core/format.hh>
#include <seastar/core/metrics.hh>
#include <seastar/core/do_with.hh>
namespace auth {
@@ -30,21 +27,7 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
: _current_version(0)
, _qp(qp)
, _loading_sem(1)
, _as(as)
, _permission_loader(nullptr)
, _permission_loader_sem(8) {
namespace sm = seastar::metrics;
_metrics.add_group("auth_cache", {
sm::make_gauge("roles", [this] { return _roles.size(); },
sm::description("Number of roles currently cached")),
sm::make_gauge("permissions", [this] {
return _cached_permissions_count;
}, sm::description("Total number of permission sets currently cached across all roles"))
});
}
void cache::set_permission_loader(permission_loader_func loader) {
_permission_loader = std::move(loader);
, _as(as) {
}
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -55,75 +38,6 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
return it->second;
}
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
std::unordered_map<resource, permission_set>* perms_cache;
lw_shared_ptr<role_record> role_ptr;
if (is_anonymous(role)) {
perms_cache = &_anonymous_permissions;
} else {
const auto& role_name = *role.name;
auto role_it = _roles.find(role_name);
if (role_it == _roles.end()) {
// Role might have been deleted but there are some connections
// left which reference it. They should no longer have access to anything.
return make_ready_future<permission_set>(permissions::NONE);
}
role_ptr = role_it->second;
perms_cache = &role_ptr->cached_permissions;
}
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
return make_ready_future<permission_set>(it->second);
}
// keep alive role_ptr as it holds perms_cache (except anonymous)
return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
return load_permissions(role, r, perms_cache);
});
}
future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
SCYLLA_ASSERT(_permission_loader);
auto units = co_await get_units(_permission_loader_sem, 1, _as);
// Check again, perhaps we were blocked and other call loaded
// the permissions already. This is a protection against misses storm.
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
co_return it->second;
}
auto perms = co_await _permission_loader(role, r);
add_permissions(*perms_cache, r, perms);
co_return perms;
}
future<> cache::prune(const resource& r) {
auto units = co_await get_units(_loading_sem, 1, _as);
_anonymous_permissions.erase(r);
for (auto& it : _roles) {
// Prunning can run concurrently with other functions but it
// can only cause cached_permissions extra reload via get_permissions.
remove_permissions(it.second->cached_permissions, r);
co_await coroutine::maybe_yield();
}
}
future<> cache::reload_all_permissions() noexcept {
SCYLLA_ASSERT(_permission_loader);
auto units = co_await get_units(_loading_sem, 1, _as);
const role_or_anonymous anon;
for (auto& [res, perms] : _anonymous_permissions) {
perms = co_await _permission_loader(anon, res);
}
for (auto& [role, entry] : _roles) {
auto& perms_cache = entry->cached_permissions;
auto r = role_or_anonymous(role);
for (auto& [res, perms] : perms_cache) {
perms = co_await _permission_loader(r, res);
}
}
logger.debug("Reloaded auth cache with {} entries", _roles.size());
}
future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
auto rec = make_lw_shared<role_record>();
rec->version = _current_version;
@@ -191,7 +105,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
future<> cache::prune_all() noexcept {
for (auto it = _roles.begin(); it != _roles.end(); ) {
if (it->second->version != _current_version) {
remove_role(it++);
_roles.erase(it++);
co_await coroutine::maybe_yield();
} else {
++it;
@@ -215,7 +129,7 @@ future<> cache::load_all() {
const auto name = r.get_as<sstring>("role");
auto role = co_await fetch_role(name);
if (role) {
add_role(name, role);
_roles[name] = role;
}
co_return stop_iteration::no;
};
@@ -233,26 +147,6 @@ future<> cache::load_all() {
});
}
future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
if (!role) {
// Role might have been removed or not yet added, either way
// their members will be handled by another top call to this function.
co_return;
}
for (const auto& member_name : role->members) {
bool is_new = roles.insert(member_name).second;
if (!is_new) {
continue;
}
lw_shared_ptr<cache::role_record> member_role;
auto r = _roles.find(member_name);
if (r != _roles.end()) {
member_role = r->second;
}
co_await gather_inheriting_roles(roles, member_role, member_name);
}
}
future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
if (legacy_mode(_qp)) {
co_return;
@@ -260,40 +154,27 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
std::unordered_set<role_name_t> roles_to_clear_perms;
for (const auto& name : roles) {
logger.info("Loading role {}", name);
auto role = co_await fetch_role(name);
if (role) {
add_role(name, role);
co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
_roles[name] = role;
} else {
if (auto it = _roles.find(name); it != _roles.end()) {
auto old_role = it->second;
remove_role(it);
co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
}
_roles.erase(name);
}
co_await distribute_role(name, role);
}
co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
for (const auto& name : roles_to_clear_perms) {
c.clear_role_permissions(name);
co_await coroutine::maybe_yield();
}
});
}
future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
auto role_ptr = role.get();
co_await container().invoke_on_others([&name, role_ptr](cache& c) {
if (!role_ptr) {
c.remove_role(name);
c._roles.erase(name);
return;
}
auto role_copy = make_lw_shared<role_record>(*role_ptr);
c.add_role(name, std::move(role_copy));
c._roles[name] = std::move(role_copy);
});
}
@@ -304,40 +185,4 @@ bool cache::includes_table(const table_id& id) noexcept {
|| id == db::system_keyspace::role_permissions()->id();
}
void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
if (auto it = _roles.find(name); it != _roles.end()) {
_cached_permissions_count -= it->second->cached_permissions.size();
}
_cached_permissions_count += role->cached_permissions.size();
_roles[name] = std::move(role);
}
void cache::remove_role(const role_name_t& name) {
if (auto it = _roles.find(name); it != _roles.end()) {
remove_role(it);
}
}
void cache::remove_role(roles_map::iterator it) {
_cached_permissions_count -= it->second->cached_permissions.size();
_roles.erase(it);
}
void cache::clear_role_permissions(const role_name_t& name) {
if (auto it = _roles.find(name); it != _roles.end()) {
_cached_permissions_count -= it->second->cached_permissions.size();
it->second->cached_permissions.clear();
}
}
void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
if (cache.emplace(r, perms).second) {
++_cached_permissions_count;
}
}
void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
_cached_permissions_count -= cache.erase(r);
}
} // namespace auth

View File

@@ -17,14 +17,11 @@
#include <seastar/core/sharded.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/semaphore.hh>
#include <seastar/core/metrics_registration.hh>
#include <absl/container/flat_hash_map.h>
#include "auth/permission.hh"
#include "auth/common.hh"
#include "auth/resource.hh"
#include "auth/role_or_anonymous.hh"
namespace cql3 { class query_processor; }
@@ -34,7 +31,6 @@ class cache : public peering_sharded_service<cache> {
public:
using role_name_t = sstring;
using version_tag_t = char;
using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;
struct role_record {
bool can_login = false;
@@ -44,19 +40,11 @@ public:
sstring salted_hash;
std::unordered_map<sstring, sstring> attributes;
std::unordered_map<sstring, permission_set> permissions;
private:
friend cache;
// cached permissions include effects of role's inheritance
std::unordered_map<resource, permission_set> cached_permissions;
version_tag_t version; // used for seamless cache reloads
};
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
void set_permission_loader(permission_loader_func loader);
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
future<> prune(const resource& r);
future<> reload_all_permissions() noexcept;
future<> load_all();
future<> load_roles(std::unordered_set<role_name_t> roles);
static bool includes_table(const table_id&) noexcept;
@@ -64,31 +52,14 @@ public:
private:
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
roles_map _roles;
// anonymous permissions map exists mainly due to compatibility with
// higher layers which use role_or_anonymous to get permissions.
std::unordered_map<resource, permission_set> _anonymous_permissions;
version_tag_t _current_version;
cql3::query_processor& _qp;
semaphore _loading_sem; // protects iteration of _roles map
semaphore _loading_sem;
abort_source& _as;
permission_loader_func _permission_loader;
semaphore _permission_loader_sem; // protects against reload storms on a single role change
metrics::metric_groups _metrics;
size_t _cached_permissions_count = 0;
future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
future<> prune_all() noexcept;
future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
void remove_role(const role_name_t& name);
void remove_role(roles_map::iterator it);
void clear_role_permissions(const role_name_t& name);
void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
};
} // namespace auth

View File

@@ -88,16 +88,10 @@ static const class_registrator<
ldap_role_manager::ldap_role_manager(
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
}
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -106,8 +100,6 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
qp.db().get_config().ldap_attr_role(),
qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(),
qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
qp,
rg0c,
mm,
@@ -127,22 +119,6 @@ future<> ldap_role_manager::start() {
return make_exception_future(
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
}
_cache_pruner = futurize_invoke([this] () -> future<> {
while (true) {
try {
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
} catch (const seastar::sleep_aborted&) {
co_return; // ignore
}
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
try {
co_await c.reload_all_permissions();
} catch (...) {
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
}
});
}
});
return _std_mgr.start();
}
@@ -199,11 +175,7 @@ future<conn_ptr> ldap_role_manager::reconnect() {
future<> ldap_role_manager::stop() {
_as.request_abort();
return std::move(_cache_pruner).then([this] {
return _std_mgr.stop();
}).then([this] {
return _connection_factory.stop();
});
return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
}
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {

View File

@@ -10,7 +10,6 @@
#pragma once
#include <seastar/core/abort_source.hh>
#include <seastar/core/future.hh>
#include <stdexcept>
#include "ent/ldap/ldap_connection.hh"
@@ -35,22 +34,14 @@ class ldap_role_manager : public role_manager {
seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
seastar::sstring _bind_name; ///< Username for LDAP simple bind.
seastar::sstring _bind_password; ///< Password for LDAP simple bind.
uint32_t _permissions_update_interval_in_ms;
utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
seastar::abort_source _as;
cache& _cache;
seastar::future<> _cache_pruner;
public:
ldap_role_manager(
std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
std::string_view bind_name, ///< LDAP bind credentials.
std::string_view bind_password, ///< LDAP bind credentials.
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ///< Passed to standard_role_manager.
::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
::service::migration_manager& mm, ///< Passed to standard_role_manager.

38
auth/permissions_cache.cc Normal file
View File

@@ -0,0 +1,38 @@
/*
* Copyright (C) 2017-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "auth/permissions_cache.hh"
#include <fmt/ranges.h>
#include "auth/authorizer.hh"
#include "auth/service.hh"
namespace auth {
permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
: _cache(c, log, [&ser, &log](const key_type& k) {
log.debug("Refreshing permissions for {}", k.first);
return ser.get_uncached_permissions(k.first, k.second);
}) {
}
bool permissions_cache::update_config(utils::loading_cache_config c) {
return _cache.update_config(std::move(c));
}
void permissions_cache::reset() {
_cache.reset();
}
future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
return do_with(key_type(maybe_role, r), [this](const auto& k) {
return _cache.get(k);
});
}
}

66
auth/permissions_cache.hh Normal file
View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2017-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <iostream>
#include <utility>
#include <fmt/core.h>
#include <seastar/core/future.hh>
#include "auth/permission.hh"
#include "auth/resource.hh"
#include "auth/role_or_anonymous.hh"
#include "utils/log.hh"
#include "utils/hash.hh"
#include "utils/loading_cache.hh"
namespace std {
inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
return os;
}
}
namespace db {
class config;
}
namespace auth {
class service;
class permissions_cache final {
using cache_type = utils::loading_cache<
std::pair<role_or_anonymous, resource>,
permission_set,
1,
utils::loading_cache_reload_enabled::yes,
utils::simple_entry_size<permission_set>,
utils::tuple_hash>;
using key_type = typename cache_type::key_type;
cache_type _cache;
public:
explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
future <> stop() {
return _cache.stop();
}
bool update_config(utils::loading_cache_config);
void reset();
future<permission_set> get(const role_or_anonymous&, const resource&);
};
}

View File

@@ -64,11 +64,11 @@ static const sstring superuser_col_name("super");
static logging::logger log("auth_service");
class auth_migration_listener final : public ::service::migration_listener {
service& _service;
authorizer& _authorizer;
cql3::query_processor& _qp;
public:
explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s), _qp(qp) {
explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a), _qp(qp) {
}
private:
@@ -92,14 +92,14 @@ private:
return;
}
// Do it in the background.
(void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
});
(void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
});
@@ -111,8 +111,9 @@ private:
return;
}
// Do it in the background.
(void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_data_resource(ks_name, cf_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
});
@@ -125,8 +126,9 @@ private:
return;
}
// Do it in the background.
(void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_functions_resource(ks_name, function_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
});
@@ -136,8 +138,9 @@ private:
// in non legacy path revoke is part of schema change statement execution
return;
}
(void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_functions_resource(ks_name, aggregate_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
});
@@ -154,6 +157,7 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
}
service::service(
utils::loading_cache_config c,
cache& cache,
cql3::query_processor& qp,
::service::raft_group0_client& g0,
@@ -162,17 +166,25 @@ service::service(
std::unique_ptr<authenticator> a,
std::unique_ptr<role_manager> r,
maintenance_socket_enabled used_by_maintenance_socket)
: _cache(cache)
: _loading_cache_config(std::move(c))
, _permissions_cache(nullptr)
, _cache(cache)
, _qp(qp)
, _group0_client(g0)
, _mnotifier(mn)
, _authorizer(std::move(z))
, _authenticator(std::move(a))
, _role_manager(std::move(r))
, _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
, _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
, _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
, _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
, _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
, _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
, _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
, _used_by_maintenance_socket(used_by_maintenance_socket) {}
service::service(
utils::loading_cache_config c,
cql3::query_processor& qp,
::service::raft_group0_client& g0,
::service::migration_notifier& mn,
@@ -181,6 +193,7 @@ service::service(
maintenance_socket_enabled used_by_maintenance_socket,
cache& cache)
: service(
std::move(c),
cache,
qp,
g0,
@@ -244,14 +257,7 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
co_await _role_manager->ensure_superuser_is_created();
}
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
if (!_used_by_maintenance_socket) {
// Maintenance socket mode can't cache permissions because it has
// different authorizer. We can't mix cached permissions, they could be
// different in normal mode.
_cache.set_permission_loader(std::bind(
&service::get_uncached_permissions,
this, std::placeholders::_1, std::placeholders::_2));
}
_permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
co_await once_among_shards([this] {
_mnotifier.register_listener(_migration_listener.get());
return make_ready_future<>();
@@ -263,7 +269,9 @@ future<> service::stop() {
// Only one of the shards has the listener registered, but let's try to
// unregister on each one just to make sure.
return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
_cache.set_permission_loader(nullptr);
if (_permissions_cache) {
return _permissions_cache->stop();
}
return make_ready_future<>();
}).then([this] {
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
@@ -275,8 +283,21 @@ future<> service::ensure_superuser_is_created() {
co_await _authenticator->ensure_superuser_is_created();
}
void service::update_cache_config() {
auto db = _qp.db();
utils::loading_cache_config perm_cache_config;
perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
}
}
void service::reset_authorization_cache() {
_permissions_cache->reset();
_qp.reset_cache();
}
@@ -301,10 +322,7 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
}
future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
if (legacy_mode(_qp) || _used_by_maintenance_socket) {
return get_uncached_permissions(maybe_role, r);
}
return _cache.get_permissions(maybe_role, r);
return _permissions_cache->get(maybe_role, r);
}
future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -429,11 +447,6 @@ future<bool> service::exists(const resource& r) const {
return make_ready_future<bool>(false);
}
future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
co_await _authorizer->revoke_all(r, mc);
co_await _cache.prune(r);
}
future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
std::vector<cql3::description> result{};
@@ -788,7 +801,7 @@ future<> revoke_permissions(
}
future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
return ser.revoke_all(r, mc);
return ser.underlying_authorizer().revoke_all(r, mc);
}
future<std::vector<permission_details>> list_filtered_permissions(

View File

@@ -20,6 +20,7 @@
#include "auth/authenticator.hh"
#include "auth/authorizer.hh"
#include "auth/permission.hh"
#include "auth/permissions_cache.hh"
#include "auth/cache.hh"
#include "auth/role_manager.hh"
#include "auth/common.hh"
@@ -74,6 +75,8 @@ public:
/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
/// given an object from another shard. Used for bouncing lwt requests to correct shard.
class service final : public seastar::peering_sharded_service<service> {
utils::loading_cache_config _loading_cache_config;
std::unique_ptr<permissions_cache> _permissions_cache;
cache& _cache;
cql3::query_processor& _qp;
@@ -91,12 +94,20 @@ class service final : public seastar::peering_sharded_service<service> {
// Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
std::unique_ptr<::service::migration_listener> _migration_listener;
std::function<void(uint32_t)> _permissions_cache_cfg_cb;
serialized_action _permissions_cache_config_action;
utils::observer<uint32_t> _permissions_cache_max_entries_observer;
utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
maintenance_socket_enabled _used_by_maintenance_socket;
abort_source _as;
public:
service(
utils::loading_cache_config,
cache& cache,
cql3::query_processor&,
::service::raft_group0_client&,
@@ -112,6 +123,7 @@ public:
/// of the instances themselves.
///
service(
utils::loading_cache_config,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_notifier&,
@@ -126,6 +138,8 @@ public:
future<> ensure_superuser_is_created();
void update_cache_config();
void reset_authorization_cache();
///
@@ -167,13 +181,6 @@ public:
future<bool> exists(const resource&) const;
///
/// Revoke all permissions granted to any role for a particular resource.
///
/// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
///
future<> revoke_all(const resource&, ::service::group0_batch&) const;
///
/// Produces descriptions that can be used to restore the state of auth. That encompasses
/// roles, role grants, and permission grants.

View File

@@ -814,7 +814,8 @@ generation_service::generation_service(
config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
replica::database& db)
replica::database& db,
std::function<bool()> raft_topology_change_enabled)
: _cfg(std::move(cfg))
, _gossiper(g)
, _sys_dist_ks(sys_dist_ks)
@@ -823,6 +824,7 @@ generation_service::generation_service(
, _token_metadata(stm)
, _feature_service(f)
, _db(db)
, _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
{
}
@@ -876,7 +878,16 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
assert_shard_zero(__PRETTY_FUNCTION__);
return make_ready_future<>();
if (_raft_topology_change_enabled()) {
return make_ready_future<>();
}
return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
return legacy_handle_cdc_generation(gen_id);
});
}
future<> generation_service::check_and_repair_cdc_streams() {

View File

@@ -79,12 +79,17 @@ private:
std::optional<cdc::generation_id> _gen_id;
future<> _cdc_streams_rewrite_complete = make_ready_future<>();
/* Returns true if raft topology changes are enabled.
* Can only be called from shard 0.
*/
std::function<bool()> _raft_topology_change_enabled;
public:
generation_service(config cfg, gms::gossiper&,
sharded<db::system_distributed_keyspace>&,
sharded<db::system_keyspace>& sys_ks,
abort_source&, const locator::shared_token_metadata&,
gms::feature_service&, replica::database& db);
gms::feature_service&, replica::database& db,
std::function<bool()> raft_topology_change_enabled);
future<> stop();
~generation_service();

View File

@@ -299,11 +299,13 @@ batch_size_fail_threshold_in_kb: 1024
# max_hint_window_in_ms: 10800000 # 3 hours
# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
# Validity period for permissions cache (fetching permissions can be an
# expensive operation depending on the authorizer, CassandraAuthorizer is
# one example). Defaults to 10000, set to 0 to disable.
# Will be disabled automatically for AllowAllAuthorizer.
# permissions_validity_in_ms: 10000
# Refresh interval for authorized statements cache.
# Refresh interval for permissions cache (if enabled).
# After this interval, cache entries become eligible for refresh. Upon next
# access, an async reload is scheduled and the old value returned until it
# completes. If permissions_validity_in_ms is non-zero, then this also must have
@@ -564,16 +566,15 @@ commitlog_total_space_in_mb: -1
# prometheus_address: 1.2.3.4
# audit settings
# Table audit is enabled by default.
# By default, Scylla does not audit anything.
# 'audit' config option controls if and where to output audited events:
# - "none": auditing is disabled
# - "table": save audited events in audit.audit_log column family (default)
# - "none": auditing is disabled (default)
# - "table": save audited events in audit.audit_log column family
# - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
audit: "table"
#
# List of statement categories that should be audited.
# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
audit_categories: "DCL,AUTH,ADMIN"
audit_categories: "DCL,DDL,AUTH,ADMIN"
#
# List of tables that should be audited.
# audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"

View File

@@ -730,6 +730,28 @@ vector_search_tests = set([
'test/vector_search/rescoring_test'
])
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
vector_search_validator_deps = set([
'test/vector_search_validator/build-validator',
'test/vector_search_validator/Cargo.toml',
'test/vector_search_validator/crates/validator/Cargo.toml',
'test/vector_search_validator/crates/validator/src/main.rs',
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
])
vector_store_bin = 'vector-search-validator/bin/vector-store'
vector_store_deps = set([
'test/vector_search_validator/build-env',
'test/vector_search_validator/build-vector-store',
])
vector_search_validator_bins = set([
vector_search_validator_bin,
vector_store_bin,
])
wasms = set([
'wasm/return_input.wat',
'wasm/test_complex_null_values.wat',
@@ -763,7 +785,7 @@ other = set([
'iotune',
])
all_artifacts = apps | cpp_apps | tests | other | wasms
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -1174,7 +1196,6 @@ scylla_core = (['message/messaging_service.cc',
'utils/gz/crc_combine.cc',
'utils/gz/crc_combine_table.cc',
'utils/http.cc',
'utils/http_client_error_processing.cc',
'utils/rest/client.cc',
'utils/s3/aws_error.cc',
'utils/s3/client.cc',
@@ -1276,6 +1297,7 @@ scylla_core = (['message/messaging_service.cc',
'auth/passwords.cc',
'auth/password_authenticator.cc',
'auth/permission.cc',
'auth/permissions_cache.cc',
'auth/service.cc',
'auth/standard_role_manager.cc',
'auth/ldap_role_manager.cc',
@@ -1645,7 +1667,6 @@ for t in sorted(perf_tests):
deps['test/boost/combined_tests'] += [
'test/boost/aggregate_fcts_test.cc',
'test/boost/auth_cache_test.cc',
'test/boost/auth_test.cc',
'test/boost/batchlog_manager_test.cc',
'test/boost/cache_algorithm_test.cc',
@@ -2564,10 +2585,11 @@ def write_build_file(f,
description = RUST_LIB $out
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
f.write(
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
mode=mode,
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
)
)
if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2597,7 +2619,7 @@ def write_build_file(f,
continue
profile_dep = modes[mode].get('profile_target', "")
if binary in other or binary in wasms:
if binary in other or binary in wasms or binary in vector_search_validator_bins:
continue
srcs = deps[binary]
# 'scylla'
@@ -2708,10 +2730,11 @@ def write_build_file(f,
)
f.write(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
mode=mode,
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
)
)
f.write(
@@ -2879,6 +2902,19 @@ def write_build_file(f,
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
)
f.write(textwrap.dedent(f'''\
rule build-vector-search-validator
command = test/vector_search_validator/build-validator $builddir
rule build-vector-store
command = test/vector_search_validator/build-vector-store $builddir
'''))
f.write(
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
)
f.write(
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
)
f.write(textwrap.dedent(f'''\
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
build dist-unified: phony dist-unified-tar

View File

@@ -389,10 +389,8 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
bool is_ann_ordering = false;
}
: K_SELECT (
( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
| (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
)?
( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
( K_DISTINCT { is_distinct = true; } )?
sclause=selectClause
)
K_FROM (
@@ -427,13 +425,13 @@ selector returns [shared_ptr<raw_selector> s]
unaliasedSelector returns [uexpression tmp]
: ( c=cident { tmp = unresolved_identifier{std::move(c)}; }
| v=value { tmp = std::move(v); }
| K_COUNT '(' countArgument ')' { tmp = make_count_rows_function_expression(); }
| K_WRITETIME '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
unresolved_identifier{std::move(c)}}; }
| K_TTL '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
unresolved_identifier{std::move(c)}}; }
| f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
| f=similarityFunctionName args=vectorSimilarityArgs { tmp = function_call{std::move(f), std::move(args)}; }
| K_CAST '(' arg=unaliasedSelector K_AS t=native_type ')' { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
)
( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -448,9 +446,23 @@ selectionFunctionArgs returns [std::vector<expression> a]
')'
;
vectorSimilarityArgs returns [std::vector<expression> a]
: '(' ')'
| '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
')'
;
vectorSimilarityArg returns [uexpression a]
: s=unaliasedSelector { a = std::move(s); }
| v=value { a = std::move(v); }
;
countArgument
: '*'
/* COUNT(1) is also allowed, it is recognized via the general function(args) path */
| i=INTEGER { if (i->getText() != "1") {
add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
} }
;
whereClause returns [uexpression clause]
@@ -1694,6 +1706,10 @@ functionName returns [cql3::functions::function_name s]
: (ks=keyspaceName '.')? f=allowedFunctionName { $s.keyspace = std::move(ks); $s.name = std::move(f); }
;
similarityFunctionName returns [cql3::functions::function_name s]
: f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
;
allowedFunctionName returns [sstring s]
: f=IDENT { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
| f=QUOTED_NAME { $s = $f.text; }
@@ -1702,6 +1718,11 @@ allowedFunctionName returns [sstring s]
| K_COUNT { $s = "count"; }
;
allowedSimilarityFunctionName returns [sstring s]
: f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
{ $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
;
functionArgs returns [std::vector<expression> a]
: '(' ')'
| '(' t1=term { a.push_back(std::move(t1)); }
@@ -2398,6 +2419,10 @@ K_MUTATION_FRAGMENTS: M U T A T I O N '_' F R A G M E N T S;
K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;
K_SIMILARITY_EUCLIDEAN: S I M I L A R I T Y '_' E U C L I D E A N;
K_SIMILARITY_COSINE: S I M I L A R I T Y '_' C O S I N E;
K_SIMILARITY_DOT_PRODUCT: S I M I L A R I T Y '_' D O T '_' P R O D U C T;
// Case-insensitive alpha characters
fragment A: ('a'|'A');
fragment B: ('b'|'B');

View File

@@ -10,7 +10,6 @@
#include "expr-utils.hh"
#include "evaluate.hh"
#include "cql3/functions/functions.hh"
#include "cql3/functions/aggregate_fcts.hh"
#include "cql3/functions/castas_fcts.hh"
#include "cql3/functions/scalar_function.hh"
#include "cql3/column_identifier.hh"
@@ -1048,47 +1047,8 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
return partially_prepared_args;
}
// Special case for count(1) - recognize it as the countRows() function. Note it is quite
// artificial and we might relax it to the more general count(expression) later.
static
std::optional<expression>
try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
return std::visit(overloaded_functor{
[&] (const functions::function_name& name) -> std::optional<expression> {
auto native_name = name;
if (!native_name.has_keyspace()) {
native_name = name.as_native_function();
}
// Collapse count(1) into countRows()
if (native_name == functions::function_name::native_function("count")) {
if (fc.args.size() == 1) {
if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
&& uc_arg->raw_text == "1") {
return expr::function_call{
.func = functions::aggregate_fcts::make_count_rows_function(),
.args = {},
};
} else {
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
}
}
}
}
return std::nullopt;
},
[] (const shared_ptr<functions::function>&) -> std::optional<expression> {
// Already prepared, nothing to do
return std::nullopt;
},
}, fc.func);
}
std::optional<expression>
prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
return prepared;
}
// Try to extract a column family name from the available information.
// Most functions can be prepared without information about the column family, usually just the keyspace is enough.
// One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,

View File

@@ -10,41 +10,9 @@
#include "types/types.hh"
#include "types/vector.hh"
#include "exceptions/exceptions.hh"
#include <span>
#include <bit>
namespace cql3 {
namespace functions {
namespace detail {
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
if (!param) {
throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
}
const size_t expected_size = dimension * sizeof(float);
if (param->size() != expected_size) {
throw exceptions::invalid_request_exception(
fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
expected_size, dimension, param->size()));
}
std::vector<float> result;
result.reserve(dimension);
bytes_view view(*param);
for (size_t i = 0; i < dimension; ++i) {
// read_simple handles network byte order (big-endian) conversion
uint32_t raw = read_simple<uint32_t>(view);
result.push_back(std::bit_cast<float>(raw));
}
return result;
}
} // namespace detail
namespace {
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {
// You should only use this function if you need to preserve the original vectors and cannot normalize
// them in advance.
float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
double squared_norm_a = 0.0;
double squared_norm_b = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
squared_norm_a += a * a;
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
}
float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double sum = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
double diff = a - b;
sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl
// Assumes that both vectors are L2-normalized.
// This similarity is intended as an optimized way to perform cosine similarity calculation.
float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
}
@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
return std::nullopt;
}
// Extract dimension from the vector type
const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
size_t dimension = type.get_dimension();
const auto& type = arg_types()[0];
data_value v1 = type->deserialize(*parameters[0]);
data_value v2 = type->deserialize(*parameters[1]);
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
// Optimized path: extract floats directly from bytes, bypassing data_value overhead
std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
return float_type->decompose(result);
}

View File

@@ -11,7 +11,6 @@
#include "native_scalar_function.hh"
#include "cql3/assignment_testable.hh"
#include "cql3/functions/function_name.hh"
#include <span>
namespace cql3 {
namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
};
namespace detail {
// Extract float vector directly from serialized bytes, bypassing data_value overhead.
// This is an internal API exposed for testing purposes.
// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
} // namespace detail
} // namespace functions
} // namespace cql3

View File

@@ -137,15 +137,6 @@ public:
return value_type();
}
bool update_result_metadata_id(const key_type& key, cql3::cql_metadata_id_type metadata_id) {
cache_value_ptr vp = _cache.find(key.key());
if (!vp) {
return false;
}
(*vp)->update_result_metadata_id(std::move(metadata_id));
return true;
}
template <typename Pred>
requires std::is_invocable_r_v<bool, Pred, ::shared_ptr<cql_statement>>
void remove_if(Pred&& pred) {

View File

@@ -481,12 +481,6 @@ public:
void update_authorized_prepared_cache_config();
/// Update the result metadata_id of a cached prepared statement.
/// Returns true if the entry was found and updated, false if it was evicted.
bool update_prepared_result_metadata_id(const cql3::prepared_cache_key_type& cache_key, cql3::cql_metadata_id_type metadata_id) {
return _prepared_cache.update_result_metadata_id(cache_key, std::move(metadata_id));
}
void reset_cache();
bool topology_global_queue_empty();

View File

@@ -23,7 +23,6 @@
#include "index/vector_index.hh"
#include "schema/schema.hh"
#include "service/client_state.hh"
#include "service/paxos/paxos_state.hh"
#include "types/types.hh"
#include "cql3/query_processor.hh"
#include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
fragmented_ostringstream os{};
fmt::format_to(os.to_iter(),
"/* Do NOT execute this statement! It's only for informational purposes.\n"
" A paxos state table is created automatically when enabling LWT on a base table.\n"
"\n{}\n"
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
}
result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
auto& replica_db = db.real_database();
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
}) | std::ranges::to<std::vector<schema_ptr>>();
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

View File

@@ -52,7 +52,6 @@ public:
std::vector<sstring> warnings;
private:
cql_metadata_id_type _metadata_id;
bool _result_metadata_is_empty;
public:
prepared_statement(audit::audit_info_ptr&& audit_info, seastar::shared_ptr<cql_statement> statement_, std::vector<seastar::lw_shared_ptr<column_specification>> bound_names_,
@@ -72,15 +71,6 @@ public:
void calculate_metadata_id();
cql_metadata_id_type get_metadata_id() const;
bool result_metadata_is_empty() const {
return _result_metadata_is_empty;
}
void update_result_metadata_id(cql_metadata_id_type metadata_id) {
_metadata_id = std::move(metadata_id);
_result_metadata_is_empty = false;
}
};
}

View File

@@ -49,7 +49,6 @@ prepared_statement::prepared_statement(
, partition_key_bind_indices(std::move(partition_key_bind_indices))
, warnings(std::move(warnings))
, _metadata_id(bytes{})
, _result_metadata_is_empty(statement->get_result_metadata()->flags().contains<metadata::flag::NO_METADATA>())
{
statement->set_audit_info(std::move(audit_info));
}

View File

@@ -259,9 +259,11 @@ uint32_t select_statement::get_bound_terms() const {
future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
try {
auto cdc = qp.db().get_cdc_base_table(*_schema);
auto& cf_name = _schema->is_view()
? _schema->view_info()->base_name()
const data_dictionary::database db = qp.db();
auto&& s = db.find_schema(keyspace(), column_family());
auto cdc = db.get_cdc_base_table(*s);
auto& cf_name = s->is_view()
? s->view_info()->base_name()
: (cdc ? cdc->cf_name() : column_family());
const schema_ptr& base_schema = cdc ? cdc : _schema;
bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);

View File

@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
}
continue;
} catch (shutdown_marker&) {
_reserve_segments.abort(std::current_exception());
break;
} catch (...) {
clogger.warn("Exception in segment reservation: {}", std::current_exception());
}
co_await sleep(100ms);
}
_reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
}
future<std::vector<db::commitlog::descriptor>>

View File

@@ -1201,13 +1201,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"* org.apache.cassandra.auth.CassandraRoleManager: Stores role data in the system_auth keyspace;\n"
"* com.scylladb.auth.LDAPRoleManager: Fetches role data from an LDAP server.")
, permissions_validity_in_ms(this, "permissions_validity_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
"How long authorized statements cache entries remain valid. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
"How long permissions in cache remain valid. Depending on the authorizer, such as CassandraAuthorizer, fetching permissions can be resource intensive. Permissions caching is disabled when this property is set to 0 or when AllowAllAuthorizer is used. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
"and the cached value has been read at least once during the permissions_validity_in_ms time frame. If any of these two conditions doesn't hold the cached value is going to be evicted from the cache.\n"
"\n"
"Related information: Object permissions")
, permissions_update_interval_in_ms(this, "permissions_update_interval_in_ms", liveness::LiveUpdate, value_status::Used, 2000,
"Refresh interval for authorized statements cache. After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms. This option additionally controls the permissions refresh interval for LDAP.")
, permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Unused, 1000,
"Refresh interval for permissions cache (if enabled). After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms.")
, permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Used, 1000,
"Maximum cached permission entries. Must have a non-zero value if permissions caching is enabled (see a permissions_validity_in_ms description).")
, server_encryption_options(this, "server_encryption_options", value_status::Used, {/*none*/},
"Enable or disable inter-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. The available options are:\n"
@@ -1272,7 +1272,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1498,7 +1498,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
"The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
, consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
, force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing. Note: gossip topology changes are incompatible with tablets.")
, recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
, wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
, wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
@@ -1527,21 +1527,17 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"Allows target tablet size to be configured. Defaults to 5G (in bytes). Maintaining tablets at reasonable sizes is important to be able to " \
"redistribute load. A higher value means tablet migration throughput can be reduced. A lower value may cause number of tablets to increase significantly, " \
"potentially resulting in performance drawbacks.")
, tablet_streaming_read_concurrency_per_shard(this, "tablet_streaming_read_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
"Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
, tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
"Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
, replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
, replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
, service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")
, audit(this, "audit", value_status::Used, "table",
, audit(this, "audit", value_status::Used, "none",
"Controls the audit feature:\n"
"\n"
"\tnone : No auditing enabled.\n"
"\tsyslog : Audit messages sent to Syslog.\n"
"\ttable : Audit messages written to column family named audit.audit_log.\n")
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH", "Comma separated list of operation categories that should be audited.")
, audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
, audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
, audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")

View File

@@ -542,8 +542,6 @@ public:
named_value<double> tablets_initial_scale_factor;
named_value<unsigned> tablets_per_shard_goal;
named_value<uint64_t> target_tablet_size_in_bytes;
named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
named_value<unsigned> tablet_streaming_write_concurrency_per_shard;
named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;

View File

@@ -1714,9 +1714,7 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
std::unordered_set<dht::token> tset;
for (auto& t: tokens) {
auto str = value_cast<sstring>(t);
if (str != dht::token::from_sstring(str).to_sstring()) {
on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
}
SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
tset.insert(dht::token::from_sstring(str));
}
return tset;
@@ -3193,7 +3191,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
};
}
} else if (must_have_tokens(nstate)) {
on_internal_error(slogger, format(
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
}
}
@@ -3275,7 +3273,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
// Currently, at most one node at a time can be in transitioning state.
if (!map->empty()) {
const auto& [other_id, other_rs] = *map->begin();
on_internal_error(slogger, format(
on_fatal_internal_error(slogger, format(
"load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
other_id, other_rs.state, host_id, nstate));
}
@@ -3333,7 +3331,8 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
gen_id.id);
if (!gen_rows || gen_rows->empty()) {
SCYLLA_ASSERT(gen_rows);
if (gen_rows->empty()) {
on_internal_error(slogger, format(
"load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
}

View File

@@ -215,8 +215,6 @@ public:
static constexpr auto BUILT_VIEWS = "built_views";
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
static constexpr auto CDC_LOCAL = "cdc_local";
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
static constexpr auto CDC_STREAMS = "cdc_streams";
// auth
static constexpr auto ROLES = "roles";

View File

@@ -588,7 +588,11 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
utils::get_local_injector().inject("do_build_range_fail",
[] { throw std::runtime_error("do_build_range failed due to error injection"); });
return seastar::async([this, base_id, views_ids = std::move(views_ids), last_token, &as] {
// Run the view building in the streaming scheduling group
// so that it doesn't impact other tasks with higher priority.
seastar::thread_attributes attr;
attr.sched_group = _db.get_streaming_scheduling_group();
return seastar::async(std::move(attr), [this, base_id, views_ids = std::move(views_ids), last_token, &as] {
gc_clock::time_point now = gc_clock::now();
auto base_cf = _db.find_column_family(base_id).shared_from_this();
reader_permit permit = _db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "build_views_range", db::no_timeout, {});

View File

@@ -67,7 +67,6 @@ public:
return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
.with_column("peer", inet_addr_type, column_kind::partition_key)
.with_column("dc", utf8_type)
.with_column("rack", utf8_type)
.with_column("up", boolean_type)
.with_column("draining", boolean_type)
.with_column("excluded", boolean_type)
@@ -112,9 +111,7 @@ public:
// Not all entries in gossiper are present in the topology
auto& node = tm.get_topology().get_node(hostid);
sstring dc = node.dc_rack().dc;
sstring rack = node.dc_rack().rack;
set_cell(cr, "dc", dc);
set_cell(cr, "rack", rack);
set_cell(cr, "draining", node.is_draining());
set_cell(cr, "excluded", node.is_excluded());
}
@@ -1348,8 +1345,8 @@ public:
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1431,8 +1428,8 @@ public:
}
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", timestamp_type, column_kind::clustering_key)

View File

@@ -12,6 +12,5 @@ namespace debug {
seastar::sharded<replica::database>* volatile the_database = nullptr;
seastar::scheduling_group streaming_scheduling_group;
seastar::scheduling_group gossip_scheduling_group;
}

View File

@@ -18,7 +18,6 @@ namespace debug {
extern seastar::sharded<replica::database>* volatile the_database;
extern seastar::scheduling_group streaming_scheduling_group;
extern seastar::scheduling_group gossip_scheduling_group;
}

View File

@@ -12,7 +12,7 @@ Do the following in the top-level Scylla source directory:
2. Run `ninja dist-dev` (with the same mode name as above) to prepare
the distribution artifacts.
3. Run `./dist/docker/redhat/build_docker.sh --mode dev`
3. Run `./dist/docker/debian/build_docker.sh --mode dev`
This creates a docker image as a **file**, in the OCI format, and prints
its name, looking something like:

View File

@@ -70,7 +70,7 @@ bcp() { buildah copy "$container" "$@"; }
run() { buildah run "$container" "$@"; }
bconfig() { buildah config "$@" "$container"; }
container="$(buildah from --pull=always docker.io/redhat/ubi9-minimal:latest)"
container="$(buildah from docker.io/redhat/ubi9-minimal:latest)"
packages=(
"build/dist/$config/redhat/RPMS/$arch/$product-$version-$release.$arch.rpm"

View File

@@ -1,10 +1,6 @@
### a dictionary of redirections
#old path: new path
# Move the OS Support page
/stable/getting-started/os-support.html: https://docs.scylladb.com/stable/versioning/os-support-per-version.html
# Remove an outdated KB
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html

View File

@@ -25,8 +25,6 @@ Querying data from data is done using a ``SELECT`` statement:
: | CAST '(' `selector` AS `cql_type` ')'
: | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
: | COUNT '(' '*' ')'
: | literal
: | bind_marker
: )
: ( '.' `field_name` | '[' `term` ']' )*
where_clause: `relation` ( AND `relation` )*
@@ -37,8 +35,6 @@ Querying data from data is done using a ``SELECT`` statement:
operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
timeout: `duration`
literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
bind_marker: '?' | ':' `identifier`
For instance::
@@ -85,13 +81,6 @@ A :token:`selector` can be one of the following:
- A casting, which allows you to convert a nested selector to a (compatible) type.
- A function call, where the arguments are selector themselves.
- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
- A literal value (constant).
- A bind variable (`?` or `:name`).
Note that due to a quirk of the type system, literals and bind markers cannot be
used as top-level selectors, as the parser cannot infer their type. However, they can be used
when nested inside functions, as the function formal parameter types provide the
necessary context.
Aliases
```````
@@ -292,8 +281,7 @@ For example::
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
or columns provided in a definition of the index.
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
For example::

View File

@@ -140,83 +140,17 @@ Vector Index :label-note:`ScyllaDB Cloud`
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
index for indexing vectors per partition.
similarity search on vector data.
The vector index is the only custom type index supported in ScyllaDB. It is created using
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
add additional columns to the index for filtering the search results. The partition column
specified in the global vector index definition must be the vector column, and any subsequent
columns are treated as filtering columns. The local vector index requires that the partition key
of the base table is also the partition key of the index and the vector column is the first one
from the following columns.
Example of a simple index:
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global vector index. Additional filtering can be performed on the primary key
columns of the base table.
Example of a global vector index with additional filtering:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global index. Additional columns are added for filtering the search results.
The filtering is possible on ``category``, ``info`` and all primary key columns
of the base table.
Example of a local vector index:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed for similarity search (a local
index) and additional columns are added for filtering the search results. The
filtering is possible on ``category``, ``info`` and all primary key columns of
the base table. The columns ``id`` and ``created_at`` must be the partition key
of the base table.
Vector indexes support additional filtering columns of native data types
(excluding counter and duration). The indexed column itself must be a vector
column, while the extra columns can be used to filter search results.
The supported types are:
* ``ascii``
* ``bigint``
* ``blob``
* ``boolean``
* ``date``
* ``decimal``
* ``double``
* ``float``
* ``inet``
* ``int``
* ``smallint``
* ``text``
* ``varchar``
* ``time``
* ``timestamp``
* ``timeuuid``
* ``tinyint``
* ``uuid``
* ``varint``
The following options are supported for vector indexes. All of them are optional.
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+

View File

@@ -108,4 +108,6 @@ check the statement and throw if it is disallowed, similar to what
Obviously, an audit definition must survive a server restart and stay
consistent among all nodes in a cluster. We'll accomplish both by
storing audits in a system table.
storing audits in a system table. They will be cached in memory the
same way `permissions_cache` caches table contents in `permission_set`
objects resident in memory.

View File

@@ -39,17 +39,6 @@ Both client and server use the same string identifiers for the keys to determine
negotiated extension set, judging by the presence of a particular key in the
SUPPORTED/STARTUP messages.
## Client options
`client_options` column in `system.clients` table stores all data sent by the
client in STARTUP request, as a `map<text, text>`. This column may be useful
for debugging and monitoring purposes.
Drivers can send additional data in STARTUP, e.g. load balancing policy, retry
policy, timeouts, and other configuration.
Such data should be sent in `CLIENT_OPTIONS` key, as JSON. The recommended
structure of this JSON will be decided in the future.
## Intranode sharding
This extension allows the driver to discover how Scylla internally
@@ -85,6 +74,8 @@ The keys and values are:
as an indicator to which shard client wants to connect. The desired shard number
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
Its value is a decimal representation of type `uint16_t`, by default `19142`.
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
`biased-token-round-robin`. To apply the algorithm,

View File

@@ -563,18 +563,17 @@ CREATE TABLE system.clients (
address inet,
port int,
client_type text,
client_options frozen<map<text, text>>,
connection_stage text,
driver_name text,
driver_version text,
hostname text,
protocol_version int,
scheduling_group text,
shard_id int,
ssl_cipher_suite text,
ssl_enabled boolean,
ssl_protocol text,
username text,
scheduling_group text,
PRIMARY KEY (address, port, client_type)
) WITH CLUSTERING ORDER BY (port ASC, client_type ASC)
~~~
@@ -582,7 +581,4 @@ CREATE TABLE system.clients (
Currently only CQL clients are tracked. The table used to be present on disk (in data
directory) before and including version 4.5.
`client_options` column stores all data sent by the client in the STARTUP request.
This column is useful for debugging and monitoring purposes.
## TODO: the rest

View File

@@ -156,7 +156,7 @@ How do I check the current version of ScyllaDB that I am running?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* On a regular system or VM (running Ubuntu, CentOS, or RedHat Enterprise): :code:`$ scylla --version`
Check the `Operating System Support Guide <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for a list of supported operating systems and versions.
Check the :doc:`Operating System Support Guide </getting-started/os-support>` for a list of supported operating systems and versions.
* On a docker node: :code:`$ docker exec -it Node_Z scylla --version`

View File

@@ -3,9 +3,9 @@
Automatic Repair
================
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
Traditionally, launching `repairs </operating-scylla/procedures/maintenance/repair>`_ in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the tablet `tablet </architecture/tablets>`_ automatically.
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
To enable automatic repair, add this to the configuration (``scylla.yaml``):
@@ -20,4 +20,4 @@ More featureful configuration methods will be implemented in the future.
To disable, set ``auto_repair_enabled_default: false``.
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
Automatic repair relies on `Incremental Repair </features/incremental-repair>`_ and as such it only works with `tablet </architecture/tablets>`_ tables.

View File

@@ -3,7 +3,7 @@
Incremental Repair
==================
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
ScyllaDB's standard `repair </operating-scylla/procedures/maintenance/repair>`_ process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
@@ -51,7 +51,7 @@ Benefits of Incremental Repair
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with `Automatic Repair </features/automatic-repair>`_.
Notes
-----

View File

@@ -18,7 +18,7 @@ Getting Started
:class: my-panel
* :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
.. panel-box::
:title: Install and Configure ScyllaDB

View File

@@ -17,7 +17,7 @@ This article will help you install ScyllaDB on Linux using platform-specific pac
Prerequisites
----------------
* Ubuntu, Debian, CentOS, or RHEL (see `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* Ubuntu, Debian, CentOS, or RHEL (see :doc:`OS Support by Platform and Version </getting-started/os-support>`
for details about supported versions and architecture)
* Root or ``sudo`` access to the system
* Open :ref:`ports used by ScyllaDB <networking-ports>`

View File

@@ -10,7 +10,7 @@ Prerequisites
--------------
Ensure that your platform is supported by the ScyllaDB version you want to install.
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_.
See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.
Install ScyllaDB with Web Installer
---------------------------------------

View File

@@ -12,8 +12,7 @@ the package manager (dnf and apt).
Prerequisites
---------------
Ensure your platform is supported by the ScyllaDB version you want to install.
See `OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
for information about supported Linux distributions and versions.
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
Note that if you're on CentOS 7, only root offline installation is supported.

View File

@@ -0,0 +1,26 @@
OS Support by Linux Distributions and Version
==============================================
The following matrix shows which Linux distributions, containers, and images
are :ref:`supported <os-support-definition>` with which versions of ScyllaDB.
.. datatemplate:json:: /_static/data/os-support.json
:template: platforms.tmpl
``*`` 2024.1.9 and later
All releases are available as a Docker container, EC2 AMI, GCP, and Azure images.
.. _os-support-definition:
By *supported*, it is meant that:
- A binary installation package is available.
- The download and install procedures are tested as part of the ScyllaDB release process for each version.
- An automated install is included from :doc:`ScyllaDB Web Installer for Linux tool </getting-started/installation-common/scylla-web-installer>` (for the latest versions).
You can `build ScyllaDB from source <https://github.com/scylladb/scylladb#build-prerequisites>`_
on other x86_64 or aarch64 platforms, without any guarantees.

View File

@@ -8,12 +8,12 @@ ScyllaDB Requirements
:hidden:
system-requirements
OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>
OS Support <os-support>
Cloud Instance Recommendations <cloud-instance-recommendations>
scylla-in-a-shared-environment
* :doc:`System Requirements</getting-started/system-requirements/>`
* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
* :doc:`Cloud Instance Recommendations AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>`
* :doc:`Running ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`

View File

@@ -8,7 +8,7 @@ Supported Platforms
===================
ScyllaDB runs on 64-bit Linux. The x86_64 and AArch64 architectures are supported (AArch64 support includes AWS EC2 Graviton).
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for information about
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about
supported operating systems, distros, and versions.
See :doc:`Cloud Instance Recommendations for AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>` for information

View File

@@ -0,0 +1,43 @@
====================================================
Increase Permission Cache to Avoid Non-paged Queries
====================================================
**Topic: Mitigate non-paged queries coming from connection authentications**
**Audience: ScyllaDB administrators**
Issue
-----
If you create lots of roles and give them lots of permissions your nodes might spike with non-paged queries.
Root Cause
----------
``permissions_cache_max_entries`` is set to 1000 by default. This setting may not be high enough for bigger deployments with lots of tables, users, and roles with permissions.
Solution
--------
Open the scylla.yaml configuration for editing and adjust the following parameters:
``permissions_cache_max_entries`` - increase this value to suit your needs. See the example below.
``permissions_update_interval_in_ms``
``permissions_validity_in_ms``
Note:: ``permissions_update_interval_in_ms`` and ``permissions_validity_in_ms`` can be set to also make the authentication records come from cache instead of lookups, which generate non-paged queries
Example
-------
Considering with ``permissions_cache_max_entries`` there is no maximum value, it's just limited by your memory.
The cache consumes memory as it caches all records from the list of users and their associated roles (similar to a cartesian product).
Every user, role, and permissions(7 types) on a per table basis are cached.
If for example, you have 1 user with 1 role and 1 table, the table will have 7 permission types and 7 entries 1 * 1 * 1 * 7 = 7.
When expanded to 5 users, 5 roles, and 10 tables this will be 5 * 5 * 10 * 7 = 1750 entries, which is above the default cache value of 1000. The entries that go over the max value (750 entries) will be non-paged queries for every new connection from the client (and clients tend to reconnect often).
In cases like this, you may want to consider trading your memory for not stressing the entire cluster with ``auth`` queries.

View File

@@ -38,6 +38,7 @@ Knowledge Base
* :doc:`If a query does not reveal enough results </kb/cqlsh-results>`
* :doc:`How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds>` - How to change the ``gc_grace_seconds`` parameter and prevent data resurrection.
* :doc:`How to flush old tombstones from a table </kb/tombstones-flush>` - How to remove old tombstones from SSTables.
* :doc:`Increase Cache to Avoid Non-paged Queries </kb/increase-permission-cache>` - How to increase the ``permissions_cache_max_entries`` setting.
* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
* :doc:`Facts about TTL, Compaction, and gc_grace_seconds <ttl-facts>`
* :doc:`Efficient Tombstone Garbage Collection in ICS <garbage-collection-ics>`

View File

@@ -25,8 +25,7 @@ Before you run ``nodetool decommission``:
starting the removal procedure.
* Make sure that the number of nodes remaining in the DC after you decommission a node
will be the same or higher than the Replication Factor configured for the keyspace
in this DC. Please mind that e.g. audit feature, which is enabled by default, may require
adjusting ``audit`` keyspace. If the number of remaining nodes is lower than the RF, the decommission
in this DC. If the number of remaining nodes is lower than the RF, the decommission
request may fail.
In such a case, ALTER the keyspace to reduce the RF before running ``nodetool decommission``.

View File

@@ -73,17 +73,6 @@ Procedure
cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
.. note::
If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
You must also alter the ``audit`` keyspace to remove replicas from the decommissioned data-center. For example:
.. code-block:: shell
cqlsh> ALTER KEYSPACE audit WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
Failure to do so will result in decommission errors such as "zero replica after the removal".
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.

View File

@@ -52,14 +52,18 @@ Row-level repair improves ScyllaDB in two ways:
* keeping the data in a temporary buffer.
* using the cached data to calculate the checksum and send it to the replicas.
See also the `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_.
See also
* `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
Incremental Repair
------------------
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
Built on top of `Row-level Repair <row-level-repair_>`_ and `Tablets </architecture/tablets>`_, Incremental Repair enables frequent and quick repairs. For more details, see `Incremental Repair </features/incremental-repair>`_.
Automatic Repair
----------------
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
Built on top of `Incremental Repair </features/incremental-repair>`_, `Automatic Repair </features/automatic-repair>`_ offers repair scheduling and execution directly in ScyllaDB, without external processes.

View File

@@ -14,11 +14,11 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
Enabling Audit
---------------
By default, table auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
You can set the following options:
* ``none`` - Audit is disabled.
* ``table`` - Audit is enabled, and messages are stored in a Scylla table (default).
* ``none`` - Audit is disabled (default).
* ``table`` - Audit is enabled, and messages are stored in a Scylla table.
* ``syslog`` - Audit is enabled, and messages are sent to Syslog.
* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
@@ -32,7 +32,7 @@ The audit can be tuned using the following flags or ``scylla.yaml`` entries:
================== ================================== ========================================================================================================================
Flag Default Value Description
================== ================================== ========================================================================================================================
audit_categories "DCL,AUTH,ADMIN" Comma-separated list of statement categories that should be audited
audit_categories "DCL,DDL,AUTH,ADMIN" Comma-separated list of statement categories that should be audited
------------------ ---------------------------------- ------------------------------------------------------------------------------------------------------------------------
audit_tables “” Comma-separated list of table names that should be audited, in the format of <keyspacename>.<tablename>
------------------ ---------------------------------- ------------------------------------------------------------------------------------------------------------------------
@@ -86,7 +86,9 @@ Storing Audit Messages in Syslog
.. code-block:: shell
# audit setting
# 'audit' config option controls if and where to output audited events:
# by default, Scylla does not audit anything.
# It is possible to enable auditing to the following places:
# - audit.audit_log column family by setting the flag to "table"
audit: "syslog"
#
# List of statement categories that should be audited.
@@ -157,7 +159,9 @@ For example:
.. code-block:: shell
# audit setting
# 'audit' config option controls if and where to output audited events:
# by default, Scylla does not audit anything.
# It is possible to enable auditing to the following places:
# - audit.audit_log column family by setting the flag to "table"
audit: "table"
#
# List of statement categories that should be audited.
@@ -211,8 +215,8 @@ Handling Audit Failures
In some cases, auditing may not be possible, for example, when:
* A table is used as the audits backend, and the partitions where the audit rows are saved are unavailable because the nodes holding those partitions are down or unreachable due to network issues.
* Syslog is used as the audits backend, and the Syslog sink (a regular Unix socket) is unresponsive or unavailable.
* A table is used as the audits backend, and the audit partition where the audit row is saved is not available because the node that holds this partition is down.
* Syslog is used as the audits backend, and the Syslog sink (a regular unix socket) is unresponsive/unavailable.
If the audit fails and audit messages are not stored in the configured audits backend, you can still review the audit log in the regular ScyllaDB logs.

View File

@@ -14,7 +14,7 @@ if necessary.
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
CentOS, Debian, and Ubuntu.
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
See :doc:`OS Support by Platform and Version </getting-started/os-support>`
for information about supported versions.
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.

View File

@@ -17,7 +17,7 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian,
and Ubuntu. See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>`
for information about supported versions.
It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.

View File

@@ -284,7 +284,6 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
}
[[fallthrough]];
case httpclient::reply_status::request_timeout:
case httpclient::reply_status::too_many_requests:
if (retry < max_retries) {
// service unavailable etc -> backoff + retry
do_backoff = true;

View File

@@ -2424,8 +2424,8 @@ bool gossiper::is_enabled() const {
void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
auto now_ = now();
auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T}]: (expire = {}, now = {}, diff = {} seconds)",
endpoint, fmt::localtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
now_.time_since_epoch().count(), diff);
_expire_time_endpoint_map[endpoint] = expire_time;
}

View File

@@ -153,8 +153,6 @@ public:
}
const std::set<inet_address>& get_seeds() const noexcept;
seastar::scheduling_group get_scheduling_group() const noexcept { return _gcfg.gossip_scheduling_group; }
public:
static clk::time_point inline now() noexcept { return clk::now(); }
public:

View File

@@ -23,11 +23,11 @@ static_assert(std::is_nothrow_move_constructible_v<gms::inet_address>);
future<gms::inet_address> gms::inet_address::lookup(sstring name, opt_family family, opt_family preferred) {
return seastar::net::dns::get_host_by_name(std::move(name), family).then([preferred](seastar::net::hostent&& h) {
for (auto& ent : h.addr_entries) {
if (!preferred || ent.addr.in_family() == preferred) {
return gms::inet_address(ent.addr);
for (auto& addr : h.addr_list) {
if (!preferred || addr.in_family() == preferred) {
return gms::inet_address(addr);
}
}
return gms::inet_address(h.addr_entries.front().addr);
return gms::inet_address(h.addr_list.front());
});
}

View File

@@ -17,11 +17,11 @@
#include "index/secondary_index.hh"
#include "index/secondary_index_manager.hh"
#include "types/concrete_types.hh"
#include "types/types.hh"
#include "utils/managed_string.hh"
#include <seastar/core/sstring.hh>
#include <boost/algorithm/string.hpp>
namespace secondary_index {
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
@@ -147,88 +147,17 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
}
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
struct validate_visitor {
const class schema& schema;
bool& is_vector;
/// Vector indexes support filtering on native types that can be used as primary key columns.
/// There is no counter (it cannot be used with vector columns)
/// and no duration (it cannot be used as a primary key or in secondary indexes).
static bool is_supported_filtering_column(abstract_type const & kind_type) {
switch (kind_type.get_kind()) {
case abstract_type::kind::ascii:
case abstract_type::kind::boolean:
case abstract_type::kind::byte:
case abstract_type::kind::bytes:
case abstract_type::kind::date:
case abstract_type::kind::decimal:
case abstract_type::kind::double_kind:
case abstract_type::kind::float_kind:
case abstract_type::kind::inet:
case abstract_type::kind::int32:
case abstract_type::kind::long_kind:
case abstract_type::kind::short_kind:
case abstract_type::kind::simple_date:
case abstract_type::kind::time:
case abstract_type::kind::timestamp:
case abstract_type::kind::timeuuid:
case abstract_type::kind::utf8:
case abstract_type::kind::uuid:
case abstract_type::kind::varint:
return true;
default:
break;
}
return false;
}
void validate(cql3::column_identifier const& column, bool is_vector) const {
auto const& c_name = column.to_string();
auto const* c_def = schema.get_column_definition(column.name());
if (c_def == nullptr) {
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
}
auto type = c_def->type;
if (is_vector) {
auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
if (vector_type == nullptr) {
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
}
auto elements_type = vector_type->get_elements_type();
if (elements_type->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
}
return;
}
if (!is_supported_filtering_column(*type)) {
throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
}
}
void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
for (const auto& column : columns) {
// CQL restricts the secondary local index to have multiple columns with partition key only.
// Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
// so we can assume here that these are non-vectors filtering columns.
validate(*column, false);
}
}
void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
validate(*column, is_vector);
// The first column is the vector column, the rest mustn't be vectors.
is_vector = false;
}
};
bool is_vector = true;
for (const auto& target : targets) {
std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
if (targets.size() != 1) {
throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
}
auto target = targets[0];
auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
if (!c_def) {
throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
}
auto type = c_def->type;
if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
}
}

17
init.cc
View File

@@ -11,6 +11,7 @@
#include "seastarx.hh"
#include "db/config.hh"
#include <boost/algorithm/string/trim.hpp>
#include <seastar/core/coroutine.hh>
#include "sstables/sstable_compressor_factory.hh"
#include "gms/feature_service.hh"
@@ -29,7 +30,11 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,
std::set<gms::inet_address> seeds;
if (seed_provider.parameters.contains("seeds")) {
for (const auto& seed : utils::split_comma_separated_list(seed_provider.parameters.at("seeds"))) {
size_t begin = 0;
size_t next = 0;
sstring seeds_str = seed_provider.parameters.find("seeds")->second;
while (begin < seeds_str.length() && begin != (next=seeds_str.find(",",begin))) {
auto seed = boost::trim_copy(seeds_str.substr(begin,next-begin));
try {
seeds.emplace(gms::inet_address::lookup(seed, family, preferred).get());
} catch (...) {
@@ -41,10 +46,11 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,
seed,
std::current_exception());
}
begin = next+1;
}
}
if (seeds.empty()) {
seeds.emplace("127.0.0.1");
seeds.emplace(gms::inet_address("127.0.0.1"));
}
startlog.info("seeds={{{}}}, listen_address={}, broadcast_address={}",
fmt::join(seeds, ", "), listen, broadcast_address);
@@ -96,6 +102,13 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
}
if (cfg.force_gossip_topology_changes()) {
if (cfg.enable_tablets_by_default()) {
throw std::runtime_error("Tablets cannot be enabled with gossip topology changes. Use either --tablets-mode-for-new-keyspaces=enabled|enforced or --force-gossip-topology-changes, but not both.");
}
startlog.warn("The tablets feature is disabled due to forced gossip topology changes");
disabled.insert("TABLETS"s);
}
if (!cfg.table_digest_insensitive_to_expiry()) {
disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
}

View File

@@ -150,6 +150,7 @@ fedora_packages=(
llvm
openldap-servers
openldap-devel
toxiproxy
cyrus-sasl
fipscheck
cpp-jwt-devel
@@ -157,10 +158,7 @@ fedora_packages=(
podman
buildah
# for cassandra-stress
java-openjdk-headless
snappy
https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
elfutils
jq
@@ -297,7 +295,6 @@ print_usage() {
echo " --print-pip-runtime-packages Print required pip packages for Scylla"
echo " --print-pip-symlinks Print list of pip provided commands which need to install to /usr/bin"
echo " --print-node-exporter-filename Print node_exporter filename"
echo " --future Install dependencies for future toolchain (Fedora rawhide based)"
exit 1
}
@@ -305,7 +302,6 @@ PRINT_PYTHON3=false
PRINT_PIP=false
PRINT_PIP_SYMLINK=false
PRINT_NODE_EXPORTER=false
FUTURE=false
while [ $# -gt 0 ]; do
case "$1" in
"--print-python3-runtime-packages")
@@ -324,10 +320,6 @@ while [ $# -gt 0 ]; do
PRINT_NODE_EXPORTER=true
shift 1
;;
"--future")
FUTURE=true
shift 1
;;
*)
print_usage
;;
@@ -358,10 +350,6 @@ if $PRINT_NODE_EXPORTER; then
exit 0
fi
if ! $FUTURE; then
fedora_packages+=(toxiproxy)
fi
umask 0022
./seastar/install-dependencies.sh
@@ -389,10 +377,6 @@ elif [ "$ID" = "fedora" ]; then
exit 1
fi
dnf install -y "${fedora_packages[@]}" "${fedora_python3_packages[@]}"
# Fedora 45 tightened key checks, and cassandra-stress is not signed yet.
dnf install --no-gpgchecks -y https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
PIP_DEFAULT_ARGS="--only-binary=:all: -v"
pip_constrained_packages=""
for package in "${!pip_packages[@]}"
@@ -463,11 +447,3 @@ if [ ! -z "${CURL_ARGS}" ]; then
else
echo "Minio server and client are up-to-date, skipping download"
fi
if $FUTURE ; then
toxyproxy_version="v2.12.0"
for bin in toxiproxy-cli toxiproxy-server; do
curl -fSL -o "/usr/local/bin/${bin}" "https://github.com/Shopify/toxiproxy/releases/download/${toxyproxy_version}/${bin}-linux-$(go_arch)"
chmod +x "/usr/local/bin/${bin}"
done
fi

View File

@@ -8,7 +8,6 @@
#include <boost/date_time/gregorian/greg_date.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <random>
#include "lua.hh"
#include "lang/lua_scylla_types.hh"
#include "exceptions/exceptions.hh"
@@ -29,14 +28,6 @@
# define LUA_504_PLUS(x...)
#endif
// Lua 5.5 added a seed parameter to lua_newstate
#if LUA_VERSION_NUM >= 505
# define LUA_505_PLUS(x...) x
#else
# define LUA_505_PLUS(x...)
#endif
using namespace seastar;
using namespace lua;
@@ -135,11 +126,7 @@ static void debug_hook(lua_State* l, lua_Debug* ar) {
static lua_slice_state new_lua(const lua::runtime_config& cfg) {
auto a_state = std::make_unique<alloc_state>(cfg.max_bytes, cfg.max_contiguous);
#if LUA_VERSION_NUM >= 505
static thread_local std::default_random_engine rng{std::random_device{}()};
auto seed = rng();
#endif
std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get() LUA_505_PLUS(, seed))};
std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get())};
if (!l) {
throw std::runtime_error("could not create lua state");
}

View File

@@ -1170,17 +1170,6 @@ token_metadata::set_version_tracker(version_tracker_t tracker) {
_impl->set_version_tracker(std::move(tracker));
}
version_tracker::version_tracker(utils::phased_barrier::operation op, const token_metadata& tm)
: _op(std::move(op))
, _version(tm.get_version())
, _tm(&tm)
{
}
long version_tracker::version_use_count() const {
return _tm->use_count();
}
version_tracker::~version_tracker() {
if (_expired_at) {
auto now = std::chrono::steady_clock::now();
@@ -1192,8 +1181,8 @@ version_tracker::~version_tracker() {
}
}
version_tracker shared_token_metadata::new_tracker(const token_metadata& tm) {
auto tracker = version_tracker(_versions_barrier.start(), tm);
version_tracker shared_token_metadata::new_tracker(token_metadata::version_t version) {
auto tracker = version_tracker(_versions_barrier.start(), version);
_trackers.push_front(tracker);
return tracker;
}
@@ -1209,18 +1198,6 @@ void shared_token_metadata::clear_and_dispose(std::unique_ptr<token_metadata_imp
}
}
std::unordered_map<service::topology::version_t, int> shared_token_metadata::describe_stale_versions() {
std::unordered_map<service::topology::version_t, int> result;
const auto active_version = _shared.get()->get_version();
for (const auto& t: _trackers) {
const auto v = t.version();
if (v < active_version) {
result.emplace(v, t.version_use_count());
}
}
return result;
}
void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
if (_shared->get_ring_version() >= tmptr->get_ring_version()) {
on_internal_error(tlogger, format("shared_token_metadata: must not set non-increasing ring_version: {} -> {}", _shared->get_ring_version(), tmptr->get_ring_version()));
@@ -1234,7 +1211,7 @@ void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
tmptr->set_shared_token_metadata(*this);
_shared = std::move(tmptr);
_shared->set_version_tracker(new_tracker(*_shared));
_shared->set_version_tracker(new_tracker(_shared->get_version()));
for (auto&& v : _trackers) {
if (v.version() != _shared->get_version()) {

View File

@@ -112,7 +112,6 @@ public:
private:
utils::phased_barrier::operation _op;
service::topology::version_t _version;
const token_metadata* _tm = nullptr;
link_type _link;
// When engaged it means the version is no longer latest and should be released soon as to
@@ -121,7 +120,8 @@ private:
std::chrono::steady_clock::duration _log_threshold;
public:
version_tracker() = default;
version_tracker(utils::phased_barrier::operation op, const token_metadata& tm);
version_tracker(utils::phased_barrier::operation op, service::topology::version_t version)
: _op(std::move(op)), _version(version) {}
version_tracker(version_tracker&&) noexcept = default;
version_tracker& operator=(version_tracker&& o) noexcept {
if (this != &o) {
@@ -137,8 +137,6 @@ public:
return _version;
}
long version_use_count() const;
void mark_expired(std::chrono::steady_clock::duration log_threshold) {
if (!_expired_at) {
_expired_at = std::chrono::steady_clock::now();
@@ -174,7 +172,7 @@ private:
friend class token_metadata_impl;
};
class token_metadata final: public enable_lw_shared_from_this<token_metadata> {
class token_metadata final {
shared_token_metadata* _shared_token_metadata = nullptr;
std::unique_ptr<token_metadata_impl> _impl;
private:
@@ -412,7 +410,7 @@ class shared_token_metadata : public peering_sharded_service<shared_token_metada
boost::intrusive::constant_time_size<false>>;
version_tracker_list_type _trackers;
private:
version_tracker new_tracker(const token_metadata& tm);
version_tracker new_tracker(token_metadata::version_t);
public:
// used to construct the shared object as a sharded<> instance
// lock_func returns semaphore_units<>
@@ -421,7 +419,7 @@ public:
, _lock_func(std::move(lock_func))
, _versions_barrier("shared_token_metadata::versions_barrier")
{
_shared->set_version_tracker(new_tracker(*_shared));
_shared->set_version_tracker(new_tracker(_shared->get_version()));
}
shared_token_metadata(const shared_token_metadata& x) = delete;
@@ -448,9 +446,6 @@ public:
_stall_detector_threshold = threshold;
}
// Returns a map version -> use_count
std::unordered_map<service::topology::version_t, int> describe_stale_versions();
future<> stale_versions_in_use() const {
return _stale_versions_in_use.get_future();
}

14
main.cc
View File

@@ -1150,7 +1150,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
dbcfg.memtable_scheduling_group = create_scheduling_group("memtable", "mt", 1000).get();
dbcfg.memtable_to_cache_scheduling_group = create_scheduling_group("memtable_to_cache", "mt2c", 200).get();
dbcfg.gossip_scheduling_group = create_scheduling_group("gossip", "gms", 1000).get();
debug::gossip_scheduling_group = dbcfg.gossip_scheduling_group;
dbcfg.commitlog_scheduling_group = create_scheduling_group("commitlog", "clog", 1000).get();
dbcfg.schema_commitlog_scheduling_group = create_scheduling_group("schema_commitlog", "sclg", 1000).get();
dbcfg.available_memory = memory::stats().total_memory();
@@ -2042,7 +2041,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
cdc_config.ring_delay = std::chrono::milliseconds(cfg->ring_delay_ms());
cdc_config.dont_rewrite_streams = cfg->cdc_dont_rewrite_streams();
cdc_generation_service.start(std::move(cdc_config), std::ref(gossiper), std::ref(sys_dist_ks), std::ref(sys_ks),
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db)).get();
std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db),
[&ss] () -> bool { return ss.local().raft_topology_change_enabled(); }).get();
auto stop_cdc_generation_service = defer_verbose_shutdown("CDC Generation Management service", [] {
cdc_generation_service.stop().get();
});
@@ -2071,7 +2071,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
gossiper.local().unregister_(mm.local().shared_from_this()).get();
});
utils::loading_cache_config perm_cache_config;
perm_cache_config.max_size = cfg->permissions_cache_max_entries();
perm_cache_config.expiry = std::chrono::milliseconds(cfg->permissions_validity_in_ms());
perm_cache_config.refresh = std::chrono::milliseconds(cfg->permissions_update_interval_in_ms());
auto start_auth_service = [&mm] (sharded<auth::service>& auth_service, std::any& stop_auth_service, const char* what) {
supervisor::notify(fmt::format("starting {}", what));
auth_service.invoke_on_all(&auth::service::start, std::ref(mm), std::ref(sys_ks)).get();
stop_auth_service = defer_verbose_shutdown(what, [&auth_service] {
@@ -2099,7 +2105,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
maintenance_auth_config.authenticator_java_name = sstring{auth::allow_all_authenticator_name};
maintenance_auth_config.role_manager_java_name = sstring{auth::maintenance_socket_role_manager_name};
maintenance_auth_service.start(std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);
@@ -2366,7 +2372,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
auth_config.authenticator_java_name = qualified_authenticator_name;
auth_config.role_manager_java_name = qualified_role_manager_name;
auth_service.start(std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
std::any stop_auth_service;
// Has to be called after node joined the cluster (join_cluster())

View File

@@ -272,27 +272,25 @@ private:
bool can_purge_tombstone(const tombstone& t, is_shadowable is_shadowable, const gc_clock::time_point deletion_time) {
max_purgeable::can_purge_result purge_res { };
std::optional<bool> expired;
if (_tombstone_gc_state.cheap_to_get_gc_before(_schema)) {
// if retrieval of grace period is cheap, can_gc() will only be
// called for tombstones that are older than grace period, in
// order to avoid unnecessary bloom filter checks when calculating
// max purgeable timestamp.
expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
purge_res.can_purge = satisfy_grace_period(deletion_time);
if (purge_res.can_purge) {
purge_res = can_gc(t, is_shadowable);
}
} else {
purge_res = can_gc(t, is_shadowable);
if (purge_res.can_purge) {
expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
purge_res.can_purge = satisfy_grace_period(deletion_time);
}
}
if constexpr (sstable_compaction()) {
// Tombstone GC stats only account for expired tombstones (those eligible for GC).
if (!_tombstone_stats || !t || !expired.value_or(satisfy_grace_period(deletion_time))) {
if (!_tombstone_stats || !t) {
return purge_res.can_purge;
}

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9034610470ff645fab03da5ad6c690e5b41f3307ea4b529c7e63b0786a1289ed
size 6539600
oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
size 6492280

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0c4bbf51dbe01d684ea5b9a9157781988ed499604d2fde90143bad0b9a5594f0
size 6543944
oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
size 6492176

View File

@@ -459,7 +459,7 @@ future<> server_impl::wait_for_state_change(seastar::abort_source* as) {
}
try {
co_await (as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future());
return as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future();
} catch (abort_requested_exception&) {
throw request_aborted(fmt::format(
"Aborted while waiting for state change on server: {}, latest applied entry: {}, current state: {}", _id, _applied_idx, _fsm->current_state()));

View File

@@ -252,10 +252,6 @@ public:
//
// The caller may pass a pointer to an abort_source to make the function abortable.
// It it passes nullptr, the function is unabortable.
//
// Exceptions:
// raft::request_aborted
// Thrown if abort is requested before the operation finishes.
virtual future<> wait_for_state_change(seastar::abort_source* as) = 0;
// The returned future is resolved when a leader is elected for the current term.
@@ -266,10 +262,6 @@ public:
//
// The caller may pass a pointer to an abort_source to make the function abortable.
// It it passes nullptr, the function is unabortable.
//
// Exceptions:
// raft::request_aborted
// Thrown if abort is requested before the operation finishes.
virtual future<> wait_for_leader(seastar::abort_source* as) = 0;
// Manually trigger snapshot creation and log truncation.

View File

@@ -103,8 +103,8 @@ thread_local dirty_memory_manager default_dirty_memory_manager;
inline
flush_controller
make_flush_controller(const db::config& cfg, const database_config& dbcfg, std::function<double()> fn) {
return flush_controller(dbcfg.memtable_scheduling_group, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
make_flush_controller(const db::config& cfg, backlog_controller::scheduling_group& sg, std::function<double()> fn) {
return flush_controller(sg, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
}
keyspace::keyspace(config cfg, locator::effective_replication_map_factory& erm_factory)
@@ -394,7 +394,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
, _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
, _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
, _dbcfg(dbcfg)
, _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
, _flush_sg(dbcfg.memtable_scheduling_group)
, _memtable_controller(make_flush_controller(_cfg, _flush_sg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
if (_dirty_memory_manager.has_extraneous_flushes_requested()) {
backlog = std::max(backlog, _memtable_controller.backlog_of_shares(200));
@@ -1503,10 +1504,12 @@ keyspace::make_column_family_config(const schema& s, const database& db) const {
cfg.compaction_concurrency_semaphore = _config.compaction_concurrency_semaphore;
cfg.cf_stats = _config.cf_stats;
cfg.enable_incremental_backups = _config.enable_incremental_backups;
cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
cfg.statement_scheduling_group = _config.statement_scheduling_group;
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
cfg.enable_node_aggregated_table_metrics = db_config.enable_node_aggregated_table_metrics();
cfg.tombstone_warn_threshold = db_config.tombstone_warn_threshold();
@@ -2450,10 +2453,12 @@ database::make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_
cfg.cf_stats = &_cf_stats;
cfg.enable_incremental_backups = _enable_incremental_backups;
cfg.compaction_scheduling_group = _dbcfg.compaction_scheduling_group;
cfg.memory_compaction_scheduling_group = _dbcfg.memory_compaction_scheduling_group;
cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group;
cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group;
cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group;
cfg.statement_scheduling_group = _dbcfg.statement_scheduling_group;
cfg.enable_metrics_reporting = _cfg.enable_keyspace_column_family_metrics();
cfg.view_update_memory_semaphore_limit = max_memory_pending_view_updates();
@@ -3777,7 +3782,7 @@ future<utils::chunked_vector<temporary_buffer<char>>> database::sample_data_file
&result,
chunk_size
] (database& local_db, state_by_shard& local_state) -> future<> {
auto ticket = co_await get_units(local_db._sample_data_files_local_concurrency_limiter, 1);
auto ticket = get_units(local_db._sample_data_files_local_concurrency_limiter, 1);
// In `chosen_chunks`, the sorted array of chosen chunk offsets (in the "global chunk list"),
// find the range of offsets which belongs to us.

View File

@@ -466,7 +466,9 @@ public:
replica::cf_stats* cf_stats = nullptr;
seastar::scheduling_group memtable_scheduling_group;
seastar::scheduling_group memtable_to_cache_scheduling_group;
seastar::scheduling_group compaction_scheduling_group;
seastar::scheduling_group memory_compaction_scheduling_group;
seastar::scheduling_group statement_scheduling_group;
seastar::scheduling_group streaming_scheduling_group;
bool enable_metrics_reporting = false;
bool enable_node_aggregated_table_metrics = true;
@@ -1403,7 +1405,9 @@ public:
replica::cf_stats* cf_stats = nullptr;
seastar::scheduling_group memtable_scheduling_group;
seastar::scheduling_group memtable_to_cache_scheduling_group;
seastar::scheduling_group compaction_scheduling_group;
seastar::scheduling_group memory_compaction_scheduling_group;
seastar::scheduling_group statement_scheduling_group;
seastar::scheduling_group streaming_scheduling_group;
bool enable_metrics_reporting = false;
size_t view_update_memory_semaphore_limit;
@@ -1613,6 +1617,7 @@ private:
dirty_memory_manager _dirty_memory_manager;
database_config _dbcfg;
backlog_controller::scheduling_group _flush_sg;
flush_controller _memtable_controller;
drain_progress _drain_progress {};
@@ -1791,6 +1796,8 @@ public:
return &_cf_stats;
}
seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; }
seastar::scheduling_group get_gossip_scheduling_group() const { return _dbcfg.gossip_scheduling_group; }
compaction::compaction_manager& get_compaction_manager() {

View File

@@ -2754,7 +2754,7 @@ public:
return _cg.get_backlog_tracker();
}
const std::string get_group_id() const noexcept override {
return fmt::to_string(_cg.group_id());
return fmt::format("{}", _cg.group_id());
}
seastar::condition_variable& get_staging_done_condition() noexcept override {
@@ -4964,6 +4964,7 @@ future<> table::cleanup_tablet(database& db, db::system_keyspace& sys_ks, locato
co_await stop_compaction_groups(sg);
co_await utils::get_local_injector().inject("delay_tablet_compaction_groups_cleanup", std::chrono::seconds(5));
co_await cleanup_compaction_groups(db, sys_ks, tid, sg);
co_await utils::get_local_injector().inject("tablet_cleanup_completion_wait", utils::wait_for_message(std::chrono::seconds(5)));
}
future<> table::cleanup_tablet_without_deallocation(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid) {

View File

@@ -20,7 +20,7 @@ set -e
trap 'echo "error $? in $0 line $LINENO"' ERR
SCRIPT_NAME=$(basename $0)
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=https://api.backtrace.scylladb.com
SCYLLA_S3_RELOC_SERVER_DEFAULT_URL=http://backtrace.scylladb.com
function print_usage {
cat << EOF
@@ -284,8 +284,7 @@ then
log "Build id: ${BUILD_ID}"
# https://api.backtrace.scylladb.com/api/docs#/default/search_by_build_id_search_build_id_get
BUILD=$(curl "${SCYLLA_S3_RELOC_SERVER_URL}/api/search/build_id?build_id=${BUILD_ID}" -H 'accept: application/json')
BUILD=$(curl -s -X GET "${SCYLLA_S3_RELOC_SERVER_URL}/build.json?build_id=${BUILD_ID}")
if [[ -z "$BUILD" ]]
then
@@ -294,16 +293,12 @@ then
fi
RESPONSE_BUILD_ID=$(get_json_field "$BUILD" "build_id")
BUILD_MODE=$(get_json_field "$BUILD" "build_type")
PACKAGE_URL=$(get_json_field "$BUILD" "unstripped_url")
BUILD_DATA=$(get_json_field "$BUILD" "build_data")
VERSION=$(get_json_field "$BUILD_DATA" "version")
PRODUCT=$(get_json_field "$BUILD_DATA" "product")
RELEASE=$(get_json_field "$BUILD_DATA" "release")
ARCH=$(get_json_field "$BUILD_DATA" "platform")
TIMESTAMP=$(get_json_field "$BUILD_DATA" "timestamp")
VERSION=$(get_json_field "$BUILD" "version")
PRODUCT=$(get_json_field "$BUILD" "product")
RELEASE=$(get_json_field "$BUILD" "release")
ARCH=$(get_json_field "$BUILD" "arch")
BUILD_MODE=$(get_json_field "$BUILD" "build_mode")
PACKAGE_URL=$(get_json_field "$BUILD" "package_url" 1)
if [[ "$RESPONSE_BUILD_ID" != "$BUILD_ID" ]]
then
@@ -311,7 +306,7 @@ then
exit 1
fi
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH} from ${TIMESTAMP}"
log "Matching build is ${PRODUCT}-${VERSION} ${RELEASE} ${BUILD_MODE}-${ARCH}"
fi
if ! [[ -d ${ARTIFACT_DIR}/scylla.package ]]

Submodule seastar updated: d2953d2ad1...f55dc7ebed

View File

@@ -217,8 +217,6 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
static const std::unordered_set<auth::resource> vector_search_system_resources = {
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
};
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||

View File

@@ -56,9 +56,6 @@ static future<schema_ptr> get_schema_definition(table_schema_version v, locator:
migration_manager::migration_manager(migration_notifier& notifier, gms::feature_service& feat, netw::messaging_service& ms,
service::storage_proxy& storage_proxy, gms::gossiper& gossiper, service::raft_group0_client& group0_client, sharded<db::system_keyspace>& sysks) :
_notifier(notifier)
, _background_tasks("migration_manager::background_tasks")
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
, _sys_ks(sysks)
, _group0_barrier(this_shard_id() == 0 ?
std::function<future<>()>([this] () -> future<> {
if ((co_await _group0_client.get_group0_upgrade_state()).second == group0_upgrade_state::use_pre_raft_procedures) {
@@ -66,7 +63,7 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
}
// This will run raft barrier and will sync schema with the leader
co_await with_scheduling_group(_gossiper.get_scheduling_group(), [this] {
co_await with_scheduling_group(_storage_proxy.get_db().local().get_gossip_scheduling_group(), [this] {
return start_group0_operation().discard_result();
});
}) :
@@ -77,6 +74,9 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
});
})
)
, _background_tasks("migration_manager::background_tasks")
, _feat(feat), _messaging(ms), _storage_proxy(storage_proxy), _ss("migration_manager::storage_service"), _gossiper(gossiper), _group0_client(group0_client)
, _sys_ks(sysks)
, _schema_push([this] { return passive_announce(); })
, _concurrent_ddl_retries{10}
{

View File

@@ -57,6 +57,7 @@ private:
migration_notifier& _notifier;
std::unordered_map<locator::host_id, serialized_action> _schema_pulls;
serialized_action _group0_barrier;
std::vector<gms::feature::listener_registration> _feature_listeners;
seastar::named_gate _background_tasks;
static const std::chrono::milliseconds migration_delay;
@@ -68,7 +69,6 @@ private:
seastar::abort_source _as;
service::raft_group0_client& _group0_client;
sharded<db::system_keyspace>& _sys_ks;
serialized_action _group0_barrier;
serialized_action _schema_push;
table_schema_version _schema_version_to_publish;

View File

@@ -72,7 +72,7 @@ void group0_state_id_handler::refresh() {
const auto min_state_id = std::ranges::min(group0_members_state_ids, [](auto a, auto b) {
if (!a || !b) {
// This should never happen, but if it does, it's a bug.
on_internal_error(slogger, "unexpected empty state_id");
on_fatal_internal_error(slogger, "unexpected empty state_id");
}
return utils::timeuuid_tri_compare(a, b) < 0;
});

View File

@@ -149,31 +149,19 @@ public:
const auto& node = nodes_info.at(voter_id);
if (node.is_alive) {
if (_alive_nodes_remaining == 0) {
on_internal_error(rvlogger,
format("rack_info: no alive nodes remaining, but node {} is alive", voter_id));
}
SCYLLA_ASSERT(_alive_nodes_remaining > 0);
--_alive_nodes_remaining;
if (node.is_leader) {
if (!_owns_alive_leader) {
on_internal_error(rvlogger,
format("rack_info: rack doesn't own a live leader, but leader {} is alive", voter_id));
}
SCYLLA_ASSERT(_owns_alive_leader);
_owns_alive_leader = false;
}
}
if (node.is_voter) {
if (node.is_alive) {
if (_existing_alive_voters_remaining == 0) {
on_internal_error(rvlogger,
format("rack_info: no live voters remaining, but voter {} is alive", voter_id));
}
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
--_existing_alive_voters_remaining;
} else {
if (_existing_dead_voters_remaining == 0) {
on_internal_error(rvlogger,
format("rack_info: no dead voters remaining, but voter {} is dead", voter_id));
}
SCYLLA_ASSERT(_existing_dead_voters_remaining > 0);
--_existing_dead_voters_remaining;
}
}
@@ -291,25 +279,16 @@ public:
if (node.is_alive) {
if (node.is_voter) {
if (_existing_alive_voters_remaining == 0) {
on_internal_error(rvlogger,
format("datacenter_info: no live voters remaining, but voter {} is alive", *voter_id));
}
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
--_existing_alive_voters_remaining;
}
if (node.is_leader) {
if (!_owns_alive_leader) {
on_internal_error(rvlogger,
format("datacenter_info: DC doesn't own a live leader, but leader {} is alive", *voter_id));
}
SCYLLA_ASSERT(_owns_alive_leader);
_owns_alive_leader = false;
}
}
if (_nodes_remaining == 0) {
on_internal_error(rvlogger,
format("datacenter_info: no nodes remaining, but voter {} belongs to this DC", *voter_id));
}
SCYLLA_ASSERT(_nodes_remaining > 0);
--_nodes_remaining;
++_assigned_voters_count;

View File

@@ -123,7 +123,12 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
// Check the effective replication map consistency:
// we have an inconsistent effective replication map in case we the number of
// read replicas is higher than the replication factor.
[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
// Skip for non-debug builds.
if constexpr (!tools::build_info::is_debug_build()) {
return;
}
const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
if (!error.empty()) {
on_internal_error(slogger, error);
@@ -612,13 +617,8 @@ private:
try {
// FIXME: get_schema_for_write() doesn't timeout
schema_ptr s = co_await get_schema_for_write(schema_version, reply_to_host_id, shard, timeout);
// This erm ensures that tablet migrations wait for replica requests,
// even if the coordinator is no longer available.
const auto erm = s->table().get_effective_replication_map();
// Note: blocks due to execution_stage in replica::database::apply()
co_await p->run_fenceable_write(erm->get_replication_strategy(),
co_await p->run_fenceable_write(s->table().get_effective_replication_map()->get_replication_strategy(),
fence, src_addr,
[&] { return apply_fn(p, trace_state_ptr, std::move(s), m, timeout); });
// We wait for send_mutation_done to complete, otherwise, if reply_to is busy, we will accumulate
@@ -868,10 +868,6 @@ private:
slogger.info("storage_proxy::handle_read injection done");
});
// This erm ensures that tablet migrations wait for replica requests,
// even if the coordinator is no longer available.
auto erm = s->table().get_effective_replication_map();
auto pr2 = ::compat::unwrap(std::move(pr), *s);
auto do_query = [&]() {
if constexpr (verb == read_verb::read_data) {
@@ -879,6 +875,7 @@ private:
// this function assumes singular queries but doesn't validate
throw std::runtime_error("READ_DATA called with wrapping range");
}
auto erm = s->table().get_effective_replication_map();
p->get_stats().replica_data_reads++;
if (!oda) {
throw std::runtime_error("READ_DATA called without digest algorithm");
@@ -995,12 +992,6 @@ private:
auto schema = co_await get_schema_for_read(cmd.schema_version, src_addr, src_shard, *timeout);
dht::token token = dht::get_token(*schema, key);
// This guard ensures that tablet migrations wait for replica requests,
// even if the LWT coordinator is no longer available.
locator::token_metadata_guard guard(schema->table(), token);
co_await _sp.apply_fence(fence_opt, src_addr);
unsigned shard = schema->table().shard_for_reads(token);
bool local = shard == this_shard_id();
_sp.get_stats().replica_cross_shard_ops += !local;
@@ -1041,12 +1032,6 @@ private:
});
auto schema = co_await get_schema_for_read(proposal.update.schema_version(), src_addr, src_shard, *timeout);
dht::token token = proposal.update.decorated_key(*schema).token();
// This guard ensures that tablet migrations wait for replica requests,
// even if the LWT coordinator is no longer available.
locator::token_metadata_guard guard(schema->table(), token);
co_await _sp.apply_fence(fence_opt, src_addr);
unsigned shard = schema->table().shard_for_reads(token);
bool local = shard == this_shard_id();
_sp.get_stats().replica_cross_shard_ops += !local;
@@ -1088,12 +1073,6 @@ private:
auto d = defer([] { pruning--; });
auto schema = co_await get_schema_for_read(schema_id, src_addr, src_shard, *timeout);
dht::token token = dht::get_token(*schema, key);
// This guard ensures that tablet migrations wait for replica requests,
// even if the LWT coordinator is no longer available.
locator::token_metadata_guard guard(schema->table(), token);
co_await _sp.apply_fence(fence_opt, src_addr);
unsigned shard = schema->table().shard_for_reads(token);
bool local = shard == this_shard_id();
_sp.get_stats().replica_cross_shard_ops += !local;
@@ -6993,12 +6972,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
return host_id_vector_replica_set{my_host_id(erm)};
}
auto endpoints = erm.get_replicas_for_reading(token);
// Skip for non-debug builds and maintenance mode.
if constexpr (tools::build_info::is_debug_build()) {
if (!_db.local().get_config().maintenance_mode()) {
validate_read_replicas(erm, endpoints);
}
}
validate_read_replicas(erm, endpoints);
auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
endpoints.erase(it, endpoints.end());
sort_endpoints_by_proximity(erm, endpoints);

Some files were not shown because too many files have changed in this diff Show More