Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
af2d1adcdc docs: document alternator_streams_increased_compatibility option in compatibility.md
Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
2026-03-09 16:37:52 +00:00
copilot-swe-agent[bot]
029712dc56 Initial plan 2026-03-09 16:35:57 +00:00
2653 changed files with 16222 additions and 18808 deletions

View File

@@ -55,26 +55,22 @@ ninja build/<mode>/test/boost/<test_name>
ninja build/<mode>/scylla
# Run all tests in a file
./test.py --mode=<mode> test/<suite>/<test_name>.py
./test.py --mode=<mode> <test_path>
# Run a single test case from a file
./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
# Run all tests in a directory
./test.py --mode=<mode> test/<suite>/
./test.py --mode=<mode> <test_path>::<test_function_name>
# Examples
./test.py --mode=dev test/alternator/
./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
./test.py --mode=dev test/cqlpy/test_json.py
./test.py --mode=dev alternator/
./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
# Optional flags
./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v # Verbose output
./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5 # Repeat test 5 times
./test.py --mode=dev cluster/test_raft_no_quorum -v # Verbose output
./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5 # Repeat test 5 times
```
**Important:**
- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
- To run a single test case, append `::<test_function_name>` to the file path
- Add `-v` for verbose output
- Add `--repeat <num>` to repeat a test multiple times

View File

@@ -1,6 +1,6 @@
version: 2
updates:
- package-ecosystem: "uv"
- package-ecosystem: "pip"
directory: "/docs"
schedule:
interval: "daily"

View File

@@ -8,9 +8,6 @@ on:
jobs:
check-fixes-prefix:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
steps:
- name: Check PR body for "Fixes" prefix patterns
uses: actions/github-script@v7

View File

@@ -1,8 +1,8 @@
name: Sync Jira Based on PR Events
name: Sync Jira Based on PR Events
on:
pull_request_target:
types: [opened, edited, ready_for_review, review_requested, labeled, unlabeled, closed]
types: [opened, ready_for_review, review_requested, labeled, unlabeled, closed]
permissions:
contents: read
@@ -10,9 +10,32 @@ permissions:
issues: write
jobs:
jira-sync:
uses: scylladb/github-automation/.github/workflows/main_pr_events_jira_sync.yml@main
with:
caller_action: ${{ github.event.action }}
jira-sync-pr-opened:
if: github.event.action == 'opened'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_opened.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-sync-in-review:
if: github.event.action == 'ready_for_review' || github.event.action == 'review_requested'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_in_review.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-sync-add-label:
if: github.event.action == 'labeled'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_add_label.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-status-remove-label:
if: github.event.action == 'unlabeled'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_remove_label.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-status-pr-closed:
if: github.event.action == 'closed'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_closed.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,14 +1,14 @@
name: Call Jira release creation for new milestone
name: Call Jira release creation for new milestone
on:
milestone:
types: [created, closed]
types: [created]
jobs:
sync-milestone-to-jira:
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
with:
# Comma-separated list of Jira project keys
jira_project_keys: "SCYLLADB,CUSTOMER,SMI,RELENG,VECTOR"
jira_project_keys: "SCYLLADB,CUSTOMER,SMI,RELENG"
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -19,8 +19,6 @@ on:
jobs:
release:
permissions:
pages: write
id-token: write
contents: write
runs-on: ubuntu-latest
steps:
@@ -33,9 +31,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
python-version: "3.10"
- name: Set up env
run: make -C docs FLAG="${{ env.FLAG }}" setupenv
- name: Build docs

View File

@@ -29,9 +29,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
python-version: "3.10"
- name: Set up env
run: make -C docs FLAG="${{ env.FLAG }}" setupenv
- name: Build docs

View File

@@ -1,6 +1,4 @@
name: Trigger Scylla CI Route
permissions:
contents: read
on:
issue_comment:

View File

@@ -1,8 +1,5 @@
name: Trigger next gating
permissions:
contents: read
on:
push:
branches:

View File

@@ -13,8 +13,7 @@
#include <string_view>
#include "alternator/auth.hh"
#include <fmt/format.h>
#include "db/consistency_level_type.hh"
#include "db/system_keyspace.hh"
#include "auth/password_authenticator.hh"
#include "service/storage_proxy.hh"
#include "alternator/executor.hh"
#include "cql3/selection/selection.hh"
@@ -26,8 +25,8 @@ namespace alternator {
static logging::logger alogger("alternator-auth");
future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username) {
schema_ptr schema = proxy.data_dictionary().find_schema(db::system_keyspace::NAME, "roles");
future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username) {
schema_ptr schema = proxy.data_dictionary().find_schema(auth::get_auth_ks_name(as.query_processor()), "roles");
partition_key pk = partition_key::from_single_value(*schema, utf8_type->decompose(username));
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
std::vector<query::clustering_range> bounds{query::clustering_range::make_open_ended_both_sides()};
@@ -40,7 +39,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id, can_login_col->id}, selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
auto cl = db::consistency_level::LOCAL_ONE;
auto cl = auth::password_authenticator::consistency_for_user(username);
service::client_state client_state{service::client_state::internal_tag()};
service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,

View File

@@ -20,6 +20,6 @@ namespace alternator {
using key_cache = utils::loading_cache<std::string, std::string, 1>;
future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username);
future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username);
}

View File

@@ -3463,11 +3463,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
if (should_add_wcu) {
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
}
auto duration = std::chrono::steady_clock::now() - start_time;
_stats.api_operations.batch_write_item_latency.mark(duration);
for (const auto& w : per_table_wcu) {
w.first->api_operations.batch_write_item_latency.mark(duration);
}
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return rjson::print(std::move(ret));
}
@@ -4978,12 +4974,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
if (!some_succeeded && eptr) {
co_await coroutine::return_exception_ptr(std::move(eptr));
}
auto duration = std::chrono::steady_clock::now() - start_time;
_stats.api_operations.batch_get_item_latency.mark(duration);
for (const table_requests& rs : requests) {
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
}
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
if (is_big(response)) {
co_return make_streamed(std::move(response));
} else {

View File

@@ -411,8 +411,8 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
}
}
auto cache_getter = [&proxy = _proxy] (std::string username) {
return get_key_from_roles(proxy, std::move(username));
auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
return get_key_from_roles(proxy, as, std::move(username));
};
return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
user = std::move(user),
@@ -771,7 +771,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
if (!username.empty()) {
client_state.set_login(auth::authenticated_user(username));
}
client_state.maybe_update_per_service_level_params();
co_await client_state.maybe_update_per_service_level_params();
tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
tracing::trace(trace_state, "{}", op);

View File

@@ -14,6 +14,20 @@
namespace alternator {
const char* ALTERNATOR_METRICS = "alternator";
static seastar::metrics::histogram estimated_histogram_to_metrics(const utils::estimated_histogram& histogram) {
seastar::metrics::histogram res;
res.buckets.resize(histogram.bucket_offsets.size());
uint64_t cumulative_count = 0;
res.sample_count = histogram._count;
res.sample_sum = histogram._sample_sum;
for (size_t i = 0; i < res.buckets.size(); i++) {
auto& v = res.buckets[i];
v.upper_bound = histogram.bucket_offsets[i];
cumulative_count += histogram.buckets[i];
v.count = cumulative_count;
}
return res;
}
static seastar::metrics::label column_family_label("cf");
static seastar::metrics::label keyspace_label("ks");
@@ -137,21 +151,21 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"), labels,
stats.api_operations.batch_get_item_batch_total)(op("BatchGetItem")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
[&stats]{ return to_metrics_histogram(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
[&stats]{ return to_metrics_histogram(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
});
seastar::metrics::label expression_label("expression");

View File

@@ -16,8 +16,6 @@
#include "cql3/stats.hh"
namespace alternator {
using batch_histogram = utils::estimated_histogram_with_max<128>;
using op_size_histogram = utils::estimated_histogram_with_max<512>;
// Object holding per-shard statistics related to Alternator.
// While this object is alive, these metrics are also registered to be
@@ -78,34 +76,34 @@ public:
utils::timed_rate_moving_average_summary_and_histogram batch_get_item_latency;
utils::timed_rate_moving_average_summary_and_histogram get_records_latency;
batch_histogram batch_get_item_histogram;
batch_histogram batch_write_item_histogram;
utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
} api_operations;
// Operation size metrics
struct {
// Item size statistics collected per table and aggregated per node.
// Each histogram covers the range 0 - 512. Resolves #25143.
// Each histogram covers the range 0 - 446. Resolves #25143.
// A size is the retrieved item's size.
op_size_histogram get_item_op_size_kb;
utils::estimated_histogram get_item_op_size_kb{30};
// A size is the maximum of the new item's size and the old item's size.
op_size_histogram put_item_op_size_kb;
utils::estimated_histogram put_item_op_size_kb{30};
// A size is the deleted item's size. If the deleted item's size is
// unknown (i.e. read-before-write wasn't necessary and it wasn't
// forced by a configuration option), it won't be recorded on the
// histogram.
op_size_histogram delete_item_op_size_kb;
utils::estimated_histogram delete_item_op_size_kb{30};
// A size is the maximum of existing item's size and the estimated size
// of the update. This will be changed to the maximum of the existing item's
// size and the new item's size in a subsequent PR.
op_size_histogram update_item_op_size_kb;
utils::estimated_histogram update_item_op_size_kb{30};
// A size is the sum of the sizes of all items per table. This means
// that a single BatchGetItem / BatchWriteItem updates the histogram
// for each table that it has items in.
// The sizes are the retrieved items' sizes grouped per table.
op_size_histogram batch_get_item_op_size_kb;
utils::estimated_histogram batch_get_item_op_size_kb{30};
// The sizes are the the written items' sizes grouped per table.
op_size_histogram batch_write_item_op_size_kb;
utils::estimated_histogram batch_write_item_op_size_kb{30};
} operation_sizes;
// Count of authentication and authorization failures, counted if either
// alternator_enforce_authorization or alternator_warn_authorization are
@@ -142,7 +140,7 @@ public:
cql3::cql_stats cql_stats;
// Enumeration of expression types only for stats
// if needed it can be extended e.g. per operation
// if needed it can be extended e.g. per operation
enum expression_types {
UPDATE_EXPRESSION,
CONDITION_EXPRESSION,
@@ -166,7 +164,7 @@ struct table_stats {
void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);
inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
return (bytes) / 1024;
return (bytes + 1023) / 1024;
}
}

View File

@@ -33,8 +33,6 @@
#include "data_dictionary/data_dictionary.hh"
#include "utils/rjson.hh"
static logging::logger elogger("alternator-streams");
/**
* Base template type to implement rapidjson::internal::TypeHelper<...>:s
* for types that are ostreamable/string constructible/castable.
@@ -430,25 +428,6 @@ using namespace std::chrono_literals;
// Dynamo docs says no data shall live longer than 24h.
static constexpr auto dynamodb_streams_max_window = 24h;
// find the parent shard in previous generation for the given child shard
// takes care of wrap-around case in vnodes
// prev_streams must be sorted by token
const cdc::stream_id& find_parent_shard_in_previous_generation(db_clock::time_point prev_timestamp, const utils::chunked_vector<cdc::stream_id> &prev_streams, const cdc::stream_id &child) {
if (prev_streams.empty()) {
// something is really wrong - streams are empty
// let's try internal_error in hope it will be notified and fixed
on_internal_error(elogger, fmt::format("streams are empty for cdc generation at {} ({})", prev_timestamp, prev_timestamp.time_since_epoch().count()));
}
auto it = std::lower_bound(prev_streams.begin(), prev_streams.end(), child.token(), [](const cdc::stream_id& id, const dht::token& t) {
return id.token() < t;
});
if (it == prev_streams.end()) {
// wrap around case - take first
it = prev_streams.begin();
}
return *it;
}
future<executor::request_return_type> executor::describe_stream(client_state& client_state, service_permit permit, rjson::value request) {
_stats.api_operations.describe_stream++;
@@ -599,8 +578,16 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
auto shard = rjson::empty_object();
if (prev != e) {
auto &pid = find_parent_shard_in_previous_generation(prev->first, prev->second.streams, id);
rjson::add(shard, "ParentShardId", shard_id(prev->first, pid));
auto& pids = prev->second.streams;
auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
return t < id.token();
});
if (pid != pids.begin()) {
pid = std::prev(pid);
}
if (pid != pids.end()) {
rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
}
}
last.emplace(ts, id);

View File

@@ -243,7 +243,7 @@
"GOSSIP_DIGEST_SYN",
"GOSSIP_DIGEST_ACK2",
"GOSSIP_SHUTDOWN",
"UNUSED__DEFINITIONS_UPDATE",
"DEFINITIONS_UPDATE",
"TRUNCATE",
"UNUSED__REPLICATION_FINISHED",
"MIGRATION_REQUEST",

View File

@@ -1295,45 +1295,6 @@
}
]
},
{
"path":"/storage_service/logstor_compaction",
"operations":[
{
"method":"POST",
"summary":"Trigger compaction of the key-value storage",
"type":"void",
"nickname":"logstor_compaction",
"produces":[
"application/json"
],
"parameters":[
{
"name":"major",
"description":"When true, perform a major compaction",
"required":false,
"allowMultiple":false,
"type":"boolean",
"paramType":"query"
}
]
}
]
},
{
"path":"/storage_service/logstor_flush",
"operations":[
{
"method":"POST",
"summary":"Trigger flush of logstor storage",
"type":"void",
"nickname":"logstor_flush",
"produces":[
"application/json"
],
"parameters":[]
}
]
},
{
"path":"/storage_service/active_repair/",
"operations":[
@@ -3268,38 +3229,6 @@
}
]
},
{
"path":"/storage_service/logstor_info",
"operations":[
{
"method":"GET",
"summary":"Logstor segment information for one table",
"type":"table_logstor_info",
"nickname":"logstor_info",
"produces":[
"application/json"
],
"parameters":[
{
"name":"keyspace",
"description":"The keyspace",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"table",
"description":"table name",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
}
]
}
]
},
{
"path":"/storage_service/retrain_dict",
"operations":[
@@ -3708,47 +3637,6 @@
}
}
},
"logstor_hist_bucket":{
"id":"logstor_hist_bucket",
"properties":{
"bucket":{
"type":"long"
},
"count":{
"type":"long"
},
"min_data_size":{
"type":"long"
},
"max_data_size":{
"type":"long"
}
}
},
"table_logstor_info":{
"id":"table_logstor_info",
"description":"Per-table logstor segment distribution",
"properties":{
"keyspace":{
"type":"string"
},
"table":{
"type":"string"
},
"compaction_groups":{
"type":"long"
},
"segments":{
"type":"long"
},
"data_size_histogram":{
"type":"array",
"items":{
"$ref":"logstor_hist_bucket"
}
}
}
},
"tablet_repair_result":{
"id":"tablet_repair_result",
"description":"Tablet repair result",

View File

@@ -209,21 +209,6 @@
"parameters":[]
}
]
},
{
"path":"/system/chosen_sstable_version",
"operations":[
{
"method":"GET",
"summary":"Get sstable version currently chosen for use in new sstables",
"type":"string",
"nickname":"get_chosen_sstable_version",
"produces":[
"application/json"
],
"parameters":[]
}
]
}
]
}

View File

@@ -122,9 +122,9 @@ future<> unset_thrift_controller(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_thrift_controller(ctx, r); });
}
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
return ctx.http_server.set_routes([&ctx, &ss, &ssc, &group0_client] (routes& r) {
set_storage_service(ctx, r, ss, ssc, group0_client);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
return ctx.http_server.set_routes([&ctx, &ss, &group0_client] (routes& r) {
set_storage_service(ctx, r, ss, group0_client);
});
}

View File

@@ -98,7 +98,7 @@ future<> set_server_config(http_context& ctx, db::config& cfg);
future<> unset_server_config(http_context& ctx);
future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
future<> unset_server_snitch(http_context& ctx);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
future<> unset_server_storage_service(http_context& ctx);
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
future<> unset_server_client_routes(http_context& ctx);

View File

@@ -18,9 +18,7 @@
#include "utils/assert.hh"
#include "utils/estimated_histogram.hh"
#include <algorithm>
#include <sstream>
#include "db/data_listeners.hh"
#include "utils/hash.hh"
#include "storage_service.hh"
#include "compaction/compaction_manager.hh"
#include "unimplemented.hh"
@@ -344,56 +342,6 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
return ret;
}
static
future<json::json_return_type>
rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
bool filters_provided = false;
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
table_filters.emplace(parse_fully_qualified_cf_name(filter));
}
}
std::unordered_set<sstring> keyspace_filters {};
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
keyspace_filters.emplace(std::move(filter));
}
}
// when the query is empty return immediately
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
apilog.debug("toppartitions query: processing results");
cf::toppartitions_query_results results;
results.read_cardinality = 0;
results.write_cardinality = 0;
return make_ready_future<json::json_return_type>(results);
}
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
api::req_param<unsigned> capacity(*req, "capacity", 256);
api::req_param<unsigned> list_size(*req, "list_size", 10);
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
return run_toppartitions_query(q);
});
}
void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
cf::get_column_family_name.set(r, [&db] (const_req req){
std::vector<sstring> res;
@@ -1099,10 +1047,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
});
});
ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
return rest_toppartitions_generic(db, std::move(req));
});
cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
if (!req->get_query_param("split_output").empty()) {
fail(unimplemented::cause::API);
@@ -1269,7 +1213,6 @@ void unset_column_family(http_context& ctx, routes& r) {
cf::get_sstable_count_per_level.unset(r);
cf::get_sstables_for_key.unset(r);
cf::toppartitions.unset(r);
ss::toppartitions_generic.unset(r);
cf::force_major_compaction.unset(r);
ss::get_load.unset(r);
ss::get_metrics_load.unset(r);

View File

@@ -17,7 +17,9 @@
#include "gms/feature_service.hh"
#include "schema/schema_builder.hh"
#include "sstables/sstables_manager.hh"
#include "utils/hash.hh"
#include <optional>
#include <sstream>
#include <stdexcept>
#include <time.h>
#include <algorithm>
@@ -610,6 +612,56 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
}
static
future<json::json_return_type>
rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
bool filters_provided = false;
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
table_filters.emplace(parse_fully_qualified_cf_name(filter));
}
}
std::unordered_set<sstring> keyspace_filters {};
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
keyspace_filters.emplace(std::move(filter));
}
}
// when the query is empty return immediately
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
apilog.debug("toppartitions query: processing results");
httpd::column_family_json::toppartitions_query_results results;
results.read_cardinality = 0;
results.write_cardinality = 0;
return make_ready_future<json::json_return_type>(results);
}
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
api::req_param<unsigned> capacity(*req, "capacity", 256);
api::req_param<unsigned> list_size(*req, "list_size", 10);
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
return run_toppartitions_query(q);
});
}
static
json::json_return_type
rest_get_release_version(sharded<service::storage_service>& ss, const_req& req) {
@@ -783,31 +835,9 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
static
future<json::json_return_type>
rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
bool major = false;
if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
major = validate_bool(major_param);
}
apilog.info("logstor_compaction: major={}", major);
auto& db = ctx.db;
co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
co_return json_void();
}
static
future<json::json_return_type>
rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
apilog.info("logstor_flush");
auto& db = ctx.db;
co_await replica::database::flush_logstor_separator_on_all_shards(db);
co_return json_void();
}
static
future<json::json_return_type>
rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
rest_decommission(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
apilog.info("decommission");
return ss.local().decommission(ssc).then([] {
return ss.local().decommission().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
}
@@ -1523,54 +1553,6 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
});
}
static
future<json::json_return_type>
rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
auto table = api::req_param<sstring>(*req, "table", {}).value;
if (table.empty()) {
table = api::req_param<sstring>(*req, "cf", {}).value;
}
if (keyspace.empty()) {
throw bad_param_exception("The query parameter 'keyspace' is required");
}
if (table.empty()) {
throw bad_param_exception("The query parameter 'table' is required");
}
keyspace = validate_keyspace(ctx, keyspace);
auto tid = validate_table(ctx.db.local(), keyspace, table);
auto& cf = ctx.db.local().find_column_family(tid);
if (!cf.uses_logstor()) {
throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
}
return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
merged_stats += shard_stats;
}, [tid](const replica::database& db) {
return db.get_logstor_table_segment_stats(tid);
}).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
ss::table_logstor_info result;
result.keyspace = keyspace;
result.table = table;
result.compaction_groups = merged_stats.compaction_group_count;
result.segments = merged_stats.segment_count;
for (const auto& bucket : merged_stats.histogram) {
ss::logstor_hist_bucket hist;
hist.count = bucket.count;
hist.max_data_size = bucket.max_data_size;
result.data_size_histogram.push(std::move(hist));
}
return make_ready_future<json::json_return_type>(stream_object(result));
});
});
}
static
future<json::json_return_type>
rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1590,7 +1572,10 @@ rest_upgrade_to_raft_topology(sharded<service::storage_service>& ss, std::unique
static
future<json::json_return_type>
rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
co_return sstring("done");
const auto ustate = co_await ss.invoke_on(0, [] (auto& ss) {
return ss.get_topology_upgrade_state();
});
co_return sstring(format("{}", ustate));
}
static
@@ -1800,8 +1785,9 @@ rest_bind(FuncType func, BindArgs&... args) {
return std::bind_front(func, std::ref(args)...);
}
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1816,9 +1802,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
ss::decommission.set(r, rest_bind(rest_decommission, ss));
ss::move.set(r, rest_bind(rest_move, ss));
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1867,7 +1851,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1884,6 +1867,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
void unset_storage_service(http_context& ctx, routes& r) {
ss::get_token_endpoint.unset(r);
ss::toppartitions_generic.unset(r);
ss::get_release_version.unset(r);
ss::get_scylla_release_version.unset(r);
ss::get_schema_version.unset(r);
@@ -1897,8 +1881,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::reset_cleanup_needed.unset(r);
ss::force_flush.unset(r);
ss::force_keyspace_flush.unset(r);
ss::logstor_compaction.unset(r);
ss::logstor_flush.unset(r);
ss::decommission.unset(r);
ss::move.unset(r);
ss::remove_node.unset(r);
@@ -1946,7 +1928,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::get_ownership.unset(r);
ss::get_effective_ownership.unset(r);
ss::sstable_info.unset(r);
ss::logstor_info.unset(r);
ss::reload_raft_topology_state.unset(r);
ss::upgrade_to_raft_topology.unset(r);
ss::raft_topology_upgrade_status.unset(r);
@@ -2163,7 +2144,6 @@ void unset_snapshot(http_context& ctx, routes& r) {
ss::start_backup.unset(r);
cf::get_true_snapshots_size.unset(r);
cf::get_all_true_snapshots_size.unset(r);
ss::decommission.unset(r);
}
}

View File

@@ -66,7 +66,7 @@ struct scrub_info {
scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req);
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
void unset_storage_service(http_context& ctx, httpd::routes& r);
void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
void unset_sstables_loader(http_context& ctx, httpd::routes& r);

View File

@@ -190,13 +190,6 @@ void set_system(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
});
});
hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
return smp::submit_to(0, [&ctx] {
auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
});
});
}
}

View File

@@ -19,6 +19,7 @@ target_sources(scylla_auth
permission.cc
resource.cc
role_or_anonymous.cc
roles-metadata.cc
sasl_challenge.cc
saslauthd_authenticator.cc
service.cc
@@ -48,4 +49,4 @@ if (Scylla_USE_PRECOMPILED_HEADER_USE)
target_precompile_headers(scylla_auth REUSE_FROM scylla-precompiled-header)
endif()
check_headers(check-headers scylla_auth
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)

View File

@@ -26,7 +26,7 @@ extern const std::string_view allow_all_authorizer_name;
class allow_all_authorizer final : public authorizer {
public:
allow_all_authorizer(cql3::query_processor&) {
allow_all_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
}
virtual future<> start() override {

View File

@@ -47,7 +47,7 @@ void cache::set_permission_loader(permission_loader_func loader) {
_permission_loader = std::move(loader);
}
lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
auto it = _roles.find(role);
if (it == _roles.end()) {
return {};
@@ -55,16 +55,6 @@ lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const
return it->second;
}
void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
for (const auto& [name, record] : _roles) {
func(name, *record);
}
}
size_t cache::roles_count() const noexcept {
return _roles.size();
}
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
std::unordered_map<resource, permission_set>* perms_cache;
lw_shared_ptr<role_record> role_ptr;
@@ -219,6 +209,9 @@ future<> cache::prune_all() noexcept {
}
future<> cache::load_all() {
if (legacy_mode(_qp)) {
co_return;
}
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
@@ -270,6 +263,9 @@ future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles,
}
future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
if (legacy_mode(_qp)) {
co_return;
}
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);

View File

@@ -9,7 +9,6 @@
#pragma once
#include <seastar/core/abort_source.hh>
#include <string_view>
#include <unordered_set>
#include <unordered_map>
@@ -20,7 +19,7 @@
#include <seastar/core/semaphore.hh>
#include <seastar/core/metrics_registration.hh>
#include "absl-flat_hash_map.hh"
#include <absl/container/flat_hash_map.h>
#include "auth/permission.hh"
#include "auth/common.hh"
@@ -43,8 +42,8 @@ public:
std::unordered_set<role_name_t> member_of;
std::unordered_set<role_name_t> members;
sstring salted_hash;
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
std::unordered_map<sstring, sstring> attributes;
std::unordered_map<sstring, permission_set> permissions;
private:
friend cache;
// cached permissions include effects of role's inheritance
@@ -53,7 +52,7 @@ public:
};
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
void set_permission_loader(permission_loader_func loader);
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
future<> prune(const resource& r);
@@ -62,15 +61,8 @@ public:
future<> load_roles(std::unordered_set<role_name_t> roles);
static bool includes_table(const table_id&) noexcept;
// Returns the number of roles in the cache.
size_t roles_count() const noexcept;
// The callback doesn't suspend (no co_await) so it observes the state
// of the cache atomically.
void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
private:
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
roles_map _roles;
// anonymous permissions map exists mainly due to compatibility with
// higher layers which use role_or_anonymous to get permissions.

View File

@@ -14,11 +14,18 @@
#include <seastar/core/sharded.hh>
#include "mutation/canonical_mutation.hh"
#include "schema/schema_fwd.hh"
#include "mutation/timestamp.hh"
#include "utils/assert.hh"
#include "utils/exponential_backoff_retry.hh"
#include "cql3/query_processor.hh"
#include "cql3/statements/create_table_statement.hh"
#include "schema/schema_builder.hh"
#include "service/migration_manager.hh"
#include "service/raft/group0_state_machine.hh"
#include "timeout_config.hh"
#include "utils/error_injection.hh"
#include "db/system_keyspace.hh"
namespace auth {
@@ -26,14 +33,22 @@ namespace meta {
namespace legacy {
constinit const std::string_view AUTH_KS("system_auth");
constinit const std::string_view USERS_CF("users");
} // namespace legacy
constinit const std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
} // namespace meta
static logging::logger auth_log("auth");
std::string default_superuser(cql3::query_processor& qp) {
return qp.db().get_config().auth_superuser_name();
bool legacy_mode(cql3::query_processor& qp) {
return qp.auth_version < db::auth_version_t::v2;
}
std::string_view get_auth_ks_name(cql3::query_processor& qp) {
if (legacy_mode(qp)) {
return meta::legacy::AUTH_KS;
}
return db::system_keyspace::NAME;
}
// Func must support being invoked more than once.
@@ -50,6 +65,47 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
}).discard_result();
}
static future<> create_legacy_metadata_table_if_missing_impl(
std::string_view table_name,
cql3::query_processor& qp,
std::string_view cql,
::service::migration_manager& mm) {
SCYLLA_ASSERT(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
auto db = qp.db();
auto parsed_statement = cql3::query_processor::parse_statement(cql, cql3::dialect{});
auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);
parsed_cf_statement.prepare_keyspace(meta::legacy::AUTH_KS);
auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);
const auto schema = statement->get_cf_meta_data(qp.db());
const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
schema_builder b(schema);
b.set_uuid(uuid);
schema_ptr table = b.build();
if (!db.has_schema(table->ks_name(), table->cf_name())) {
auto group0_guard = co_await mm.start_group0_operation();
auto ts = group0_guard.write_timestamp();
try {
co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
} catch (const exceptions::already_exists_exception&) {}
}
}
future<> create_legacy_metadata_table_if_missing(
std::string_view table_name,
cql3::query_processor& qp,
std::string_view cql,
::service::migration_manager& mm) noexcept {
return futurize_invoke(create_legacy_metadata_table_if_missing_impl, table_name, qp, cql, mm);
}
::service::query_state& internal_distributed_query_state() noexcept {
#ifdef DEBUG
// Give the much slower debug tests more headroom for completing auth queries.
@@ -84,6 +140,56 @@ static future<> announce_mutations_with_guard(
return group0_client.add_entry(std::move(group0_cmd), std::move(group0_guard), as, timeout);
}
future<> announce_mutations_with_batching(
::service::raft_group0_client& group0_client,
start_operation_func_t start_operation_func,
std::function<::service::mutations_generator(api::timestamp_type t)> gen,
seastar::abort_source& as,
std::optional<::service::raft_timeout> timeout) {
// account for command's overhead, it's better to use smaller threshold than constantly bounce off the limit
size_t memory_threshold = group0_client.max_command_size() * 0.75;
utils::get_local_injector().inject("auth_announce_mutations_command_max_size",
[&memory_threshold] {
memory_threshold = 1000;
});
size_t memory_usage = 0;
utils::chunked_vector<canonical_mutation> muts;
// guard has to be taken before we execute code in gen as
// it can do read-before-write and we want announce_mutations
// operation to be linearizable with other such calls,
// for instance if we do select and then delete in gen
// we want both to operate on the same data or fail
// if someone else modified it in the middle
std::optional<::service::group0_guard> group0_guard;
group0_guard = co_await start_operation_func(as);
auto timestamp = group0_guard->write_timestamp();
auto g = gen(timestamp);
while (auto mut = co_await g()) {
muts.push_back(canonical_mutation{*mut});
memory_usage += muts.back().representation().size();
if (memory_usage >= memory_threshold) {
if (!group0_guard) {
group0_guard = co_await start_operation_func(as);
timestamp = group0_guard->write_timestamp();
}
co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
group0_guard = std::nullopt;
memory_usage = 0;
muts = {};
}
}
if (!muts.empty()) {
if (!group0_guard) {
group0_guard = co_await start_operation_func(as);
timestamp = group0_guard->write_timestamp();
}
co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
}
}
future<> announce_mutations(
cql3::query_processor& qp,
::service::raft_group0_client& group0_client,

View File

@@ -21,7 +21,12 @@
using namespace std::chrono_literals;
namespace replica {
class database;
}
namespace service {
class migration_manager;
class query_state;
}
@@ -35,8 +40,10 @@ namespace meta {
namespace legacy {
extern constinit const std::string_view AUTH_KS;
extern constinit const std::string_view USERS_CF;
} // namespace legacy
constexpr std::string_view DEFAULT_SUPERUSER_NAME("cassandra");
extern constinit const std::string_view AUTH_PACKAGE_NAME;
} // namespace meta
@@ -45,7 +52,12 @@ constexpr std::string_view PERMISSIONS_CF = "role_permissions";
constexpr std::string_view ROLE_MEMBERS_CF = "role_members";
constexpr std::string_view ROLE_ATTRIBUTES_CF = "role_attributes";
std::string default_superuser(cql3::query_processor& qp);
// This is a helper to check whether auth-v2 is on.
bool legacy_mode(cql3::query_processor& qp);
// We have legacy implementation using different keyspace
// and need to parametrize depending on runtime feature.
std::string_view get_auth_ks_name(cql3::query_processor& qp);
template <class Task>
future<> once_among_shards(Task&& f) {
@@ -59,6 +71,12 @@ future<> once_among_shards(Task&& f) {
// Func must support being invoked more than once.
future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_function<future<>()> func);
future<> create_legacy_metadata_table_if_missing(
std::string_view table_name,
cql3::query_processor&,
std::string_view cql,
::service::migration_manager&) noexcept;
///
/// Time-outs for internal, non-local CQL queries.
///
@@ -66,6 +84,20 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
::service::raft_timeout get_raft_timeout() noexcept;
// Execute update query via group0 mechanism, mutations will be applied on all nodes.
// Use this function when need to perform read before write on a single guard or if
// you have more than one mutation and potentially exceed single command size limit.
using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source&)>;
future<> announce_mutations_with_batching(
::service::raft_group0_client& group0_client,
// since we can operate also in topology coordinator context where we need stronger
// guarantees than start_operation from group0_client gives we allow to inject custom
// function here
start_operation_func_t start_operation_func,
std::function<::service::mutations_generator(api::timestamp_type t)> gen,
seastar::abort_source& as,
std::optional<::service::raft_timeout> timeout);
// Execute update query via group0 mechanism, mutations will be applied on all nodes.
future<> announce_mutations(
cql3::query_processor& qp,

View File

@@ -39,14 +39,103 @@ static constexpr std::string_view PERMISSIONS_NAME = "permissions";
static logging::logger alogger("default_authorizer");
default_authorizer::default_authorizer(cql3::query_processor& qp)
: _qp(qp) {
default_authorizer::default_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
: _qp(qp)
, _migration_manager(mm) {
}
default_authorizer::~default_authorizer() {
}
static const sstring legacy_table_name{"permissions"};
bool default_authorizer::legacy_metadata_exists() const {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<bool> default_authorizer::legacy_any_granted() const {
static const sstring query = seastar::format("SELECT * FROM {}.{} LIMIT 1", meta::legacy::AUTH_KS, PERMISSIONS_CF);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{},
cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> results) {
return !results->empty();
});
}
future<> default_authorizer::migrate_legacy_metadata() {
alogger.info("Starting migration of legacy permissions metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
return do_with(
row.get_as<sstring>("username"),
parse_resource(row.get_as<sstring>(RESOURCE_NAME)),
::service::group0_batch::unused(),
[this, &row](const auto& username, const auto& r, auto& mc) {
const permission_set perms = permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
return grant(username, perms, r, mc);
});
}).finally([results] {});
}).then([] {
alogger.info("Finished migrating legacy permissions metadata.");
}).handle_exception([](std::exception_ptr ep) {
alogger.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> default_authorizer::start_legacy() {
static const sstring create_table = fmt::format(
"CREATE TABLE {}.{} ("
"{} text,"
"{} text,"
"{} set<text>,"
"PRIMARY KEY({}, {})"
") WITH gc_grace_seconds={}",
meta::legacy::AUTH_KS,
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME,
PERMISSIONS_NAME,
ROLE_NAME,
RESOURCE_NAME,
90 * 24 * 60 * 60); // 3 months.
return once_among_shards([this] {
return create_legacy_metadata_table_if_missing(
PERMISSIONS_CF,
_qp,
create_table,
_migration_manager).then([this] {
_finished = do_after_system_ready(_as, [this] {
return async([this] {
_migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
if (legacy_metadata_exists()) {
if (!legacy_any_granted().get()) {
migrate_legacy_metadata().get();
return;
}
alogger.warn("Ignoring legacy permissions metadata since role permissions exist.");
}
});
});
});
});
}
future<> default_authorizer::start() {
if (legacy_mode(_qp)) {
return start_legacy();
}
return make_ready_future<>();
}
@@ -63,7 +152,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? AND {} = ?",
PERMISSIONS_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
@@ -87,13 +176,21 @@ default_authorizer::modify(
std::string_view op,
::service::group0_batch& mc) {
const sstring query = seastar::format("UPDATE {}.{} SET {} = {} {} ? WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
PERMISSIONS_NAME,
PERMISSIONS_NAME,
op,
ROLE_NAME,
RESOURCE_NAME);
if (legacy_mode(_qp)) {
co_return co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
internal_distributed_query_state(),
{permissions::to_strings(set), sstring(role_name), resource.name()},
cql3::query_processor::cache_internal::no).discard_result();
}
co_await collect_mutations(_qp, mc, query,
{permissions::to_strings(set), sstring(role_name), resource.name()});
}
@@ -112,7 +209,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
ROLE_NAME,
RESOURCE_NAME,
PERMISSIONS_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF);
const auto results = co_await _qp.execute_internal(
@@ -137,16 +234,74 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
future<> default_authorizer::revoke_all(std::string_view role_name, ::service::group0_batch& mc) {
try {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
}
}
future<> default_authorizer::revoke_all_legacy(const resource& resource) {
static const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
ROLE_NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
RESOURCE_NAME);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{resource.name()},
cql3::query_processor::cache_internal::no).then_wrapped([this, resource](future<::shared_ptr<cql3::untyped_result_set>> f) {
try {
auto res = f.get();
return parallel_for_each(
res->begin(),
res->end(),
[this, res, resource](const cql3::untyped_result_set::row& r) {
static const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{r.get_as<sstring>(ROLE_NAME), resource.name()},
cql3::query_processor::cache_internal::no).discard_result().handle_exception(
[resource](auto ep) {
try {
std::rethrow_exception(ep);
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
}
});
});
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
return make_ready_future();
}
});
}
future<> default_authorizer::revoke_all(const resource& resource, ::service::group0_batch& mc) {
if (legacy_mode(_qp)) {
co_return co_await revoke_all_legacy(resource);
}
if (resource.kind() == resource_kind::data &&
data_resource_view(resource).is_keyspace()) {
revoke_all_keyspace_resources(resource, mc);
@@ -157,7 +312,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
auto gen = [this, name] (api::timestamp_type t) -> ::service::mutations_generator {
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
ROLE_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
RESOURCE_NAME);
auto res = co_await _qp.execute_internal(
@@ -167,7 +322,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
cql3::query_processor::cache_internal::no);
for (const auto& r : *res) {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
@@ -192,7 +347,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
const sstring query = seastar::format("SELECT {}, {} FROM {}.{}",
ROLE_NAME,
RESOURCE_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF);
auto res = co_await _qp.execute_internal(
query,
@@ -207,7 +362,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
continue;
}
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);

View File

@@ -27,12 +27,14 @@ namespace auth {
class default_authorizer : public authorizer {
cql3::query_processor& _qp;
::service::migration_manager& _migration_manager;
abort_source _as{};
future<> _finished{make_ready_future<>()};
public:
default_authorizer(cql3::query_processor&);
default_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
~default_authorizer();
@@ -57,6 +59,16 @@ public:
virtual const resource_set& protected_resources() const override;
private:
future<> start_legacy();
bool legacy_metadata_exists() const;
future<> revoke_all_legacy(const resource&);
future<bool> legacy_any_granted() const;
future<> migrate_legacy_metadata();
future<> modify(std::string_view, permission_set, const resource&, std::string_view, ::service::group0_batch&);
void revoke_all_keyspace_resources(const resource& ks_resource, ::service::group0_batch& mc);

View File

@@ -32,7 +32,7 @@ namespace {
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
struct url_desc_deleter {
void operator()(LDAPURLDesc* p) {
void operator()(LDAPURLDesc *p) {
ldap_free_urldesc(p);
}
};
@@ -40,7 +40,7 @@ struct url_desc_deleter {
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
url_desc_ptr parse_url(std::string_view url) {
LDAPURLDesc* desc = nullptr;
LDAPURLDesc *desc = nullptr;
if (ldap_url_parse(url.data(), &desc)) {
mylog.error("error in ldap_url_parse({})", url);
}
@@ -53,12 +53,8 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
mylog.debug("Analyzing search results");
for (auto e = ldap_first_entry(ld, res); e; e = ldap_next_entry(ld, e)) {
struct deleter {
void operator()(berval** p) {
ldap_value_free_len(p);
}
void operator()(char* p) {
ldap_memfree(p);
}
void operator()(berval** p) { ldap_value_free_len(p); }
void operator()(char* p) { ldap_memfree(p); }
};
const std::unique_ptr<char, deleter> dname(ldap_get_dn(ld, e));
mylog.debug("Analyzing entry {}", dname.get());
@@ -79,29 +75,32 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
namespace auth {
ldap_role_manager::ldap_role_manager(std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms, utils::observer<uint32_t> permissions_update_interval_in_ms_observer, cql3::query_processor& qp,
::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache)
, _group0_client(rg0c)
, _query_template(query_template)
, _target_attr(target_attr)
, _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
ldap_role_manager::ldap_role_manager(
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
}
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: ldap_role_manager(qp.db().get_config().ldap_url_template(), qp.db().get_config().ldap_attr_role(), qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(), qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this](const uint32_t& v) {
_permissions_update_interval_in_ms = v;
}),
qp, rg0c, mm, cache) {
: ldap_role_manager(
qp.db().get_config().ldap_url_template(),
qp.db().get_config().ldap_attr_role(),
qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(),
qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
qp,
rg0c,
mm,
cache) {
}
std::string_view ldap_role_manager::qualified_java_name() const noexcept {
@@ -114,16 +113,17 @@ const resource_set& ldap_role_manager::protected_resources() const {
future<> ldap_role_manager::start() {
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
return make_exception_future(std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
return make_exception_future(
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
}
_cache_pruner = futurize_invoke([this]() -> future<> {
_cache_pruner = futurize_invoke([this] () -> future<> {
while (true) {
try {
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
} catch (const seastar::sleep_aborted&) {
co_return; // ignore
}
co_await _cache.container().invoke_on_all([](cache& c) -> future<> {
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
try {
co_await c.reload_all_permissions();
} catch (...) {
@@ -165,7 +165,7 @@ future<conn_ptr> ldap_role_manager::connect() {
future<conn_ptr> ldap_role_manager::reconnect() {
unsigned retries_left = 5;
using namespace std::literals::chrono_literals;
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left]() -> future<std::optional<conn_ptr>> {
conn_ptr conn = co_await exponential_backoff_retry::do_until_value(1s, 32s, _as, [this, &retries_left] () -> future<std::optional<conn_ptr>> {
if (!retries_left) {
co_return conn_ptr{};
}
@@ -188,13 +188,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {
future<> ldap_role_manager::stop() {
_as.request_abort();
return std::move(_cache_pruner)
.then([this] {
return _std_mgr.stop();
})
.then([this] {
return _connection_factory.stop();
});
return std::move(_cache_pruner).then([this] {
return _std_mgr.stop();
}).then([this] {
return _connection_factory.stop();
});
}
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
@@ -223,42 +221,43 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
if (!desc) {
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
}
return _connection_factory.with_connection(
[this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)](ldap_connection& conn) -> future<role_set> {
sstring grantee_name = std::move(grantee_name_);
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
/*timeout=*/nullptr, /*sizelimit=*/0);
mylog.trace("query_granted: got search results");
const auto mtype = ldap_msgtype(res.get());
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
return _connection_factory.with_connection([this, desc = std::move(desc), grantee_name_ = sstring(grantee_name)]
(ldap_connection& conn) -> future<role_set> {
sstring grantee_name = std::move(grantee_name_);
ldap_msg_ptr res = co_await conn.search(desc->lud_dn, desc->lud_scope, desc->lud_filter, desc->lud_attrs,
/*attrsonly=*/0, /*serverctrls=*/nullptr, /*clientctrls=*/nullptr,
/*timeout=*/nullptr, /*sizelimit=*/0);
mylog.trace("query_granted: got search results");
const auto mtype = ldap_msgtype(res.get());
if (mtype != LDAP_RES_SEARCH_ENTRY && mtype != LDAP_RES_SEARCH_RESULT && mtype != LDAP_RES_SEARCH_REFERENCE) {
mylog.error("ldap search yielded result {} of type {}", static_cast<const void*>(res.get()), mtype);
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error("ldap_role_manager: search result has wrong type")));
}
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
auth::role_set valid_roles{grantee_name};
// Each value is a role to be granted.
co_await parallel_for_each(values, [this, &valid_roles] (const sstring& ldap_role) {
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role] (bool exists) {
if (exists) {
valid_roles.insert(ldap_role);
} else {
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
}
std::vector<sstring> values = get_attr_values(conn.get_ldap(), res.get(), _target_attr.c_str());
auth::role_set valid_roles{grantee_name};
// Each value is a role to be granted.
co_await parallel_for_each(values, [this, &valid_roles](const sstring& ldap_role) {
return _std_mgr.exists(ldap_role).then([&valid_roles, &ldap_role](bool exists) {
if (exists) {
valid_roles.insert(ldap_role);
} else {
mylog.error("unrecognized role received from LDAP: {}", ldap_role);
}
});
});
co_return std::move(valid_roles);
});
});
co_return std::move(valid_roles);
});
}
future<role_to_directly_granted_map> ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
future<role_to_directly_granted_map>
ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
role_to_directly_granted_map result;
auto roles = co_await query_all(qs);
for (auto& role : roles) {
for (auto& role: roles) {
auto granted_set = co_await query_granted(role, recursive_role_query::no);
for (auto& granted : granted_set) {
for (auto& granted: granted_set) {
if (granted != role) {
result.insert({role, granted});
}
@@ -272,7 +271,7 @@ future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
}
future<> ldap_role_manager::create_role(std::string_view role_name) {
return smp::submit_to(0, [this, role_name]() -> future<> {
return smp::submit_to(0, [this, role_name] () -> future<> {
int retries = 10;
while (true) {
auto guard = co_await _group0_client.start_operation(_as, ::service::raft_timeout{});
@@ -284,8 +283,8 @@ future<> ldap_role_manager::create_role(std::string_view role_name) {
} catch (const role_already_exists&) {
// ok
} catch (const ::service::group0_concurrent_modification& ex) {
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.", role_name,
retries ? " Retrying" : " Number of retries exceeded, giving up");
mylog.warn("Failed to auto-create role \"{}\" due to guard conflict.{}.",
role_name, retries ? " Retrying" : " Number of retries exceeded, giving up");
if (retries--) {
continue;
}
@@ -330,7 +329,8 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
return _std_mgr.can_login(role_name);
}
future<std::optional<sstring>> ldap_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
future<std::optional<sstring>> ldap_role_manager::get_attribute(
std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
return _std_mgr.get_attribute(role_name, attribute_name, qs);
}

View File

@@ -1,37 +0,0 @@
/*
* Copyright (C) 2026-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#pragma once
#include "auth/default_authorizer.hh"
#include "auth/permission.hh"
namespace auth {
// maintenance_socket_authorizer is used for clients connecting to the
// maintenance socket. It grants all permissions unconditionally (like
// AllowAllAuthorizer) while still supporting grant/revoke operations
// (delegated to the underlying CassandraAuthorizer / default_authorizer).
class maintenance_socket_authorizer : public default_authorizer {
public:
using default_authorizer::default_authorizer;
~maintenance_socket_authorizer() override = default;
future<> start() override {
return make_ready_future<>();
}
future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
return make_ready_future<permission_set>(permissions::ALL);
}
};
} // namespace auth

View File

@@ -26,9 +26,9 @@
#include "cql3/untyped_result_set.hh"
#include "utils/log.hh"
#include "service/migration_manager.hh"
#include "replica/database.hh"
#include "cql3/query_processor.hh"
#include "db/config.hh"
#include "db/system_keyspace.hh"
namespace auth {
@@ -36,10 +36,20 @@ constexpr std::string_view password_authenticator_name("org.apache.cassandra.aut
// name of the hash column.
static constexpr std::string_view SALTED_HASH = "salted_hash";
static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);
static logging::logger plogger("password_authenticator");
static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
std::string password_authenticator::default_superuser(cql3::query_processor& qp) {
if (legacy_mode(qp)) {
return std::string(meta::DEFAULT_SUPERUSER_NAME);
}
return qp.db().get_config().auth_superuser_name();
}
password_authenticator::~password_authenticator() {
}
@@ -57,18 +67,82 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
sstring password_authenticator::update_row_query() const {
return seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
}
static const sstring legacy_table_name{"credentials"};
bool password_authenticator::legacy_metadata_exists() const {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<> password_authenticator::migrate_legacy_metadata() const {
plogger.info("Starting migration of legacy authentication metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
auto username = row.get_as<sstring>("username");
auto salted_hash = row.get_as<sstring>(SALTED_HASH);
static const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
return _qp.execute_internal(
query,
consistency_for_user(username),
internal_distributed_query_state(),
{std::move(salted_hash), username},
cql3::query_processor::cache_internal::no).discard_result();
}).finally([results] {});
}).then([] {
plogger.info("Finished migrating legacy authentication metadata.");
}).handle_exception([](std::exception_ptr ep) {
plogger.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> password_authenticator::legacy_create_default_if_missing() {
if (_superuser.empty()) {
on_internal_error(plogger, "Legacy auth default superuser name is empty");
}
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
if (exists) {
co_return;
}
std::string salted_pwd(_qp.db().get_config().auth_superuser_salted_password());
if (salted_pwd.empty()) {
salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
}
const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
{salted_pwd, _superuser},
cql3::query_processor::cache_internal::no);
plogger.info("Created default superuser authentication record.");
}
future<> password_authenticator::maybe_create_default_password() {
auto needs_password = [this] () -> future<bool> {
if (default_superuser(_qp).empty()) {
if (_superuser.empty()) {
co_return false;
}
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
auto results = co_await _qp.execute_internal(query,
db::consistency_level::LOCAL_ONE,
internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
@@ -78,7 +152,7 @@ future<> password_authenticator::maybe_create_default_password() {
bool has_default = false;
bool has_superuser_with_password = false;
for (auto& result : *results) {
if (result.get_as<sstring>(meta::roles_table::role_col_name) == default_superuser(_qp)) {
if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
has_default = true;
}
if (has_salted_hash(result)) {
@@ -104,7 +178,7 @@ future<> password_authenticator::maybe_create_default_password() {
co_return;
}
const auto update_query = update_row_query();
co_await collect_mutations(_qp, batch, update_query, {salted_pwd, default_superuser(_qp)});
co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
plogger.info("Created default superuser authentication record.");
}
@@ -131,20 +205,72 @@ future<> password_authenticator::maybe_create_default_password_with_retries() {
future<> password_authenticator::start() {
return once_among_shards([this] {
_superuser = default_superuser(_qp);
// Verify that at least one hashing scheme is supported.
passwords::detail::verify_scheme(_scheme);
plogger.info("Using password hashing scheme: {}", passwords::detail::prefix_for_scheme(_scheme));
_stopped = do_after_system_ready(_as, [this] {
return async([this] {
if (legacy_mode(_qp)) {
if (_superuser.empty()) {
on_internal_error(plogger, "Legacy auth default superuser name is empty");
}
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
_migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
if (legacy::any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get()) {
if (legacy_metadata_exists()) {
plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
}
return;
}
if (legacy_metadata_exists()) {
migrate_legacy_metadata().get();
return;
}
legacy_create_default_if_missing().get();
}
utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
maybe_create_default_password_with_retries().get();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
if (!legacy_mode(_qp)) {
maybe_create_default_password_with_retries().get();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
}
}
});
});
if (legacy_mode(_qp)) {
if (_superuser.empty()) {
on_internal_error(plogger, "Legacy auth default superuser name is empty");
}
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
return create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
_migration_manager);
}
return make_ready_future<>();
});
}
@@ -154,6 +280,15 @@ future<> password_authenticator::stop() {
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
}
db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
// TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
// super user uses plain LOCAL_ONE?
if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
return db::consistency_level::QUORUM;
}
return db::consistency_level::LOCAL_ONE;
}
std::string_view password_authenticator::qualified_java_name() const {
return password_authenticator_name;
}
@@ -183,12 +318,20 @@ future<authenticated_user> password_authenticator::authenticate(
const sstring password = credentials.at(PASSWORD_KEY);
try {
auto role = _cache.get(username);
if (!role || role->salted_hash.empty()) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
std::optional<sstring> salted_hash;
if (legacy_mode(_qp)) {
salted_hash = co_await get_password_hash(username);
if (!salted_hash) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
} else {
auto role = _cache.get(username);
if (!role || role->salted_hash.empty()) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
salted_hash = role->salted_hash;
}
const auto& salted_hash = role->salted_hash;
const bool password_match = co_await passwords::check(password, salted_hash);
const bool password_match = co_await passwords::check(password, *salted_hash);
if (!password_match) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
@@ -227,7 +370,16 @@ future<> password_authenticator::create(std::string_view role_name, const authen
}
const auto query = update_row_query();
co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_user(role_name),
internal_distributed_query_state(),
{std::move(*maybe_hash), sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
}
}
future<> password_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
@@ -238,21 +390,38 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
const auto password = std::get<password_option>(*options.credentials).password;
const sstring query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query,
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_user(role_name),
internal_distributed_query_state(),
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
}
}
future<> password_authenticator::drop(std::string_view name, ::service::group0_batch& mc) {
const sstring query = seastar::format("DELETE {} FROM {}.{} WHERE {} = ?",
SALTED_HASH,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query, consistency_for_user(name),
internal_distributed_query_state(),
{sstring(name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(name)});
}
}
future<custom_options> password_authenticator::query_custom_options(std::string_view role_name) const {
@@ -271,13 +440,13 @@ future<std::optional<sstring>> password_authenticator::get_password_hash(std::st
// that a map lookup string->statement is not gonna kill us much.
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ?",
SALTED_HASH,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
const auto res = co_await _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
consistency_for_user(role_name),
internal_distributed_query_state(),
{role_name},
cql3::query_processor::cache_internal::yes);

View File

@@ -13,6 +13,7 @@
#include <seastar/core/abort_source.hh>
#include <seastar/core/shared_future.hh>
#include "db/consistency_level_type.hh"
#include "auth/authenticator.hh"
#include "auth/passwords.hh"
#include "auth/cache.hh"
@@ -43,11 +44,15 @@ class password_authenticator : public authenticator {
cache& _cache;
future<> _stopped;
abort_source _as;
std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
shared_promise<> _superuser_created_promise;
// We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
public:
static db::consistency_level consistency_for_user(std::string_view role_name);
static std::string default_superuser(cql3::query_processor& qp);
password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
~password_authenticator();
@@ -85,6 +90,12 @@ public:
virtual future<> ensure_superuser_is_created() const override;
private:
bool legacy_metadata_exists() const;
future<> migrate_legacy_metadata() const;
future<> legacy_create_default_if_missing();
future<> maybe_create_default_password();
future<> maybe_create_default_password_with_retries();

68
auth/roles-metadata.cc Normal file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2018-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "auth/roles-metadata.hh"
#include <seastar/core/format.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/sstring.hh>
#include "auth/common.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
namespace auth {
namespace legacy {
future<bool> default_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
auth::meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
auto results = co_await qp.execute_internal(query, cl
, internal_distributed_query_state()
, {rolename.value_or(std::string(auth::meta::DEFAULT_SUPERUSER_NAME))}
, cql3::query_processor::cache_internal::yes
);
if (!results->empty()) {
co_return p(results->one());
}
}
co_return false;
}
future<bool> any_nondefault_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{}", auth::meta::legacy::AUTH_KS, meta::roles_table::name);
auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
, internal_distributed_query_state(), cql3::query_processor::cache_internal::no
);
if (results->empty()) {
co_return false;
}
static const sstring col_name = sstring(meta::roles_table::role_col_name);
co_return std::ranges::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
return is_nondefault && p(row);
});
}
} // namespace legacy
} // namespace auth

View File

@@ -8,7 +8,18 @@
#pragma once
#include <optional>
#include <string_view>
#include <functional>
#include <seastar/core/future.hh>
#include "seastarx.hh"
namespace cql3 {
class query_processor;
class untyped_result_set_row;
}
namespace auth {
@@ -24,4 +35,26 @@ constexpr std::string_view role_col_name{"role", 4};
} // namespace meta
namespace legacy {
///
/// Check that the default role satisfies a predicate, or `false` if the default role does not exist.
///
future<bool> default_role_row_satisfies(
cql3::query_processor&,
std::function<bool(const cql3::untyped_result_set_row&)>,
std::optional<std::string> rolename = {}
);
///
/// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
///
future<bool> any_nondefault_role_row_satisfies(
cql3::query_processor&,
std::function<bool(const cql3::untyped_result_set_row&)>,
std::optional<std::string> rolename = {}
);
} // namespace legacy
} // namespace auth

View File

@@ -30,7 +30,6 @@
#include "auth/default_authorizer.hh"
#include "auth/ldap_role_manager.hh"
#include "auth/maintenance_socket_authenticator.hh"
#include "auth/maintenance_socket_authorizer.hh"
#include "auth/maintenance_socket_role_manager.hh"
#include "auth/password_authenticator.hh"
#include "auth/role_or_anonymous.hh"
@@ -74,6 +73,88 @@ static const sstring superuser_col_name("super");
static logging::logger log("auth_service");
class auth_migration_listener final : public ::service::migration_listener {
service& _service;
cql3::query_processor& _qp;
public:
explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s), _qp(qp) {
}
private:
void on_create_keyspace(const sstring& ks_name) override {}
void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
void on_update_keyspace(const sstring& ks_name) override {}
void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
void on_drop_keyspace(const sstring& ks_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
});
(void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
});
}
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
});
}
void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_drop_function(const sstring& ks_name, const sstring& function_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
});
}
void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
(void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
return _service.revoke_all(r, mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
});
}
void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
};
static future<> validate_role_exists(const service& ser, std::string_view role_name) {
return ser.underlying_role_manager().exists(role_name).then([role_name](bool exists) {
if (!exists) {
@@ -86,6 +167,7 @@ service::service(
cache& cache,
cql3::query_processor& qp,
::service::raft_group0_client& g0,
::service::migration_notifier& mn,
std::unique_ptr<authorizer> z,
std::unique_ptr<authenticator> a,
std::unique_ptr<role_manager> r,
@@ -93,14 +175,17 @@ service::service(
: _cache(cache)
, _qp(qp)
, _group0_client(g0)
, _mnotifier(mn)
, _authorizer(std::move(z))
, _authenticator(std::move(a))
, _role_manager(std::move(r))
, _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
, _used_by_maintenance_socket(used_by_maintenance_socket) {}
service::service(
cql3::query_processor& qp,
::service::raft_group0_client& g0,
::service::migration_notifier& mn,
authorizer_factory authorizer_factory,
authenticator_factory authenticator_factory,
role_manager_factory role_manager_factory,
@@ -110,6 +195,7 @@ service::service(
cache,
qp,
g0,
mn,
authorizer_factory(),
authenticator_factory(),
role_manager_factory(),
@@ -145,6 +231,9 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
}
future<> service::start(::service::migration_manager& mm, db::system_keyspace& sys_ks) {
auto auth_version = co_await sys_ks.get_auth_version();
// version is set in query processor to be easily available in various places we call auth::legacy_mode check.
_qp.auth_version = auth_version;
if (this_shard_id() == 0) {
co_await _cache.load_all();
}
@@ -174,12 +263,22 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
&service::get_uncached_permissions,
this, std::placeholders::_1, std::placeholders::_2));
}
co_await once_among_shards([this] {
_mnotifier.register_listener(_migration_listener.get());
return make_ready_future<>();
});
}
future<> service::stop() {
_as.request_abort();
_cache.set_permission_loader(nullptr);
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
// Only one of the shards has the listener registered, but let's try to
// unregister on each one just to make sure.
return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
_cache.set_permission_loader(nullptr);
return make_ready_future<>();
}).then([this] {
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
});
}
future<> service::ensure_superuser_is_created() {
@@ -213,7 +312,7 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
}
future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
if (_used_by_maintenance_socket) {
if (legacy_mode(_qp) || _used_by_maintenance_socket) {
return get_uncached_permissions(maybe_role, r);
}
return _cache.get_permissions(maybe_role, r);
@@ -279,6 +378,11 @@ future<> service::create_role(std::string_view name,
ep = std::current_exception();
}
if (ep) {
// Rollback only in legacy mode as normally mutations won't be
// applied in case exception is raised
if (legacy_mode(_qp)) {
co_await underlying_role_manager().drop(name, mc);
}
std::rethrow_exception(std::move(ep));
}
}
@@ -357,11 +461,11 @@ future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_
const bool authenticator_uses_password_hashes = _authenticator->uses_password_hashes();
const auto default_su = cql3::util::maybe_quote(default_superuser(_qp));
auto produce_create_statement = [&default_su, with_hashed_passwords] (const sstring& formatted_role_name,
auto produce_create_statement = [with_hashed_passwords] (const sstring& formatted_role_name,
const std::optional<sstring>& maybe_hashed_password, bool can_login, bool is_superuser) {
const sstring role_part = formatted_role_name == default_su
// Even after applying formatting to a role, `formatted_role_name` can only equal `meta::DEFAULT_SUPER_NAME`
// if the original identifier was equal to it.
const sstring role_part = formatted_role_name == meta::DEFAULT_SUPERUSER_NAME
? seastar::format("IF NOT EXISTS {}", formatted_role_name)
: formatted_role_name;
@@ -772,6 +876,85 @@ future<> commit_mutations(service& ser, ::service::group0_batch&& mc) {
return ser.commit_mutations(std::move(mc));
}
future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as) {
// FIXME: if this function fails it may leave partial data in the new tables
// that should be cleared
auto gen = [&sys_ks] (api::timestamp_type ts) -> ::service::mutations_generator {
auto& qp = sys_ks.query_processor();
for (const auto& cf_name : std::vector<sstring>{
"roles", "role_members", "role_attributes", "role_permissions"}) {
schema_ptr schema;
try {
schema = qp.db().find_schema(meta::legacy::AUTH_KS, cf_name);
} catch (const data_dictionary::no_such_column_family&) {
continue; // some tables might not have been created if they were not used
}
std::vector<sstring> col_names;
for (const auto& col : schema->all_columns()) {
col_names.push_back(col.name_as_cql_string());
}
sstring val_binders_str = "?";
for (size_t i = 1; i < col_names.size(); ++i) {
val_binders_str += ", ?";
}
std::vector<mutation> collected;
// use longer than usual timeout as we scan the whole table
// but not infinite or very long as we want to fail reasonably fast
const auto t = 5min;
const timeout_config tc{t, t, t, t, t, t, t};
::service::client_state cs(::service::client_state::internal_tag{}, tc);
::service::query_state qs(cs, empty_service_permit());
co_await qp.query_internal(
seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
db::consistency_level::ALL,
{},
1000,
[&qp, &cf_name, &col_names, &val_binders_str, &schema, ts, &collected] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
std::vector<data_value_or_unset> values;
for (const auto& col : schema->all_columns()) {
if (row.has(col.name_as_text())) {
values.push_back(
col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
} else {
values.push_back(unset_value{});
}
}
auto muts = co_await qp.get_mutations_internal(
seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
db::system_keyspace::NAME,
cf_name,
fmt::join(col_names, ", "),
val_binders_str),
internal_distributed_query_state(),
ts,
std::move(values));
if (muts.size() != 1) {
on_internal_error(log,
format("expecting single insert mutation, got {}", muts.size()));
}
collected.push_back(std::move(muts[0]));
co_return stop_iteration::no;
},
std::move(qs));
for (auto& m : collected) {
co_yield std::move(m);
}
}
co_yield co_await sys_ks.make_auth_version_mutation(ts,
db::system_keyspace::auth_version_t::v2);
};
co_await announce_mutations_with_batching(g0,
start_operation_func,
std::move(gen),
as,
std::nullopt);
}
namespace {
std::string_view get_short_name(std::string_view name) {
@@ -786,20 +969,22 @@ std::string_view get_short_name(std::string_view name) {
authorizer_factory make_authorizer_factory(
std::string_view name,
sharded<cql3::query_processor>& qp) {
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm) {
std::string_view short_name = get_short_name(name);
if (boost::iequals(short_name, "AllowAllAuthorizer")) {
return [&qp] {
return std::make_unique<allow_all_authorizer>(qp.local());
return [&qp, &g0, &mm] {
return std::make_unique<allow_all_authorizer>(qp.local(), g0, mm.local());
};
} else if (boost::iequals(short_name, "CassandraAuthorizer")) {
return [&qp] {
return std::make_unique<default_authorizer>(qp.local());
return [&qp, &g0, &mm] {
return std::make_unique<default_authorizer>(qp.local(), g0, mm.local());
};
} else if (boost::iequals(short_name, "TransitionalAuthorizer")) {
return [&qp] {
return std::make_unique<transitional_authorizer>(qp.local());
return [&qp, &g0, &mm] {
return std::make_unique<transitional_authorizer>(qp.local(), g0, mm.local());
};
}
throw std::invalid_argument(fmt::format("Unknown authorizer: {}", name));
@@ -867,12 +1052,6 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
};
}
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
return [&qp] {
return std::make_unique<maintenance_socket_authorizer>(qp.local());
};
}
role_manager_factory make_maintenance_socket_role_manager_factory(
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,

View File

@@ -12,7 +12,6 @@
#include <memory>
#include <optional>
#include <seastar/core/coroutine.hh>
#include <seastar/core/future.hh>
#include <seastar/core/sstring.hh>
#include <seastar/util/bool_class.hh>
@@ -37,6 +36,8 @@ class query_processor;
namespace service {
class migration_manager;
class migration_notifier;
class migration_listener;
}
namespace auth {
@@ -78,12 +79,17 @@ class service final : public seastar::peering_sharded_service<service> {
::service::raft_group0_client& _group0_client;
::service::migration_notifier& _mnotifier;
authorizer::ptr_type _authorizer;
authenticator::ptr_type _authenticator;
role_manager::ptr_type _role_manager;
// Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
std::unique_ptr<::service::migration_listener> _migration_listener;
maintenance_socket_enabled _used_by_maintenance_socket;
abort_source _as;
@@ -93,6 +99,7 @@ public:
cache& cache,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_notifier&,
std::unique_ptr<authorizer>,
std::unique_ptr<authenticator>,
std::unique_ptr<role_manager>,
@@ -106,6 +113,7 @@ public:
service(
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_notifier&,
authorizer_factory,
authenticator_factory,
role_manager_factory,
@@ -194,9 +202,12 @@ public:
return *_role_manager;
}
cql3::query_processor& query_processor() const noexcept {
return _qp;
}
future<> commit_mutations(::service::group0_batch&& mc) {
co_await std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
co_await _group0_client.send_group0_read_barrier_to_live_members();
return std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
}
private:
@@ -396,6 +407,9 @@ future<std::vector<permission_details>> list_filtered_permissions(
// Finalizes write operations performed in auth by committing mutations via raft group0.
future<> commit_mutations(service& ser, ::service::group0_batch&& mc);
// Migrates data from old keyspace to new one which supports linearizable writes via raft.
future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as);
///
/// Factory helper functions for creating auth module instances.
/// These are intended for use with sharded<service>::start() where copyable arguments are required.
@@ -406,7 +420,9 @@ future<> commit_mutations(service& ser, ::service::group0_batch&& mc);
/// @param name The authorizer class name (e.g., "CassandraAuthorizer", "AllowAllAuthorizer")
authorizer_factory make_authorizer_factory(
std::string_view name,
sharded<cql3::query_processor>& qp);
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm);
/// Creates an authenticator factory for config-selectable authenticator types.
/// @param name The authenticator class name (e.g., "PasswordAuthenticator", "AllowAllAuthenticator")
@@ -434,11 +450,6 @@ authenticator_factory make_maintenance_socket_authenticator_factory(
sharded<::service::migration_manager>& mm,
sharded<cache>& cache);
/// Creates a factory for the maintenance socket authorizer.
/// This authorizer is not config-selectable and is only used for the maintenance socket.
/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
/// Creates a factory for the maintenance socket role manager.
/// This role manager is not config-selectable and is only used for the maintenance socket.
role_manager_factory make_maintenance_socket_role_manager_factory(

View File

@@ -28,7 +28,6 @@
#include "cql3/untyped_result_set.hh"
#include "cql3/util.hh"
#include "db/consistency_level_type.hh"
#include "db/system_keyspace.hh"
#include "exceptions/exceptions.hh"
#include "utils/error_injection.hh"
#include "utils/log.hh"
@@ -36,6 +35,7 @@
#include <seastar/coroutine/maybe_yield.hh>
#include "service/raft/raft_group0_client.hh"
#include "service/migration_manager.hh"
#include "password_authenticator.hh"
#include "utils/managed_string.hh"
namespace auth {
@@ -43,13 +43,51 @@ namespace auth {
static logging::logger log("standard_role_manager");
static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
return db::consistency_level::QUORUM;
}
return db::consistency_level::LOCAL_ONE;
}
future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
const auto results = co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::yes);
if (results->empty()) {
co_return std::optional<record>();
}
const cql3::untyped_result_set_row& row = results->one();
co_return std::make_optional(record{
row.get_as<sstring>(sstring(meta::roles_table::role_col_name)),
row.get_or<bool>("is_superuser", false),
row.get_or<bool>("can_login", false),
(row.has("member_of")
? row.get_set<sstring>("member_of")
: role_set())});
}
future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
auto role = _cache.get(role_name);
if (legacy_mode(_qp)) {
return legacy_find_record(role_name);
}
auto name = sstring(role_name);
auto role = _cache.get(name);
if (!role) {
return make_ready_future<std::optional<record>>(std::nullopt);
}
return make_ready_future<std::optional<record>>(std::make_optional(record{
.name = sstring(role_name),
.name = std::move(name),
.is_superuser = role->is_superuser,
.can_login = role->can_login,
.member_of = role->member_of
@@ -90,12 +128,85 @@ const resource_set& standard_role_manager::protected_resources() const {
return resources;
}
future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const {
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
static const sstring create_role_members_query = fmt::format(
"CREATE TABLE {}.{} ("
" role text,"
" member text,"
" PRIMARY KEY (role, member)"
")",
meta::legacy::AUTH_KS,
ROLE_MEMBERS_CF);
static const sstring create_role_attributes_query = seastar::format(
"CREATE TABLE {}.{} ("
" role text,"
" name text,"
" value text,"
" PRIMARY KEY(role, name)"
")",
meta::legacy::AUTH_KS,
ROLE_ATTRIBUTES_CF);
return when_all_succeed(
create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
_migration_manager),
create_legacy_metadata_table_if_missing(
ROLE_MEMBERS_CF,
_qp,
create_role_members_query,
_migration_manager),
create_legacy_metadata_table_if_missing(
ROLE_ATTRIBUTES_CF,
_qp,
create_role_attributes_query,
_migration_manager)).discard_result();
}
future<> standard_role_manager::legacy_create_default_role_if_missing() {
if (_superuser.empty()) {
on_internal_error(log, "Legacy auth default superuser name is empty");
}
try {
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_can_login, _superuser);
if (exists) {
co_return;
}
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
{_superuser},
cql3::query_processor::cache_internal::no).discard_result();
log.info("Created default superuser role '{}'.", _superuser);
} catch (const exceptions::unavailable_exception& e) {
log.warn("Skipped default role setup: some nodes were not ready; will retry");
throw e;
}
}
future<> standard_role_manager::maybe_create_default_role() {
if (default_superuser(_qp).empty()) {
if (_superuser.empty()) {
co_return;
}
auto has_superuser = [this] () -> future<bool> {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
for (const auto& result : *results) {
@@ -119,12 +230,12 @@ future<> standard_role_manager::maybe_create_default_role() {
// There is no superuser which has can_login field - create default role.
// Note that we don't check if can_login is set to true.
const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, batch, insert_query, {default_superuser(_qp)});
co_await collect_mutations(_qp, batch, insert_query, {_superuser});
co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
log.info("Created default superuser role '{}'.", default_superuser(_qp));
log.info("Created default superuser role '{}'.", _superuser);
}
future<> standard_role_manager::maybe_create_default_role_with_retries() {
@@ -147,12 +258,80 @@ future<> standard_role_manager::maybe_create_default_role_with_retries() {
}
}
static const sstring legacy_table_name{"users"};
bool standard_role_manager::legacy_metadata_exists() {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<> standard_role_manager::migrate_legacy_metadata() {
log.info("Starting migration of legacy user metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
role_config config;
config.is_superuser = row.get_or<bool>("super", false);
config.can_login = true;
return do_with(
row.get_as<sstring>("name"),
std::move(config),
::service::group0_batch::unused(),
[this](const auto& name, const auto& config, auto& mc) {
return create_or_replace(meta::legacy::AUTH_KS, name, config, mc);
});
}).finally([results] {});
}).then([] {
log.info("Finished migrating legacy user metadata.");
}).handle_exception([](std::exception_ptr ep) {
log.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> standard_role_manager::start() {
return once_among_shards([this] () -> future<> {
_superuser = password_authenticator::default_superuser(_qp);
if (legacy_mode(_qp)) {
co_await create_legacy_metadata_tables_if_missing();
}
auto handler = [this] () -> future<> {
co_await maybe_create_default_role_with_retries();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
const bool legacy = legacy_mode(_qp);
if (legacy) {
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);
if (co_await legacy::any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
if (legacy_metadata_exists()) {
log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
}
co_return;
}
if (legacy_metadata_exists()) {
co_await migrate_legacy_metadata();
co_return;
}
co_await legacy_create_default_role_if_missing();
}
if (!legacy) {
co_await maybe_create_default_role_with_retries();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
}
}
};
@@ -171,12 +350,21 @@ future<> standard_role_manager::ensure_superuser_is_created() {
return _superuser_created_promise.get_shared_future();
}
future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
future<> standard_role_manager::create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
db::system_keyspace::NAME,
auth_ks_name,
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(role_name), c.is_superuser, c.can_login});
if (auth_ks_name == meta::legacy::AUTH_KS) {
co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), c.is_superuser, c.can_login},
cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name), c.is_superuser, c.can_login});
}
}
future<>
@@ -186,7 +374,7 @@ standard_role_manager::create(std::string_view role_name, const role_config& c,
throw role_already_exists(role_name);
}
return create_or_replace(role_name, c, mc);
return create_or_replace(get_auth_ks_name(_qp), role_name, c, mc);
});
}
@@ -211,11 +399,20 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
return make_ready_future<>();
}
const sstring query = seastar::format("UPDATE {}.{} SET {} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
build_column_assignments(u),
meta::roles_table::role_col_name);
return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
if (legacy_mode(_qp)) {
return _qp.execute_internal(
std::move(query),
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
}
});
}
@@ -226,11 +423,11 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
// First, revoke this role from all roles that are members of it.
const auto revoke_from_members = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("SELECT member FROM {}.{} WHERE role = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
const auto members = co_await _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no);
@@ -258,33 +455,102 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
// Delete all attributes for that role
const auto remove_attributes_of = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name)},
cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
};
// Finally, delete the role itself.
const auto delete_role = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
};
co_await when_all_succeed(revoke_from_members, revoke_members_of, remove_attributes_of);
co_await delete_role();
}
future<>
standard_role_manager::legacy_modify_membership(
std::string_view grantee_name,
std::string_view role_name,
membership_change ch) {
const auto modify_roles = [this, role_name, grantee_name, ch] () -> future<> {
const auto query = seastar::format(
"UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
get_auth_ks_name(_qp),
meta::roles_table::name,
(ch == membership_change::add ? '+' : '-'),
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
consistency_for_role(grantee_name),
internal_distributed_query_state(),
{role_set{sstring(role_name)}, sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
};
const auto modify_role_members = [this, role_name, grantee_name, ch] () -> future<> {
switch (ch) {
case membership_change::add: {
const sstring insert_query = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
co_return co_await _qp.execute_internal(
insert_query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
}
case membership_change::remove: {
const sstring delete_query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
co_return co_await _qp.execute_internal(
delete_query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
}
}
};
co_await when_all_succeed(modify_roles, modify_role_members).discard_result();
}
future<>
standard_role_manager::modify_membership(
std::string_view grantee_name,
std::string_view role_name,
membership_change ch,
::service::group0_batch& mc) {
if (legacy_mode(_qp)) {
co_return co_await legacy_modify_membership(grantee_name, role_name, ch);
}
const auto modify_roles = seastar::format(
"UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
(ch == membership_change::add ? '+' : '-'),
meta::roles_table::role_col_name);
@@ -295,12 +561,12 @@ standard_role_manager::modify_membership(
switch (ch) {
case membership_change::add:
modify_role_members = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
break;
case membership_change::remove:
modify_role_members = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
break;
default:
@@ -392,21 +658,57 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
}
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
const sstring query = seastar::format("SELECT * FROM {}.{}",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
qs,
cql3::query_processor::cache_internal::yes);
role_to_directly_granted_map roles_map;
_cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
for (const auto& granted_role : record.member_of) {
roles_map.emplace(name, granted_role);
}
});
std::transform(
results->begin(),
results->end(),
std::inserter(roles_map, roles_map.begin()),
[] (const cql3::untyped_result_set_row& row) {
return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
);
co_return roles_map;
}
future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
const sstring query = seastar::format("SELECT {} FROM {}.{}",
meta::roles_table::role_col_name,
get_auth_ks_name(_qp),
meta::roles_table::name);
// To avoid many copies of a view.
static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
if (legacy_mode(_qp)) {
throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
}
}
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
qs,
cql3::query_processor::cache_internal::yes);
role_set roles;
roles.reserve(_cache.roles_count());
_cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
roles.insert(name);
});
std::transform(
results->begin(),
results->end(),
std::inserter(roles, roles.begin()),
[] (const cql3::untyped_result_set_row& row) {
return row.get_as<sstring>(role_col_name_string);}
);
co_return roles;
}
@@ -429,26 +731,31 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
}
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
auto role = _cache.get(role_name);
if (!role) {
co_return std::nullopt;
const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
if (!result_set->empty()) {
const cql3::untyped_result_set_row &row = result_set->one();
co_return std::optional<sstring>(row.get_as<sstring>("value"));
}
auto it = role->attributes.find(attribute_name);
if (it != role->attributes.end()) {
co_return it->second;
}
co_return std::nullopt;
co_return std::optional<sstring>{};
}
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
attribute_vals result;
_cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
auto it = record.attributes.find(attribute_name);
if (it != record.attributes.end()) {
result.emplace(name, it->second);
}
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
if (att_val) {
role_to_att_val.emplace(std::move(role), std::move(*att_val));
}
});
}).then([&role_to_att_val] () {
return make_ready_future<attribute_vals>(std::move(role_to_att_val));
});
});
});
co_return result;
}
future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
@@ -456,10 +763,14 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
throw auth::nonexistant_role(role_name);
}
const sstring query = seastar::format("INSERT INTO {}.{} (role, name, value) VALUES (?, ?, ?)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
}
}
future<> standard_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
@@ -467,10 +778,14 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
throw auth::nonexistant_role(role_name);
}
const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name)});
}
}
future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {

View File

@@ -40,6 +40,7 @@ class standard_role_manager final : public role_manager {
cache& _cache;
future<> _stopped;
abort_source _as;
std::string _superuser;
shared_promise<> _superuser_created_promise;
public:
@@ -96,13 +97,24 @@ private:
role_set member_of;
};
future<> create_legacy_metadata_tables_if_missing() const;
bool legacy_metadata_exists();
future<> migrate_legacy_metadata();
future<> legacy_create_default_role_if_missing();
future<> maybe_create_default_role();
future<> maybe_create_default_role_with_retries();
future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);
future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
future<std::optional<record>> legacy_find_record(std::string_view role_name);
future<std::optional<record>> find_record(std::string_view role_name);
future<record> require_record(std::string_view role_name);
future<> collect_roles(

View File

@@ -145,8 +145,8 @@ future<> transitional_authenticator::ensure_superuser_is_created() const {
return _authenticator->ensure_superuser_is_created();
}
transitional_authorizer::transitional_authorizer(cql3::query_processor& qp)
: transitional_authorizer(std::make_unique<default_authorizer>(qp)) {
transitional_authorizer::transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
: transitional_authorizer(std::make_unique<default_authorizer>(qp, g0, mm)) {
}
transitional_authorizer::transitional_authorizer(std::unique_ptr<authorizer> a)

View File

@@ -62,7 +62,7 @@ class transitional_authorizer : public authorizer {
std::unique_ptr<authorizer> _authorizer;
public:
transitional_authorizer(cql3::query_processor& qp);
transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm);
transitional_authorizer(std::unique_ptr<authorizer> a);
~transitional_authorizer();

View File

@@ -10,15 +10,24 @@
#include <random>
#include <unordered_set>
#include <algorithm>
#include <seastar/core/sleep.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/util/later.hh>
#include "gms/endpoint_state.hh"
#include "gms/versioned_value.hh"
#include "keys/keys.hh"
#include "replica/database.hh"
#include "db/system_keyspace.hh"
#include "db/system_distributed_keyspace.hh"
#include "dht/token-sharding.hh"
#include "locator/token_metadata.hh"
#include "types/set.hh"
#include "gms/application_state.hh"
#include "gms/inet_address.hh"
#include "gms/gossiper.hh"
#include "gms/feature_service.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/UUID_gen.hh"
@@ -32,6 +41,16 @@
extern logging::logger cdc_log;
static int get_shard_count(const locator::host_id& endpoint, const gms::gossiper& g) {
auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
return ep_state ? std::stoi(ep_state->value()) : -1;
}
static unsigned get_sharding_ignore_msb(const locator::host_id& endpoint, const gms::gossiper& g) {
auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
return ep_state ? std::stoi(ep_state->value()) : 0;
}
namespace db {
extern thread_local data_type cdc_streams_set_type;
}
@@ -206,6 +225,12 @@ static std::vector<stream_id> create_stream_ids(
return result;
}
bool should_propose_first_generation(const locator::host_id& my_host_id, const gms::gossiper& g) {
return g.for_each_endpoint_state_until([&] (const gms::endpoint_state& eps) {
return stop_iteration(my_host_id < eps.get_host_id());
}) == stop_iteration::no;
}
bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
if (tm.sorted_tokens().size() != gen.entries().size()) {
// We probably have garbage streams from old generations
@@ -305,6 +330,38 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
}
// non-static for testing
size_t limit_of_streams_in_topology_description() {
// Each stream takes 16B and we don't want to exceed 4MB so we can have
// at most 262144 streams but not less than 1 per vnode.
return 4 * 1024 * 1024 / 16;
}
// non-static for testing
topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
uint64_t streams_count = 0;
for (auto& tr_desc : desc.entries()) {
streams_count += tr_desc.streams.size();
}
size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
if (limit >= streams_count) {
return std::move(desc);
}
size_t streams_per_vnode_limit = limit / desc.entries().size();
auto entries = std::move(desc).entries();
auto start = entries.back().token_range_end;
for (size_t idx = 0; idx < entries.size(); ++idx) {
auto end = entries[idx].token_range_end;
if (entries[idx].streams.size() > streams_per_vnode_limit) {
entries[idx].streams =
create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
}
start = end;
}
return topology_description(std::move(entries));
}
// Compute a set of tokens that split the token ring into vnodes.
static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
auto tokens = tmptr->sorted_tokens();
@@ -362,6 +419,364 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
return ts;
}
future<cdc::generation_id> generation_service::legacy_make_new_generation(const std::unordered_set<dht::token>& bootstrap_tokens, bool add_delay) {
const locator::token_metadata_ptr tmptr = _token_metadata.get();
// Fetch sharding parameters for a node that owns vnode ending with this token
// using gossiped application states.
auto get_sharding_info = [&] (dht::token end) -> std::pair<size_t, uint8_t> {
if (bootstrap_tokens.contains(end)) {
return {smp::count, _cfg.ignore_msb_bits};
} else {
auto endpoint = tmptr->get_endpoint(end);
if (!endpoint) {
throw std::runtime_error(
format("Can't find endpoint for token {}", end));
}
auto sc = get_shard_count(*endpoint, _gossiper);
return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
}
};
auto uuid = utils::make_random_uuid();
auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
// Our caller should ensure that there are normal tokens in the token ring.
auto normal_token_owners = tmptr->count_normal_token_owners();
SCYLLA_ASSERT(normal_token_owners);
if (_feature_service.cdc_generations_v2) {
cdc_log.info("Inserting new generation data at UUID {}", uuid);
// This may take a while.
co_await _sys_dist_ks.local().insert_cdc_generation(uuid, gen, { normal_token_owners });
// Begin the race.
cdc::generation_id_v2 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay), uuid};
cdc_log.info("New CDC generation: {}", gen_id);
co_return gen_id;
}
// The CDC_GENERATIONS_V2 feature is not enabled: some nodes may still not understand the V2 format.
// We must create a generation in the old format.
// If the cluster is large we may end up with a generation that contains
// large number of streams. This is problematic because we store the
// generation in a single row (V1 format). For a generation with large number of rows
// this will lead to a row that can be as big as 32MB. This is much more
// than the limit imposed by commitlog_segment_size_in_mb. If the size of
// the row that describes a new generation grows above
// commitlog_segment_size_in_mb, the write will fail and the new node won't
// be able to join. To avoid such problem we make sure that such row is
// always smaller than 4MB. We do that by removing some CDC streams from
// each vnode if the total number of streams is too large.
gen = limit_number_of_streams_if_needed(std::move(gen));
cdc_log.warn(
"Creating a new CDC generation in the old storage format due to a partially upgraded cluster:"
" the CDC_GENERATIONS_V2 feature is known by this node, but not enabled in the cluster."
" The old storage format forces us to create a suboptimal generation."
" It is recommended to finish the upgrade and then create a new generation either by bootstrapping"
" a new node or running the checkAndRepairCdcStreams nodetool command.");
// Begin the race.
cdc::generation_id_v1 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay)};
co_await _sys_dist_ks.local().insert_cdc_topology_description(gen_id, std::move(gen), { normal_token_owners });
cdc_log.info("New CDC generation: {}", gen_id);
co_return gen_id;
}
/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
* We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
* but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
* which means it will gossip the generation's timestamp.
*/
static std::optional<cdc::generation_id> get_generation_id_for(const locator::host_id& endpoint, const gms::endpoint_state& eps) {
const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
if (!gen_id_ptr) {
return std::nullopt;
}
auto gen_id_string = gen_id_ptr->value();
cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
}
static future<std::optional<cdc::topology_description>> retrieve_generation_data_v2(
cdc::generation_id_v2 id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks) {
auto cdc_gen = co_await sys_dist_ks.read_cdc_generation(id.id);
if (!cdc_gen && id.id.is_timestamp()) {
// If we entered legacy mode due to recovery, we (or some other node)
// might gossip about a generation that was previously propagated
// through raft. If that's the case, it will sit in
// the system.cdc_generations_v3 table.
//
// If the provided id is not a timeuuid, we don't want to query
// the system.cdc_generations_v3 table. This table stores generation
// ids as timeuuids. If the provided id is not a timeuuid, the
// generation cannot be in system.cdc_generations_v3. Also, the query
// would fail with a marshaling error.
cdc_gen = co_await sys_ks.read_cdc_generation_opt(id.id);
}
co_return cdc_gen;
}
static future<std::optional<cdc::topology_description>> retrieve_generation_data(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks,
db::system_distributed_keyspace::context ctx) {
return std::visit(make_visitor(
[&] (const cdc::generation_id_v1& id) {
return sys_dist_ks.read_cdc_topology_description(id, ctx);
},
[&] (const cdc::generation_id_v2& id) {
return retrieve_generation_data_v2(id, sys_ks, sys_dist_ks);
}
), gen_id);
}
static future<> do_update_streams_description(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks,
db::system_distributed_keyspace::context ctx) {
if (co_await sys_dist_ks.cdc_desc_exists(get_ts(gen_id), ctx)) {
cdc_log.info("Generation {}: streams description table already updated.", gen_id);
co_return;
}
// We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
auto topo = co_await retrieve_generation_data(gen_id, sys_ks, sys_dist_ks, ctx);
if (!topo) {
throw no_generation_data_exception(gen_id);
}
co_await sys_dist_ks.create_cdc_desc(get_ts(gen_id), *topo, ctx);
cdc_log.info("CDC description table successfully updated with generation {}.", gen_id);
}
/* Inform CDC users about a generation of streams (identified by the given timestamp)
* by inserting it into the cdc_streams table.
*
* Assumes that the cdc_generation_descriptions table contains this generation.
*
* Returning from this function does not mean that the table update was successful: the function
* might run an asynchronous task in the background.
*/
static future<> update_streams_description(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) {
try {
co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
} catch (...) {
cdc_log.warn(
"Could not update CDC description table with generation {}: {}. Will retry in the background.",
gen_id, std::current_exception());
// It is safe to discard this future: we keep system distributed keyspace alive.
(void)(([] (cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) -> future<> {
while (true) {
try {
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
} catch (seastar::sleep_aborted&) {
cdc_log.warn( "Aborted update CDC description table with generation {}", gen_id);
co_return;
}
try {
co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
co_return;
} catch (...) {
cdc_log.warn(
"Could not update CDC description table with generation {}: {}. Will try again.",
gen_id, std::current_exception());
}
}
})(gen_id, sys_ks, std::move(sys_dist_ks), std::move(get_num_token_owners), abort_src));
}
}
static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
return db_clock::time_point(utils::UUID_gen::unix_timestamp(uuid));
}
static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
db::system_distributed_keyspace& sys_dist_ks,
abort_source& abort_src,
const noncopyable_function<unsigned()>& get_num_token_owners) {
while (true) {
try {
co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
} catch (...) {
cdc_log.warn(
"Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
std::current_exception());
}
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
}
}
// Contains a CDC log table's creation time (extracted from its schema's id)
// and its CDC TTL setting.
struct time_and_ttl {
db_clock::time_point creation_time;
int ttl;
};
/*
* See `maybe_rewrite_streams_descriptions`.
* This is the long-running-in-the-background part of that function.
* It returns the timestamp of the last rewritten generation (if any).
*/
static future<std::optional<cdc::generation_id_v1>> rewrite_streams_descriptions(
std::vector<time_and_ttl> times_and_ttls,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) {
cdc_log.info("Retrieving generation timestamps for rewriting...");
auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
cdc_log.info("Generation timestamps retrieved.");
// Find first generation timestamp such that some CDC log table may contain data before this timestamp.
// This predicate is monotonic w.r.t the timestamps.
auto now = db_clock::now();
std::sort(tss.begin(), tss.end());
auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
// partition_point finds first element that does *not* satisfy the predicate.
return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
[&] (const time_and_ttl& tat) {
// In this CDC log table there are no entries older than the table's creation time
// or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
// If ttl is set to 0 then entries in this table never expire. In that case we look
// only at the table's creation time.
auto no_entries_older_than =
(tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
- std::chrono::seconds(10);
return no_entries_older_than < ts;
});
});
// Find first generation timestamp such that some CDC log table may contain data in this generation.
// This and all later generations need to be written to the new streams table.
if (first != tss.begin()) {
--first;
}
if (first == tss.end()) {
cdc_log.info("No generations to rewrite.");
co_return std::nullopt;
}
cdc_log.info("First generation to rewrite: {}", *first);
bool each_success = true;
co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
while (true) {
try {
co_return co_await do_update_streams_description(cdc::generation_id_v1{ts}, sys_ks, *sys_dist_ks, { get_num_token_owners() });
} catch (const no_generation_data_exception& e) {
cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
each_success = false;
co_return;
} catch (...) {
cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
}
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
}
});
if (each_success) {
cdc_log.info("Rewriting stream tables finished successfully.");
} else {
cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
}
if (first != tss.end()) {
co_return cdc::generation_id_v1{*std::prev(tss.end())};
}
co_return std::nullopt;
}
future<> generation_service::maybe_rewrite_streams_descriptions() {
if (!_db.has_schema(_sys_dist_ks.local().NAME, _sys_dist_ks.local().CDC_DESC_V1)) {
// This cluster never went through a Scylla version which used this table
// or the user deleted the table. Nothing to do.
co_return;
}
if (co_await _sys_ks.local().cdc_is_rewritten()) {
co_return;
}
if (_cfg.dont_rewrite_streams) {
cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
co_return;
}
// For each CDC log table get the TTL setting (from CDC options) and the table's creation time
std::vector<time_and_ttl> times_and_ttls;
_db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
auto& s = *t->schema();
auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
if (!base) {
// Not a CDC log table.
return;
}
auto& cdc_opts = base->cdc_options();
if (!cdc_opts.enabled()) {
// This table is named like a CDC log table but it's not one.
return;
}
times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
});
if (times_and_ttls.empty()) {
// There's no point in rewriting old generations' streams (they don't contain any data).
cdc_log.info("No CDC log tables present, not rewriting stream tables.");
co_return co_await _sys_ks.local().cdc_set_rewritten(std::nullopt);
}
auto get_num_token_owners = [tm = _token_metadata.get()] { return tm->count_normal_token_owners(); };
// This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
// and some nodes that are UP may still be marked as DOWN by us.
// Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
// it doesn't - we'll retry - but it's nice if we succeed without any warnings).
co_await sleep_abortable(std::chrono::seconds(10), _abort_src);
cdc_log.info("Rewriting stream tables in the background...");
auto last_rewritten = co_await rewrite_streams_descriptions(
std::move(times_and_ttls),
_sys_ks.local(),
_sys_dist_ks.local_shared(),
std::move(get_num_token_owners),
_abort_src);
co_await _sys_ks.local().cdc_set_rewritten(last_rewritten);
}
static void assert_shard_zero(const sstring& where) {
if (this_shard_id() != 0) {
on_internal_error(cdc_log, format("`{}`: must be run on shard 0", where));
}
}
class and_reducer {
private:
bool _result = true;
@@ -388,26 +803,195 @@ public:
}
};
class generation_handling_nonfatal_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
constexpr char could_not_retrieve_msg_template[]
= "Could not retrieve CDC streams with timestamp {} upon gossip event. Reason: \"{}\". Action: {}.";
generation_service::generation_service(
config cfg,
config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
replica::database& db)
: _cfg(std::move(cfg))
, _gossiper(g)
, _sys_dist_ks(sys_dist_ks)
, _sys_ks(sys_ks)
, _abort_src(abort_src)
, _token_metadata(stm)
, _feature_service(f)
, _db(db)
{
}
future<> generation_service::stop() {
try {
co_await std::move(_cdc_streams_rewrite_complete);
} catch (...) {
cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
}
if (_joined && (this_shard_id() == 0)) {
co_await leave_ring();
}
_stopped = true;
return make_ready_future<>();
}
generation_service::~generation_service() {
SCYLLA_ASSERT(_stopped);
}
future<> generation_service::handle_cdc_generation(cdc::generation_id gen_id) {
future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
_gen_id = std::move(startup_gen_id);
_gossiper.register_(shared_from_this());
_joined = true;
// Retrieve the latest CDC generation seen in gossip (if any).
co_await legacy_scan_cdc_generations();
// Ensure that the new CDC stream description table has all required streams.
// See the function's comment for details.
//
// Since this depends on the entire cluster (and therefore we cannot guarantee
// timely completion), run it in the background and wait for it in stop().
_cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
}
future<> generation_service::leave_ring() {
assert_shard_zero(__PRETTY_FUNCTION__);
_joined = false;
co_await _gossiper.unregister_(shared_from_this());
}
future<> generation_service::on_join(gms::inet_address ep, locator::host_id id, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
return on_change(ep, id, ep_state->get_application_state_map(), pid);
}
future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
assert_shard_zero(__PRETTY_FUNCTION__);
return make_ready_future<>();
}
future<> generation_service::check_and_repair_cdc_streams() {
// FIXME: support Raft group 0-based topology changes
if (!_joined) {
throw std::runtime_error("check_and_repair_cdc_streams: node not initialized yet");
}
std::optional<cdc::generation_id> latest = _gen_id;
_gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& state) {
auto addr = state.get_host_id();
if (_gossiper.is_left(addr)) {
cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
return;
}
if (!_gossiper.is_normal(addr)) {
throw std::runtime_error(fmt::format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
" ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
}
const auto gen_id = get_generation_id_for(addr, state);
if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
latest = gen_id;
}
});
auto tmptr = _token_metadata.get();
auto sys_dist_ks = get_sys_dist_ks();
bool should_regenerate = false;
if (!latest) {
cdc_log.warn("check_and_repair_cdc_streams: no generation observed in gossip");
should_regenerate = true;
} else if (std::holds_alternative<cdc::generation_id_v1>(*latest)
&& _feature_service.cdc_generations_v2) {
cdc_log.info(
"Cluster still using CDC generation storage format V1 (id: {}), even though it already understands the V2 format."
" Creating a new generation using V2.", *latest);
should_regenerate = true;
} else {
cdc_log.info("check_and_repair_cdc_streams: last generation observed in gossip: {}", *latest);
static const auto timeout_msg = "Timeout while fetching CDC topology description";
static const auto topology_read_error_note = "Note: this is likely caused by"
" node(s) being down or unreachable. It is recommended to check the network and"
" restart/remove the failed node(s), then retry checkAndRepairCdcStreams command";
static const auto exception_translating_msg = "Translating the exception to `request_execution_exception`";
std::optional<topology_description> gen;
try {
gen = co_await retrieve_generation_data(*latest, _sys_ks.local(), *sys_dist_ks, { tmptr->count_normal_token_owners() });
} catch (exceptions::request_timeout_exception& e) {
cdc_log.error("{}: \"{}\". {}.", timeout_msg, e.what(), exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
format("{}. {}.", timeout_msg, topology_read_error_note));
} catch (exceptions::unavailable_exception& e) {
static const auto unavailable_msg = "Node(s) unavailable while fetching CDC topology description";
cdc_log.error("{}: \"{}\". {}.", unavailable_msg, e.what(), exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::UNAVAILABLE,
format("{}. {}.", unavailable_msg, topology_read_error_note));
} catch (...) {
const auto ep = std::current_exception();
if (is_timeout_exception(ep)) {
cdc_log.error("{}: \"{}\". {}.", timeout_msg, ep, exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
format("{}. {}.", timeout_msg, topology_read_error_note));
}
// On exotic errors proceed with regeneration
cdc_log.error("Exception while reading CDC topology description: \"{}\". Regenerating streams anyway.", ep);
should_regenerate = true;
}
if (!gen) {
cdc_log.error(
"Could not find CDC generation with timestamp {} in distributed system tables (current time: {}),"
" even though some node gossiped about it.",
latest, db_clock::now());
should_regenerate = true;
} else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
should_regenerate = true;
cdc_log.info("CDC generation {} needs repair, regenerating", latest);
}
}
if (!should_regenerate) {
if (latest != _gen_id) {
co_await legacy_do_handle_cdc_generation(*latest);
}
cdc_log.info("CDC generation {} does not need repair", latest);
co_return;
}
const auto new_gen_id = co_await legacy_make_new_generation({}, true);
// Need to artificially update our STATUS so other nodes handle the generation ID change
// FIXME: after 0e0282cd nodes do not require a STATUS update to react to CDC generation changes.
// The artificial STATUS update here should eventually be removed (in a few releases).
auto status = _gossiper.get_this_endpoint_state_ptr()->get_application_state_ptr(gms::application_state::STATUS);
if (!status) {
cdc_log.error("Our STATUS is missing");
cdc_log.error("Aborting CDC generation repair due to missing STATUS");
co_return;
}
// Update _gen_id first, so that legacy_do_handle_cdc_generation (which will get called due to the status update)
// won't try to update the gossiper, which would result in a deadlock inside add_local_application_state
_gen_id = new_gen_id;
co_await _gossiper.add_local_application_state(
std::pair(gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(new_gen_id)),
std::pair(gms::application_state::STATUS, *status)
);
co_await _sys_ks.local().update_cdc_generation_id(new_gen_id);
}
future<> generation_service::handle_cdc_generation(cdc::generation_id_v2 gen_id) {
auto ts = get_ts(gen_id);
if (co_await container().map_reduce(and_reducer(), [ts] (generation_service& svc) {
return !svc._cdc_metadata.prepare(ts);
@@ -429,8 +1013,171 @@ future<> generation_service::handle_cdc_generation(cdc::generation_id gen_id) {
}
}
future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::generation_id> gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
if (!gen_id) {
co_return;
}
if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
}
// The service should not be listening for generation changes until after the node
// is bootstrapped and since the node leaves the ring on decommission
if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
return !svc._cdc_metadata.prepare(ts);
})) {
co_return;
}
bool using_this_gen = false;
try {
using_this_gen = co_await legacy_do_handle_cdc_generation_intercept_nonfatal_errors(*gen_id);
} catch (generation_handling_nonfatal_exception& e) {
cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "retrying in the background");
legacy_async_handle_cdc_generation(*gen_id);
co_return;
} catch (...) {
cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying");
co_return; // Exotic ("fatal") exception => do not retry
}
if (using_this_gen) {
cdc_log.info("Starting to use generation {}", *gen_id);
co_await update_streams_description(*gen_id, _sys_ks.local(), get_sys_dist_ks(),
[&tm = _token_metadata] { return tm.get()->count_normal_token_owners(); },
_abort_src);
}
}
void generation_service::legacy_async_handle_cdc_generation(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
(void)(([] (cdc::generation_id gen_id, shared_ptr<generation_service> svc) -> future<> {
while (true) {
co_await sleep_abortable(std::chrono::seconds(5), svc->_abort_src);
try {
bool using_this_gen = co_await svc->legacy_do_handle_cdc_generation_intercept_nonfatal_errors(gen_id);
if (using_this_gen) {
cdc_log.info("Starting to use generation {}", gen_id);
co_await update_streams_description(gen_id, svc->_sys_ks.local(), svc->get_sys_dist_ks(),
[&tm = svc->_token_metadata] { return tm.get()->count_normal_token_owners(); },
svc->_abort_src);
}
co_return;
} catch (generation_handling_nonfatal_exception& e) {
cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "continuing to retry in the background");
} catch (...) {
cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying anymore");
co_return; // Exotic ("fatal") exception => do not retry
}
if (co_await svc->container().map_reduce(and_reducer(), [ts = get_ts(gen_id)] (generation_service& svc) {
return svc._cdc_metadata.known_or_obsolete(ts);
})) {
co_return;
}
}
})(gen_id, shared_from_this()));
}
future<> generation_service::legacy_scan_cdc_generations() {
assert_shard_zero(__PRETTY_FUNCTION__);
std::optional<cdc::generation_id> latest;
_gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& eps) {
auto gen_id = get_generation_id_for(eps.get_host_id(), eps);
if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
latest = gen_id;
}
});
if (latest) {
cdc_log.info("Latest generation seen during startup: {}", *latest);
co_await legacy_handle_cdc_generation(latest);
} else {
cdc_log.info("No generation seen during startup.");
}
}
future<bool> generation_service::legacy_do_handle_cdc_generation_intercept_nonfatal_errors(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
// Use futurize_invoke to catch all exceptions from legacy_do_handle_cdc_generation.
return futurize_invoke([this, gen_id] {
return legacy_do_handle_cdc_generation(gen_id);
}).handle_exception([] (std::exception_ptr ep) -> future<bool> {
try {
std::rethrow_exception(ep);
} catch (exceptions::request_timeout_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (exceptions::unavailable_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (exceptions::read_failure_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (...) {
const auto ep = std::current_exception();
if (is_timeout_exception(ep)) {
throw generation_handling_nonfatal_exception(format("{}", ep));
}
throw;
}
});
}
future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
auto sys_dist_ks = get_sys_dist_ks();
auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
if (!gen) {
// This may happen during raft upgrade when a node gossips about a generation that
// was propagated through raft and we didn't apply it yet.
throw generation_handling_nonfatal_exception(fmt::format(
"Could not find CDC generation {} in distributed system tables (current time: {}),"
" even though some node gossiped about it.",
gen_id, db_clock::now()));
}
// We always gossip about the generation with the greatest timestamp. Specific nodes may remember older generations,
// but eventually they forget when their clocks move past the latest generation's timestamp.
// The cluster as a whole is only interested in the last generation so restarting nodes may learn what it is.
// We assume that generation changes don't happen ``too often'' so every node can learn about a generation
// before it is superseded by a newer one which causes nodes to start gossiping the about the newer one.
// The assumption follows from the requirement of bootstrapping nodes sequentially.
if (!_gen_id || get_ts(*_gen_id) < get_ts(gen_id)) {
_gen_id = gen_id;
co_await _sys_ks.local().update_cdc_generation_id(gen_id);
co_await _gossiper.add_local_application_state(
gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(gen_id));
}
// Return `true` iff the generation was inserted on any of our shards.
co_return co_await container().map_reduce(or_reducer(),
[ts = get_ts(gen_id), &gen] (generation_service& svc) -> future<bool> {
// We need to copy it here before awaiting anything to avoid destruction of the captures.
const auto timestamp = ts;
topology_description gen_copy = co_await gen->clone_async();
co_return svc._cdc_metadata.insert(timestamp, std::move(gen_copy));
});
}
shared_ptr<db::system_distributed_keyspace> generation_service::get_sys_dist_ks() {
assert_shard_zero(__PRETTY_FUNCTION__);
if (!_sys_dist_ks.local_is_initialized()) {
throw std::runtime_error("system distributed keyspace not initialized");
}
return _sys_dist_ks.local_shared();
}
db_clock::time_point get_ts(const generation_id& gen_id) {
return gen_id.ts;
return std::visit([] (auto& id) { return id.ts; }, gen_id);
}
future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const locator::tablet_map& map, api::timestamp_type ts) {

View File

@@ -34,6 +34,16 @@ namespace seastar {
class abort_source;
} // namespace seastar
namespace db {
class config;
class system_distributed_keyspace;
} // namespace db
namespace gms {
class inet_address;
class gossiper;
} // namespace gms
namespace locator {
class tablet_map;
} // namespace locator
@@ -143,6 +153,23 @@ struct cdc_stream_diff {
using table_streams = std::map<api::timestamp_type, committed_stream_set>;
class no_generation_data_exception : public std::runtime_error {
public:
no_generation_data_exception(cdc::generation_id generation_ts)
: std::runtime_error(fmt::format("could not find generation data for timestamp {}", generation_ts))
{}
};
/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
* which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
* that there's a bug, or the user messed with our local tables).
*
* It checks whether we should be the node to propose the first generation of CDC streams.
* The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
* when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
*/
bool should_propose_first_generation(const locator::host_id& me, const gms::gossiper&);
/*
* Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
* with `token_metadata`.

View File

@@ -15,22 +15,48 @@
namespace cdc {
struct generation_id_v1 {
db_clock::time_point ts;
bool operator==(const generation_id_v1&) const = default;
};
struct generation_id {
struct generation_id_v2 {
db_clock::time_point ts;
utils::UUID id;
bool operator==(const generation_id&) const = default;
bool operator==(const generation_id_v2&) const = default;
};
using generation_id = std::variant<generation_id_v1, generation_id_v2>;
db_clock::time_point get_ts(const generation_id&);
} // namespace cdc
template <>
struct fmt::formatter<cdc::generation_id_v1> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id_v1& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", gen_id.ts);
}
};
template <>
struct fmt::formatter<cdc::generation_id_v2> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id_v2& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
}
};
template <>
struct fmt::formatter<cdc::generation_id> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
return std::visit([&ctx] (auto& id) {
return fmt::format_to(ctx.out(), "{}", id);
}, gen_id);
}
};

View File

@@ -11,51 +11,135 @@
#include <seastar/core/sharded.hh>
#include "cdc/metadata.hh"
#include "cdc/generation_id.hh"
#include "gms/i_endpoint_state_change_subscriber.hh"
namespace db {
class system_distributed_keyspace;
class system_keyspace;
}
namespace gms {
class gossiper;
class feature_service;
}
namespace seastar {
class abort_source;
}
namespace locator {
class shared_token_metadata;
class tablet_map;
}
namespace cdc {
class generation_service : public peering_sharded_service<generation_service>
, public async_sharded_service<generation_service> {
, public async_sharded_service<generation_service>
, public gms::i_endpoint_state_change_subscriber {
public:
struct config {
unsigned ignore_msb_bits;
std::chrono::milliseconds ring_delay;
bool dont_rewrite_streams = false;
};
private:
bool _stopped = false;
// The node has joined the token ring. Set to `true` on `after_join` call.
bool _joined = false;
config _cfg;
gms::gossiper& _gossiper;
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
sharded<db::system_keyspace>& _sys_ks;
abort_source& _abort_src;
const locator::shared_token_metadata& _token_metadata;
gms::feature_service& _feature_service;
replica::database& _db;
/* Maintains the set of known CDC generations used to pick streams for log writes (i.e., the partition keys of these log writes). */
/* Maintains the set of known CDC generations used to pick streams for log writes (i.e., the partition keys of these log writes).
* Updated in response to certain gossip events (see the handle_cdc_generation function).
*/
cdc::metadata _cdc_metadata;
/* The latest known generation timestamp and the timestamp that we're currently gossiping
* (as CDC_GENERATION_ID application state).
*
* Only shard 0 manages this, hence it will be std::nullopt on all shards other than 0.
* This timestamp is also persisted in the system.cdc_local table.
*
* On shard 0 this may be nullopt only in one special case: rolling upgrade, when we upgrade
* from an old version of Scylla that didn't support CDC. In that case one node in the cluster
* will create the first generation and start gossiping it; it may be us, or it may be some
* different node. In any case, eventually - after one of the nodes gossips the first timestamp
* - we'll catch on and this variable will be updated with that generation.
*/
std::optional<cdc::generation_id> _gen_id;
future<> _cdc_streams_rewrite_complete = make_ready_future<>();
public:
generation_service(config cfg,
generation_service(config cfg, gms::gossiper&,
sharded<db::system_distributed_keyspace>&,
sharded<db::system_keyspace>& sys_ks,
replica::database& db);
abort_source&, const locator::shared_token_metadata&,
gms::feature_service&, replica::database& db);
future<> stop();
~generation_service();
/* After the node bootstraps and creates a new CDC generation, or restarts and loads the last
* known generation timestamp from persistent storage, this function should be called with
* that generation timestamp moved in as the `startup_gen_id` parameter.
* This passes the responsibility of managing generations from the node startup code to this service;
* until then, the service remains dormant.
* The startup code is in `storage_service::join_topology`, hence
* `after_join` should be called at the end of that function.
* Precondition: the node has completed bootstrapping and system_distributed_keyspace is initialized.
* Must be called on shard 0 - that's where the generation management happens.
*/
future<> after_join(std::optional<cdc::generation_id>&& startup_gen_id);
future<> leave_ring();
cdc::metadata& get_cdc_metadata() {
return _cdc_metadata;
}
virtual future<> on_join(gms::inet_address, locator::host_id id, gms::endpoint_state_ptr, gms::permit_id) override;
virtual future<> on_change(gms::inet_address, locator::host_id id, const gms::application_state_map&, gms::permit_id) override;
future<> check_and_repair_cdc_streams();
/* Generate a new set of CDC streams and insert it into the internal distributed CDC generations table.
* Returns the ID of this new generation.
*
* Should be called when starting the node for the first time (i.e., joining the ring).
*
* Assumes that the system_distributed_keyspace service is initialized.
* `cluster_supports_generations_v2` must be `true` if and only if the `CDC_GENERATIONS_V2` feature is enabled.
*
* If `CDC_GENERATIONS_V2` is enabled, the new generation will be inserted into
* `system_distributed_everywhere.cdc_generation_descriptions_v2` and the returned ID will be in the v2 format.
* Otherwise the new generation will be limited in size, causing suboptimal stream distribution, it will be inserted
* into `system_distributed.cdc_generation_descriptions` and the returned ID will be in the v1 format.
* The second case should happen only when we create new generations in a mixed cluster.
*
* The caller of this function is expected to insert the ID into the gossiper as fast as possible,
* so that other nodes learn about the generation before their clocks cross the generation's timestamp
* (not guaranteed in the current implementation, but expected to be the common case;
* we assume that `ring_delay` is enough for other nodes to learn about the new generation).
*
* Legacy: used for gossiper-based topology changes.
*/
future<cdc::generation_id> legacy_make_new_generation(
const std::unordered_set<dht::token>& bootstrap_tokens, bool add_delay);
/* Retrieve the CDC generation with the given ID from local tables
* and start using it for CDC log writes if it's not obsolete.
* Precondition: the generation was committed using group 0 and locally applied.
*/
future<> handle_cdc_generation(cdc::generation_id);
future<> handle_cdc_generation(cdc::generation_id_v2);
future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);
@@ -67,6 +151,56 @@ public:
future<utils::chunked_vector<mutation>> garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts);
future<> garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts);
private:
/* Retrieve the CDC generation which starts at the given timestamp (from a distributed table created for this purpose)
* and start using it for CDC log writes if it's not obsolete.
*
* Legacy: used for gossiper-based topology changes.
*/
future<> legacy_handle_cdc_generation(std::optional<cdc::generation_id>);
/* If `legacy_handle_cdc_generation` fails, it schedules an asynchronous retry in the background
* using `legacy_async_handle_cdc_generation`.
*
* Legacy: used for gossiper-based topology changes.
*/
void legacy_async_handle_cdc_generation(cdc::generation_id);
/* Wrapper around `legacy_do_handle_cdc_generation` which intercepts timeout/unavailability exceptions.
* Returns: legacy_do_handle_cdc_generation(ts).
*
* Legacy: used for gossiper-based topology changes.
*/
future<bool> legacy_do_handle_cdc_generation_intercept_nonfatal_errors(cdc::generation_id);
/* Returns `true` iff we started using the generation (it was not obsolete or already known),
* which means that this node might write some CDC log entries using streams from this generation.
*
* Legacy: used for gossiper-based topology changes.
*/
future<bool> legacy_do_handle_cdc_generation(cdc::generation_id);
/* Scan CDC generation timestamps gossiped by other nodes and retrieve the latest one.
* This function should be called once at the end of the node startup procedure
* (after the node is started and running normally, it will retrieve generations on gossip events instead).
*
* Legacy: used for gossiper-based topology changes.
*/
future<> legacy_scan_cdc_generations();
/* generation_service code might be racing with system_distributed_keyspace deinitialization
* (the deinitialization order is broken).
* Therefore, whenever we want to access sys_dist_ks in a background task,
* we need to check if the instance is still there. Storing the shared pointer will keep it alive.
*/
shared_ptr<db::system_distributed_keyspace> get_sys_dist_ks();
/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
* used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
* table contains streams of all generations that were present in the old table and may still contain data
* (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
* these generations). */
future<> maybe_rewrite_streams_descriptions();
};
} // namespace cdc

View File

@@ -76,14 +76,14 @@ struct partition_deletion {
using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
template <typename Container>
template<typename Container>
concept EntryContainer = requires(Container& container) {
// Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
{ (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
{ (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
};
template <EntryContainer Container>
template<EntryContainer Container>
static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
for (const auto& entry : cont.atomic_entries) {
cset.set(entry.id);
@@ -134,7 +134,7 @@ struct batch {
ret.emplace(clustering_key::make_empty(), all_columns);
}
auto process_change_type = [&](const auto& changes) {
auto process_change_type = [&] (const auto& changes) {
for (const auto& change : changes) {
auto& cset = ret[change.key];
cset.resize(s.regular_columns_count());
@@ -211,9 +211,7 @@ private:
public:
extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
: _id(id)
, _updates(updates) {
}
: _id(id), _updates(updates) {}
void collection_tombstone(const tombstone& t) {
auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
@@ -228,9 +226,7 @@ public:
cell(key, c);
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
@@ -253,46 +249,41 @@ struct extract_row_visitor {
void collection_column(const column_definition& cdef, auto&& visit_collection) {
visit(*cdef.type, make_visitor(
[&](const collection_type_impl& ctype) {
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
data_type _value_type;
[&] (const collection_type_impl& ctype) {
struct collection_visitor : public extract_collection_visitor<collection_visitor> {
data_type _value_type;
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
: extract_collection_visitor<collection_visitor>(id, updates)
, _value_type(ctype.value_comparator()) {
}
collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
: extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
data_type get_value_type(bytes_view) {
return _value_type;
}
} v(cdef.id, _updates, ctype);
data_type get_value_type(bytes_view) {
return _value_type;
}
} v(cdef.id, _updates, ctype);
visit_collection(v);
},
[&](const user_type_impl& utype) {
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
const user_type_impl& _utype;
visit_collection(v);
},
[&] (const user_type_impl& utype) {
struct udt_visitor : public extract_collection_visitor<udt_visitor> {
const user_type_impl& _utype;
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
: extract_collection_visitor<udt_visitor>(id, updates)
, _utype(utype) {
}
udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
: extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
data_type get_value_type(bytes_view key) {
return _utype.type(deserialize_field_index(key));
}
} v(cdef.id, _updates, utype);
data_type get_value_type(bytes_view key) {
return _utype.type(deserialize_field_index(key));
}
} v(cdef.id, _updates, utype);
visit_collection(v);
},
[&](const abstract_type& o) {
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
}));
visit_collection(v);
},
[&] (const abstract_type& o) {
throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
}
));
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
struct extract_changes_visitor {
@@ -302,8 +293,12 @@ struct extract_changes_visitor {
extract_row_visitor v;
visit_row_cells(v);
for (auto& [ts_ttl, row_update] : v._updates) {
_result[ts_ttl.first].static_updates.push_back({ts_ttl.second, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
for (auto& [ts_ttl, row_update]: v._updates) {
_result[ts_ttl.first].static_updates.push_back({
ts_ttl.second,
std::move(row_update.atomic_entries),
std::move(row_update.nonatomic_entries)
});
}
}
@@ -324,18 +319,24 @@ struct extract_changes_visitor {
} v;
visit_row_cells(v);
for (auto& [ts_ttl, row_update] : v._updates) {
for (auto& [ts_ttl, row_update]: v._updates) {
// It is important that changes in the resulting `set_of_changes` are listed
// in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
// search for "#6070".
auto [ts, ttl] = ts_ttl;
if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
_result[ts].clustered_inserts.push_back({ttl, ckey, *v._marker, std::move(row_update.atomic_entries), {}});
_result[ts].clustered_inserts.push_back({
ttl,
ckey,
*v._marker,
std::move(row_update.atomic_entries),
{}
});
auto& cr_insert = _result[ts].clustered_inserts.back();
bool clustered_update_exists = false;
for (auto& nonatomic_up : row_update.nonatomic_entries) {
for (auto& nonatomic_up: row_update.nonatomic_entries) {
// Updating a collection column with an INSERT statement implies inserting a tombstone.
//
// For example, suppose that we have:
@@ -361,7 +362,12 @@ struct extract_changes_visitor {
cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
} else {
if (!clustered_update_exists) {
_result[ts].clustered_updates.push_back({ttl, ckey, {}, {}});
_result[ts].clustered_updates.push_back({
ttl,
ckey,
{},
{}
});
// Multiple iterations of this `for` loop (for different collection columns)
// might want to put their `nonatomic_up`s into an UPDATE change;
@@ -384,7 +390,12 @@ struct extract_changes_visitor {
}
}
} else {
_result[ts].clustered_updates.push_back({ttl, ckey, std::move(row_update.atomic_entries), std::move(row_update.nonatomic_entries)});
_result[ts].clustered_updates.push_back({
ttl,
ckey,
std::move(row_update.atomic_entries),
std::move(row_update.nonatomic_entries)
});
}
}
}
@@ -401,9 +412,7 @@ struct extract_changes_visitor {
_result[t.timestamp].partition_deletions = partition_deletion{t};
}
constexpr bool finished() const {
return false;
}
constexpr bool finished() const { return false; }
};
set_of_changes extract_changes(const mutation& m) {
@@ -417,23 +426,13 @@ namespace cdc {
struct find_timestamp_visitor {
api::timestamp_type _ts = api::missing_timestamp;
bool finished() const {
return _ts != api::missing_timestamp;
}
bool finished() const { return _ts != api::missing_timestamp; }
void visit(api::timestamp_type ts) {
_ts = ts;
}
void visit(const atomic_cell_view& cell) {
visit(cell.timestamp());
}
void visit(api::timestamp_type ts) { _ts = ts; }
void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void collection_tombstone(const tombstone& t) {
// A collection tombstone with timestamp T can be created with:
// UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
@@ -442,33 +441,15 @@ struct find_timestamp_visitor {
// with cdc$time using timestamp T + 1 instead of T.
visit(t.timestamp + 1);
}
void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void collection_column(const column_definition&, auto&& visit_collection) {
visit_collection(*this);
}
void marker(const row_marker& rm) {
visit(rm.timestamp());
}
void static_row_cells(auto&& visit_row_cells) {
visit_row_cells(*this);
}
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
visit_row_cells(*this);
}
void clustered_row_delete(const clustering_key&, const tombstone& t) {
visit(t.timestamp);
}
void range_delete(const range_tombstone& t) {
visit(t.tomb.timestamp);
}
void partition_delete(const tombstone& t) {
visit(t.timestamp);
}
void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
void marker(const row_marker& rm) { visit(rm.timestamp()); }
void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
void partition_delete(const tombstone& t) { visit(t.timestamp); }
};
/* Find some timestamp inside the given mutation.
@@ -524,12 +505,8 @@ struct should_split_visitor {
virtual ~should_split_visitor() = default;
inline bool finished() const {
return _result;
}
inline void stop() {
_result = true;
}
inline bool finished() const { return _result; }
inline void stop() { _result = true; }
void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
if (_ts != api::missing_timestamp && _ts != ts) {
@@ -540,23 +517,15 @@ struct should_split_visitor {
if (_ttl && *_ttl != ttl) {
return stop();
}
_ttl = {ttl};
_ttl = { ttl };
}
void visit(const atomic_cell_view& cell) {
visit(cell.timestamp(), get_ttl(cell));
}
void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) {
visit(cell);
}
void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
void collection_tombstone(const tombstone& t) {
visit(t.timestamp + 1);
}
void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
if (_had_row_marker) {
@@ -565,12 +534,8 @@ struct should_split_visitor {
}
visit(cell);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) {
visit(cell);
}
void collection_column(const column_definition&, auto&& visit_collection) {
visit_collection(*this);
}
void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
virtual void marker(const row_marker& rm) {
_had_row_marker = true;
@@ -641,8 +606,8 @@ bool should_split(const mutation& m, const per_request_options& options) {
cdc::inspect_mutation(m, v);
return v._result
// A mutation with no timestamp will be split into 0 mutations:
|| v._ts == api::missing_timestamp;
// A mutation with no timestamp will be split into 0 mutations:
|| v._ts == api::missing_timestamp;
}
// Returns true if the row state and the atomic and nonatomic entries represent
@@ -677,7 +642,7 @@ static bool entries_match_row_state(const schema_ptr& base_schema, const cell_ma
if (current_values.size() != update.cells.size()) {
return false;
}
std::unordered_map<sstring_view, bytes> current_values_map;
for (const auto& entry : current_values) {
const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
@@ -746,8 +711,8 @@ bool should_skip(batch& changes, const mutation& base_mutation, change_processor
return true;
}
void process_changes_with_splitting(
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
const auto base_schema = base_mutation.schema();
auto changes = extract_changes(base_mutation);
auto pk = base_mutation.key();
@@ -859,8 +824,8 @@ void process_changes_with_splitting(
}
}
void process_changes_without_splitting(
const mutation& base_mutation, change_processor& processor, bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
if (alternator_strict_compatibility) {
auto changes = extract_changes(base_mutation);
if (should_skip(changes.begin()->second, base_mutation, processor)) {
@@ -877,7 +842,7 @@ void process_changes_without_splitting(
one_kind_column_set columns{base_schema->static_columns_count()};
if (!p.static_row().empty()) {
p.static_row().get().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
columns.set(id);
});
processor.produce_preimage(nullptr, columns);
@@ -890,7 +855,7 @@ void process_changes_without_splitting(
// Row deleted - include all columns in preimage
columns.set(0, base_schema->regular_columns_count(), true);
} else {
cr.row().cells().for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
columns.set(id);
});
}

View File

@@ -48,7 +48,6 @@
#include "mutation/mutation_fragment_stream_validator.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/chunked_vector.hh"
#include "utils/pretty_printers.hh"
#include "readers/multi_range.hh"
#include "readers/compacting.hh"
@@ -162,7 +161,6 @@ std::string_view to_string(compaction_type type) {
case compaction_type::Reshape: return "Reshape";
case compaction_type::Split: return "Split";
case compaction_type::Major: return "Major";
case compaction_type::RewriteComponent: return "RewriteComponent";
}
on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
return "(invalid)";
@@ -612,23 +610,23 @@ private:
}
// Called in a seastar thread
utils::chunked_vector<dht::partition_range>
dht::partition_range_vector
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
// If owned ranges is disengaged, it means no cleanup work was done and
// so nothing needs to be invalidated.
if (!_owned_ranges) {
return {};
return dht::partition_range_vector{};
}
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
auto non_owned_ranges = sstables
| std::views::transform([] (const sstables::shared_sstable& sst) {
seastar::thread::maybe_yield();
return dht::partition_range::make({sst->get_first_decorated_key(), true},
{sst->get_last_decorated_key(), true});
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
}) | std::ranges::to<dht::partition_range_vector>();
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
}
protected:
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -722,8 +720,8 @@ protected:
compaction_completion_desc
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
auto ranges = get_ranges_for_invalidation(input_sstables);
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
}
// Tombstone expiration is enabled based on the presence of sstable set.
@@ -2052,7 +2050,6 @@ compaction_type compaction_type_options::type() const {
compaction_type::Reshape,
compaction_type::Split,
compaction_type::Major,
compaction_type::RewriteComponent,
};
static_assert(std::variant_size_v<compaction_type_options::options_variant> == std::size(index_to_type));
return index_to_type[_options.index()];
@@ -2089,9 +2086,6 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
std::unique_ptr<compaction> operator()(compaction_type_options::split split_options) {
return std::make_unique<split_compaction>(table_s, std::move(descriptor), cdata, std::move(split_options), progress_monitor);
}
std::unique_ptr<compaction> operator()(compaction_type_options::component_rewrite) {
throw std::runtime_error("component_rewrite compaction should be handled separately");
}
} visitor_factory{table_s, std::move(descriptor), cdata, progress_monitor};
return descriptor.options.visit(visitor_factory);
@@ -2109,7 +2103,7 @@ static future<compaction_result> scrub_sstables_validate_mode(compaction_descrip
validation_errors += co_await sst->validate(permit, cdata.abort, [&schema] (sstring what) {
scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
}, monitor_generator(sst), true);
}, monitor_generator(sst));
// Did validation actually finish because aborted?
if (cdata.is_stop_requested()) {
// Compaction manager will catch this exception and re-schedule the compaction.
@@ -2146,34 +2140,6 @@ future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor des
co_return res;
}
future<compaction_result> rewrite_sstables_component(compaction_descriptor descriptor, compaction_group_view& table_s) {
return seastar::async([descriptor = std::move(descriptor), &table_s] () mutable {
compaction_result result {
.stats = {
.started_at = db_clock::now(),
},
};
const auto& options = descriptor.options.as<compaction_type_options::component_rewrite>();
bool update_id = static_cast<bool>(options.update_id);
// When rewriting a component, we cannot use the standard descriptor creator
// because we must preserve the sstable version.
auto creator = [&table_s] (sstables::shared_sstable sst) {
return table_s.make_sstable(sst->state(), sst->get_version());
};
result.new_sstables.reserve(descriptor.sstables.size());
for (auto& sst : descriptor.sstables) {
auto rewritten = sst->link_with_rewritten_component(creator, options.component_to_rewrite, options.modifier, update_id).get();
result.new_sstables.push_back(rewritten);
}
descriptor.replacer({std::move(descriptor.sstables), result.new_sstables});
result.stats.ended_at = db_clock::now();
return result;
});
}
future<compaction_result>
compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
if (descriptor.sstables.empty()) {
@@ -2185,9 +2151,6 @@ compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compa
// Bypass the usual compaction machinery for dry-mode scrub
return scrub_sstables_validate_mode(std::move(descriptor), cdata, table_s, progress_monitor);
}
if (descriptor.options.type() == compaction_type::RewriteComponent) {
return rewrite_sstables_component(std::move(descriptor), table_s);
}
return compaction::run(make_compaction(table_s, std::move(descriptor), cdata, progress_monitor));
}

View File

@@ -12,12 +12,10 @@
#include <functional>
#include <optional>
#include <variant>
#include "sstables/component_type.hh"
#include "sstables/types_fwd.hh"
#include "sstables/sstable_set.hh"
#include "compaction_fwd.hh"
#include "mutation_writer/token_group_based_splitting_writer.hh"
#include "utils/chunked_vector.hh"
namespace compaction {
@@ -32,7 +30,6 @@ enum class compaction_type {
Reshape = 7,
Split = 8,
Major = 9,
RewriteComponent = 10,
};
struct compaction_completion_desc {
@@ -41,7 +38,7 @@ struct compaction_completion_desc {
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
std::vector<sstables::shared_sstable> new_sstables;
// Set of compacted partition ranges that should be invalidated in the cache.
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
dht::partition_range_vector ranges_for_cache_invalidation;
};
// creates a new SSTable for a given shard
@@ -93,15 +90,8 @@ public:
struct split {
mutation_writer::classify_by_token_group classifier;
};
struct component_rewrite {
sstables::component_type component_to_rewrite;
std::function<void(sstables::sstable&)> modifier;
using update_sstable_id = bool_class<class update_sstable_id_tag>;
update_sstable_id update_id = update_sstable_id::yes;
};
private:
using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major, component_rewrite>;
using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major>;
private:
options_variant _options;
@@ -139,10 +129,6 @@ public:
return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables, .drop_unfixable = drop_unfixable_sstables});
}
static compaction_type_options make_component_rewrite(component_type component, std::function<void(sstables::sstable&)> modifier, component_rewrite::update_sstable_id update_id = component_rewrite::update_sstable_id::yes) {
return compaction_type_options(component_rewrite{.component_to_rewrite = component, .modifier = std::move(modifier), .update_id = update_id});
}
static compaction_type_options make_split(mutation_writer::classify_by_token_group classifier) {
return compaction_type_options(split{std::move(classifier)});
}

View File

@@ -46,7 +46,6 @@ public:
virtual reader_permit make_compaction_reader_permit() const = 0;
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
virtual sstables::shared_sstable make_sstable(sstables::sstable_state, sstables::sstable_version_types) const = 0;
virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
virtual api::timestamp_type min_memtable_timestamp() const = 0;
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;

View File

@@ -946,7 +946,7 @@ sstables::shared_sstable sstables_task_executor::consume_sstable() {
auto sst = _sstables.back();
_sstables.pop_back();
--_cm._stats.pending_tasks; // from this point on, switch_state(pending|active) works the same way as any other task
cmlog.debug("consumed {}", sst->get_filename());
cmlog.debug("{}", format("consumed {}", sst->get_filename()));
return sst;
}
@@ -1208,6 +1208,7 @@ future<> compaction_manager::await_tasks(std::vector<shared_ptr<compaction_task_
std::vector<shared_ptr<compaction_task_executor>>
compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt) noexcept {
auto ongoing_compactions = get_compactions(filter).size();
auto tasks = _tasks
| std::views::filter([&filter, type_opt] (const auto& task) {
return filter(task.compacting_table()) && (!type_opt || task.compaction_type() == *type_opt);
@@ -1216,7 +1217,6 @@ compaction_manager::do_stop_ongoing_compactions(sstring reason, std::function<bo
| std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
if (cmlog.is_enabled(level)) {
auto ongoing_compactions = get_compactions(filter).size();
std::string scope = "";
if (!tasks.empty()) {
const compaction_group_view* t = tasks.front()->compacting_table();
@@ -1268,15 +1268,9 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
if (dsm && (this_shard_id() == 0)) {
_out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
if (threshold_reached) {
return container().invoke_on_all([] (compaction_manager& cm) {
cm._in_critical_disk_utilization_mode = true;
return cm.drain();
});
return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
}
return container().invoke_on_all([] (compaction_manager& cm) {
cm._in_critical_disk_utilization_mode = false;
cm.enable();
});
return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
});
}
@@ -1426,17 +1420,11 @@ protected:
compaction_strategy cs = t.get_compaction_strategy();
compaction_descriptor descriptor = co_await cs.get_sstables_for_compaction(t, _cm.get_strategy_control());
int weight = calculate_weight(descriptor);
bool debug_enabled = cmlog.is_enabled(log_level::debug);
if (debug_enabled) {
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
}
cmlog.debug("Started minor compaction sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
descriptor.sstables, compacting_table()->get_sstables_repaired_at(),
compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
sstring old_sstables;
if (debug_enabled) {
old_sstables = ::format("{}", descriptor.sstables);
}
auto old_sstables = ::format("{}", descriptor.sstables);
if (descriptor.sstables.empty() || !can_proceed() || t.is_auto_compaction_disabled_by_user()) {
cmlog.debug("{}: sstables={} can_proceed={} auto_compaction={}", *this, descriptor.sstables.size(), can_proceed(), t.is_auto_compaction_disabled_by_user());
@@ -1466,10 +1454,8 @@ protected:
try {
bool should_update_history = this->should_update_history(descriptor.options.type());
compaction_result res = co_await compact_sstables(std::move(descriptor), _compaction_data, on_replace);
if (debug_enabled) {
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
}
cmlog.debug("Finished minor compaction old_sstables={} new_sstables={} sstables_reapired_at={} range={} uuid={} compaction_uuid={}",
old_sstables, res.new_sstables, compacting_table()->get_sstables_repaired_at(), compacting_table()->token_range(), uuid, _compaction_data.compaction_uuid);
finish_compaction();
if (should_update_history) {
// update_history can take a long time compared to
@@ -1802,41 +1788,6 @@ protected:
}
};
class rewrite_sstables_component_compaction_task_executor final : public rewrite_sstables_compaction_task_executor {
std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& _rewritten_sstables;
public:
rewrite_sstables_component_compaction_task_executor(compaction_manager& mgr,
throw_if_stopping do_throw_if_stopping,
compaction_group_view* t,
tasks::task_id parent_id,
compaction_type_options options,
std::vector<sstables::shared_sstable> sstables,
compacting_sstable_registration compacting,
std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables)
: rewrite_sstables_compaction_task_executor(mgr, do_throw_if_stopping, t, parent_id, options, {},
std::move(sstables), std::move(compacting), compaction_manager::can_purge_tombstones::no, "component_rewrite"),
_rewritten_sstables(rewritten_sstables)
{}
protected:
virtual future<compaction_manager::compaction_stats_opt> do_run() override {
compaction_stats stats{};
switch_state(state::pending);
auto maintenance_permit = co_await acquire_semaphore(_cm._maintenance_ops_sem);
while (!_sstables.empty()) {
auto sst = consume_sstable();
auto it = _rewritten_sstables.emplace(sst, sstables::shared_sstable{}).first;
auto res = co_await rewrite_sstable(std::move(sst));
_cm._validation_errors += res.stats.validation_errors;
stats += res.stats;
it->second = std::move(res.new_sstables.front());
}
co_return stats;
}
};
class split_compaction_task_executor final : public rewrite_sstables_compaction_task_executor {
compaction_type_options::split _opt;
public:
@@ -1950,28 +1901,6 @@ compaction_manager::rewrite_sstables(compaction_group_view& t, compaction_type_o
return perform_task_on_all_files<rewrite_sstables_compaction_task_executor>("rewrite", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_func), throw_if_stopping::no, can_purge, std::move(options_desc));
}
future<compaction_manager::compaction_stats_opt>
compaction_manager::rewrite_sstables_component(compaction_group_view& t,
std::vector<sstables::shared_sstable>& sstables,
compaction_type_options options,
std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables,
tasks::task_info info) {
auto gh = start_compaction(t);
if (!gh) {
co_return std::nullopt;
}
if (sstables.empty()) {
co_return std::nullopt;
}
compacting_sstable_registration compacting(*this, get_compaction_state(&t));
compacting.register_compacting(sstables);
co_return co_await perform_compaction<rewrite_sstables_component_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id,
std::move(options), std::move(sstables), std::move(compacting), rewritten_sstables);
}
class validate_sstables_compaction_task_executor : public sstables_task_executor {
compaction_manager::quarantine_invalid_sstables _quarantine_sstables;
public:
@@ -2362,16 +2291,6 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
}
std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
std::exception_ptr ex;
if (_in_critical_disk_utilization_mode) {
ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
} else {
ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
}
return ex;
}
future<std::vector<sstables::shared_sstable>>
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2381,7 +2300,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
if (is_disabled()) {
co_return coroutine::exception(make_disabled_exception(t));
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
"reason might be out of space prevention", sst->get_filename()))));
}
std::vector<sstables::shared_sstable> ret;
@@ -2405,18 +2325,6 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
co_return ret;
}
future<std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>> compaction_manager::perform_component_rewrite(compaction::compaction_group_view& t,
tasks::task_info info,
std::vector<sstables::shared_sstable> sstables,
sstables::component_type component,
std::function<void(sstables::sstable&)> modifier,
compaction_type_options::component_rewrite::update_sstable_id update_id) {
std::unordered_map<sstables::shared_sstable, sstables::shared_sstable> rewritten_sstables;
rewritten_sstables.reserve(sstables.size());
co_await rewrite_sstables_component(t, sstables, compaction_type_options::make_component_rewrite(component, std::move(modifier), update_id), rewritten_sstables, info);
co_return rewritten_sstables;
}
// Submit a table to be scrubbed and wait for its termination.
future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(compaction_group_view& t, compaction_type_options::scrub opts, tasks::task_info info) {
auto scrub_mode = opts.operation_mode;

View File

@@ -55,7 +55,6 @@ class custom_compaction_task_executor;
class regular_compaction_task_executor;
class offstrategy_compaction_task_executor;
class rewrite_sstables_compaction_task_executor;
class rewrite_sstables_component_compaction_task_executor;
class split_compaction_task_executor;
class cleanup_sstables_compaction_task_executor;
class validate_sstables_compaction_task_executor;
@@ -115,8 +114,6 @@ private:
uint32_t _disabled_state_count = 0;
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
// precondition: is_disabled() is true.
std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);
std::optional<future<>> _stop_future;
@@ -172,7 +169,6 @@ private:
shared_tombstone_gc_state _shared_tombstone_gc_state;
utils::disk_space_monitor::subscription _out_of_space_subscription;
bool _in_critical_disk_utilization_mode = false;
private:
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
@@ -256,12 +252,6 @@ private:
future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");
future<compaction_stats_opt> rewrite_sstables_component(compaction_group_view& t,
std::vector<sstables::shared_sstable>& sstables,
compaction_type_options options,
std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables,
tasks::task_info info);
// Stop all fibers, without waiting. Safe to be called multiple times.
void do_stop() noexcept;
future<> really_do_stop() noexcept;
@@ -370,13 +360,6 @@ public:
// Submit a table to be scrubbed and wait for its termination.
future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, compaction_type_options::scrub opts, tasks::task_info info);
future<std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>> perform_component_rewrite(compaction::compaction_group_view& t,
tasks::task_info info,
std::vector<sstables::shared_sstable> sstables,
sstables::component_type component,
std::function<void(sstables::sstable&)> modifier,
compaction_type_options::component_rewrite::update_sstable_id update_id = compaction_type_options::component_rewrite::update_sstable_id::yes);
// Submit a table for major compaction.
future<> perform_major_compaction(compaction::compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data = false);
@@ -498,7 +481,6 @@ public:
friend class compaction::regular_compaction_task_executor;
friend class compaction::offstrategy_compaction_task_executor;
friend class compaction::rewrite_sstables_compaction_task_executor;
friend class compaction::rewrite_sstables_component_compaction_task_executor;
friend class compaction::cleanup_sstables_compaction_task_executor;
friend class compaction::validate_sstables_compaction_task_executor;
friend compaction_reenabler;

View File

@@ -33,10 +33,8 @@ future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_comp
auto candidate = manifest.get_compaction_candidates(*state->last_compacted_keys, state->compaction_counter);
if (!candidate.sstables.empty()) {
if (leveled_manifest::logger.is_enabled(logging::log_level::debug)) {
auto main_set = co_await table_s.main_sstable_set();
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
}
auto main_set = co_await table_s.main_sstable_set();
leveled_manifest::logger.debug("leveled: Compacting {} out of {} sstables", candidate.sstables.size(), main_set->size());
co_return candidate;
}

View File

@@ -15,7 +15,6 @@
#include "compaction_strategy_state.hh"
#include "utils/error_injection.hh"
#include <seastar/util/lazy.hh>
#include <ranges>
namespace compaction {
@@ -29,12 +28,12 @@ time_window_compaction_strategy_state_ptr time_window_compaction_strategy::get_s
}
const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
{"MINUTES", 60s}, {"HOURS", 3600s}, {"DAYS", 86400s}};
{ "MINUTES", 60s }, { "HOURS", 3600s }, { "DAYS", 86400s }
};
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions>
time_window_compaction_strategy_options::valid_timestamp_resolutions = {
{"MICROSECONDS", timestamp_resolutions::microsecond},
{"MILLISECONDS", timestamp_resolutions::millisecond},
const std::unordered_map<sstring, time_window_compaction_strategy_options::timestamp_resolutions> time_window_compaction_strategy_options::valid_timestamp_resolutions = {
{ "MICROSECONDS", timestamp_resolutions::microsecond },
{ "MILLISECONDS", timestamp_resolutions::millisecond },
};
static std::chrono::seconds validate_compaction_window_unit(const std::map<sstring, sstring>& options) {
@@ -44,8 +43,7 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
if (tmp_value) {
auto valid_window_units_it = time_window_compaction_strategy_options::valid_window_units.find(tmp_value.value());
if (valid_window_units_it == time_window_compaction_strategy_options::valid_window_units.end()) {
throw exceptions::configuration_exception(
fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
throw exceptions::configuration_exception(fmt::format("Invalid window unit {} for {}", tmp_value.value(), time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY));
}
window_unit = valid_window_units_it->second;
}
@@ -61,12 +59,10 @@ static std::chrono::seconds validate_compaction_window_unit(const std::map<sstri
static int validate_compaction_window_size(const std::map<sstring, sstring>& options) {
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY);
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value,
time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
int window_size = cql3::statements::property_definitions::to_long(time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, tmp_value, time_window_compaction_strategy_options::DEFAULT_COMPACTION_WINDOW_SIZE);
if (window_size <= 0) {
throw exceptions::configuration_exception(
fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
throw exceptions::configuration_exception(fmt::format("{} value ({}) must be greater than 1", time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, window_size));
}
return window_size;
@@ -86,30 +82,26 @@ static db_clock::duration validate_expired_sstable_check_frequency_seconds(const
try {
expired_sstable_check_frequency = std::chrono::seconds(std::stol(tmp_value.value()));
} catch (const std::exception& e) {
throw exceptions::syntax_exception(fmt::format(
"Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
throw exceptions::syntax_exception(fmt::format("Invalid long value {} for {}", tmp_value.value(), time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY));
}
}
return expired_sstable_check_frequency;
}
static db_clock::duration validate_expired_sstable_check_frequency_seconds(
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
static db_clock::duration validate_expired_sstable_check_frequency_seconds(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
db_clock::duration expired_sstable_check_frequency = validate_expired_sstable_check_frequency_seconds(options);
unchecked_options.erase(time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
return expired_sstable_check_frequency;
}
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options) {
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution =
time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = time_window_compaction_strategy_options::timestamp_resolutions::microsecond;
auto tmp_value = compaction_strategy_impl::get_value(options, time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
if (tmp_value) {
if (!time_window_compaction_strategy_options::valid_timestamp_resolutions.contains(tmp_value.value())) {
throw exceptions::configuration_exception(fmt::format(
"Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
throw exceptions::configuration_exception(fmt::format("Invalid timestamp resolution {} for {}", tmp_value.value(), time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY));
} else {
timestamp_resolution = time_window_compaction_strategy_options::valid_timestamp_resolutions.at(tmp_value.value());
}
@@ -118,8 +110,7 @@ static time_window_compaction_strategy_options::timestamp_resolutions validate_t
return timestamp_resolution;
}
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(
const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
static time_window_compaction_strategy_options::timestamp_resolutions validate_timestamp_resolution(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options) {
time_window_compaction_strategy_options::timestamp_resolutions timestamp_resolution = validate_timestamp_resolution(options);
unchecked_options.erase(time_window_compaction_strategy_options::TIMESTAMP_RESOLUTION_KEY);
return timestamp_resolution;
@@ -154,7 +145,7 @@ void time_window_compaction_strategy_options::validate(const std::map<sstring, s
compaction_strategy_impl::validate_min_max_threshold(options, unchecked_options);
auto it = options.find("enable_optimized_twcs_queries");
if (it != options.end() && it->second != "true" && it->second != "false") {
if (it != options.end() && it->second != "true" && it->second != "false") {
throw exceptions::configuration_exception(fmt::format("enable_optimized_twcs_queries value ({}) must be \"true\" or \"false\"", it->second));
}
unchecked_options.erase("enable_optimized_twcs_queries");
@@ -171,9 +162,7 @@ class classify_by_timestamp {
std::vector<int64_t> _known_windows;
public:
explicit classify_by_timestamp(time_window_compaction_strategy_options options)
: _options(std::move(options)) {
}
explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
int64_t operator()(api::timestamp_type ts) {
const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
if (const auto it = std::ranges::find(_known_windows, window); it != _known_windows.end()) {
@@ -201,7 +190,7 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
auto estimated_window_count = max_data_segregation_window_count;
auto default_ttl = std::chrono::duration_cast<std::chrono::microseconds>(s->default_time_to_live());
bool min_and_max_ts_available = ms_meta.min_timestamp && ms_meta.max_timestamp;
auto estimate_window_count = [this](timestamp_type min_window, timestamp_type max_window) {
auto estimate_window_count = [this] (timestamp_type min_window, timestamp_type max_window) {
const auto window_size = get_window_size(_options);
return (max_window + (window_size - 1) - min_window) / window_size;
};
@@ -221,19 +210,21 @@ uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutati
return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
}
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(
const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
if (ms_meta.min_timestamp && ms_meta.max_timestamp &&
get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
mutation_reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, mutation_reader_consumer end_consumer) const {
if (ms_meta.min_timestamp && ms_meta.max_timestamp
&& get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
return end_consumer;
}
return [options = _options, end_consumer = std::move(end_consumer)](mutation_reader rd) mutable -> future<> {
return mutation_writer::segregate_by_timestamp(std::move(rd), classify_by_timestamp(std::move(options)), end_consumer);
return [options = _options, end_consumer = std::move(end_consumer)] (mutation_reader rd) mutable -> future<> {
return mutation_writer::segregate_by_timestamp(
std::move(rd),
classify_by_timestamp(std::move(options)),
end_consumer);
};
}
compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
compaction_descriptor
time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_sstable> input, schema_ptr schema, reshape_config cfg) const {
auto mode = cfg.mode;
std::vector<sstables::shared_sstable> single_window;
std::vector<sstables::shared_sstable> multi_window;
@@ -248,7 +239,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
// Sort input sstables by first_key order
// to allow efficient reshaping of disjoint sstables.
std::sort(input.begin(), input.end(), [&schema](const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
std::sort(input.begin(), input.end(), [&schema] (const sstables::shared_sstable& a, const sstables::shared_sstable& b) {
return dht::ring_position(a->get_first_decorated_key()).less_compare(*schema, dht::ring_position(b->get_first_decorated_key()));
});
@@ -262,34 +253,31 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
}
}
auto is_disjoint = [&schema, mode, max_sstables](const std::vector<sstables::shared_sstable>& ssts) {
auto is_disjoint = [&schema, mode, max_sstables] (const std::vector<sstables::shared_sstable>& ssts) {
size_t tolerance = (mode == reshape_mode::relaxed) ? max_sstables : 0;
return sstable_set_overlapping_count(schema, ssts) <= tolerance;
};
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} "
"single_window={} disjoint={}",
offstrategy_threshold, max_sstables, multi_window.size(), seastar::value_of([&] {
return !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0;
}),
single_window.size(), seastar::value_of([&] {
return !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0;
}));
clogger.debug("time_window_compaction_strategy::get_reshaping_job: offstrategy_threshold={} max_sstables={} multi_window={} disjoint={} single_window={} disjoint={}",
offstrategy_threshold, max_sstables,
multi_window.size(), !multi_window.empty() && sstable_set_overlapping_count(schema, multi_window) == 0,
single_window.size(), !single_window.empty() && sstable_set_overlapping_count(schema, single_window) == 0);
auto get_job_size = [](const std::vector<sstables::shared_sstable>& ssts) {
auto get_job_size = [] (const std::vector<sstables::shared_sstable>& ssts) {
return std::ranges::fold_left(ssts | std::views::transform(std::mem_fn(&sstables::sstable::bytes_on_disk)), uint64_t(0), std::plus{});
};
// Targets a space overhead of 10%. All disjoint sstables can be compacted together as long as they won't
// cause an overhead above target. Otherwise, the job targets a maximum of #max_threshold sstables.
auto need_trimming = [&](const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
auto need_trimming = [&] (const std::vector<sstables::shared_sstable>& ssts, const uint64_t job_size, bool is_disjoint) {
const size_t min_sstables = 2;
auto is_above_target_size = job_size > target_job_size;
return (ssts.size() > max_sstables && !is_disjoint) || (ssts.size() > min_sstables && is_above_target_size);
return (ssts.size() > max_sstables && !is_disjoint) ||
(ssts.size() > min_sstables && is_above_target_size);
};
auto maybe_trim_job = [&need_trimming](std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
auto maybe_trim_job = [&need_trimming] (std::vector<sstables::shared_sstable>& ssts, uint64_t job_size, bool is_disjoint) {
while (need_trimming(ssts, job_size, is_disjoint)) {
auto sst = ssts.back();
ssts.pop_back();
@@ -306,7 +294,7 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
// For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
// in a single compaction round, removing the need to later compact W to reduce its number of files.
auto sort_size = std::min(max_sstables, multi_window.size());
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [](const sstables::shared_sstable& a) {
std::ranges::partial_sort(multi_window, multi_window.begin() + sort_size, std::ranges::less(), [] (const sstables::shared_sstable &a) {
return a->get_stats_metadata().max_timestamp;
});
maybe_trim_job(multi_window, job_size, disjoint);
@@ -346,7 +334,8 @@ compaction_descriptor time_window_compaction_strategy::get_reshaping_job(
return compaction_descriptor();
}
future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
future<compaction_descriptor>
time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
auto state = get_state(table_s);
auto compaction_time = gc_clock::now();
auto candidates = co_await control.candidates(table_s);
@@ -380,8 +369,10 @@ future<compaction_descriptor> time_window_compaction_strategy::get_sstables_for_
co_return compaction_descriptor(std::move(compaction_candidates));
}
time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_strategy::compaction_mode(
const time_window_compaction_strategy_state& state, const bucket_t& bucket, timestamp_type bucket_key, timestamp_type now, size_t min_threshold) const {
time_window_compaction_strategy::bucket_compaction_mode
time_window_compaction_strategy::compaction_mode(const time_window_compaction_strategy_state& state,
const bucket_t& bucket, timestamp_type bucket_key,
timestamp_type now, size_t min_threshold) const {
// STCS will also be performed on older window buckets, to avoid a bad write and
// space amplification when something like read repair cause small updates to
// those past windows.
@@ -394,7 +385,8 @@ time_window_compaction_strategy::bucket_compaction_mode time_window_compaction_s
return bucket_compaction_mode::none;
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state) {
auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables, state);
@@ -408,29 +400,31 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_next_
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
// ratio is greater than threshold.
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s](const sstables::shared_sstable& sst) -> bool {
std::erase_if(non_expiring_sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
return !worth_dropping_tombstones(sst, compaction_time, table_s);
});
if (non_expiring_sstables.empty()) {
return {};
}
auto it = std::ranges::min_element(non_expiring_sstables, [](auto& i, auto& j) {
auto it = std::ranges::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
});
return {*it};
return { *it };
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
// Update the highest window seen, if necessary
state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);
return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
state.highest_window_seen, state);
state.highest_window_seen, state);
}
timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
timestamp_type
time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
using namespace std::chrono;
// mask out window size from timestamp to get lower bound of its window
auto num_windows = microseconds(timestamp) / sstable_window_size;
@@ -438,8 +432,8 @@ timestamp_type time_window_compaction_strategy::get_window_lower_bound(std::chro
return duration_cast<microseconds>(num_windows * sstable_window_size).count();
}
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type> time_window_compaction_strategy::get_buckets(
std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, timestamp_type>
time_window_compaction_strategy::get_buckets(std::vector<sstables::shared_sstable> files, const time_window_compaction_strategy_options& options) {
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets;
timestamp_type max_timestamp = 0;
@@ -456,13 +450,11 @@ std::pair<std::map<timestamp_type, std::vector<sstables::shared_sstable>>, times
return std::make_pair(std::move(buckets), max_timestamp);
}
} // namespace compaction
}
template <>
struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(const std::map<compaction::timestamp_type, std::vector<sstables::shared_sstable>>& buckets, fmt::format_context& ctx) const {
auto out = fmt::format_to(ctx.out(), " buckets = {{\n");
for (auto& [timestamp, sstables] : buckets | std::views::reverse) {
@@ -474,9 +466,9 @@ struct fmt::formatter<std::map<compaction::timestamp_type, std::vector<sstables:
namespace compaction {
std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control,
std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets, int min_threshold, int max_threshold, timestamp_type now,
time_window_compaction_strategy_state& state) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets,
int min_threshold, int max_threshold, timestamp_type now, time_window_compaction_strategy_state& state) {
clogger.debug("time_window_compaction_strategy::newest_bucket:\n now {}\n{}", now, buckets);
for (auto&& [key, bucket] : buckets | std::views::reverse) {
@@ -517,7 +509,8 @@ std::vector<sstables::shared_sstable> time_window_compaction_strategy::newest_bu
return {};
}
std::vector<sstables::shared_sstable> time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
std::vector<sstables::shared_sstable>
time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold) {
auto n = std::min(bucket.size(), size_t(max_threshold));
// Trim the largest sstables off the end to meet the maxThreshold
std::ranges::partial_sort(bucket, bucket.begin() + n, std::ranges::less(), std::mem_fn(&sstables::sstable::ondisk_data_size));
@@ -549,8 +542,8 @@ future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(c
co_return n;
}
std::vector<compaction_descriptor> time_window_compaction_strategy::get_cleanup_compaction_jobs(
compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
std::vector<compaction_descriptor>
time_window_compaction_strategy::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
std::vector<compaction_descriptor> ret;
for (auto&& [_, sstables] : get_buckets(std::move(candidates), _options).first) {
auto per_window_jobs = size_tiered_compaction_strategy(_stcs_options).get_cleanup_compaction_jobs(table_s, std::move(sstables));
@@ -563,4 +556,4 @@ std::unique_ptr<sstables::sstable_set_impl> time_window_compaction_strategy::mak
return std::make_unique<sstables::time_series_sstable_set>(ts.schema(), _options.enable_optimized_twcs_queries);
}
} // namespace compaction
}

View File

@@ -397,17 +397,6 @@ commitlog_total_space_in_mb: -1
# you can cache more hot rows
# column_index_size_in_kb: 64
# sstable format version for newly written sstables.
# Currently allowed values are `me` and `ms`.
# If not specified in the config, this defaults to `me`.
#
# The difference between `me` and `ms` are the data structures used
# in the primary index.
# In short, `ms` needs more CPU during sstable writes,
# but should behave better during reads,
# although it might behave worse for very long clustering keys.
sstable_format: ms
# Auto-scaling of the promoted index prevents running out of memory
# when the promoted index grows too large (due to partitions with many rows
# vs. too small column_index_size_in_kb). When the serialized representation

View File

@@ -544,6 +544,7 @@ scylla_tests = set([
'test/boost/caching_options_test',
'test/boost/canonical_mutation_test',
'test/boost/cartesian_product_test',
'test/boost/cdc_generation_test',
'test/boost/cell_locker_test',
'test/boost/checksum_utils_test',
'test/boost/chunked_managed_vector_test',
@@ -618,7 +619,6 @@ scylla_tests = set([
'test/boost/reservoir_sampling_test',
'test/boost/result_utils_test',
'test/boost/rest_client_test',
'test/boost/rolling_max_tracker_test',
'test/boost/reusable_buffer_test',
'test/boost/rust_test',
'test/boost/s3_test',
@@ -896,9 +896,6 @@ scylla_core = (['message/messaging_service.cc',
'replica/multishard_query.cc',
'replica/mutation_dump.cc',
'replica/querier.cc',
'replica/logstor/segment_manager.cc',
'replica/logstor/logstor.cc',
'replica/logstor/write_buffer.cc',
'mutation/atomic_cell.cc',
'mutation/canonical_mutation.cc',
'mutation/frozen_mutation.cc',
@@ -1243,6 +1240,7 @@ scylla_core = (['message/messaging_service.cc',
'service/pager/query_pagers.cc',
'service/qos/qos_common.cc',
'service/qos/service_level_controller.cc',
'service/qos/standard_service_level_distributed_data_accessor.cc',
'service/qos/raft_service_level_distributed_data_accessor.cc',
'streaming/stream_task.cc',
'streaming/stream_session.cc',
@@ -1276,6 +1274,7 @@ scylla_core = (['message/messaging_service.cc',
'auth/common.cc',
'auth/default_authorizer.cc',
'auth/resource.cc',
'auth/roles-metadata.cc',
'auth/passwords.cc',
'auth/maintenance_socket_authenticator.cc',
'auth/password_authenticator.cc',
@@ -1470,7 +1469,6 @@ idls = ['idl/gossip_digest.idl.hh',
'idl/query.idl.hh',
'idl/idl_test.idl.hh',
'idl/commitlog.idl.hh',
'idl/logstor.idl.hh',
'idl/tracing.idl.hh',
'idl/consistency_level.idl.hh',
'idl/cache_temperature.idl.hh',
@@ -1478,7 +1476,6 @@ idls = ['idl/gossip_digest.idl.hh',
'idl/messaging_service.idl.hh',
'idl/paxos.idl.hh',
'idl/raft.idl.hh',
'idl/raft_util.idl.hh',
'idl/raft_storage.idl.hh',
'idl/group0.idl.hh',
'idl/hinted_handoff.idl.hh',
@@ -1498,9 +1495,7 @@ idls = ['idl/gossip_digest.idl.hh',
'idl/gossip.idl.hh',
'idl/migration_manager.idl.hh',
"idl/node_ops.idl.hh",
"idl/tasks.idl.hh",
"idl/client_state.idl.hh",
"idl/forward_cql.idl.hh",
"idl/tasks.idl.hh"
]
scylla_tests_generic_dependencies = [
@@ -1593,7 +1588,6 @@ pure_boost_tests = set([
'test/boost/wrapping_interval_test',
'test/boost/range_tombstone_list_test',
'test/boost/reservoir_sampling_test',
'test/boost/rolling_max_tracker_test',
'test/boost/serialization_test',
'test/boost/small_vector_test',
'test/boost/top_k_test',
@@ -1742,7 +1736,6 @@ deps['test/boost/url_parse_test'] = ['utils/http.cc', 'test/boost/url_parse_test
deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'utils/labels.cc']
deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
deps['test/boost/rolling_max_tracker_test'] = ['test/boost/rolling_max_tracker_test.cc']
deps['test/boost/estimated_histogram_test'] = ['test/boost/estimated_histogram_test.cc']
deps['test/boost/summary_test'] = ['test/boost/summary_test.cc']
deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']

View File

@@ -23,7 +23,7 @@ column_specification::column_specification(std::string_view ks_name_, std::strin
bool column_specification::all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names)
{
throwing_assert(!names.empty());
SCYLLA_ASSERT(!names.empty());
auto first = names.front();
return std::all_of(std::next(names.begin()), names.end(), [first] (auto&& spec) {

View File

@@ -49,9 +49,9 @@ static cql3_type::kind get_cql3_kind(const abstract_type& t) {
cql3_type::kind operator()(const uuid_type_impl&) { return cql3_type::kind::UUID; }
cql3_type::kind operator()(const varint_type_impl&) { return cql3_type::kind::VARINT; }
cql3_type::kind operator()(const reversed_type_impl& r) { return get_cql3_kind(*r.underlying_type()); }
cql3_type::kind operator()(const tuple_type_impl&) { throwing_assert(0 && "no kind for this type"); }
cql3_type::kind operator()(const vector_type_impl&) { throwing_assert(0 && "no kind for this type"); }
cql3_type::kind operator()(const collection_type_impl&) { throwing_assert(0 && "no kind for this type"); }
cql3_type::kind operator()(const tuple_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
cql3_type::kind operator()(const vector_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
cql3_type::kind operator()(const collection_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
};
return visit(t, visitor{});
}
@@ -124,7 +124,7 @@ class cql3_type::raw_collection : public raw {
} else if (_kind == abstract_type::kind::map) {
return format("{}map<{}, {}>{}", start, _keys, _values, end);
}
throwing_assert(0 && "invalid raw_collection kind");
abort();
}
public:
raw_collection(const abstract_type::kind kind, shared_ptr<raw> keys, shared_ptr<raw> values)
@@ -150,7 +150,7 @@ public:
}
virtual cql3_type prepare_internal(const sstring& keyspace, const data_dictionary::user_types_metadata& user_types) override {
throwing_assert(_values); // "Got null values type for a collection";
SCYLLA_ASSERT(_values); // "Got null values type for a collection";
if (_values->is_counter()) {
throw exceptions::invalid_request_exception(format("Counters are not allowed inside collections: {}", *this));
@@ -190,7 +190,7 @@ private:
}
return cql3_type(set_type_impl::get_instance(_values->prepare_internal(keyspace, user_types).get_type(), !is_frozen()));
} else if (_kind == abstract_type::kind::map) {
throwing_assert(_keys); // "Got null keys type for a collection";
SCYLLA_ASSERT(_keys); // "Got null keys type for a collection";
if (_keys->is_duration()) {
throw exceptions::invalid_request_exception(format("Durations are not allowed as map keys: {}", *this));
}
@@ -198,7 +198,7 @@ private:
_values->prepare_internal(keyspace, user_types).get_type(),
!is_frozen()));
}
throwing_assert(0 && "do_prepare invalid kind");
abort();
}
};

View File

@@ -1603,7 +1603,7 @@ static cql3::raw_value do_evaluate(const collection_constructor& collection, con
case collection_constructor::style_type::vector:
return evaluate_vector(collection, inputs);
}
throwing_assert(0 && "do_evaluate invalid style");
std::abort();
}
static cql3::raw_value do_evaluate(const usertype_constructor& user_val, const evaluation_inputs& inputs) {

View File

@@ -876,7 +876,7 @@ cast_test_assignment(const cast& c, data_dictionary::database db, const sstring&
return assignment_testable::test_result::NOT_ASSIGNABLE;
}
} catch (exceptions::invalid_request_exception& e) {
throwing_assert(0 && "cast_test_assignment exception");
abort();
}
}

View File

@@ -544,7 +544,7 @@ functions::get_user_aggregates(const sstring& keyspace) const {
std::ranges::subrange<functions::declared_t::const_iterator>
functions::find(const function_name& name) const {
throwing_assert(name.has_keyspace()); // : "function name not fully qualified";
SCYLLA_ASSERT(name.has_keyspace()); // : "function name not fully qualified";
auto pair = _declared.equal_range(name);
return std::ranges::subrange(pair.first, pair.second);
}

View File

@@ -25,7 +25,7 @@ bool keyspace_element_name::has_keyspace() const
const sstring& keyspace_element_name::get_keyspace() const
{
throwing_assert(_ks_name);
SCYLLA_ASSERT(_ks_name);
return *_ks_name;
}

View File

@@ -62,7 +62,7 @@ lists::setter_by_index::fill_prepare_context(prepare_context& ctx) {
void
lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
// we should not get here for frozen lists
throwing_assert(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";
auto index = expr::evaluate(_idx, params._options);
if (index.is_null()) {
@@ -105,7 +105,7 @@ lists::setter_by_uuid::requires_read() const {
void
lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
// we should not get here for frozen lists
throwing_assert(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";
auto index = expr::evaluate(_idx, params._options);
auto value = expr::evaluate(*_e, params._options);
@@ -133,7 +133,7 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
void
lists::appender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
const cql3::raw_value value = expr::evaluate(*_e, params._options);
throwing_assert(column.type->is_multi_cell()); // "Attempted to append to a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to append to a frozen list";
do_append(value, m, prefix, column, params);
}
@@ -189,7 +189,7 @@ lists::do_append(const cql3::raw_value& list_value,
void
lists::prepender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to prepend to a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to prepend to a frozen list";
cql3::raw_value lvalue = expr::evaluate(*_e, params._options);
if (lvalue.is_null()) {
return;
@@ -244,7 +244,7 @@ lists::discarder::requires_read() const {
void
lists::discarder::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to delete from a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to delete from a frozen list";
auto&& existing_list = params.get_prefetched_list(m.key(), prefix, column);
// We want to call bind before possibly returning to reject queries where the value provided is not a list.
@@ -300,7 +300,7 @@ lists::discarder_by_index::requires_read() const {
void
lists::discarder_by_index::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to delete an item by index from a frozen list";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to delete an item by index from a frozen list";
cql3::raw_value index = expr::evaluate(*_e, params._options);
if (index.is_null()) {
throw exceptions::invalid_request_exception("Invalid null value for list index");

View File

@@ -45,7 +45,7 @@ maps::setter_by_key::fill_prepare_context(prepare_context& ctx) {
void
maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
using exceptions::invalid_request_exception;
throwing_assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
auto key = expr::evaluate(_k, params._options);
auto value = expr::evaluate(*_e, params._options);
if (key.is_null()) {
@@ -63,7 +63,7 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
void
maps::putter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to add items to a frozen map";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to add items to a frozen map";
cql3::raw_value value = expr::evaluate(*_e, params._options);
do_put(m, prefix, params, value, column);
}
@@ -96,7 +96,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para
void
maps::discarder_by_key::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to delete a single key in a frozen map";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to delete a single key in a frozen map";
cql3::raw_value key = expr::evaluate(*_e, params._options);
if (key.is_null()) {
throw exceptions::invalid_request_exception("Invalid null map key");

View File

@@ -67,7 +67,7 @@ operation::set_element::prepare(data_dictionary::database db, const sstring& key
verify_no_aggregate_functions(mval, "SET clause");
return make_shared<maps::setter_by_key>(receiver, std::move(key), std::move(mval));
}
throwing_assert(0 && "prepare set_element collection type");
abort();
}
bool
@@ -166,7 +166,7 @@ operation::addition::prepare(data_dictionary::database db, const sstring& keyspa
} else if (ctype->get_kind() == abstract_type::kind::map) {
return make_shared<maps::putter>(receiver, std::move(v));
} else {
throwing_assert(0 && "prepare addition collection type");
abort();
}
}
@@ -216,7 +216,7 @@ operation::subtraction::prepare(data_dictionary::database db, const sstring& key
verify_no_aggregate_functions(v, "SET clause");
return ::make_shared<sets::discarder>(receiver, std::move(v));
}
throwing_assert(0 && "prepare subtraction collection type");
abort();
}
bool
@@ -267,7 +267,7 @@ operation::set_value::prepare(data_dictionary::database db, const sstring& keysp
} else if (k == abstract_type::kind::map) {
return make_shared<maps::setter>(receiver, std::move(v));
} else {
throwing_assert(0 && "prepare set_value collection type");
abort();
}
}
@@ -385,7 +385,7 @@ operation::element_deletion::prepare(data_dictionary::database db, const sstring
verify_no_aggregate_functions(key, "SET clause");
return make_shared<maps::discarder_by_key>(receiver, std::move(key));
}
throwing_assert(0 && "prepare element_deletion collection type");
abort();
}
expr::expression

View File

@@ -105,7 +105,6 @@ public:
static const std::chrono::minutes entry_expiry;
using key_type = prepared_cache_key_type;
using pinned_value_type = cache_value_ptr;
using value_type = checked_weak_ptr;
using statement_is_too_big = typename cache_type::entry_is_too_big;
@@ -117,14 +116,9 @@ public:
: _cache(size, entry_expiry, logger)
{}
template <typename LoadFunc>
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
}
template <typename LoadFunc>
future<value_type> get(const key_type& key, LoadFunc&& load) {
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
});
}

View File

@@ -11,7 +11,6 @@
#include "cql3/query_processor.hh"
#include <seastar/core/metrics.hh>
#include <seastar/core/memory.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/coroutine/as_future.hh>
@@ -48,15 +47,13 @@ const sstring query_processor::CQL_VERSION = "3.3.1";
const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
struct query_processor::remote {
remote(service::migration_manager& mm, service::mapreduce_service& fwd, service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& _sc_coordinator)
: mm(mm)
, mapreducer(fwd)
, ss(ss)
, group0_client(group0_client)
, sc_coordinator(_sc_coordinator)
, gate("query_processor::remote") {
}
remote(service::migration_manager& mm, service::mapreduce_service& fwd,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& _sc_coordinator)
: mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
, sc_coordinator(_sc_coordinator)
, gate("query_processor::remote")
{}
service::migration_manager& mm;
service::mapreduce_service& mapreducer;
@@ -79,34 +76,24 @@ static service::query_state query_state_for_internal_call() {
return {service::client_state::for_internal_calls(), empty_service_permit()};
}
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn,
vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg,
lang::manager& langm)
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
, _proxy(proxy)
, _db(db)
, _mnotifier(mn)
, _vector_store_client(vsc)
, _mcfg(mcfg)
, _cql_config(cql_cfg)
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
, _auth_prepared_cache_cfg_cb([this](uint32_t) {
(void)_authorized_prepared_cache_config_action.trigger_later();
})
, _authorized_prepared_cache_config_action([this] {
update_authorized_prepared_cache_config();
return make_ready_future<>();
})
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _lang_manager(langm)
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) {
_write_consistency_levels_warned = to_consistency_level_set(v);
}))
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) {
_write_consistency_levels_disallowed = to_consistency_level_set(v);
})) {
query_processor::query_processor(service::storage_proxy& proxy, data_dictionary::database db, service::migration_notifier& mn, vector_search::vector_store_client& vsc, query_processor::memory_config mcfg, cql_config& cql_cfg, utils::loading_cache_config auth_prep_cache_cfg, lang::manager& langm)
: _migration_subscriber{std::make_unique<migration_subscriber>(this)}
, _proxy(proxy)
, _db(db)
, _mnotifier(mn)
, _vector_store_client(vsc)
, _mcfg(mcfg)
, _cql_config(cql_cfg)
, _prepared_cache(prep_cache_log, _mcfg.prepared_statment_cache_size)
, _authorized_prepared_cache(std::move(auth_prep_cache_cfg), authorized_prepared_statements_cache_log)
, _auth_prepared_cache_cfg_cb([this] (uint32_t) { (void) _authorized_prepared_cache_config_action.trigger_later(); })
, _authorized_prepared_cache_config_action([this] { update_authorized_prepared_cache_config(); return make_ready_future<>(); })
, _authorized_prepared_cache_update_interval_in_ms_observer(_db.get_config().permissions_update_interval_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _authorized_prepared_cache_validity_in_ms_observer(_db.get_config().permissions_validity_in_ms.observe(_auth_prepared_cache_cfg_cb))
, _lang_manager(langm)
, _write_consistency_levels_warned_observer(_db.get_config().write_consistency_levels_warned.observe([this](const auto& v) { _write_consistency_levels_warned = to_consistency_level_set(v); }))
, _write_consistency_levels_disallowed_observer(_db.get_config().write_consistency_levels_disallowed.observe([this](const auto& v) { _write_consistency_levels_disallowed = to_consistency_level_set(v); }))
{
_write_consistency_levels_warned = to_consistency_level_set(_db.get_config().write_consistency_levels_warned());
_write_consistency_levels_disallowed = to_consistency_level_set(_db.get_config().write_consistency_levels_disallowed());
namespace sm = seastar::metrics;
@@ -114,7 +101,7 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
using clevel = db::consistency_level;
sm::label cl_label("consistency_level");
sm::label who_label("who"); // Who queried system tables
sm::label who_label("who"); // Who queried system tables
const auto user_who_label_instance = who_label("user");
const auto internal_who_label_instance = who_label("internal");
@@ -122,11 +109,17 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
const auto system_ks_label_instance = ks_label("system");
std::vector<sm::metric_definition> qp_group;
qp_group.push_back(sm::make_counter("statements_prepared", _stats.prepare_invocations, sm::description("Counts the total number of parsed CQL requests.")));
qp_group.push_back(sm::make_counter(
"statements_prepared",
_stats.prepare_invocations,
sm::description("Counts the total number of parsed CQL requests.")));
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
qp_group.push_back(sm::make_counter(
"queries", _stats.queries_by_cl[cl], sm::description("Counts queries by consistency level."), {cl_label(clevel(cl)), basic_level})
.set_skip_when_empty());
qp_group.push_back(
sm::make_counter(
"queries",
_stats.queries_by_cl[cl],
sm::description("Counts queries by consistency level."),
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
}
_metrics.add_group("query_processor", qp_group);
@@ -517,33 +510,33 @@ query_processor::query_processor(service::storage_proxy& proxy, data_dictionary:
_cql_stats.replication_strategy_fail_list_violations,
sm::description("Counts the number of replication_strategy_fail_list guardrail violations, "
"i.e. attempts to set a forbidden replication strategy in a keyspace via CREATE/ALTER KEYSPACE.")).set_skip_when_empty(),
sm::make_counter(
"forwarded_requests",
_cql_stats.forwarded_requests,
sm::description("Counts the total number of attempts to forward CQL requests to other nodes. One request may be forwarded multiple times, "
"particularly when a write is handled by a non-replica node.")).set_skip_when_empty(),
});
std::vector<sm::metric_definition> cql_cl_group;
for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
cql_cl_group.push_back(sm::make_counter("writes_per_consistency_level", _cql_stats.writes_per_consistency_level[cl],
sm::description("Counts the number of writes for each consistency level."), {cl_label(clevel(cl)), basic_level})
.set_skip_when_empty());
cql_cl_group.push_back(
sm::make_counter(
"writes_per_consistency_level",
_cql_stats.writes_per_consistency_level[cl],
sm::description("Counts the number of writes for each consistency level."),
{cl_label(clevel(cl)), basic_level}).set_skip_when_empty());
}
_metrics.add_group("cql", cql_cl_group);
_metrics.add_group(
"cql", {
sm::make_counter("write_consistency_levels_disallowed_violations", _cql_stats.write_consistency_levels_disallowed_violations,
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
"i.e. attempts to write with a forbidden consistency level."),
{basic_level}),
sm::make_counter("write_consistency_levels_warned_violations", _cql_stats.write_consistency_levels_warned_violations,
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
"i.e. attempts to write with a discouraged consistency level."),
{basic_level}),
});
_metrics.add_group("cql", {
sm::make_counter(
"write_consistency_levels_disallowed_violations",
_cql_stats.write_consistency_levels_disallowed_violations,
sm::description("Counts the number of write_consistency_levels_disallowed guardrail violations, "
"i.e. attempts to write with a forbidden consistency level."),
{basic_level}),
sm::make_counter(
"write_consistency_levels_warned_violations",
_cql_stats.write_consistency_levels_warned_violations,
sm::description("Counts the number of write_consistency_levels_warned guardrail violations, "
"i.e. attempts to write with a discouraged consistency level."),
{basic_level}),
});
_mnotifier.register_listener(_migration_subscriber.get());
}
@@ -554,13 +547,15 @@ query_processor::~query_processor() {
}
}
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder> query_processor::acquire_strongly_consistent_coordinator() {
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
query_processor::acquire_strongly_consistent_coordinator() {
auto [remote_, holder] = remote();
return {remote_.get().sc_coordinator, std::move(holder)};
}
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer, service::storage_service& ss,
service::raft_group0_client& group0_client, service::strong_consistency::coordinator& sc_coordinator) {
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& sc_coordinator) {
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
}
@@ -580,24 +575,23 @@ future<> query_processor::stop() {
}
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_with_guard(
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)>
fn,
std::function<future<::shared_ptr<cql_transport::messages::result_message>>(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>)> fn,
::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options) {
// execute all statements that need group0 guard on shard0
if (this_shard_id() != 0) {
co_return bounce_to_shard(0, std::move(const_cast<cql3::query_options&>(options).take_cached_pk_function_calls()), false);
co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0,
std::move(const_cast<cql3::query_options&>(options).take_cached_pk_function_calls()));
}
auto [remote_, holder] = remote();
size_t retries = remote_.get().mm.get_concurrent_ddl_retries();
while (true) {
while (true) {
try {
auto guard = co_await remote_.get().mm.start_group0_operation();
co_return co_await fn(query_state, statement, options, std::move(guard));
} catch (const service::group0_concurrent_modification& ex) {
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.", statement->raw_cql_statement,
retries ? " Retrying" : " Number of retries exceeded, giving up");
log.warn("Failed to execute statement \"{}\" due to guard conflict.{}.",
statement->raw_cql_statement, retries ? " Retrying" : " Number of retries exceeded, giving up");
if (retries--) {
continue;
}
@@ -606,30 +600,29 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
}
}
template <typename... Args>
future<::shared_ptr<result_message>> query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement,
const query_options& options,
future<::shared_ptr<result_message>> (query_processor::*fn)(
service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...),
Args... args) {
template<typename... Args>
future<::shared_ptr<result_message>>
query_processor::execute_maybe_with_guard(service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options,
future<::shared_ptr<result_message>>(query_processor::*fn)(service::query_state&, ::shared_ptr<cql_statement>, const query_options&, std::optional<service::group0_guard>, Args...), Args... args) {
if (!statement->needs_guard(*this, query_state)) {
return (this->*fn)(query_state, std::move(statement), options, std::nullopt, std::forward<Args>(args)...);
}
static auto exec = [fn](query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard) {
static auto exec = [fn] (query_processor& qp, Args... args, service::query_state& query_state, ::shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
return (qp.*fn)(query_state, std::move(statement), options, std::move(guard), std::forward<Args>(args)...);
};
return execute_with_guard(std::bind_front(exec, std::ref(*this), std::forward<Args>(args)...), std::move(statement), query_state, options);
}
future<::shared_ptr<result_message>> query_processor::execute_direct_without_checking_exception_message(
const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
future<::shared_ptr<result_message>>
query_processor::execute_direct_without_checking_exception_message(const std::string_view& query_string, service::query_state& query_state, dialect d, query_options& options) {
log.trace("execute_direct: \"{}\"", query_string);
tracing::trace(query_state.get_trace_state(), "Parsing a statement");
auto p = get_statement(query_string, query_state.get_client_state(), d);
auto statement = p->statement;
if (statement->get_bound_terms() != options.get_values_count()) {
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}", statement->get_bound_terms(), options.get_values_count());
const auto msg = format("Invalid amount of bind variables: expected {:d} received {:d}",
statement->get_bound_terms(),
options.get_values_count());
throw exceptions::invalid_request_exception(msg);
}
options.prepare(p->bound_names);
@@ -640,13 +633,17 @@ future<::shared_ptr<result_message>> query_processor::execute_direct_without_che
metrics.regularStatementsExecuted.inc();
#endif
auto user = query_state.get_client_state().user();
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}",
user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
tracing::trace(query_state.get_trace_state(), "Processing a statement for authenticated user: {}", user ? (user->name ? *user->name : "anonymous") : "no user authenticated");
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_direct, std::move(p->warnings));
}
future<::shared_ptr<result_message>> query_processor::do_execute_direct(service::query_state& query_state, shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard, cql3::cql_warnings_vec warnings) {
future<::shared_ptr<result_message>>
query_processor::do_execute_direct(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
std::optional<service::group0_guard> guard,
cql3::cql_warnings_vec warnings) {
auto access_future = co_await coroutine::as_future(statement->check_access(*this, query_state.get_client_state()));
if (access_future.failed()) {
co_await audit::inspect(statement, query_state, options, true);
@@ -671,16 +668,26 @@ future<::shared_ptr<result_message>> query_processor::do_execute_direct(service:
co_return std::move(m);
}
future<::shared_ptr<result_message>> query_processor::execute_prepared_without_checking_exception_message(service::query_state& query_state,
shared_ptr<cql_statement> statement, const query_options& options, statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
return execute_maybe_with_guard(
query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
future<::shared_ptr<result_message>>
query_processor::execute_prepared_without_checking_exception_message(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key,
bool needs_authorization) {
return execute_maybe_with_guard(query_state, std::move(statement), options, &query_processor::do_execute_prepared, std::move(prepared), std::move(cache_key), needs_authorization);
}
future<::shared_ptr<result_message>> query_processor::do_execute_prepared(service::query_state& query_state, shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard, statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key, bool needs_authorization) {
future<::shared_ptr<result_message>>
query_processor::do_execute_prepared(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options,
std::optional<service::group0_guard> guard,
statements::prepared_statement::checked_weak_ptr prepared,
cql3::prepared_cache_key_type cache_key,
bool needs_authorization) {
if (needs_authorization) {
co_await statement->check_access(*this, query_state.get_client_state());
try {
@@ -694,8 +701,8 @@ future<::shared_ptr<result_message>> query_processor::do_execute_prepared(servic
co_return co_await process_authorized_statement(std::move(statement), query_state, options, std::move(guard));
}
future<::shared_ptr<result_message>> query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement,
service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
future<::shared_ptr<result_message>>
query_processor::process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options, std::optional<service::group0_guard> guard) {
auto& client_state = query_state.get_client_state();
++_stats.queries_by_cl[size_t(options.get_consistency())];
@@ -705,39 +712,43 @@ future<::shared_ptr<result_message>> query_processor::process_authorized_stateme
auto msg = co_await statement->execute_without_checking_exception_message(*this, query_state, options, std::move(guard));
if (msg) {
co_return std::move(msg);
co_return std::move(msg);
}
co_return ::make_shared<result_message::void_message>();
}
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
sstring query_string, service::query_state& query_state, cql3::dialect d) {
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
query_processor::prepare(sstring query_string, service::query_state& query_state, cql3::dialect d) {
auto& client_state = query_state.get_client_state();
return prepare(std::move(query_string), client_state, d);
}
future<::shared_ptr<cql_transport::messages::result_message::prepared>> query_processor::prepare(
sstring query_string, const service::client_state& client_state, cql3::dialect d) {
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
try {
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
auto prepared = get_statement(query_string, client_state, d);
prepared->calculate_metadata_id();
auto bound_terms = prepared->statement->get_bound_terms();
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
throw exceptions::invalid_request_exception(
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}", bound_terms, std::numeric_limits<uint16_t>::max()));
}
throwing_assert(bound_terms == prepared->bound_names.size());
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
});
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
auto prepared = get_statement(query_string, client_state, d);
prepared->calculate_metadata_id();
auto bound_terms = prepared->statement->get_bound_terms();
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
throw exceptions::invalid_request_exception(
format("Too many markers(?). {:d} markers exceed the allowed maximum of {:d}",
bound_terms,
std::numeric_limits<uint16_t>::max()));
}
SCYLLA_ASSERT(bound_terms == prepared->bound_names.size());
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
});
co_await utils::get_local_injector().inject("query_processor_prepare_wait_after_cache_get", utils::wait_for_message(std::chrono::seconds(60)));
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
co_return std::move(msg);
} catch (typename prepared_statements_cache::statement_is_too_big&) {
const auto& warnings = prep_ptr->warnings;
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
for (const auto& w : warnings) {
msg->add_warning(w);
}
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
} catch(typename prepared_statements_cache::statement_is_too_big&) {
throw prepared_statement_is_too_big(query_string);
}
}
@@ -748,15 +759,15 @@ static std::string hash_target(std::string_view query_string, std::string_view k
return ret;
}
prepared_cache_key_type query_processor::compute_id(std::string_view query_string, std::string_view keyspace, dialect d) {
prepared_cache_key_type query_processor::compute_id(
std::string_view query_string,
std::string_view keyspace,
dialect d) {
return prepared_cache_key_type(md5_hasher::calculate(hash_target(query_string, keyspace)), d);
}
std::unique_ptr<prepared_statement> query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
// Measuring allocation cost requires that no yield points exist
// between bytes_before and bytes_after. It needs fixing if this
// function is ever futurized.
auto bytes_before = seastar::memory::stats().total_bytes_allocated();
std::unique_ptr<prepared_statement>
query_processor::get_statement(const std::string_view& query, const service::client_state& client_state, dialect d) {
std::unique_ptr<raw::parsed_statement> statement = parse_statement(query, d);
// Set keyspace for statement that require login
@@ -772,12 +783,11 @@ std::unique_ptr<prepared_statement> query_processor::get_statement(const std::st
audit_info->set_query_string(query);
p->statement->sanitize_audit_info();
}
auto bytes_after = seastar::memory::stats().total_bytes_allocated();
_parsing_cost_tracker.add_sample(bytes_after - bytes_before);
return p;
}
std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const std::string_view& query, dialect d) {
std::unique_ptr<raw::parsed_statement>
query_processor::parse_statement(const std::string_view& query, dialect d) {
try {
{
const char* error_injection_key = "query_processor-parse_statement-test_failure";
@@ -802,7 +812,8 @@ std::unique_ptr<raw::parsed_statement> query_processor::parse_statement(const st
}
}
std::vector<std::unique_ptr<raw::parsed_statement>> query_processor::parse_statements(std::string_view queries, dialect d) {
std::vector<std::unique_ptr<raw::parsed_statement>>
query_processor::parse_statements(std::string_view queries, dialect d) {
try {
auto statements = util::do_with_parser(queries, d, std::mem_fn(&cql3_parser::CqlParser::queries));
if (statements.empty()) {
@@ -831,10 +842,15 @@ std::pair<std::reference_wrapper<struct query_processor::remote>, gate::holder>
on_internal_error(log, "attempted to perform distributed query when `query_processor::remote` is unavailable");
}
query_options query_processor::make_internal_options(const statements::prepared_statement::checked_weak_ptr& p, const std::vector<data_value_or_unset>& values,
db::consistency_level cl, int32_t page_size, service::node_local_only node_local_only) const {
query_options query_processor::make_internal_options(
const statements::prepared_statement::checked_weak_ptr& p,
const std::vector<data_value_or_unset>& values,
db::consistency_level cl,
int32_t page_size,
service::node_local_only node_local_only) const {
if (p->bound_names.size() != values.size()) {
throw std::invalid_argument(format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
throw std::invalid_argument(
format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
}
auto ni = p->bound_names.begin();
raw_value_vector_with_unset bound_values;
@@ -842,28 +858,32 @@ query_options query_processor::make_internal_options(const statements::prepared_
bound_values.unset.resize(values.size());
for (auto& var : values) {
auto& n = *ni;
std::visit(overloaded_functor{[&](const data_value& v) {
if (v.type() == bytes_type) {
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
} else if (v.is_null()) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
} else {
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
}
},
[&](const unset_value&) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
}},
var);
std::visit(overloaded_functor {
[&] (const data_value& v) {
if (v.type() == bytes_type) {
bound_values.values.emplace_back(cql3::raw_value::make_value(value_cast<bytes>(v)));
} else if (v.is_null()) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
} else {
bound_values.values.emplace_back(cql3::raw_value::make_value(n->type->decompose(v)));
}
}, [&] (const unset_value&) {
bound_values.values.emplace_back(cql3::raw_value::make_null());
bound_values.unset[std::distance(p->bound_names.begin(), ni)] = true;
}
}, var);
++ni;
}
return query_options(cl, std::move(bound_values),
cql3::query_options::specific_options{.page_size = page_size,
.state = {},
.serial_consistency = db::consistency_level::SERIAL,
.timestamp = api::missing_timestamp,
.node_local_only = node_local_only});
return query_options(
cl,
std::move(bound_values),
cql3::query_options::specific_options {
.page_size = page_size,
.state = {},
.serial_consistency = db::consistency_level::SERIAL,
.timestamp = api::missing_timestamp,
.node_local_only = node_local_only
});
}
statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string) {
@@ -885,7 +905,11 @@ struct internal_query_state {
};
internal_query_state query_processor::create_paged_state(
const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size, std::optional<service::query_state> qs) {
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
std::optional<service::query_state> qs) {
auto p = prepare_internal(query_string);
auto opts = make_internal_options(p, values, cl, page_size);
if (!qs) {
@@ -899,7 +923,8 @@ bool query_processor::has_more_results(cql3::internal_query_state& state) const
}
future<> query_processor::for_each_cql_result(
cql3::internal_query_state& state, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
cql3::internal_query_state& state,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set::row&)> f) {
do {
auto msg = co_await execute_paged_internal(state);
for (auto& row : *msg) {
@@ -910,18 +935,17 @@ future<> query_processor::for_each_cql_result(
} while (has_more_results(state));
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal(internal_query_state& state) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_paged_internal(internal_query_state& state) {
state.p->statement->validate(*this, service::client_state::for_internal_calls());
::shared_ptr<cql_transport::messages::result_message> msg = co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
::shared_ptr<cql_transport::messages::result_message> msg =
co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
class visitor : public result_message::visitor_base {
internal_query_state& _state;
query_processor& _qp;
public:
visitor(internal_query_state& state, query_processor& qp)
: _state(state)
, _qp(qp) {
visitor(internal_query_state& state, query_processor& qp) : _state(state), _qp(qp) {
}
virtual ~visitor() = default;
void visit(const result_message::rows& rmrs) override {
@@ -950,14 +974,23 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_paged_internal
co_return ::make_shared<untyped_result_set>(msg);
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
const sstring& query_string, db::consistency_level cl, const data_value_list& values, cache_internal cache) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_internal(
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
cache_internal cache) {
auto qs = query_state_for_internal_call();
co_return co_await execute_internal(query_string, cl, qs, values, cache);
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
const sstring& query_string, db::consistency_level cl, service::query_state& query_state, const data_value_list& values, cache_internal cache) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_internal(
const sstring& query_string,
db::consistency_level cl,
service::query_state& query_state,
const data_value_list& values,
cache_internal cache) {
if (log.is_enabled(logging::log_level::trace)) {
log.trace("execute_internal: {}\"{}\" ({})", cache ? "(cached) " : "", query_string, fmt::join(values, ", "));
@@ -975,7 +1008,10 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
}
future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
const sstring query_string, service::query_state& query_state, api::timestamp_type timestamp, std::vector<data_value_or_unset> values) {
const sstring query_string,
service::query_state& query_state,
api::timestamp_type timestamp,
std::vector<data_value_or_unset> values) {
log.debug("get_mutations_internal: \"{}\" ({})", query_string, fmt::join(values, ", "));
auto stmt = prepare_internal(query_string);
auto mod_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(stmt->statement);
@@ -993,8 +1029,12 @@ future<utils::chunked_vector<mutation>> query_processor::get_mutations_internal(
co_return co_await mod_stmt->get_mutations(*this, opts, timeout, true, timestamp, query_state, json_cache, std::move(keys));
}
future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
statements::prepared_statement::checked_weak_ptr p, db::consistency_level cl, service::query_state& query_state, const data_value_list& values) {
future<::shared_ptr<untyped_result_set>>
query_processor::execute_with_params(
statements::prepared_statement::checked_weak_ptr p,
db::consistency_level cl,
service::query_state& query_state,
const data_value_list& values) {
auto opts = make_internal_options(p, values, cl);
auto statement = p->statement;
@@ -1002,24 +1042,30 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_with_params(
co_return ::make_shared<untyped_result_set>(msg);
}
future<::shared_ptr<result_message>> query_processor::do_execute_with_params(
service::query_state& query_state, shared_ptr<cql_statement> statement, const query_options& options, std::optional<service::group0_guard> guard) {
future<::shared_ptr<result_message>>
query_processor::do_execute_with_params(
service::query_state& query_state,
shared_ptr<cql_statement> statement,
const query_options& options, std::optional<service::group0_guard> guard) {
statement->validate(*this, service::client_state::for_internal_calls());
co_return co_await coroutine::try_future(statement->execute(*this, query_state, options, std::move(guard)));
}
future<::shared_ptr<cql_transport::messages::result_message>> query_processor::execute_batch_without_checking_exception_message(
::shared_ptr<statements::batch_statement> batch, service::query_state& query_state, query_options& options,
future<::shared_ptr<cql_transport::messages::result_message>>
query_processor::execute_batch_without_checking_exception_message(
::shared_ptr<statements::batch_statement> batch,
service::query_state& query_state,
query_options& options,
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state](auto& e) -> future<> {
try {
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
} catch (...) {
log.error("failed to cache the entry: {}", std::current_exception());
}
});
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
try {
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
} catch (...) {
log.error("failed to cache the entry: {}", std::current_exception());
}
});
bool failed = access_future.failed();
co_await audit::inspect(batch, query_state, options, failed);
if (access_future.failed()) {
@@ -1028,28 +1074,30 @@ future<::shared_ptr<cql_transport::messages::result_message>> query_processor::e
batch->validate();
batch->validate(*this, query_state.get_client_state());
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
if (log.is_enabled(logging::log_level::trace)) {
if (log.is_enabled(logging::log_level::trace)) {
std::ostringstream oss;
for (const auto& s : batch->get_statements()) {
oss << std::endl << s.statement->raw_cql_statement;
for (const auto& s: batch->get_statements()) {
oss << std::endl << s.statement->raw_cql_statement;
}
log.trace("execute_batch({}): {}", batch->get_statements().size(), oss.str());
}
co_return co_await batch->execute(*this, query_state, options, std::nullopt);
}
future<service::broadcast_tables::query_result> query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
future<service::broadcast_tables::query_result>
query_processor::execute_broadcast_table_query(const service::broadcast_tables::query& query) {
auto [remote_, holder] = remote();
co_return co_await service::broadcast_tables::execute(remote_.get().group0_client, query);
}
future<query::mapreduce_result> query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
future<query::mapreduce_result>
query_processor::mapreduce(query::mapreduce_request req, tracing::trace_state_ptr tr_state) {
auto [remote_, holder] = remote();
co_return co_await remote_.get().mapreducer.dispatch(std::move(req), std::move(tr_state));
}
future<::shared_ptr<messages::result_message>> query_processor::execute_schema_statement(
const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
future<::shared_ptr<messages::result_message>>
query_processor::execute_schema_statement(const statements::schema_altering_statement& stmt, service::query_state& state, const query_options& options, service::group0_batch& mc) {
if (this_shard_id() != 0) {
on_internal_error(log, "DDL must be executed on shard 0");
}
@@ -1103,8 +1151,7 @@ future<> query_processor::announce_schema_statement(const statements::schema_alt
co_await remote_.get().mm.announce(std::move(m), std::move(guard), description);
}
query_processor::migration_subscriber::migration_subscriber(query_processor* qp)
: _qp{qp} {
query_processor::migration_subscriber::migration_subscriber(query_processor* qp) : _qp{qp} {
}
void query_processor::migration_subscriber::on_create_keyspace(const sstring& ks_name) {
@@ -1130,7 +1177,10 @@ void query_processor::migration_subscriber::on_create_view(const sstring& ks_nam
void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks_name) {
}
void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) {
void query_processor::migration_subscriber::on_update_column_family(
const sstring& ks_name,
const sstring& cf_name,
bool columns_changed) {
// #1255: Ignoring columns_changed deliberately.
log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
remove_invalid_prepared_statements(ks_name, cf_name);
@@ -1145,7 +1195,9 @@ void query_processor::migration_subscriber::on_update_function(const sstring& ks
void query_processor::migration_subscriber::on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) {
}
void query_processor::migration_subscriber::on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) {
void query_processor::migration_subscriber::on_update_view(
const sstring& ks_name,
const sstring& view_name, bool columns_changed) {
// scylladb/scylladb#16392 - Materialized views are also tables so we need at least handle
// them as such when changed.
on_update_column_family(ks_name, view_name, columns_changed);
@@ -1174,39 +1226,41 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,
remove_invalid_prepared_statements(ks_name, view_name);
}
void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::optional<sstring> cf_name) {
_qp->_prepared_cache.remove_if([&](::shared_ptr<cql_statement> stmt) {
void query_processor::migration_subscriber::remove_invalid_prepared_statements(
sstring ks_name,
std::optional<sstring> cf_name) {
_qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
return this->should_invalidate(ks_name, cf_name, stmt);
});
}
bool query_processor::migration_subscriber::should_invalidate(sstring ks_name, std::optional<sstring> cf_name, ::shared_ptr<cql_statement> statement) {
bool query_processor::migration_subscriber::should_invalidate(
sstring ks_name,
std::optional<sstring> cf_name,
::shared_ptr<cql_statement> statement) {
return statement->depends_on(ks_name, cf_name);
}
future<> query_processor::query_internal(const sstring& query_string, db::consistency_level cl, const data_value_list& values, int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f, std::optional<service::query_state> qs) {
future<> query_processor::query_internal(
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
std::optional<service::query_state> qs) {
auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
co_return co_await for_each_cql_result(query_state, std::move(f));
}
future<> query_processor::query_internal(const sstring& query_string, noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
future<> query_processor::query_internal(
const sstring& query_string,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
return query_internal(query_string, db::consistency_level::ONE, {}, 1000, std::move(f));
}
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(
unsigned shard, cql3::computed_function_values cached_fn_calls, bool track) {
if (track) {
_proxy.get_stats().replica_cross_shard_ops++;
}
const auto my_host_id = _proxy.get_token_metadata_ptr()->get_topology().my_host_id();
return ::make_shared<cql_transport::messages::result_message::bounce>(my_host_id, shard, std::move(cached_fn_calls));
}
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_node(
locator::tablet_replica replica, cql3::computed_function_values cached_fn_calls, seastar::lowres_clock::time_point timeout, bool is_write) {
get_cql_stats().forwarded_requests++;
return ::make_shared<cql_transport::messages::result_message::bounce>(replica.host, replica.shard, std::move(cached_fn_calls), timeout, is_write);
shared_ptr<cql_transport::messages::result_message> query_processor::bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls) {
_proxy.get_stats().replica_cross_shard_ops++;
return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard, std::move(cached_fn_calls));
}
query_processor::consistency_level_set query_processor::to_consistency_level_set(const query_processor::cl_option_list& levels) {
@@ -1221,7 +1275,7 @@ void query_processor::update_authorized_prepared_cache_config() {
utils::loading_cache_config cfg;
cfg.max_size = _mcfg.authorized_prepared_cache_size;
cfg.expiry = std::min(std::chrono::milliseconds(_db.get_config().permissions_validity_in_ms()),
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
std::chrono::duration_cast<std::chrono::milliseconds>(prepared_statements_cache::entry_expiry));
cfg.refresh = std::chrono::milliseconds(_db.get_config().permissions_update_interval_in_ms());
if (!_authorized_prepared_cache.update_config(std::move(cfg))) {
@@ -1233,4 +1287,4 @@ void query_processor::reset_cache() {
_authorized_prepared_cache.reset();
}
} // namespace cql3
}

View File

@@ -31,9 +31,9 @@
#include "vector_search/vector_store_client.hh"
#include "utils/assert.hh"
#include "utils/observable.hh"
#include "utils/rolling_max_tracker.hh"
#include "service/raft/raft_group0_client.hh"
#include "types/types.hh"
#include "db/auth_version.hh"
#include "db/consistency_level_type.hh"
#include "db/config.hh"
#include "utils/enum_option.hh"
@@ -135,9 +135,6 @@ private:
prepared_statements_cache _prepared_cache;
authorized_prepared_statements_cache _authorized_prepared_cache;
// Tracks the rolling maximum of gross bytes allocated during CQL parsing
utils::rolling_max_tracker _parsing_cost_tracker{1000};
std::function<void(uint32_t)> _auth_prepared_cache_cfg_cb;
serialized_action _authorized_prepared_cache_config_action;
utils::observer<uint32_t> _authorized_prepared_cache_update_interval_in_ms_observer;
@@ -216,11 +213,6 @@ public:
return _cql_stats;
}
/// Returns the estimated peak memory cost of CQL parsing.
size_t parsing_cost_estimate() const noexcept {
return _parsing_cost_tracker.current_max();
}
lang::manager& lang() { return _lang_manager; }
const vector_search::vector_store_client& vector_store_client() const noexcept {
@@ -231,6 +223,8 @@ public:
return _vector_store_client;
}
db::auth_version_t auth_version;
statements::prepared_statement::checked_weak_ptr get_prepared(const std::optional<auth::authenticated_user>& user, const prepared_cache_key_type& key) {
if (user) {
auto vp = _authorized_prepared_cache.find(*user, key);
@@ -510,12 +504,7 @@ public:
friend class migration_subscriber;
shared_ptr<cql_transport::messages::result_message> bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls, bool track = true);
shared_ptr<cql_transport::messages::result_message> bounce_to_node(
locator::tablet_replica replica,
cql3::computed_function_values cached_fn_calls,
seastar::lowres_clock::time_point timeout,
bool is_write);
shared_ptr<cql_transport::messages::result_message> bounce_to_shard(unsigned shard, cql3::computed_function_values cached_fn_calls);
void update_authorized_prepared_cache_config();

View File

@@ -89,10 +89,10 @@ public:
*/
void merge(const bounds_slice& other) {
if (has_bound(statements::bound::START)) {
throwing_assert(!other.has_bound(statements::bound::START));
SCYLLA_ASSERT(!other.has_bound(statements::bound::START));
_bounds[get_idx(statements::bound::END)] = other._bounds[get_idx(statements::bound::END)];
} else {
throwing_assert(!other.has_bound(statements::bound::END));
SCYLLA_ASSERT(!other.has_bound(statements::bound::END));
_bounds[get_idx(statements::bound::START)] = other._bounds[get_idx(statements::bound::START)];
}
}

View File

@@ -61,7 +61,7 @@ void metadata::set_paging_state(lw_shared_ptr<const service::pager::paging_state
}
void metadata::maybe_set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state) {
throwing_assert(paging_state);
SCYLLA_ASSERT(paging_state);
if (paging_state->get_remaining() > 0) {
set_paging_state(std::move(paging_state));
} else {
@@ -138,7 +138,7 @@ bool result_set::empty() const {
}
void result_set::add_row(std::vector<managed_bytes_opt> row) {
throwing_assert(row.size() == _metadata->value_count());
SCYLLA_ASSERT(row.size() == _metadata->value_count());
_rows.emplace_back(std::move(row));
}

View File

@@ -356,7 +356,7 @@ public:
add_value(*def, static_row_iterator);
break;
default:
throwing_assert(0);
SCYLLA_ASSERT(0);
}
}
_builder.complete_row();

View File

@@ -34,7 +34,7 @@ sets::setter::execute(mutation& m, const clustering_key_prefix& row_key, const u
void
sets::adder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
const cql3::raw_value value = expr::evaluate(*_e, params._options);
throwing_assert(column.type->is_multi_cell()); // "Attempted to add items to a frozen set";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to add items to a frozen set";
do_add(m, row_key, params, value, column);
}
@@ -77,7 +77,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
void
sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
throwing_assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";
SCYLLA_ASSERT(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";
cql3::raw_value svalue = expr::evaluate(*_e, params._options);
if (svalue.is_null()) {
@@ -98,7 +98,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
void sets::element_discarder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params)
{
throwing_assert(column.type->is_multi_cell() && "Attempted to remove items from a frozen set");
SCYLLA_ASSERT(column.type->is_multi_cell() && "Attempted to remove items from a frozen set");
cql3::raw_value elt = expr::evaluate(*_e, params._options);
if (elt.is_null()) {
throw exceptions::invalid_request_exception("Invalid null set element");

View File

@@ -296,7 +296,7 @@ void alter_table_statement::drop_column(const query_options& options, const sche
std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_schema_update(data_dictionary::database db, const query_options& options) const {
auto s = validation::validate_column_family(db, keyspace(), column_family());
if (s->is_view()) {
throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View. (Did you mean ALTER MATERIALIZED VIEW)?");
throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View");
}
const bool is_cdc_log_table = cdc::is_log_for_some_table(db.real_database(), s->ks_name(), s->cf_name());
@@ -368,7 +368,7 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
switch (_type) {
case alter_table_statement::type::add:
throwing_assert(_column_changes.size());
SCYLLA_ASSERT(_column_changes.size());
if (s->is_dense()) {
throw exceptions::invalid_request_exception("Cannot add new column to a COMPACT STORAGE table");
}
@@ -376,12 +376,12 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
break;
case alter_table_statement::type::alter:
throwing_assert(_column_changes.size() == 1);
SCYLLA_ASSERT(_column_changes.size() == 1);
invoke_column_change_fn(std::mem_fn(&alter_table_statement::alter_column));
break;
case alter_table_statement::type::drop:
throwing_assert(_column_changes.size());
SCYLLA_ASSERT(_column_changes.size());
if (!s->is_cql3_table()) {
throw exceptions::invalid_request_exception("Cannot drop columns from a non-CQL3 table");
}

View File

@@ -46,7 +46,7 @@ future<> alter_view_statement::check_access(query_processor& qp, const service::
view_ptr alter_view_statement::prepare_view(data_dictionary::database db) const {
schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
if (!schema->is_view()) {
throw exceptions::invalid_request_exception("Cannot use ALTER MATERIALIZED VIEW on Table. (Did you mean ALTER TABLE)?");
throw exceptions::invalid_request_exception("Cannot use ALTER MATERIALIZED VIEW on Table");
}
if (!_properties) {

View File

@@ -25,7 +25,7 @@ attach_service_level_statement::attach_service_level_statement(sstring service_l
}
bool attach_service_level_statement::needs_guard(query_processor& qp, service::query_state& state) const {
return true;
return !auth::legacy_mode(qp) || state.get_service_level_controller().is_v2();
}
std::unique_ptr<cql3::statements::prepared_statement>

View File

@@ -11,6 +11,7 @@
#include "authentication_statement.hh"
#include "transport/messages/result_message.hh"
#include "cql3/query_processor.hh"
#include "auth/common.hh"
uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
return 0;
@@ -25,7 +26,7 @@ future<> cql3::statements::authentication_statement::check_access(query_processo
}
bool cql3::statements::authentication_altering_statement::needs_guard(query_processor& qp, service::query_state&) const {
return true;
return !auth::legacy_mode(qp);
}
audit::statement_category cql3::statements::authentication_statement::category() const {

View File

@@ -14,6 +14,7 @@
#include "cql3/query_processor.hh"
#include "exceptions/exceptions.hh"
#include "db/cql_type_parser.hh"
#include "auth/common.hh"
uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
return 0;
@@ -73,7 +74,7 @@ void cql3::statements::authorization_statement::maybe_correct_resource(auth::res
bool cql3::statements::authorization_altering_statement::needs_guard(
query_processor& qp, service::query_state&) const {
return true;
return !auth::legacy_mode(qp);
};
audit::statement_category cql3::statements::authorization_statement::category() const {

View File

@@ -265,10 +265,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(
exceptions::invalid_request_exception(
format("Write consistency level {} is forbidden by the current configuration "
"setting of write_consistency_levels_disallowed. Please use a different "
"consistency level, or remove {} from write_consistency_levels_disallowed "
"set in the configuration.", cl, cl)));
format("Consistency level {} is not allowed for write operations", cl)));
}
for (size_t i = 0; i < _statements.size(); ++i) {
@@ -280,8 +277,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
_stats.statements_in_cas_batches += _statements.size();
return execute_with_conditions(qp, options, query_state).then([guardrail_state, cl] (auto result) {
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
result->add_warning(format("Using write consistency level {} listed on the "
"write_consistency_levels_warned is not recommended.", cl));
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
}
return result;
});
@@ -301,8 +297,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
}
auto result = make_shared<cql_transport::messages::result_message::void_message>();
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
result->add_warning(format("Using write consistency level {} listed on the "
"write_consistency_levels_warned is not recommended.", cl));
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
}
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(result));
});

View File

@@ -20,7 +20,6 @@
#include "cql3/attributes.hh"
#include "cql3/expr/expression.hh"
#include "cql3/expr/evaluate.hh"
#include "cql3/query_options.hh"
#include "cql3/query_processor.hh"
#include "cql3/values.hh"
#include "timeout_config.hh"
@@ -66,7 +65,7 @@ evaluate_prepared(
future<::shared_ptr<cql_transport::messages::result_message>>
broadcast_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
if (this_shard_id() != 0) {
co_return qp.bounce_to_shard(0, cql3::computed_function_values{}, false);
co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
}
auto result = co_await qp.execute_broadcast_table_query(

View File

@@ -96,7 +96,7 @@ evaluate_prepared(
future<::shared_ptr<cql_transport::messages::result_message>>
broadcast_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
if (this_shard_id() != 0) {
co_return qp.bounce_to_shard(0, cql3::computed_function_values{}, false);
co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
}
auto result = co_await qp.execute_broadcast_table_query(

View File

@@ -51,7 +51,7 @@ public:
, _key(std::move(key_arg))
, _rows(schema_arg)
{
throwing_assert(_key.size() == 1 && query::is_single_partition(_key.front()));
SCYLLA_ASSERT(_key.size() == 1 && query::is_single_partition(_key.front()));
}
dht::partition_range_vector key() const {

View File

@@ -59,8 +59,6 @@ const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
const sstring cf_prop_defs::KW_TABLETS = "tablets";
const sstring cf_prop_defs::KW_STORAGE_ENGINE = "storage_engine";
schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
schema::extensions_map er;
for (auto& p : exts.schema_extensions()) {
@@ -108,7 +106,6 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS,
KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
KW_STORAGE_ENGINE,
});
static std::set<sstring> obsolete_keywords({
sstring("index_interval"),
@@ -199,20 +196,6 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
}
db::tablet_options::validate(*tablet_options_map);
}
if (has_property(KW_STORAGE_ENGINE)) {
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
if (storage_engine == "logstor") {
if (!db.features().logstor) {
throw exceptions::configuration_exception(format("The experimental feature 'logstor' must be enabled in order to use the 'logstor' storage engine."));
}
if (!db.get_config().enable_logstor()) {
throw exceptions::configuration_exception(format("The configuration option 'enable_logstor' must be set to true in the configuration in order to use the 'logstor' storage engine."));
}
} else {
throw exceptions::configuration_exception(format("Illegal value for '{}'", KW_STORAGE_ENGINE));
}
}
}
std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
@@ -413,13 +396,6 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
if (auto tablet_options_opt = get_map(KW_TABLETS)) {
builder.set_tablet_options(std::move(*tablet_options_opt));
}
if (has_property(KW_STORAGE_ENGINE)) {
auto storage_engine = get_string(KW_STORAGE_ENGINE, "");
if (storage_engine == "logstor") {
builder.set_logstor();
}
}
}
void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const

View File

@@ -64,8 +64,6 @@ public:
static const sstring KW_TABLETS;
static const sstring KW_STORAGE_ENGINE;
// FIXME: In origin the following consts are in CFMetaData.
static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;

View File

@@ -41,13 +41,13 @@ void cf_statement::prepare_keyspace(std::string_view keyspace)
}
bool cf_statement::has_keyspace() const {
throwing_assert(_cf_name.has_value());
SCYLLA_ASSERT(_cf_name.has_value());
return _cf_name->has_keyspace();
}
const sstring& cf_statement::keyspace() const
{
throwing_assert(_cf_name->has_keyspace()); // "The statement hasn't be prepared correctly";
SCYLLA_ASSERT(_cf_name->has_keyspace()); // "The statement hasn't be prepared correctly";
return _cf_name->get_keyspace();
}

View File

@@ -9,7 +9,6 @@
*/
#include "cql3/statements/cf_prop_defs.hh"
#include "utils/assert.hh"
#include <inttypes.h>
#include <boost/regex.hpp>
@@ -140,7 +139,7 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
{
throwing_assert(aliases.size() == types.size());
SCYLLA_ASSERT(aliases.size() == types.size());
for (size_t i = 0; i < aliases.size(); i++) {
if (!aliases[i].empty()) {
builder.with_column(aliases[i], types[i], kind);
@@ -152,7 +151,7 @@ std::unique_ptr<prepared_statement>
create_table_statement::prepare(data_dictionary::database db, cql_stats& stats) {
// Cannot happen; create_table_statement is never instantiated as a raw statement
// (instead we instantiate create_table_statement::raw_statement)
throwing_assert(0 && "create_table_statement::prepare");
abort();
}
future<> create_table_statement::grant_permissions_to_creator(const service::client_state& cs, service::group0_batch& mc) const {
@@ -240,7 +239,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
for (auto&& inner: type->all_types()) {
if (inner->is_multi_cell()) {
// a nested non-frozen UDT should have already been rejected when defining the type
throwing_assert(inner->is_collection());
SCYLLA_ASSERT(inner->is_collection());
throw exceptions::invalid_request_exception("Non-frozen UDTs with nested non-frozen collections are not supported");
}
}
@@ -267,13 +266,6 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
stmt_warning("CREATE TABLE WITH COMPACT STORAGE is deprecated and will eventually be removed in a future version.");
}
if (_properties.properties()->has_property(cf_prop_defs::KW_STORAGE_ENGINE)) {
auto storage_engine = _properties.properties()->get_string(cf_prop_defs::KW_STORAGE_ENGINE, "");
if (storage_engine == "logstor" && !_column_aliases.empty()) {
throw exceptions::configuration_exception("The 'logstor' storage engine cannot be used with tables that have clustering columns");
}
}
auto& key_aliases = _key_aliases[0];
std::vector<data_type> key_types;
for (auto&& alias : key_aliases) {

View File

@@ -63,7 +63,7 @@ future<> create_view_statement::check_access(query_processor& qp, const service:
static const column_definition* get_column_definition(const schema& schema, column_identifier::raw& identifier) {
auto prepared = identifier.prepare(schema);
throwing_assert(dynamic_pointer_cast<column_identifier>(prepared));
SCYLLA_ASSERT(dynamic_pointer_cast<column_identifier>(prepared));
auto id = static_pointer_cast<column_identifier>(prepared);
return schema.get_column_definition(id->name());
}

View File

@@ -103,7 +103,7 @@ delete_statement::delete_statement(cf_name name,
, _deletions(std::move(deletions))
, _where_clause(std::move(where_clause))
{
throwing_assert(!_attrs->time_to_live.has_value());
SCYLLA_ASSERT(!_attrs->time_to_live.has_value());
}
}

View File

@@ -307,7 +307,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
auto s = validation::validate_column_family(db, ks, name);
if (s->is_view()) {
throw exceptions::invalid_request_exception("Cannot use DESC TABLE on materialized View. (Did you mean DESC MATERIALIZED VIEW)?");
throw exceptions::invalid_request_exception("Cannot use DESC TABLE on materialized View");
}
auto schema = table->schema();

View File

@@ -23,7 +23,7 @@ detach_service_level_statement::detach_service_level_statement(sstring role_name
}
bool detach_service_level_statement::needs_guard(query_processor& qp, service::query_state&) const {
return true;
return !auth::legacy_mode(qp);
}
std::unique_ptr<cql3::statements::prepared_statement>

View File

@@ -63,7 +63,7 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_w
}
}
{
if (!auth::legacy_mode(qp)) {
const auto& as = *state.get_client_state().get_auth_service();
co_await auth::revoke_all(as, auth::make_data_resource(_keyspace), mc);
co_await auth::revoke_all(as, auth::make_functions_resource(_keyspace), mc);

View File

@@ -58,7 +58,7 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_w
}
}
{
if (!auth::legacy_mode(qp)) {
const auto& as = *state.get_client_state().get_auth_service();
co_await auth::revoke_all(as, auth::make_data_resource(keyspace(), column_family()), mc);
}

View File

@@ -81,7 +81,7 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu
}
auto& sl_controller = state.get_service_level_controller();
auto slo = sl_controller.find_cached_effective_service_level(_role_name);
auto slo = co_await sl_controller.find_effective_service_level(_role_name);
if (!slo) {
throw exceptions::invalid_request_exception(format("Role {} doesn't have assigned any service level", _role_name));

View File

@@ -16,7 +16,6 @@
#include "auth/common.hh"
#include "cql3/result_set.hh"
#include "cql3/column_identifier.hh"
#include "db/system_keyspace.hh"
#include "transport/messages/result_message.hh"
cql3::statements::list_permissions_statement::list_permissions_statement(
@@ -81,9 +80,9 @@ cql3::statements::list_permissions_statement::execute(
service::query_state& state,
const query_options& options,
std::optional<service::group0_guard> guard) const {
auto make_column = [](sstring name) {
auto make_column = [auth_ks = auth::get_auth_ks_name(qp)](sstring name) {
return make_lw_shared<column_specification>(
db::system_keyspace::NAME,
auth_ks,
"permissions",
::make_shared<column_identifier>(std::move(name), true),
utf8_type);

View File

@@ -14,7 +14,6 @@
#include "cql3/query_options.hh"
#include "cql3/column_identifier.hh"
#include "auth/common.hh"
#include "db/system_keyspace.hh"
#include "transport/messages/result_message.hh"
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::list_users_statement::prepare(
@@ -31,9 +30,9 @@ future<::shared_ptr<cql_transport::messages::result_message>>
cql3::statements::list_users_statement::execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const {
static const sstring virtual_table_name("users");
const auto make_column_spec = [](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
return make_lw_shared<column_specification>(
db::system_keyspace::NAME,
auth_ks,
virtual_table_name,
::make_shared<column_identifier>(name, true),
ty);

View File

@@ -273,10 +273,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
if (guardrail_state == query_processor::write_consistency_guardrail_state::FAIL) {
co_return coroutine::exception(
std::make_exception_ptr(exceptions::invalid_request_exception(
format("Write consistency level {} is forbidden by the current configuration "
"setting of write_consistency_levels_disallowed. Please use a different "
"consistency level, or remove {} from write_consistency_levels_disallowed "
"set in the configuration.", cl, cl))));
format("Consistency level {} is not allowed for write operations", cl))));
}
_restrictions->validate_primary_key(options);
@@ -284,8 +281,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
if (has_conditions()) {
auto result = co_await execute_with_condition(qp, qs, options);
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
result->add_warning(format("Using write consistency level {} listed on the "
"write_consistency_levels_warned is not recommended.", cl));
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
}
co_return result;
}
@@ -307,8 +303,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
auto result = seastar::make_shared<cql_transport::messages::result_message::void_message>();
if (guardrail_state == query_processor::write_consistency_guardrail_state::WARN) {
result->add_warning(format("Using write consistency level {} listed on the "
"write_consistency_levels_warned is not recommended.", cl));
result->add_warning(format("Write with consistency level {} is warned by guardrail configuration", cl));
}
if (keys_size_one) {
auto&& table = s->table();
@@ -376,6 +371,7 @@ process_forced_rebounce(unsigned shard, query_processor& qp, const query_options
// On the last iteration, re-bounce to the correct shard.
if (counter != 0) {
const auto shard_num = smp::count;
assert(shard_num > 0);
const auto local_shard = this_shard_id();
auto target_shard = local_shard + 1;
if (target_shard == shard) {
@@ -509,7 +505,7 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
* partition to check conditions.
*/
if (_if_exists || _if_not_exists) {
throwing_assert(!_has_static_column_conditions && !_has_regular_column_conditions);
SCYLLA_ASSERT(!_has_static_column_conditions && !_has_regular_column_conditions);
if (s->has_static_columns() && !_restrictions->has_clustering_columns_restriction()) {
_has_static_column_conditions = true;
} else {
@@ -695,13 +691,13 @@ modification_statement::prepare_conditions(data_dictionary::database db, const s
if (_if_not_exists) {
// To have both 'IF NOT EXISTS' and some other conditions doesn't make sense.
// So far this is enforced by the parser, but let's throwing_assert it for sanity if ever the parse changes.
throwing_assert(!_conditions);
throwing_assert(!_if_exists);
// So far this is enforced by the parser, but let's SCYLLA_ASSERT it for sanity if ever the parse changes.
SCYLLA_ASSERT(!_conditions);
SCYLLA_ASSERT(!_if_exists);
stmt.set_if_not_exist_condition();
} else if (_if_exists) {
throwing_assert(!_conditions);
throwing_assert(!_if_not_exists);
SCYLLA_ASSERT(!_conditions);
SCYLLA_ASSERT(!_if_not_exists);
stmt.set_if_exist_condition();
} else {
stmt._condition = column_condition_prepare(*_conditions, db, keyspace(), schema);

View File

@@ -20,7 +20,6 @@
#include "cql3/column_specification.hh"
#include "cql3/column_identifier.hh"
#include "cql3/query_processor.hh"
#include "db/system_keyspace.hh"
#include "cql3/statements/alter_role_statement.hh"
#include "cql3/statements/create_role_statement.hh"
#include "cql3/statements/drop_role_statement.hh"
@@ -379,9 +378,9 @@ future<result_message_ptr>
list_roles_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional<service::group0_guard> guard) const {
static const sstring virtual_table_name("roles");
const auto make_column_spec = [](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
return make_lw_shared<column_specification>(
db::system_keyspace::NAME,
auth_ks,
virtual_table_name,
::make_shared<column_identifier>(name, true),
ty);

View File

@@ -889,7 +889,7 @@ select_statement::execute_without_checking_exception_message_non_aggregate_unpag
auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
if (needs_post_query_ordering() && _limit) {
return do_with(std::forward<dht::partition_range_vector>(partition_ranges), [this, &qp, &state, &options, cmd, timeout, cas_shard = std::move(cas_shard)](auto& prs) {
throwing_assert(cmd->partition_limit == query::max_partitions);
SCYLLA_ASSERT(cmd->partition_limit == query::max_partitions);
query::result_merger merger(cmd->get_row_limit() * prs.size(), query::max_partitions);
return utils::result_map_reduce(prs.begin(), prs.end(), [this, &qp, &state, &options, cmd, timeout, cas_shard = std::move(cas_shard)] (auto& pr) {
dht::partition_range_vector prange { pr };
@@ -1085,7 +1085,7 @@ view_indexed_table_select_statement::view_indexed_table_select_statement(schema_
, _used_index_restrictions(std::move(used_index_restrictions))
, _view_schema(view_schema)
{
throwing_assert(_view_schema);
SCYLLA_ASSERT(_view_schema);
if (_index.metadata().local()) {
_get_partition_ranges_for_posting_list = [this] (const query_options& options) { return get_partition_ranges_for_local_index_posting_list(options); };
_get_partition_slice_for_posting_list = [this] (const query_options& options) { return get_partition_slice_for_local_index_posting_list(options); };
@@ -1203,7 +1203,7 @@ view_indexed_table_select_statement::actually_do_execute(query_processor& qp,
? source_selector::INTERNAL : source_selector::USER;
++_stats.query_cnt(src_sel, _ks_sel, cond_selector::NO_CONDITIONS, statement_type::SELECT);
throwing_assert(_restrictions->uses_secondary_indexing());
SCYLLA_ASSERT(_restrictions->uses_secondary_indexing());
_stats.unpaged_select_queries(_ks_sel) += options.get_page_size() <= 0;
@@ -2240,8 +2240,8 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
namespace raw {
static void validate_attrs(const cql3::attributes::raw& attrs) {
throwing_assert(!attrs.timestamp.has_value());
throwing_assert(!attrs.time_to_live.has_value());
SCYLLA_ASSERT(!attrs.timestamp.has_value());
SCYLLA_ASSERT(!attrs.time_to_live.has_value());
}
audit::statement_category select_statement::category() const {
@@ -2406,7 +2406,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
std::visit([&](auto&& ordering) {
using T = std::decay_t<decltype(ordering)>;
if constexpr (!std::is_same_v<T, select_statement::ann_vector>) {
throwing_assert(!for_view);
SCYLLA_ASSERT(!for_view);
verify_ordering_is_allowed(*_parameters, *restrictions);
prepared_orderings_type prepared_orderings = prepare_orderings(*schema);
verify_ordering_is_valid(prepared_orderings, *schema, *restrictions);
@@ -2676,7 +2676,7 @@ select_statement::prepared_orderings_type select_statement::prepare_orderings(co
// Then specifying ascending order would cause the results of this column to be reverse in comparison to a standard select.
static bool are_column_select_results_reversed(const column_definition& column, select_statement::ordering_type column_ordering) {
auto ordering = std::get_if<select_statement::ordering>(&column_ordering);
throwing_assert(ordering);
SCYLLA_ASSERT(ordering);
if (*ordering == select_statement::ordering::ascending) {
return column.type->is_reversed();

Some files were not shown because too many files have changed in this diff Show More