Compare commits

..

1 Commits

Author SHA1 Message Date
Yaniv Kaul
eecc503ba3 Potential fix for code scanning alert no. 171: Workflow does not contain permissions
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 12:02:11 +02:00
2666 changed files with 29516 additions and 57464 deletions

View File

@@ -55,26 +55,22 @@ ninja build/<mode>/test/boost/<test_name>
ninja build/<mode>/scylla
# Run all tests in a file
./test.py --mode=<mode> test/<suite>/<test_name>.py
./test.py --mode=<mode> <test_path>
# Run a single test case from a file
./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
# Run all tests in a directory
./test.py --mode=<mode> test/<suite>/
./test.py --mode=<mode> <test_path>::<test_function_name>
# Examples
./test.py --mode=dev test/alternator/
./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
./test.py --mode=dev test/cqlpy/test_json.py
./test.py --mode=dev alternator/
./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
# Optional flags
./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v # Verbose output
./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5 # Repeat test 5 times
./test.py --mode=dev cluster/test_raft_no_quorum -v # Verbose output
./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5 # Repeat test 5 times
```
**Important:**
- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
- To run a single test case, append `::<test_function_name>` to the file path
- Add `-v` for verbose output
- Add `--repeat <num>` to repeat a test multiple times
@@ -88,14 +84,3 @@ ninja build/<mode>/scylla
- Strive for simplicity and clarity, add complexity only when clearly justified
- Question requests: don't blindly implement requests - evaluate trade-offs, identify issues, and suggest better alternatives when appropriate
- Consider different approaches, weigh pros and cons, and recommend the best fit for the specific context
## Test Philosophy
- Performance matters. Tests should run as quickly as possible. Sleeps in the code are highly discouraged and should be avoided, to reduce run time and flakiness.
- Stability matters. Tests should be stable. New tests should be executed 100 times at least to ensure they pass 100 out of 100 times. (use --repeat 100 --max-failures 1 when running it)
- Unit tests should ideally test one thing and one thing only.
- Tests for bug fixes should run before the fix - and show the failure and after the fix - and show they now pass.
- Tests for bug fixes should have in their comments which bug fixes (GitHub or JIRA issue) they test.
- Tests in debug are always slower, so if needed, reduce number of iterations, rows, data used, cycles, etc. in debug mode.
- Tests should strive to be repeatable, and not use random input that will make their results unpredictable.
- Tests should consume as little resources as possible. Prefer running tests on a single node if it is sufficient, for example.

View File

@@ -1,6 +1,6 @@
version: 2
updates:
- package-ecosystem: "uv"
- package-ecosystem: "pip"
directory: "/docs"
schedule:
interval: "daily"

View File

@@ -8,9 +8,6 @@ on:
jobs:
check-fixes-prefix:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
steps:
- name: Check PR body for "Fixes" prefix patterns
uses: actions/github-script@v7
@@ -21,7 +18,7 @@ jobs:
// Regular expression pattern to check for "Fixes" prefix
// Adjusted to dynamically insert the repository full name
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
const regex = new RegExp(pattern);
if (!regex.test(body)) {

View File

@@ -1,53 +0,0 @@
name: Backport with Jira Integration
on:
push:
branches:
- master
- next-*.*
- branch-*.*
pull_request_target:
types: [labeled, closed]
branches:
- master
- next
- next-*.*
- branch-*.*
jobs:
backport-on-push:
if: github.event_name == 'push'
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'push'
base_branch: ${{ github.ref }}
commits: ${{ github.event.before }}..${{ github.sha }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
backport-on-label:
if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'labeled'
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
pull_request_number: ${{ github.event.pull_request.number }}
head_commit: ${{ github.event.pull_request.base.sha }}
label_name: ${{ github.event.label.name }}
pr_state: ${{ github.event.pull_request.state }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
backport-chain:
if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'chain'
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
pull_request_number: ${{ github.event.pull_request.number }}
pr_body: ${{ github.event.pull_request.body }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -0,0 +1,12 @@
name: Call Jira Status In Progress
on:
pull_request_target:
types: [opened]
jobs:
call-jira-status-in-progress:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -0,0 +1,12 @@
name: Call Jira Status In Review
on:
pull_request_target:
types: [ready_for_review, review_requested]
jobs:
call-jira-status-in-review:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -0,0 +1,12 @@
name: Call Jira Status Ready For Merge
on:
pull_request_target:
types: [labeled]
jobs:
call-jira-status-update:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,18 +0,0 @@
name: Sync Jira Based on PR Events
on:
pull_request_target:
types: [opened, edited, ready_for_review, review_requested, labeled, unlabeled, closed]
permissions:
contents: read
pull-requests: write
issues: write
jobs:
jira-sync:
uses: scylladb/github-automation/.github/workflows/main_pr_events_jira_sync.yml@main
with:
caller_action: ${{ github.event.action }}
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,22 +0,0 @@
name: Sync Jira Based on PR Milestone Events
on:
pull_request_target:
types: [milestoned, demilestoned]
permissions:
contents: read
pull-requests: read
jobs:
jira-sync-milestone-set:
if: github.event.action == 'milestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira-sync-milestone-removed:
if: github.event.action == 'demilestoned'
uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,14 +1,16 @@
name: Call Jira release creation for new milestone
permissions: read-all
on:
milestone:
types: [created, closed]
types: [created]
jobs:
sync-milestone-to-jira:
uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
with:
# Comma-separated list of Jira project keys
jira_project_keys: "SCYLLADB,CUSTOMER,SMI,RELENG,VECTOR"
jira_project_keys: "SCYLLADB,CUSTOMER"
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,13 +0,0 @@
name: validate_pr_author_email
on:
pull_request_target:
types:
- opened
- synchronize
- reopened
jobs:
validate_pr_author_email:
uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main

View File

@@ -1,62 +0,0 @@
name: Close issues created by Scylla associates
on:
issues:
types: [opened, reopened]
permissions:
issues: write
jobs:
comment-and-close:
runs-on: ubuntu-latest
steps:
- name: Comment and close if author email is scylladb.com
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const issue = context.payload.issue;
const actor = context.actor;
// Get user data (only public email is available)
const { data: user } = await github.rest.users.getByUsername({
username: actor,
});
const email = user.email || "";
console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
// Only continue if email exists and ends with @scylladb.com
if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
console.log("User is not a scylladb.com email (or email not public); skipping.");
return;
}
const owner = context.repo.owner;
const repo = context.repo.repo;
const issue_number = issue.number;
const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
// Add the comment
await github.rest.issues.createComment({
owner,
repo,
issue_number,
body,
});
console.log(`Comment added to #${issue_number}`);
// Close the issue
await github.rest.issues.update({
owner,
repo,
issue_number,
state: "closed",
state_reason: "not_planned"
});
console.log(`Issue #${issue_number} closed.`);

View File

@@ -13,5 +13,5 @@ jobs:
- uses: codespell-project/actions-codespell@master
with:
only_warn: 1
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison,iif,tread"
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison"
skip: "./.git,./build,./tools,*.js,*.lock,./test,./licenses,./redis/lolwut.cc,*.svg"

View File

@@ -18,10 +18,6 @@ on:
jobs:
release:
permissions:
pages: write
id-token: write
contents: write
runs-on: ubuntu-latest
steps:
- name: Checkout
@@ -33,9 +29,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
python-version: "3.10"
- name: Set up env
run: make -C docs FLAG="${{ env.FLAG }}" setupenv
- name: Build docs

View File

@@ -2,9 +2,6 @@ name: "Docs / Build PR"
# For more information,
# see https://sphinx-theme.scylladb.com/stable/deployment/production.html#available-workflows
permissions:
contents: read
env:
FLAG: ${{ github.repository == 'scylladb/scylla-enterprise' && 'enterprise' || 'opensource' }}
@@ -29,9 +26,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
python-version: "3.10"
- name: Set up env
run: make -C docs FLAG="${{ env.FLAG }}" setupenv
- name: Build docs

View File

@@ -1,8 +1,5 @@
name: Docs / Validate metrics
permissions:
contents: read
on:
pull_request:
branches:

View File

@@ -14,8 +14,7 @@ env:
CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log
permissions:
contents: read
permissions: {}
# cancel the in-progress run upon a repush
concurrency:
@@ -35,6 +34,8 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
- run: |
sudo dnf -y install clang-tools-extra
- name: Generate compilation database
run: |
cmake \

View File

@@ -10,8 +10,6 @@ on:
jobs:
read-toolchain:
runs-on: ubuntu-latest
permissions:
contents: read
outputs:
image: ${{ steps.read.outputs.image }}
steps:

View File

@@ -1,6 +1,4 @@
name: Trigger Scylla CI Route
permissions:
contents: read
on:
issue_comment:
@@ -11,56 +9,16 @@ on:
jobs:
trigger-jenkins:
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
runs-on: ubuntu-latest
steps:
- name: Verify Org Membership
id: verify_author
env:
EVENT_NAME: ${{ github.event_name }}
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
shell: bash
run: |
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
AUTHOR="$PR_AUTHOR"
ASSOCIATION="$PR_ASSOCIATION"
else
AUTHOR="$COMMENT_AUTHOR"
ASSOCIATION="$COMMENT_ASSOCIATION"
fi
if [[ "$ASSOCIATION" == "MEMBER" || "$ASSOCIATION" == "OWNER" ]]; then
echo "member=true" >> $GITHUB_OUTPUT
else
echo "::warning::${AUTHOR} is not a member of scylladb (association: ${ASSOCIATION}); skipping CI trigger."
echo "member=false" >> $GITHUB_OUTPUT
fi
- name: Validate Comment Trigger
if: github.event_name == 'issue_comment'
id: verify_comment
env:
COMMENT_BODY: ${{ github.event.comment.body }}
shell: bash
run: |
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
echo "trigger=true" >> $GITHUB_OUTPUT
else
echo "trigger=false" >> $GITHUB_OUTPUT
fi
- name: Trigger Scylla-CI-Route Jenkins Job
if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
env:
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
JENKINS_URL: "https://jenkins.scylladb.com"
PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
PR_REPO_NAME: "${{ github.event.repository.full_name }}"
run: |
PR_NUMBER=${{ github.event.issue.number }}
PR_REPO_NAME=${{ github.event.repository.full_name }}
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v

View File

@@ -1,8 +1,5 @@
name: Trigger next gating
permissions:
contents: read
on:
push:
branches:

View File

@@ -300,6 +300,7 @@ add_subdirectory(locator)
add_subdirectory(message)
add_subdirectory(mutation)
add_subdirectory(mutation_writer)
add_subdirectory(node_ops)
add_subdirectory(readers)
add_subdirectory(replica)
add_subdirectory(raft)

View File

@@ -43,7 +43,7 @@ For further information, please see:
[developer documentation]: HACKING.md
[build documentation]: docs/dev/building.md
[docker image build documentation]: dist/docker/redhat/README.md
[docker image build documentation]: dist/docker/debian/README.md
## Running Scylla

View File

@@ -78,7 +78,7 @@ fi
# Default scylla product/version tags
PRODUCT=scylla
VERSION=2026.2.0-dev
VERSION=2026.1.0-dev
if test -f version
then

View File

@@ -18,7 +18,6 @@ target_sources(alternator
consumed_capacity.cc
ttl.cc
parsed_expression_cache.cc
http_compression.cc
${cql_grammar_srcs})
target_include_directories(alternator
PUBLIC

View File

@@ -13,8 +13,7 @@
#include <string_view>
#include "alternator/auth.hh"
#include <fmt/format.h>
#include "db/consistency_level_type.hh"
#include "db/system_keyspace.hh"
#include "auth/password_authenticator.hh"
#include "service/storage_proxy.hh"
#include "alternator/executor.hh"
#include "cql3/selection/selection.hh"
@@ -26,8 +25,8 @@ namespace alternator {
static logging::logger alogger("alternator-auth");
future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username) {
schema_ptr schema = proxy.data_dictionary().find_schema(db::system_keyspace::NAME, "roles");
future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username) {
schema_ptr schema = proxy.data_dictionary().find_schema(auth::get_auth_ks_name(as.query_processor()), "roles");
partition_key pk = partition_key::from_single_value(*schema, utf8_type->decompose(username));
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
std::vector<query::clustering_range> bounds{query::clustering_range::make_open_ended_both_sides()};
@@ -40,7 +39,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::strin
auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id, can_login_col->id}, selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
auto cl = db::consistency_level::LOCAL_ONE;
auto cl = auth::password_authenticator::consistency_for_user(username);
service::client_state client_state{service::client_state::internal_tag()};
service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,

View File

@@ -20,6 +20,6 @@ namespace alternator {
using key_cache = utils::loading_cache<std::string, std::string, 1>;
future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username);
future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username);
}

View File

@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
// Check if the existing values of the item (previous_item) match the
// conditions given by the Expected and ConditionalOperator parameters
// (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
// This function can throw a ValidationException API error if there
// This function can throw an ValidationException API error if there
// are errors in the format of the condition itself.
bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
const rjson::value* expected = rjson::find(req, "Expected");

View File

@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
}
void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
if (_should_add_to_response) {
if (_should_add_to_reponse) {
auto consumption = rjson::empty_object();
rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
rjson::add(response, "ConsumedCapacity", std::move(consumption));

View File

@@ -28,9 +28,9 @@ namespace alternator {
class consumed_capacity_counter {
public:
consumed_capacity_counter() = default;
consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
bool operator()() const noexcept {
return _should_add_to_response;
return _should_add_to_reponse;
}
consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
uint64_t _total_bytes = 0;
static bool should_add_capacity(const rjson::value& request);
protected:
bool _should_add_to_response = false;
bool _should_add_to_reponse = false;
};
class rcu_consumed_capacity_counter : public consumed_capacity_counter {

View File

@@ -28,7 +28,6 @@ static logging::logger logger("alternator_controller");
controller::controller(
sharded<gms::gossiper>& gossiper,
sharded<service::storage_proxy>& proxy,
sharded<service::storage_service>& ss,
sharded<service::migration_manager>& mm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<cdc::generation_service>& cdc_gen_svc,
@@ -40,7 +39,6 @@ controller::controller(
: protocol_server(sg)
, _gossiper(gossiper)
, _proxy(proxy)
, _ss(ss)
, _mm(mm)
, _sys_dist_ks(sys_dist_ks)
, _cdc_gen_svc(cdc_gen_svc)
@@ -91,7 +89,7 @@ future<> controller::start_server() {
auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
return cfg.alternator_timeout_in_ms;
};
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks),
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
@@ -105,23 +103,11 @@ future<> controller::start_server() {
alternator_port = _config.alternator_port();
_listen_addresses.push_back({addr, *alternator_port});
}
std::optional<uint16_t> alternator_port_proxy_protocol;
if (_config.alternator_port_proxy_protocol()) {
alternator_port_proxy_protocol = _config.alternator_port_proxy_protocol();
_listen_addresses.push_back({addr, *alternator_port_proxy_protocol});
}
std::optional<uint16_t> alternator_https_port;
std::optional<uint16_t> alternator_https_port_proxy_protocol;
std::optional<tls::credentials_builder> creds;
if (_config.alternator_https_port() || _config.alternator_https_port_proxy_protocol()) {
if (_config.alternator_https_port()) {
alternator_https_port = _config.alternator_https_port();
_listen_addresses.push_back({addr, *alternator_https_port});
}
if (_config.alternator_https_port_proxy_protocol()) {
alternator_https_port_proxy_protocol = _config.alternator_https_port_proxy_protocol();
_listen_addresses.push_back({addr, *alternator_https_port_proxy_protocol});
}
if (_config.alternator_https_port()) {
alternator_https_port = _config.alternator_https_port();
_listen_addresses.push_back({addr, *alternator_https_port});
creds.emplace();
auto opts = _config.alternator_encryption_options();
if (opts.empty()) {
@@ -147,29 +133,20 @@ future<> controller::start_server() {
}
}
_server.invoke_on_all(
[this, addr, alternator_port, alternator_https_port, alternator_port_proxy_protocol, alternator_https_port_proxy_protocol, creds = std::move(creds)] (server& server) mutable {
return server.init(addr, alternator_port, alternator_https_port, alternator_port_proxy_protocol, alternator_https_port_proxy_protocol, creds,
[this, addr, alternator_port, alternator_https_port, creds = std::move(creds)] (server& server) mutable {
return server.init(addr, alternator_port, alternator_https_port, creds,
_config.alternator_enforce_authorization,
_config.alternator_warn_authorization,
_config.alternator_max_users_query_size_in_trace_output,
&_memory_limiter.local().get_semaphore(),
_config.max_concurrent_requests_per_shard);
}).handle_exception([this, addr, alternator_port, alternator_https_port, alternator_port_proxy_protocol, alternator_https_port_proxy_protocol] (std::exception_ptr ep) {
logger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}, proxy-protocol port {}, TLS proxy-protocol port {}: {}",
addr,
alternator_port ? std::to_string(*alternator_port) : "OFF",
alternator_https_port ? std::to_string(*alternator_https_port) : "OFF",
alternator_port_proxy_protocol ? std::to_string(*alternator_port_proxy_protocol) : "OFF",
alternator_https_port_proxy_protocol ? std::to_string(*alternator_https_port_proxy_protocol) : "OFF",
ep);
}).handle_exception([this, addr, alternator_port, alternator_https_port] (std::exception_ptr ep) {
logger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
addr, alternator_port ? std::to_string(*alternator_port) : "OFF", alternator_https_port ? std::to_string(*alternator_https_port) : "OFF", ep);
return stop_server().then([ep = std::move(ep)] { return make_exception_future<>(ep); });
}).then([addr, alternator_port, alternator_https_port, alternator_port_proxy_protocol, alternator_https_port_proxy_protocol] {
logger.info("Alternator server listening on {}, HTTP port {}, HTTPS port {}, proxy-protocol port {}, TLS proxy-protocol port {}",
addr,
alternator_port ? std::to_string(*alternator_port) : "OFF",
alternator_https_port ? std::to_string(*alternator_https_port) : "OFF",
alternator_port_proxy_protocol ? std::to_string(*alternator_port_proxy_protocol) : "OFF",
alternator_https_port_proxy_protocol ? std::to_string(*alternator_https_port_proxy_protocol) : "OFF");
}).then([addr, alternator_port, alternator_https_port] {
logger.info("Alternator server listening on {}, HTTP port {}, HTTPS port {}",
addr, alternator_port ? std::to_string(*alternator_port) : "OFF", alternator_https_port ? std::to_string(*alternator_https_port) : "OFF");
}).get();
});
}
@@ -192,7 +169,7 @@ future<> controller::request_stop_server() {
});
}
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
future<utils::chunked_vector<client_data>> controller::get_client_data() {
return _server.local().get_client_data();
}

View File

@@ -15,7 +15,6 @@
namespace service {
class storage_proxy;
class storage_service;
class migration_manager;
class memory_limiter;
}
@@ -58,7 +57,6 @@ class server;
class controller : public protocol_server {
sharded<gms::gossiper>& _gossiper;
sharded<service::storage_proxy>& _proxy;
sharded<service::storage_service>& _ss;
sharded<service::migration_manager>& _mm;
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
sharded<cdc::generation_service>& _cdc_gen_svc;
@@ -76,7 +74,6 @@ public:
controller(
sharded<gms::gossiper>& gossiper,
sharded<service::storage_proxy>& proxy,
sharded<service::storage_service>& ss,
sharded<service::migration_manager>& mm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<cdc::generation_service>& cdc_gen_svc,
@@ -96,7 +93,7 @@ public:
// This virtual function is called (on each shard separately) when the
// virtual table "system.clients" is read. It is expected to generate a
// list of clients connected to this server (on this shard).
virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
virtual future<utils::chunked_vector<client_data>> get_client_data() override;
};
}

View File

@@ -17,7 +17,6 @@
#include "auth/service.hh"
#include "db/config.hh"
#include "db/view/view_build_status.hh"
#include "locator/tablets.hh"
#include "mutation/tombstone.hh"
#include "locator/abstract_replication_strategy.hh"
#include "utils/log.hh"
@@ -63,20 +62,11 @@
#include "types/types.hh"
#include "db/system_keyspace.hh"
#include "cql3/statements/ks_prop_defs.hh"
#include "alternator/ttl_tag.hh"
using namespace std::chrono_literals;
logging::logger elogger("alternator-executor");
namespace std {
template <> struct hash<std::pair<sstring, sstring>> {
size_t operator () (const std::pair<sstring, sstring>& p) const {
return std::hash<sstring>()(p.first) * 1009 + std::hash<sstring>()(p.second) * 3;
}
};
}
namespace alternator {
// Alternator-specific table properties stored as hidden table tags:
@@ -165,7 +155,7 @@ static map_type attrs_type() {
static const column_definition& attrs_column(const schema& schema) {
const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
throwing_assert(cdef);
SCYLLA_ASSERT(cdef);
return *cdef;
}
@@ -238,7 +228,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
}
// This function assumes the given value is an object and returns requested member value.
// If it is not possible, an api_error::validation is thrown.
// If it is not possible an api_error::validation is thrown.
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
validate_is_object(obj, caller);
const rjson::value* ret = rjson::find(obj, member_name);
@@ -250,7 +240,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe
// This function assumes the given value is an object with a single member, and returns this member.
// In case the requirements are not met, an api_error::validation is thrown.
// In case the requirements are not met an api_error::validation is thrown.
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
if (!v.IsObject() || v.MemberCount() != 1) {
throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -258,66 +248,14 @@ static const rjson::value::Member& get_single_member(const rjson::value& v, cons
return *(v.MemberBegin());
}
class executor::describe_table_info_manager : public service::migration_listener::empty_listener {
executor &_executor;
struct table_info {
utils::simple_value_with_expiry<std::uint64_t> size_in_bytes;
};
std::unordered_map<std::pair<sstring, sstring>, table_info> info_for_tables;
bool active = false;
public:
describe_table_info_manager(executor& executor) : _executor(executor) {
_executor._proxy.data_dictionary().real_database_ptr()->get_notifier().register_listener(this);
active = true;
}
describe_table_info_manager(const describe_table_info_manager &) = delete;
describe_table_info_manager(describe_table_info_manager&&) = delete;
~describe_table_info_manager() {
if (active) {
on_fatal_internal_error(elogger, "describe_table_info_manager was not stopped before destruction");
}
}
describe_table_info_manager &operator = (const describe_table_info_manager &) = delete;
describe_table_info_manager &operator = (describe_table_info_manager&&) = delete;
static std::chrono::high_resolution_clock::time_point now() {
return std::chrono::high_resolution_clock::now();
}
std::optional<std::uint64_t> get_cached_size_in_bytes(const sstring &ks_name, const sstring &cf_name) const {
auto it = info_for_tables.find({ks_name, cf_name});
if (it != info_for_tables.end()) {
return it->second.size_in_bytes.get();
}
return std::nullopt;
}
void cache_size_in_bytes(sstring ks_name, sstring cf_name, std::uint64_t size_in_bytes, std::chrono::high_resolution_clock::time_point expiry) {
info_for_tables[{std::move(ks_name), std::move(cf_name)}].size_in_bytes.set_if_longer_expiry(size_in_bytes, expiry);
}
future<> stop() {
co_await _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().unregister_listener(this);
active = false;
co_return;
}
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
if (!ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX)) return;
info_for_tables.erase({ks_name, cf_name});
}
};
executor::executor(gms::gossiper& gossiper,
service::storage_proxy& proxy,
service::storage_service& ss,
service::migration_manager& mm,
db::system_distributed_keyspace& sdks,
cdc::metadata& cdc_metadata,
smp_service_group ssg,
utils::updateable_value<uint32_t> default_timeout_in_ms)
: _gossiper(gossiper),
_ss(ss),
_proxy(proxy),
_mm(mm),
_sdks(sdks),
@@ -330,7 +268,6 @@ executor::executor(gms::gossiper& gossiper,
_stats))
{
s_default_timeout_in_ms = std::move(default_timeout_in_ms);
_describe_table_info_manager = std::make_unique<describe_table_info_manager>(*this);
register_metrics(_metrics, _stats);
}
@@ -683,7 +620,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
}
// Sets a KeySchema object inside the given JSON parent describing the key
// attributes of the given schema as being either HASH or RANGE keys.
// attributes of the the given schema as being either HASH or RANGE keys.
// Additionally, adds to a given map mappings between the key attribute
// names and their type (as a DynamoDB type string).
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -815,44 +752,12 @@ static future<bool> is_view_built(
}
future<> executor::cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl) {
auto expiry = describe_table_info_manager::now() + ttl;
return container().invoke_on_all(
[schema, size_in_bytes, expiry] (executor& exec) {
exec._describe_table_info_manager->cache_size_in_bytes(schema->ks_name(), schema->cf_name(), size_in_bytes, expiry);
});
}
future<> executor::fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting) {
auto cached_size = _describe_table_info_manager->get_cached_size_in_bytes(schema->ks_name(), schema->cf_name());
std::uint64_t total_size = 0;
if (cached_size) {
total_size = *cached_size;
} else {
// there's no point in trying to estimate value of table that is being deleted, as other nodes more often than not might
// move forward with deletion faster than we calculate the size
if (!deleting) {
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
// Note: we don't care when the notification of other shards will finish, as long as it will be done
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
// which is also fine, as the specification doesn't give precision guarantees of any kind.
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
}
}
rjson::add(table_description, "TableSizeBytes", total_size);
}
future<rjson::value> executor::fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
static future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy& proxy, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
{
rjson::value table_description = rjson::empty_object();
auto tags_ptr = db::get_tags_of_table(schema);
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
co_await fill_table_size(table_description, schema, tbl_status == table_status::deleting);
auto creation_timestamp = get_table_creation_time(*schema);
@@ -896,7 +801,9 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
if (tbl_status != table_status::deleting) {
rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
@@ -917,7 +824,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
sstring index_name = cf_name.substr(delim_it + 1);
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
// Add index's KeySchema and collect types for AttributeDefinitions:
// Add indexes's KeySchema and collect types for AttributeDefinitions:
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
// Add projection type
rjson::value projection = rjson::empty_object();
@@ -933,7 +840,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
// (for a built view) or CREATING+Backfilling (if view building
// is in progress).
if (!is_lsi) {
if (co_await is_view_built(vptr, _proxy, client_state, trace_state, permit)) {
if (co_await is_view_built(vptr, proxy, client_state, trace_state, permit)) {
rjson::add(view_entry, "IndexStatus", "ACTIVE");
} else {
rjson::add(view_entry, "IndexStatus", "CREATING");
@@ -961,8 +868,9 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
}
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
}
executor::supplement_table_stream_info(table_description, *schema, _proxy);
executor::supplement_table_stream_info(table_description, *schema, proxy);
// FIXME: still missing some response fields (issue #5026)
co_return table_description;
}
@@ -982,7 +890,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
tracing::add_alternator_table_name(trace_state, schema->cf_name());
rjson::value table_description = co_await fill_table_description(schema, table_status::active, client_state, trace_state, permit);
rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
rjson::value response = rjson::empty_object();
rjson::add(response, "Table", std::move(table_description));
elogger.trace("returning {}", response);
@@ -1085,7 +993,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
auto& p = _proxy.container();
schema_ptr schema = get_table(_proxy, request);
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, client_state, trace_state, permit);
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
size_t retries = mm.get_concurrent_ddl_retries();
@@ -1649,8 +1557,9 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
}
}
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
throwing_assert(this_shard_id() == 0);
static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
SCYLLA_ASSERT(this_shard_id() == 0);
// We begin by parsing and validating the content of the CreateTable
// command. We can't inspect the current database schema at this point
@@ -1836,7 +1745,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
if (stream_specification && stream_specification->IsObject()) {
if (executor::add_stream_options(*stream_specification, builder, _proxy)) {
if (executor::add_stream_options(*stream_specification, builder, sp)) {
validate_cdc_log_name_length(builder.cf_name());
}
}
@@ -1855,7 +1764,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
set_table_creation_time(tags_map, db_clock::now());
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, _stats);
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);
schema_ptr schema = builder.build();
for (auto& view_builder : view_builders) {
@@ -1871,49 +1780,38 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
}
size_t retries = _mm.get_concurrent_ddl_retries();
size_t retries = mm.get_concurrent_ddl_retries();
for (;;) {
auto group0_guard = co_await _mm.start_group0_operation();
auto group0_guard = co_await mm.start_group0_operation();
auto ts = group0_guard.write_timestamp();
utils::chunked_vector<mutation> schema_mutations;
auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
if (stream_specification && stream_specification->IsObject()) {
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = sp.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
if (rs->uses_tablets()) {
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
"If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
}
}
}
// Creating an index in tablets mode requires the keyspace to be RF-rack-valid.
// GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues.
if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, _proxy.local_db().get_token_metadata_ptr(), *rs);
} catch (const std::invalid_argument& ex) {
if (!view_builders.empty()) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes and LocalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
} else {
co_return api_error::validation(fmt::format("Cannot create table '{}' with tablets: the configuration "
"option 'rf_rack_valid_keyspaces' is enabled, which enforces that tables using tablets can only be created in clusters "
"that have either 1 or 3 racks", table_name));
}
}
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
}
try {
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
} catch (exceptions::already_exists_exception&) {
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
}
}
if (_proxy.data_dictionary().try_find_table(schema->id())) {
if (sp.data_dictionary().try_find_table(schema->id())) {
// This should never happen, the ID is supposed to be unique
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
}
@@ -1922,9 +1820,9 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
for (schema_builder& view_builder : view_builders) {
schemas.push_back(view_builder.build());
}
co_await service::prepare_new_column_families_announcement(schema_mutations, _proxy, *ksm, schemas, ts);
co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
if (ksm->uses_tablets()) {
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, _proxy);
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, sp);
}
// If a role is allowed to create a table, we must give it permissions to
@@ -1949,7 +1847,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
}
std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
try {
co_await _mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
break;
} catch (const service::group0_concurrent_modification& ex) {
elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
@@ -1961,9 +1859,9 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
}
}
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
rjson::value status = rjson::empty_object();
executor::supplement_table_info(request, *schema, _proxy);
executor::supplement_table_info(request, *schema, sp);
rjson::add(status, "TableDescription", std::move(request));
co_return rjson::print(std::move(status));
}
@@ -1972,11 +1870,10 @@ future<executor::request_return_type> executor::create_table(client_state& clien
_stats.api_operations.create_table++;
elogger.trace("Creating table {}", request);
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
// `invoke_on` hopped us to shard 0, but `this` points to `executor` is from 'old' shard, we need to hop it too.
co_return co_await e.local().create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), enforce_authorization, warn_authorization, std::move(tablets_mode));
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
});
}
@@ -2127,12 +2024,9 @@ future<executor::request_return_type> executor::update_table(client_state& clien
co_return api_error::validation(fmt::format(
"LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
}
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(),
p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy());
} catch (const std::invalid_argument& ex) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
!p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
}
elogger.trace("Adding GSI {}", index_name);
@@ -2436,7 +2330,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
// case, this function simply won't be called for this attribute.)
//
// This function checks if the given attribute update is an update to some
// GSI's key, and if the value is unsuitable, an api_error::validation is
// GSI's key, and if the value is unsuitable, a api_error::validation is
// thrown. The checking here is similar to the checking done in
// get_key_from_typed_value() for the base table's key columns.
//
@@ -2838,12 +2732,14 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
}
} else if (_write_isolation != write_isolation::LWT_ALWAYS) {
std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
throwing_assert(m); // !needs_read_before_write, so apply() did not check a condition
SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
});
}
throwing_assert(cas_shard);
if (!cas_shard) {
on_internal_error(elogger, "cas_shard is not set");
}
// If we're still here, we need to do this write using LWT:
global_stats.write_using_lwt++;
per_table_stats.write_using_lwt++;
@@ -3463,11 +3359,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
if (should_add_wcu) {
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
}
auto duration = std::chrono::steady_clock::now() - start_time;
_stats.api_operations.batch_write_item_latency.mark(duration);
for (const auto& w : per_table_wcu) {
w.first->api_operations.batch_write_item_latency.mark(duration);
}
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return rjson::print(std::move(ret));
}
@@ -3551,7 +3443,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
return true;
}
// Add a path to an attribute_path_map. Throws a validation error if the path
// Add a path to a attribute_path_map. Throws a validation error if the path
// "overlaps" with one already in the filter (one is a sub-path of the other)
// or "conflicts" with it (both a member and index is requested).
template<typename T>
@@ -4978,12 +4870,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
if (!some_succeeded && eptr) {
co_await coroutine::return_exception_ptr(std::move(eptr));
}
auto duration = std::chrono::steady_clock::now() - start_time;
_stats.api_operations.batch_get_item_latency.mark(duration);
for (const table_requests& rs : requests) {
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
}
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
if (is_big(response)) {
co_return make_streamed(std::move(response));
} else {
@@ -5421,7 +5308,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
}
static dht::token token_for_segment(int segment, int total_segments) {
throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments);
SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
}
@@ -6009,11 +5896,6 @@ future<executor::request_return_type> executor::list_tables(client_state& client
_stats.api_operations.list_tables++;
elogger.trace("Listing tables {}", request);
co_await utils::get_local_injector().inject("alternator_list_tables", [] (auto& handler) -> future<> {
handler.set("waiting", true);
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
});
rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
rjson::value* limit_json = rjson::find(request, "Limit");
std::string exclusive_start = exclusive_start_json ? rjson::to_string(*exclusive_start_json) : "";
@@ -6205,10 +6087,9 @@ future<> executor::start() {
}
future<> executor::stop() {
co_await _describe_table_info_manager->stop();
// disconnect from the value source, but keep the value unchanged.
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
co_await _parsed_expression_cache->stop();
return _parsed_expression_cache->stop();
}
} // namespace alternator

View File

@@ -17,13 +17,11 @@
#include "service/client_state.hh"
#include "service_permit.hh"
#include "db/timeout_clock.hh"
#include "db/config.hh"
#include "alternator/error.hh"
#include "stats.hh"
#include "utils/rjson.hh"
#include "utils/updateable_value.hh"
#include "utils/simple_value_with_expiry.hh"
#include "tracing/trace_state.hh"
@@ -43,7 +41,6 @@ namespace cql3::selection {
namespace service {
class storage_proxy;
class cas_shard;
class storage_service;
}
namespace cdc {
@@ -60,7 +57,6 @@ class schema_builder;
namespace alternator {
enum class table_status;
class rmw_operation;
class put_or_delete_item;
@@ -140,7 +136,6 @@ class expression_cache;
class executor : public peering_sharded_service<executor> {
gms::gossiper& _gossiper;
service::storage_service& _ss;
service::storage_proxy& _proxy;
service::migration_manager& _mm;
db::system_distributed_keyspace& _sdks;
@@ -153,11 +148,6 @@ class executor : public peering_sharded_service<executor> {
std::unique_ptr<parsed::expression_cache> _parsed_expression_cache;
struct describe_table_info_manager;
std::unique_ptr<describe_table_info_manager> _describe_table_info_manager;
future<> cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl);
future<> fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting);
public:
using client_state = service::client_state;
// request_return_type is the return type of the executor methods, which
@@ -183,7 +173,6 @@ public:
executor(gms::gossiper& gossiper,
service::storage_proxy& proxy,
service::storage_service& ss,
service::migration_manager& mm,
db::system_distributed_keyspace& sdks,
cdc::metadata& cdc_metadata,
@@ -231,8 +220,6 @@ private:
friend class rmw_operation;
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit);
future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode);
future<> do_batch_write(
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,

View File

@@ -50,7 +50,7 @@ public:
_operators.emplace_back(i);
check_depth_limit();
}
void add_dot(std::string name) {
void add_dot(std::string(name)) {
_operators.emplace_back(std::move(name));
check_depth_limit();
}
@@ -85,7 +85,7 @@ struct constant {
}
};
// "value" is a value used in the right hand side of an assignment
// "value" is is a value used in the right hand side of an assignment
// expression, "SET a = ...". It can be a constant (a reference to a value
// included in the request, e.g., ":val"), a path to an attribute from the
// existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
// The supported primitive conditions are:
// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
// v1 and v2 are values - from the item (an attribute path), the query
// (a ":val" reference), or a function of the above (only the size()
// (a ":val" reference), or a function of the the above (only the size()
// function is supported).
// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
// 3. N-ary operator - v1 IN ( v2, v3, ... )

View File

@@ -1,301 +0,0 @@
/*
* Copyright 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "alternator/http_compression.hh"
#include "alternator/server.hh"
#include <seastar/coroutine/maybe_yield.hh>
#include <zlib.h>
static logging::logger slogger("alternator-http-compression");
namespace alternator {
static constexpr size_t compressed_buffer_size = 1024;
class zlib_compressor {
z_stream _zs;
temporary_buffer<char> _output_buf;
noncopyable_function<future<>(temporary_buffer<char>&&)> _write_func;
public:
zlib_compressor(bool gzip, int compression_level, noncopyable_function<future<>(temporary_buffer<char>&&)> write_func)
: _write_func(std::move(write_func)) {
memset(&_zs, 0, sizeof(_zs));
if (deflateInit2(&_zs, std::clamp(compression_level, Z_NO_COMPRESSION, Z_BEST_COMPRESSION), Z_DEFLATED,
(gzip ? 16 : 0) + MAX_WBITS, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
// Should only happen if memory allocation fails
throw std::bad_alloc();
}
}
~zlib_compressor() {
deflateEnd(&_zs);
}
future<> close() {
return compress(nullptr, 0, true);
}
future<> compress(const char* buf, size_t len, bool is_last_chunk = false) {
_zs.next_in = reinterpret_cast<unsigned char*>(const_cast<char*>(buf));
_zs.avail_in = (uInt) len;
int mode = is_last_chunk ? Z_FINISH : Z_NO_FLUSH;
while(_zs.avail_in > 0 || is_last_chunk) {
co_await coroutine::maybe_yield();
if (_output_buf.empty()) {
if (is_last_chunk) {
uint32_t max_buffer_size = 0;
deflatePending(&_zs, &max_buffer_size, nullptr);
max_buffer_size += deflateBound(&_zs, _zs.avail_in) + 1;
_output_buf = temporary_buffer<char>(std::min(compressed_buffer_size, (size_t) max_buffer_size));
} else {
_output_buf = temporary_buffer<char>(compressed_buffer_size);
}
_zs.next_out = reinterpret_cast<unsigned char*>(_output_buf.get_write());
_zs.avail_out = compressed_buffer_size;
}
int e = deflate(&_zs, mode);
if (e < Z_OK) {
throw api_error::internal("Error during compression of response body");
}
if (e == Z_STREAM_END || _zs.avail_out < compressed_buffer_size / 4) {
_output_buf.trim(compressed_buffer_size - _zs.avail_out);
co_await _write_func(std::move(_output_buf));
if (e == Z_STREAM_END) {
break;
}
}
}
}
};
// Helper string_view functions for parsing Accept-Encoding header
struct case_insensitive_cmp_sv {
bool operator()(std::string_view s1, std::string_view s2) const {
return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
[](char a, char b) { return ::tolower(a) == ::tolower(b); });
}
};
static inline std::string_view trim_left(std::string_view sv) {
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.front())))
sv.remove_prefix(1);
return sv;
}
static inline std::string_view trim_right(std::string_view sv) {
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back())))
sv.remove_suffix(1);
return sv;
}
static inline std::string_view trim(std::string_view sv) {
return trim_left(trim_right(sv));
}
inline std::vector<std::string_view> split(std::string_view text, char separator) {
std::vector<std::string_view> tokens;
if (text == "") {
return tokens;
}
while (true) {
auto pos = text.find_first_of(separator);
if (pos != std::string_view::npos) {
tokens.emplace_back(text.data(), pos);
text.remove_prefix(pos + 1);
} else {
tokens.emplace_back(text);
break;
}
}
return tokens;
}
constexpr response_compressor::compression_type response_compressor::get_compression_type(std::string_view encoding) {
for (size_t i = 0; i < static_cast<size_t>(compression_type::count); ++i) {
if (case_insensitive_cmp_sv{}(encoding, compression_names[i])) {
return static_cast<compression_type>(i);
}
}
return compression_type::unknown;
}
response_compressor::compression_type response_compressor::find_compression(std::string_view accept_encoding, size_t response_size) {
std::optional<float> ct_q[static_cast<size_t>(compression_type::count)];
ct_q[static_cast<size_t>(compression_type::none)] = std::numeric_limits<float>::min(); // enabled, but lowest priority
compression_type selected_ct = compression_type::none;
std::vector<std::string_view> entries = split(accept_encoding, ',');
for (auto& e : entries) {
std::vector<std::string_view> params = split(e, ';');
if (params.size() == 0) {
continue;
}
compression_type ct = get_compression_type(trim(params[0]));
if (ct == compression_type::unknown) {
continue; // ignore unknown encoding types
}
if (ct_q[static_cast<size_t>(ct)].has_value() && ct_q[static_cast<size_t>(ct)] != 0.0f) {
continue; // already processed this encoding
}
if (response_size < _threshold[static_cast<size_t>(ct)]) {
continue; // below threshold treat as unknown
}
for (size_t i = 1; i < params.size(); ++i) { // find "q=" parameter
auto pos = params[i].find("q=");
if (pos == std::string_view::npos) {
continue;
}
std::string_view param = params[i].substr(pos + 2);
param = trim(param);
// parse quality value
float q_value = 1.0f;
auto [ptr, ec] = std::from_chars(param.data(), param.data() + param.size(), q_value);
if (ec != std::errc() || ptr != param.data() + param.size()) {
continue;
}
if (q_value < 0.0) {
q_value = 0.0;
} else if (q_value > 1.0) {
q_value = 1.0;
}
ct_q[static_cast<size_t>(ct)] = q_value;
break; // we parsed quality value
}
if (!ct_q[static_cast<size_t>(ct)].has_value()) {
ct_q[static_cast<size_t>(ct)] = 1.0f; // default quality value
}
// keep the highest encoding (in the order, unless 'any')
if (selected_ct == compression_type::any) {
if (ct_q[static_cast<size_t>(ct)] >= ct_q[static_cast<size_t>(selected_ct)]) {
selected_ct = ct;
}
} else {
if (ct_q[static_cast<size_t>(ct)] > ct_q[static_cast<size_t>(selected_ct)]) {
selected_ct = ct;
}
}
}
if (selected_ct == compression_type::any) {
// select any not mentioned or highest quality
selected_ct = compression_type::none;
for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
if (!ct_q[i].has_value()) {
return static_cast<compression_type>(i);
}
if (ct_q[i] > ct_q[static_cast<size_t>(selected_ct)]) {
selected_ct = static_cast<compression_type>(i);
}
}
}
return selected_ct;
}
static future<chunked_content> compress(response_compressor::compression_type ct, const db::config& cfg, std::string str) {
chunked_content compressed;
auto write = [&compressed](temporary_buffer<char>&& buf) -> future<> {
compressed.push_back(std::move(buf));
return make_ready_future<>();
};
zlib_compressor compressor(ct != response_compressor::compression_type::deflate,
cfg.alternator_response_gzip_compression_level(), std::move(write));
co_await compressor.compress(str.data(), str.size(), true);
co_return compressed;
}
static sstring flatten(chunked_content&& cc) {
size_t total_size = 0;
for (const auto& chunk : cc) {
total_size += chunk.size();
}
sstring result = sstring{ sstring::initialized_later{}, total_size };
size_t offset = 0;
for (const auto& chunk : cc) {
std::copy(chunk.begin(), chunk.end(), result.begin() + offset);
offset += chunk.size();
}
return result;
}
future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, std::string&& response_body) {
response_compressor::compression_type ct = find_compression(accept_encoding, response_body.size());
if (ct != response_compressor::compression_type::none) {
rep->add_header("Content-Encoding", get_encoding_name(ct));
rep->set_content_type(content_type);
return compress(ct, cfg, std::move(response_body)).then([rep = std::move(rep)] (chunked_content compressed) mutable {
rep->_content = flatten(std::move(compressed));
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
});
} else {
// Note that despite the move, there is a copy here -
// as str is std::string and rep->_content is sstring.
rep->_content = std::move(response_body);
rep->set_content_type(content_type);
}
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
}
template<typename Compressor>
class compressed_data_sink_impl : public data_sink_impl {
output_stream<char> _out;
Compressor _compressor;
public:
template<typename... Args>
compressed_data_sink_impl(output_stream<char>&& out, Args&&... args)
: _out(std::move(out)), _compressor(std::forward<Args>(args)..., [this](temporary_buffer<char>&& buf) {
return _out.write(std::move(buf));
}) { }
future<> put(std::span<temporary_buffer<char>> data) override {
return data_sink_impl::fallback_put(data, [this] (temporary_buffer<char>&& buf) {
return do_put(std::move(buf));
});
}
private:
future<> do_put(temporary_buffer<char> buf) {
co_return co_await _compressor.compress(buf.get(), buf.size());
}
future<> close() override {
return _compressor.close().then([this] {
return _out.close();
});
}
};
executor::body_writer compress(response_compressor::compression_type ct, const db::config& cfg, executor::body_writer&& bw) {
return [bw = std::move(bw), ct, level = cfg.alternator_response_gzip_compression_level()](output_stream<char>&& out) mutable -> future<> {
output_stream_options opts;
opts.trim_to_size = true;
std::unique_ptr<data_sink_impl> data_sink_impl;
switch (ct) {
case response_compressor::compression_type::gzip:
data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), true, level);
break;
case response_compressor::compression_type::deflate:
data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), false, level);
break;
case response_compressor::compression_type::none:
case response_compressor::compression_type::any:
case response_compressor::compression_type::unknown:
on_internal_error(slogger,"Compression not selected");
default:
on_internal_error(slogger, "Unsupported compression type for data sink");
}
return bw(output_stream<char>(data_sink(std::move(data_sink_impl)), compressed_buffer_size, opts));
};
}
future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer) {
response_compressor::compression_type ct = find_compression(accept_encoding, std::numeric_limits<size_t>::max());
if (ct != response_compressor::compression_type::none) {
rep->add_header("Content-Encoding", get_encoding_name(ct));
rep->write_body(content_type, compress(ct, cfg, std::move(body_writer)));
} else {
rep->write_body(content_type, std::move(body_writer));
}
return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
}
} // namespace alternator

View File

@@ -1,91 +0,0 @@
/*
* Copyright 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "alternator/executor.hh"
#include <seastar/http/httpd.hh>
#include "db/config.hh"
namespace alternator {
class response_compressor {
public:
enum class compression_type {
gzip,
deflate,
compressions_count,
any = compressions_count,
none,
count,
unknown = count
};
static constexpr std::string_view compression_names[] = {
"gzip",
"deflate",
"*",
"identity"
};
static sstring get_encoding_name(compression_type ct) {
return sstring(compression_names[static_cast<size_t>(ct)]);
}
static constexpr compression_type get_compression_type(std::string_view encoding);
sstring get_accepted_encoding(const http::request& req) {
if (get_threshold() == 0) {
return "";
}
return req.get_header("Accept-Encoding");
}
compression_type find_compression(std::string_view accept_encoding, size_t response_size);
response_compressor(const db::config& cfg)
: cfg(cfg)
,_gzip_level_observer(
cfg.alternator_response_gzip_compression_level.observe([this](int v) {
update_threshold();
}))
,_gzip_threshold_observer(
cfg.alternator_response_compression_threshold_in_bytes.observe([this](uint32_t v) {
update_threshold();
}))
{
update_threshold();
}
response_compressor(const response_compressor& rhs) : response_compressor(rhs.cfg) {}
private:
const db::config& cfg;
utils::observable<int>::observer _gzip_level_observer;
utils::observable<uint32_t>::observer _gzip_threshold_observer;
uint32_t _threshold[static_cast<size_t>(compression_type::count)];
size_t get_threshold() { return _threshold[static_cast<size_t>(compression_type::any)]; }
void update_threshold() {
_threshold[static_cast<size_t>(compression_type::none)] = std::numeric_limits<uint32_t>::max();
_threshold[static_cast<size_t>(compression_type::any)] = std::numeric_limits<uint32_t>::max();
uint32_t gzip = cfg.alternator_response_gzip_compression_level() <= 0 ? std::numeric_limits<uint32_t>::max()
: cfg.alternator_response_compression_threshold_in_bytes();
_threshold[static_cast<size_t>(compression_type::gzip)] = gzip;
_threshold[static_cast<size_t>(compression_type::deflate)] = gzip;
for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
if (_threshold[i] < _threshold[static_cast<size_t>(compression_type::any)]) {
_threshold[static_cast<size_t>(compression_type::any)] = _threshold[i];
}
}
}
public:
future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
sstring accept_encoding, const char* content_type, std::string&& response_body);
future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer);
};
}

View File

@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);
// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it. Otherwise,
// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it. Otherwise,
// raises ValidationException with diagnostic.
big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

View File

@@ -34,7 +34,6 @@
#include "client_data.hh"
#include "utils/updateable_value.hh"
#include <zlib.h>
#include "alternator/http_compression.hh"
static logging::logger slogger("alternator-server");
@@ -112,12 +111,9 @@ class api_handler : public handler_base {
// type applies to all replies, both success and error.
static constexpr const char* REPLY_CONTENT_TYPE = "application/x-amz-json-1.0";
public:
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle,
const db::config& config) : _response_compressor(config), _f_handle(
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
[this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
sstring accept_encoding = _response_compressor.get_accepted_encoding(*req);
return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped(
[this, rep = std::move(rep), accept_encoding=std::move(accept_encoding)](future<executor::request_return_type> resf) mutable {
return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
if (resf.failed()) {
// Exceptions of type api_error are wrapped as JSON and
// returned to the client as expected. Other types of
@@ -137,20 +133,22 @@ public:
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
}
auto res = resf.get();
return std::visit(overloaded_functor {
std::visit(overloaded_functor {
[&] (std::string&& str) {
return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
REPLY_CONTENT_TYPE, std::move(str));
// Note that despite the move, there is a copy here -
// as str is std::string and rep->_content is sstring.
rep->_content = std::move(str);
rep->set_content_type(REPLY_CONTENT_TYPE);
},
[&] (executor::body_writer&& body_writer) {
return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
REPLY_CONTENT_TYPE, std::move(body_writer));
rep->write_body(REPLY_CONTENT_TYPE, std::move(body_writer));
},
[&] (const api_error& err) {
generate_error_reply(*rep, err);
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
}
}, std::move(res));
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
});
}) { }
@@ -179,7 +177,6 @@ protected:
slogger.trace("api_handler error case: {}", rep._content);
}
response_compressor _response_compressor;
future_handler_function _f_handle;
};
@@ -374,45 +371,18 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
for (const auto& header : signed_headers) {
signed_headers_map.emplace(header, std::string_view());
}
std::vector<std::string> modified_values;
for (auto& header : req._headers) {
std::string header_str;
header_str.resize(header.first.size());
std::transform(header.first.begin(), header.first.end(), header_str.begin(), ::tolower);
auto it = signed_headers_map.find(header_str);
if (it != signed_headers_map.end()) {
// replace multiple spaces in the header value header.second with
// a single space, as required by AWS SigV4 header canonization.
// If we modify the value, we need to save it in modified_values
// to keep it alive.
std::string value;
value.reserve(header.second.size());
bool prev_space = false;
bool modified = false;
for (char ch : header.second) {
if (ch == ' ') {
if (!prev_space) {
value += ch;
prev_space = true;
} else {
modified = true; // skip a space
}
} else {
value += ch;
prev_space = false;
}
}
if (modified) {
modified_values.emplace_back(std::move(value));
it->second = std::string_view(modified_values.back());
} else {
it->second = std::string_view(header.second);
}
it->second = std::string_view(header.second);
}
}
auto cache_getter = [&proxy = _proxy] (std::string username) {
return get_key_from_roles(proxy, std::move(username));
auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
return get_key_from_roles(proxy, as, std::move(username));
};
return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
user = std::move(user),
@@ -420,7 +390,6 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
datestamp = std::move(datestamp),
signed_headers_str = std::move(signed_headers_str),
signed_headers_map = std::move(signed_headers_map),
modified_values = std::move(modified_values),
region = std::move(region),
service = std::move(service),
user_signature = std::move(user_signature)] (future<key_cache::value_ptr> key_ptr_fut) {
@@ -591,11 +560,11 @@ read_entire_stream(input_stream<char>& inp, size_t length_limit) {
class safe_gzip_zstream {
z_stream _zs;
public:
// If gzip is true, decode a gzip header (for "Content-Encoding: gzip").
// Otherwise, a zlib header (for "Content-Encoding: deflate").
safe_gzip_zstream(bool gzip = true) {
safe_gzip_zstream() {
memset(&_zs, 0, sizeof(_zs));
if (inflateInit2(&_zs, gzip ? 16 + MAX_WBITS : MAX_WBITS) != Z_OK) {
// The strange 16 + WMAX_BITS tells zlib to expect and decode
// a gzip header, not a zlib header.
if (inflateInit2(&_zs, 16 + MAX_WBITS) != Z_OK) {
// Should only happen if memory allocation fails
throw std::bad_alloc();
}
@@ -614,21 +583,19 @@ public:
}
};
// ungzip() takes a chunked_content of a compressed request body, and returns
// the uncompressed content as a chunked_content. If gzip is true, we expect
// gzip header (for "Content-Encoding: gzip"), if gzip is false, we expect a
// zlib header (for "Content-Encoding: deflate").
// ungzip() takes a chunked_content with a gzip-compressed request body,
// uncompresses it, and returns the uncompressed content as a chunked_content.
// If the uncompressed content exceeds length_limit, an error is thrown.
static future<chunked_content>
ungzip(chunked_content&& compressed_body, size_t length_limit, bool gzip = true) {
ungzip(chunked_content&& compressed_body, size_t length_limit) {
chunked_content ret;
// output_buf can be any size - when uncompressing input_buf, it doesn't
// need to fit in a single output_buf, we'll use multiple output_buf for
// a single input_buf if needed.
constexpr size_t OUTPUT_BUF_SIZE = 4096;
temporary_buffer<char> output_buf;
safe_gzip_zstream strm(gzip);
bool complete_stream = false; // empty input is not a valid gzip/deflate
safe_gzip_zstream strm;
bool complete_stream = false; // empty input is not a valid gzip
size_t total_out_bytes = 0;
for (const temporary_buffer<char>& input_buf : compressed_body) {
if (input_buf.empty()) {
@@ -710,7 +677,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
++_executor._stats.requests_blocked_memory;
}
auto units = co_await std::move(units_fut);
throwing_assert(req->content_stream);
SCYLLA_ASSERT(req->content_stream);
chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
// If the request had no Content-Length, we reserved too many units
// so need to return some
@@ -731,8 +698,6 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
sstring content_encoding = req->get_header("Content-Encoding");
if (content_encoding == "gzip") {
content = co_await ungzip(std::move(content), request_content_length_limit);
} else if (content_encoding == "deflate") {
content = co_await ungzip(std::move(content), request_content_length_limit, false);
} else if (!content_encoding.empty()) {
// DynamoDB returns a 500 error for unsupported Content-Encoding.
// I'm not sure if this is the best error code, but let's do it too.
@@ -743,12 +708,8 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
// As long as the system_clients_entry object is alive, this request will
// be visible in the "system.clients" virtual table. When requested, this
// entry will be formatted by server::ongoing_request::make_client_data().
auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
return make_ready_future<options_cache_value_type>(options_cache_value_type{});
});
auto system_clients_entry = _ongoing_requests.emplace(
req->get_client_address(), std::move(user_agent_header),
req->get_client_address(), req->get_header("User-Agent"),
username, current_scheduling_group(),
req->get_protocol_name() == "https");
@@ -771,7 +732,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
if (!username.empty()) {
client_state.set_login(auth::authenticated_user(username));
}
client_state.maybe_update_per_service_level_params();
co_await client_state.maybe_update_per_service_level_params();
tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
tracing::trace(trace_state, "{}", op);
@@ -793,7 +754,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
void server::set_routes(routes& r) {
api_handler* req_handler = new api_handler([this] (std::unique_ptr<request> req) mutable {
return handle_api_request(std::move(req));
}, _proxy.data_dictionary().get_config());
});
r.put(operation_type::POST, "/", req_handler);
r.put(operation_type::GET, "/", new health_handler(_pending_requests));
@@ -904,9 +865,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
} {
}
future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port,
std::optional<uint16_t> port_proxy_protocol, std::optional<uint16_t> https_port_proxy_protocol,
std::optional<tls::credentials_builder> creds,
future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
_memory_limiter = memory_limiter;
@@ -914,28 +873,20 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
_warn_authorization = std::move(warn_authorization);
_max_concurrent_requests = std::move(max_concurrent_requests);
_max_users_query_size_in_trace_output = std::move(max_users_query_size_in_trace_output);
if (!port && !https_port && !port_proxy_protocol && !https_port_proxy_protocol) {
if (!port && !https_port) {
return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
" must be specified in order to init an alternator HTTP server instance"));
}
return seastar::async([this, addr, port, https_port, port_proxy_protocol, https_port_proxy_protocol, creds] {
return seastar::async([this, addr, port, https_port, creds] {
_executor.start().get();
if (port || port_proxy_protocol) {
if (port) {
set_routes(_http_server._routes);
_http_server.set_content_streaming(true);
if (port) {
_http_server.listen(socket_address{addr, *port}).get();
}
if (port_proxy_protocol) {
listen_options lo;
lo.reuse_address = true;
lo.proxy_protocol = true;
_http_server.listen(socket_address{addr, *port_proxy_protocol}, lo).get();
}
_http_server.listen(socket_address{addr, *port}).get();
_enabled_servers.push_back(std::ref(_http_server));
}
if (https_port || https_port_proxy_protocol) {
if (https_port) {
set_routes(_https_server._routes);
_https_server.set_content_streaming(true);
@@ -955,15 +906,7 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
} else {
_credentials = creds->build_server_credentials();
}
if (https_port) {
_https_server.listen(socket_address{addr, *https_port}, _credentials).get();
}
if (https_port_proxy_protocol) {
listen_options lo;
lo.reuse_address = true;
lo.proxy_protocol = true;
_https_server.listen(socket_address{addr, *https_port_proxy_protocol}, lo, _credentials).get();
}
_https_server.listen(socket_address{addr, *https_port}, _credentials).get();
_enabled_servers.push_back(std::ref(_https_server));
}
});
@@ -1036,15 +979,16 @@ client_data server::ongoing_request::make_client_data() const {
// and keep "driver_version" unset.
cd.driver_name = _user_agent;
// Leave "protocol_version" unset, it has no meaning in Alternator.
// Leave "hostname", "ssl_protocol" and "ssl_cipher_suite" unset for Alternator.
// Note: CQL sets ssl_protocol and ssl_cipher_suite via generic_server::connection base class.
// Leave "hostname", "ssl_protocol" and "ssl_cipher_suite" unset.
// As reported in issue #9216, we never set these fields in CQL
// either (see cql_server::connection::make_client_data()).
return cd;
}
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
future<utils::chunked_vector<client_data>> server::get_client_data() {
utils::chunked_vector<client_data> ret;
co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
ret.emplace_back(r.make_client_data());
});
co_return ret;
}

View File

@@ -55,7 +55,6 @@ class server : public peering_sharded_service<server> {
// though it isn't really relevant for Alternator which defines its own
// timeouts separately. We can create this object only once.
updateable_timeout_config _timeout_config;
client_options_cache_type _connection_options_keys_and_values;
alternator_callbacks_map _callbacks;
@@ -89,7 +88,7 @@ class server : public peering_sharded_service<server> {
// is called when reading the "system.clients" virtual table.
struct ongoing_request {
socket_address _client_address;
client_options_cache_entry_type _user_agent;
sstring _user_agent;
sstring _username;
scheduling_group _scheduling_group;
bool _is_https;
@@ -100,9 +99,7 @@ class server : public peering_sharded_service<server> {
public:
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port,
std::optional<uint16_t> port_proxy_protocol, std::optional<uint16_t> https_port_proxy_protocol,
std::optional<tls::credentials_builder> creds,
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
future<> stop();
@@ -110,7 +107,7 @@ public:
// table "system.clients" is read. It is expected to generate a list of
// clients connected to this server (on this shard). This function is
// called by alternator::controller::get_client_data().
future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
future<utils::chunked_vector<client_data>> get_client_data();
private:
void set_routes(seastar::httpd::routes& r);
// If verification succeeds, returns the authenticated user's username

View File

@@ -14,6 +14,20 @@
namespace alternator {
const char* ALTERNATOR_METRICS = "alternator";
static seastar::metrics::histogram estimated_histogram_to_metrics(const utils::estimated_histogram& histogram) {
seastar::metrics::histogram res;
res.buckets.resize(histogram.bucket_offsets.size());
uint64_t cumulative_count = 0;
res.sample_count = histogram._count;
res.sample_sum = histogram._sample_sum;
for (size_t i = 0; i < res.buckets.size(); i++) {
auto& v = res.buckets[i];
v.upper_bound = histogram.bucket_offsets[i];
cumulative_count += histogram.buckets[i];
v.count = cumulative_count;
}
return res;
}
static seastar::metrics::label column_family_label("cf");
static seastar::metrics::label keyspace_label("ks");
@@ -137,21 +151,21 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"), labels,
stats.api_operations.batch_get_item_batch_total)(op("BatchGetItem")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
[&stats]{ return to_metrics_histogram(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
[&stats]{ return to_metrics_histogram(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
});
seastar::metrics::label expression_label("expression");

View File

@@ -16,8 +16,6 @@
#include "cql3/stats.hh"
namespace alternator {
using batch_histogram = utils::estimated_histogram_with_max<128>;
using op_size_histogram = utils::estimated_histogram_with_max<512>;
// Object holding per-shard statistics related to Alternator.
// While this object is alive, these metrics are also registered to be
@@ -78,34 +76,34 @@ public:
utils::timed_rate_moving_average_summary_and_histogram batch_get_item_latency;
utils::timed_rate_moving_average_summary_and_histogram get_records_latency;
batch_histogram batch_get_item_histogram;
batch_histogram batch_write_item_histogram;
utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
} api_operations;
// Operation size metrics
struct {
// Item size statistics collected per table and aggregated per node.
// Each histogram covers the range 0 - 512. Resolves #25143.
// Each histogram covers the range 0 - 446. Resolves #25143.
// A size is the retrieved item's size.
op_size_histogram get_item_op_size_kb;
utils::estimated_histogram get_item_op_size_kb{30};
// A size is the maximum of the new item's size and the old item's size.
op_size_histogram put_item_op_size_kb;
utils::estimated_histogram put_item_op_size_kb{30};
// A size is the deleted item's size. If the deleted item's size is
// unknown (i.e. read-before-write wasn't necessary and it wasn't
// forced by a configuration option), it won't be recorded on the
// histogram.
op_size_histogram delete_item_op_size_kb;
utils::estimated_histogram delete_item_op_size_kb{30};
// A size is the maximum of existing item's size and the estimated size
// of the update. This will be changed to the maximum of the existing item's
// size and the new item's size in a subsequent PR.
op_size_histogram update_item_op_size_kb;
utils::estimated_histogram update_item_op_size_kb{30};
// A size is the sum of the sizes of all items per table. This means
// that a single BatchGetItem / BatchWriteItem updates the histogram
// for each table that it has items in.
// The sizes are the retrieved items' sizes grouped per table.
op_size_histogram batch_get_item_op_size_kb;
utils::estimated_histogram batch_get_item_op_size_kb{30};
// The sizes are the the written items' sizes grouped per table.
op_size_histogram batch_write_item_op_size_kb;
utils::estimated_histogram batch_write_item_op_size_kb{30};
} operation_sizes;
// Count of authentication and authorization failures, counted if either
// alternator_enforce_authorization or alternator_warn_authorization are
@@ -142,7 +140,7 @@ public:
cql3::cql_stats cql_stats;
// Enumeration of expression types only for stats
// if needed it can be extended e.g. per operation
// if needed it can be extended e.g. per operation
enum expression_types {
UPDATE_EXPRESSION,
CONDITION_EXPRESSION,
@@ -166,7 +164,7 @@ struct table_stats {
void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);
inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
return (bytes) / 1024;
return (bytes + 1023) / 1024;
}
}

View File

@@ -33,8 +33,6 @@
#include "data_dictionary/data_dictionary.hh"
#include "utils/rjson.hh"
static logging::logger elogger("alternator-streams");
/**
* Base template type to implement rapidjson::internal::TypeHelper<...>:s
* for types that are ostreamable/string constructible/castable.
@@ -430,25 +428,6 @@ using namespace std::chrono_literals;
// Dynamo docs says no data shall live longer than 24h.
static constexpr auto dynamodb_streams_max_window = 24h;
// find the parent shard in previous generation for the given child shard
// takes care of wrap-around case in vnodes
// prev_streams must be sorted by token
const cdc::stream_id& find_parent_shard_in_previous_generation(db_clock::time_point prev_timestamp, const utils::chunked_vector<cdc::stream_id> &prev_streams, const cdc::stream_id &child) {
if (prev_streams.empty()) {
// something is really wrong - streams are empty
// let's try internal_error in hope it will be notified and fixed
on_internal_error(elogger, fmt::format("streams are empty for cdc generation at {} ({})", prev_timestamp, prev_timestamp.time_since_epoch().count()));
}
auto it = std::lower_bound(prev_streams.begin(), prev_streams.end(), child.token(), [](const cdc::stream_id& id, const dht::token& t) {
return id.token() < t;
});
if (it == prev_streams.end()) {
// wrap around case - take first
it = prev_streams.begin();
}
return *it;
}
future<executor::request_return_type> executor::describe_stream(client_state& client_state, service_permit permit, rjson::value request) {
_stats.api_operations.describe_stream++;
@@ -512,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
if (!opts.enabled()) {
rjson::add(ret, "StreamDescription", std::move(stream_desc));
co_return rjson::print(std::move(ret));
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
}
// TODO: label
@@ -523,113 +502,123 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
// filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);
std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
auto e = topologies.end();
auto prev = e;
auto shards = rjson::empty_array();
return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
std::optional<shard_id> last;
auto e = topologies.end();
auto prev = e;
auto shards = rjson::empty_array();
auto i = topologies.begin();
// if we're a paged query, skip to the generation where we left of.
if (shard_start) {
i = topologies.find(shard_start->time);
}
std::optional<shard_id> last;
// for parent-child stuff we need id:s to be sorted by token
// (see explanation above) since we want to find closest
// token boundary when determining parent.
// #7346 - we processed and searched children/parents in
// stored order, which is not necessarily token order,
// so the finding of "closest" token boundary (using upper bound)
// could give somewhat weird results.
static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
return id1.token() < id2.token();
};
auto i = topologies.begin();
// if we're a paged query, skip to the generation where we left of.
if (shard_start) {
i = topologies.find(shard_start->time);
}
// #7409 - shards must be returned in lexicographical order,
// normal bytes compare is string_traits<int8_t>::compare.
// thus bytes 0x8000 is less than 0x0000. By doing unsigned
// compare instead we inadvertently will sort in string lexical.
static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
};
// need a prev even if we are skipping stuff
if (i != topologies.begin()) {
prev = std::prev(i);
}
for (; limit > 0 && i != e; prev = i, ++i) {
auto& [ts, sv] = *i;
last = std::nullopt;
auto lo = sv.streams.begin();
auto end = sv.streams.end();
// for parent-child stuff we need id:s to be sorted by token
// (see explanation above) since we want to find closest
// token boundary when determining parent.
// #7346 - we processed and searched children/parents in
// stored order, which is not necessarily token order,
// so the finding of "closest" token boundary (using upper bound)
// could give somewhat weird results.
static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
return id1.token() < id2.token();
};
// #7409 - shards must be returned in lexicographical order,
std::sort(lo, end, id_cmp);
// normal bytes compare is string_traits<int8_t>::compare.
// thus bytes 0x8000 is less than 0x0000. By doing unsigned
// compare instead we inadvertently will sort in string lexical.
static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
};
if (shard_start) {
// find next shard position
lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
shard_start = std::nullopt;
// need a prev even if we are skipping stuff
if (i != topologies.begin()) {
prev = std::prev(i);
}
if (lo != end && prev != e) {
// We want older stuff sorted in token order so we can find matching
// token range when determining parent shard.
std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
}
auto expired = [&]() -> std::optional<db_clock::time_point> {
auto j = std::next(i);
if (j == e) {
return std::nullopt;
}
// add this so we sort of match potential
// sequence numbers in get_records result.
return j->first + confidence_interval(db);
}();
while (lo != end) {
auto& id = *lo++;
auto shard = rjson::empty_object();
if (prev != e) {
auto &pid = find_parent_shard_in_previous_generation(prev->first, prev->second.streams, id);
rjson::add(shard, "ParentShardId", shard_id(prev->first, pid));
}
last.emplace(ts, id);
rjson::add(shard, "ShardId", *last);
auto range = rjson::empty_object();
rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
if (expired) {
rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
}
rjson::add(shard, "SequenceNumberRange", std::move(range));
rjson::push_back(shards, std::move(shard));
if (--limit == 0) {
break;
}
for (; limit > 0 && i != e; prev = i, ++i) {
auto& [ts, sv] = *i;
last = std::nullopt;
auto lo = sv.streams.begin();
auto end = sv.streams.end();
// #7409 - shards must be returned in lexicographical order,
std::sort(lo, end, id_cmp);
if (shard_start) {
// find next shard position
lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
shard_start = std::nullopt;
}
if (lo != end && prev != e) {
// We want older stuff sorted in token order so we can find matching
// token range when determining parent shard.
std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
}
auto expired = [&]() -> std::optional<db_clock::time_point> {
auto j = std::next(i);
if (j == e) {
return std::nullopt;
}
// add this so we sort of match potential
// sequence numbers in get_records result.
return j->first + confidence_interval(db);
}();
while (lo != end) {
auto& id = *lo++;
auto shard = rjson::empty_object();
if (prev != e) {
auto& pids = prev->second.streams;
auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
return t < id.token();
});
if (pid != pids.begin()) {
pid = std::prev(pid);
}
if (pid != pids.end()) {
rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
}
}
last.emplace(ts, id);
rjson::add(shard, "ShardId", *last);
auto range = rjson::empty_object();
rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
if (expired) {
rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
}
rjson::add(shard, "SequenceNumberRange", std::move(range));
rjson::push_back(shards, std::move(shard));
if (--limit == 0) {
break;
}
last = std::nullopt;
}
}
}
if (last) {
rjson::add(stream_desc, "LastEvaluatedShardId", *last);
}
if (last) {
rjson::add(stream_desc, "LastEvaluatedShardId", *last);
}
rjson::add(stream_desc, "Shards", std::move(shards));
rjson::add(ret, "StreamDescription", std::move(stream_desc));
co_return rjson::print(std::move(ret));
rjson::add(stream_desc, "Shards", std::move(shards));
rjson::add(ret, "StreamDescription", std::move(stream_desc));
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
});
}
enum class shard_iterator_type {
@@ -909,169 +898,172 @@ future<executor::request_return_type> executor::get_records(client_state& client
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));
service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
cql3::selection::result_set_builder builder(*selection, gc_clock::now());
query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
[this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {
cql3::selection::result_set_builder builder(*selection, gc_clock::now());
query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
auto result_set = builder.build();
auto records = rjson::empty_array();
auto result_set = builder.build();
auto records = rjson::empty_array();
auto& metadata = result_set->get_metadata();
auto& metadata = result_set->get_metadata();
auto op_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == op_column_name;
})
);
auto ts_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == timestamp_column_name;
})
);
auto eor_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == eor_column_name;
})
);
auto op_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == op_column_name;
})
);
auto ts_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == timestamp_column_name;
})
);
auto eor_index = std::distance(metadata.get_names().begin(),
std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
return cdef->name->name() == eor_column_name;
})
);
std::optional<utils::UUID> timestamp;
auto dynamodb = rjson::empty_object();
auto record = rjson::empty_object();
const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
std::optional<utils::UUID> timestamp;
auto dynamodb = rjson::empty_object();
auto record = rjson::empty_object();
const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
using op_utype = std::underlying_type_t<cdc::operation>;
using op_utype = std::underlying_type_t<cdc::operation>;
auto maybe_add_record = [&] {
if (!dynamodb.ObjectEmpty()) {
rjson::add(record, "dynamodb", std::move(dynamodb));
dynamodb = rjson::empty_object();
}
if (!record.ObjectEmpty()) {
rjson::add(record, "awsRegion", rjson::from_string(dc_name));
rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
rjson::add(record, "eventSource", "scylladb:alternator");
rjson::add(record, "eventVersion", "1.1");
rjson::push_back(records, std::move(record));
record = rjson::empty_object();
--limit;
}
};
auto maybe_add_record = [&] {
if (!dynamodb.ObjectEmpty()) {
rjson::add(record, "dynamodb", std::move(dynamodb));
dynamodb = rjson::empty_object();
}
if (!record.ObjectEmpty()) {
rjson::add(record, "awsRegion", rjson::from_string(dc_name));
rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
rjson::add(record, "eventSource", "scylladb:alternator");
rjson::add(record, "eventVersion", "1.1");
rjson::push_back(records, std::move(record));
record = rjson::empty_object();
--limit;
}
};
for (auto& row : result_set->rows()) {
auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
for (auto& row : result_set->rows()) {
auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
if (!dynamodb.HasMember("Keys")) {
auto keys = rjson::empty_object();
describe_single_item(*selection, row, key_names, keys);
rjson::add(dynamodb, "Keys", std::move(keys));
rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
rjson::add(dynamodb, "StreamViewType", type);
// TODO: SizeBytes
}
if (!dynamodb.HasMember("Keys")) {
auto keys = rjson::empty_object();
describe_single_item(*selection, row, key_names, keys);
rjson::add(dynamodb, "Keys", std::move(keys));
rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
rjson::add(dynamodb, "StreamViewType", type);
// TODO: SizeBytes
}
/**
* We merge rows with same timestamp into a single event.
* This is pretty much needed, because a CDC row typically
* encodes ~half the info of an alternator write.
*
* A big, big downside to how alternator records are written
* (i.e. CQL), is that the distinction between INSERT and UPDATE
* is somewhat lost/unmappable to actual eventName.
* A write (currently) always looks like an insert+modify
* regardless whether we wrote existing record or not.
*
* Maybe RMW ops could be done slightly differently so
* we can distinguish them here...
*
* For now, all writes will become MODIFY.
*
* Note: we do not check the current pre/post
* flags on CDC log, instead we use data to
* drive what is returned. This is (afaict)
* consistent with dynamo streams
*/
switch (op) {
case cdc::operation::pre_image:
case cdc::operation::post_image:
{
auto item = rjson::empty_object();
describe_single_item(*selection, row, attr_names, item, nullptr, true);
describe_single_item(*selection, row, key_names, item);
rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
break;
}
case cdc::operation::update:
rjson::add(record, "eventName", "MODIFY");
break;
case cdc::operation::insert:
rjson::add(record, "eventName", "INSERT");
break;
case cdc::operation::service_row_delete:
case cdc::operation::service_partition_delete:
{
auto user_identity = rjson::empty_object();
rjson::add(user_identity, "Type", "Service");
rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
rjson::add(record, "userIdentity", std::move(user_identity));
rjson::add(record, "eventName", "REMOVE");
break;
}
default:
rjson::add(record, "eventName", "REMOVE");
break;
}
if (eor) {
maybe_add_record();
timestamp = ts;
if (limit == 0) {
/**
* We merge rows with same timestamp into a single event.
* This is pretty much needed, because a CDC row typically
* encodes ~half the info of an alternator write.
*
* A big, big downside to how alternator records are written
* (i.e. CQL), is that the distinction between INSERT and UPDATE
* is somewhat lost/unmappable to actual eventName.
* A write (currently) always looks like an insert+modify
* regardless whether we wrote existing record or not.
*
* Maybe RMW ops could be done slightly differently so
* we can distinguish them here...
*
* For now, all writes will become MODIFY.
*
* Note: we do not check the current pre/post
* flags on CDC log, instead we use data to
* drive what is returned. This is (afaict)
* consistent with dynamo streams
*/
switch (op) {
case cdc::operation::pre_image:
case cdc::operation::post_image:
{
auto item = rjson::empty_object();
describe_single_item(*selection, row, attr_names, item, nullptr, true);
describe_single_item(*selection, row, key_names, item);
rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
break;
}
case cdc::operation::update:
rjson::add(record, "eventName", "MODIFY");
break;
case cdc::operation::insert:
rjson::add(record, "eventName", "INSERT");
break;
case cdc::operation::service_row_delete:
case cdc::operation::service_partition_delete:
{
auto user_identity = rjson::empty_object();
rjson::add(user_identity, "Type", "Service");
rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
rjson::add(record, "userIdentity", std::move(user_identity));
rjson::add(record, "eventName", "REMOVE");
break;
}
default:
rjson::add(record, "eventName", "REMOVE");
break;
}
if (eor) {
maybe_add_record();
timestamp = ts;
if (limit == 0) {
break;
}
}
}
}
auto ret = rjson::empty_object();
auto nrecords = records.Size();
rjson::add(ret, "Records", std::move(records));
auto ret = rjson::empty_object();
auto nrecords = records.Size();
rjson::add(ret, "Records", std::move(records));
if (nrecords != 0) {
// #9642. Set next iterators threshold to > last
shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
// Note that here we unconditionally return NextShardIterator,
// without checking if maybe we reached the end-of-shard. If the
// shard did end, then the next read will have nrecords == 0 and
// will notice end end of shard and not return NextShardIterator.
rjson::add(ret, "NextShardIterator", next_iter);
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
co_return rjson::print(std::move(ret));
}
if (nrecords != 0) {
// #9642. Set next iterators threshold to > last
shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
// Note that here we unconditionally return NextShardIterator,
// without checking if maybe we reached the end-of-shard. If the
// shard did end, then the next read will have nrecords == 0 and
// will notice end end of shard and not return NextShardIterator.
rjson::add(ret, "NextShardIterator", next_iter);
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
}
// ugh. figure out if we are and end-of-shard
auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
// ugh. figure out if we are and end-of-shard
auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
auto& shard = iter.shard;
return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
auto& shard = iter.shard;
if (shard.time < ts && ts < high_ts) {
// The DynamoDB documentation states that when a shard is
// closed, reading it until the end has NextShardIterator
// "set to null". Our test test_streams_closed_read
// confirms that by "null" they meant not set at all.
} else {
// We could have return the same iterator again, but we did
// a search from it until high_ts and found nothing, so we
// can also start the next search from high_ts.
// TODO: but why? It's simpler just to leave the iterator be.
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
rjson::add(ret, "NextShardIterator", iter);
}
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
if (is_big(ret)) {
co_return make_streamed(std::move(ret));
}
co_return rjson::print(std::move(ret));
if (shard.time < ts && ts < high_ts) {
// The DynamoDB documentation states that when a shard is
// closed, reading it until the end has NextShardIterator
// "set to null". Our test test_streams_closed_read
// confirms that by "null" they meant not set at all.
} else {
// We could have return the same iterator again, but we did
// a search from it until high_ts and found nothing, so we
// can also start the next search from high_ts.
// TODO: but why? It's simpler just to leave the iterator be.
shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
rjson::add(ret, "NextShardIterator", iter);
}
_stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
if (is_big(ret)) {
return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
}
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
});
});
}
bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {

View File

@@ -46,7 +46,6 @@
#include "alternator/executor.hh"
#include "alternator/controller.hh"
#include "alternator/serialization.hh"
#include "alternator/ttl_tag.hh"
#include "dht/sharder.hh"
#include "db/config.hh"
#include "db/tags/utils.hh"
@@ -58,10 +57,19 @@ static logging::logger tlogger("alternator_ttl");
namespace alternator {
// We write the expiration-time attribute enabled on a table in a
// tag TTL_TAG_KEY.
// Currently, the *value* of this tag is simply the name of the attribute,
// and the expiration scanner interprets it as an Alternator attribute name -
// It can refer to a real column or if that doesn't exist, to a member of
// the ":attrs" map column. Although this is designed for Alternator, it may
// be good enough for CQL as well (there, the ":attrs" column won't exist).
extern const sstring TTL_TAG_KEY;
future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
_stats.api_operations.update_time_to_live++;
if (!_proxy.features().alternator_ttl) {
co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Upgrade all nodes to a version that supports it.");
co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
}
schema_ptr schema = get_table(_proxy, request);
@@ -133,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLive request.
//
// Here is a brief overview of how the expiration service works:
//
@@ -316,7 +324,9 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
const auto& tm = *erm->get_token_metadata_ptr();
const auto& sorted_tokens = tm.sorted_tokens();
std::vector<std::pair<dht::token_range, locator::host_id>> ret;
throwing_assert(!sorted_tokens.empty());
if (sorted_tokens.empty()) {
on_internal_error(tlogger, "Token metadata is empty");
}
auto prev_tok = sorted_tokens.back();
for (const auto& tok : sorted_tokens) {
co_await coroutine::maybe_yield();
@@ -553,7 +563,7 @@ static future<> scan_table_ranges(
expiration_service::stats& expiration_stats)
{
const schema_ptr& s = scan_ctx.s;
throwing_assert(partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
SCYLLA_ASSERT (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
*scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
while (!p->is_exhausted()) {
@@ -583,7 +593,7 @@ static future<> scan_table_ranges(
if (retries >= 10) {
// Don't get stuck forever asking the same page, maybe there's
// a bug or a real problem in several replicas. Give up on
// this scan and retry the scan from a random position later,
// this scan an retry the scan from a random position later,
// in the next scan period.
throw runtime_exception("scanner thread failed after too many timeouts for the same page");
}
@@ -630,38 +640,13 @@ static future<> scan_table_ranges(
}
} else {
// For a real column to contain an expiration time, it
// must be a numeric type. We currently support decimal
// (used by Alternator TTL) as well as bigint, int and
// timestamp (used by CQL per-row TTL).
switch (meta[*expiration_column]->type->get_kind()) {
case abstract_type::kind::decimal:
// Used by Alternator TTL for key columns not stored
// in the map. The value is in seconds, fractional
// part is ignored.
expired = is_expired(value_cast<big_decimal>(v), now);
break;
case abstract_type::kind::long_kind:
// Used by CQL per-row TTL. The value is in seconds.
expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int64_t>(v))), now);
break;
case abstract_type::kind::int32:
// Used by CQL per-row TTL. The value is in seconds.
// Using int type is not recommended because it will
// overflow in 2038, but we support it to allow users
// to use existing int columns for expiration.
expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int32_t>(v))), now);
break;
case abstract_type::kind::timestamp:
// Used by CQL per-row TTL. The value is in milliseconds
// but we truncate it to gc_clock's precision (whole seconds).
expired = is_expired(gc_clock::time_point(std::chrono::duration_cast<gc_clock::duration>(value_cast<db_clock::time_point>(v).time_since_epoch())), now);
break;
default:
// Should never happen - we verified the column's type
// before starting the scan.
[[unlikely]]
on_internal_error(tlogger, format("expiration scanner value of unsupported type {} in column {}", meta[*expiration_column]->type->cql3_type_name(), scan_ctx.column_name) );
}
// must be a numeric type.
// FIXME: Currently we only support decimal_type (which is
// what Alternator uses), but other numeric types can be
// supported as well to make this feature more useful in CQL.
// Note that kind::decimal is also checked above.
big_decimal n = value_cast<big_decimal>(v);
expired = is_expired(n, now);
}
if (expired) {
expiration_stats.items_deleted++;
@@ -723,12 +708,16 @@ static future<bool> scan_table(
co_return false;
}
// attribute_name may be one of the schema's columns (in Alternator, this
// means a key column, in CQL it's a regular column), or an element in
// Alternator's attrs map encoded in Alternator's JSON encoding (which we
// decode). If attribute_name is a real column, in Alternator it will have
// the type decimal, counting seconds since the UNIX epoch, while in CQL
// it will one of the types bigint or int (counting seconds) or timestamp
// (counting milliseconds).
// means it's a key column), or an element in Alternator's attrs map
// encoded in Alternator's JSON encoding.
// FIXME: To make this less Alternators-specific, we should encode in the
// single key's value three things:
// 1. The name of a column
// 2. Optionally if column is a map, a member in the map
// 3. The deserializer for the value: CQL or Alternator (JSON).
// The deserializer can be guessed: If the given column or map item is
// numeric, it can be used directly. If it is a "bytes" type, it needs to
// be deserialized using Alternator's deserializer.
bytes column_name = to_bytes(*attribute_name);
const column_definition *cd = s->get_column_definition(column_name);
std::optional<std::string> member;
@@ -747,14 +736,11 @@ static future<bool> scan_table(
data_type column_type = cd->type;
// Verify that the column has the right type: If "member" exists
// the column must be a map, and if it doesn't, the column must
// be decimal_type (Alternator), bigint, int or timestamp (CQL).
// If the column has the wrong type nothing can get expired in
// this table, and it's pointless to scan it.
// (currently) be a decimal_type. If the column has the wrong type
// nothing can get expired in this table, and it's pointless to
// scan it.
if ((member && column_type->get_kind() != abstract_type::kind::map) ||
(!member && column_type->get_kind() != abstract_type::kind::decimal &&
column_type->get_kind() != abstract_type::kind::long_kind &&
column_type->get_kind() != abstract_type::kind::int32 &&
column_type->get_kind() != abstract_type::kind::timestamp)) {
(!member && column_type->get_kind() != abstract_type::kind::decimal)) {
tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
co_return false;
}
@@ -781,7 +767,7 @@ static future<bool> scan_table(
// by tasking another node to take over scanning of the dead node's primary
// ranges. What we do here is that this node will also check expiration
// on its *secondary* ranges - but only those whose primary owner is down.
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
if (!gossiper.is_alive(tablet_primary_replica.host)) {
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
@@ -892,10 +878,12 @@ future<> expiration_service::run() {
future<> expiration_service::start() {
// Called by main() on each shard to start the expiration-service
// thread. Just runs run() in the background and allows stop().
if (!shutting_down()) {
_end = run().handle_exception([] (std::exception_ptr ep) {
tlogger.error("expiration_service failed: {}", ep);
});
if (_db.features().alternator_ttl) {
if (!shutting_down()) {
_end = run().handle_exception([] (std::exception_ptr ep) {
tlogger.error("expiration_service failed: {}", ep);
});
}
}
return make_ready_future<>();
}

View File

@@ -30,7 +30,7 @@ namespace alternator {
// expiration_service is a sharded service responsible for cleaning up expired
// items in all tables with per-item expiration enabled. Currently, this means
// Alternator tables with TTL configured via an UpdateTimeToLive request.
// Alternator tables with TTL configured via a UpdateTimeToLeave request.
class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
public:
// Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
data_dictionary::database _db;
service::storage_proxy& _proxy;
gms::gossiper& _gossiper;
// _end is set by start(), and resolves when the background service
// _end is set by start(), and resolves when the the background service
// started by it ends. To ask the background service to end, _abort_source
// should be triggered. stop() below uses both _abort_source and _end.
std::optional<future<>> _end;

View File

@@ -1,26 +0,0 @@
/*
* Copyright 2026-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "seastarx.hh"
#include <seastar/core/sstring.hh>
namespace alternator {
// We use the table tag TTL_TAG_KEY ("system:ttl_attribute") to remember
// which attribute was chosen as the expiration-time attribute for
// Alternator's TTL and CQL's per-row TTL features.
// Currently, the *value* of this tag is simply the name of the attribute:
// It can refer to a real column or if that doesn't exist, to a member of
// the ":attrs" map column (which Alternator uses).
extern const sstring TTL_TAG_KEY;
} // namespace alternator
// let users use TTL_TAG_KEY without the "alternator::" prefix,
// to make it easier to move it to a different namespace later.
using alternator::TTL_TAG_KEY;

View File

@@ -12,7 +12,7 @@
"operations":[
{
"method":"POST",
"summary":"Resets authorized prepared statements cache",
"summary":"Reset cache",
"type":"void",
"nickname":"authorization_cache_reset",
"produces":[

View File

@@ -243,7 +243,7 @@
"GOSSIP_DIGEST_SYN",
"GOSSIP_DIGEST_ACK2",
"GOSSIP_SHUTDOWN",
"UNUSED__DEFINITIONS_UPDATE",
"DEFINITIONS_UPDATE",
"TRUNCATE",
"UNUSED__REPLICATION_FINISHED",
"MIGRATION_REQUEST",

View File

@@ -1295,45 +1295,6 @@
}
]
},
{
"path":"/storage_service/logstor_compaction",
"operations":[
{
"method":"POST",
"summary":"Trigger compaction of the key-value storage",
"type":"void",
"nickname":"logstor_compaction",
"produces":[
"application/json"
],
"parameters":[
{
"name":"major",
"description":"When true, perform a major compaction",
"required":false,
"allowMultiple":false,
"type":"boolean",
"paramType":"query"
}
]
}
]
},
{
"path":"/storage_service/logstor_flush",
"operations":[
{
"method":"POST",
"summary":"Trigger flush of logstor storage",
"type":"void",
"nickname":"logstor_flush",
"produces":[
"application/json"
],
"parameters":[]
}
]
},
{
"path":"/storage_service/active_repair/",
"operations":[
@@ -3090,7 +3051,7 @@
},
{
"name":"incremental_mode",
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
"required":false,
"allowMultiple":false,
"type":"string",
@@ -3124,48 +3085,6 @@
}
]
},
{
"path":"/storage_service/tablets/snapshots",
"operations":[
{
"method":"POST",
"summary":"Takes the snapshot for the given keyspaces/tables. A snapshot name must be specified.",
"type":"void",
"nickname":"take_cluster_snapshot",
"produces":[
"application/json"
],
"parameters":[
{
"name":"tag",
"description":"the tag given to the snapshot",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"keyspace",
"description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
"required":false,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"table",
"description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
"required":false,
"allowMultiple":false,
"type":"string",
"paramType":"query"
}
]
}
]
},
{
"path":"/storage_service/quiesce_topology",
"operations":[
@@ -3268,38 +3187,6 @@
}
]
},
{
"path":"/storage_service/logstor_info",
"operations":[
{
"method":"GET",
"summary":"Logstor segment information for one table",
"type":"table_logstor_info",
"nickname":"logstor_info",
"produces":[
"application/json"
],
"parameters":[
{
"name":"keyspace",
"description":"The keyspace",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"table",
"description":"table name",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"query"
}
]
}
]
},
{
"path":"/storage_service/retrain_dict",
"operations":[
@@ -3708,47 +3595,6 @@
}
}
},
"logstor_hist_bucket":{
"id":"logstor_hist_bucket",
"properties":{
"bucket":{
"type":"long"
},
"count":{
"type":"long"
},
"min_data_size":{
"type":"long"
},
"max_data_size":{
"type":"long"
}
}
},
"table_logstor_info":{
"id":"table_logstor_info",
"description":"Per-table logstor segment distribution",
"properties":{
"keyspace":{
"type":"string"
},
"table":{
"type":"string"
},
"compaction_groups":{
"type":"long"
},
"segments":{
"type":"long"
},
"data_size_histogram":{
"type":"array",
"items":{
"$ref":"logstor_hist_bucket"
}
}
}
},
"tablet_repair_result":{
"id":"tablet_repair_result",
"description":"Tablet repair result",

View File

@@ -209,21 +209,6 @@
"parameters":[]
}
]
},
{
"path":"/system/chosen_sstable_version",
"operations":[
{
"method":"GET",
"summary":"Get sstable version currently chosen for use in new sstables",
"type":"string",
"nickname":"get_chosen_sstable_version",
"produces":[
"application/json"
],
"parameters":[]
}
]
}
]
}

View File

@@ -122,9 +122,9 @@ future<> unset_thrift_controller(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_thrift_controller(ctx, r); });
}
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
return ctx.http_server.set_routes([&ctx, &ss, &ssc, &group0_client] (routes& r) {
set_storage_service(ctx, r, ss, ssc, group0_client);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
return ctx.http_server.set_routes([&ctx, &ss, &group0_client] (routes& r) {
set_storage_service(ctx, r, ss, group0_client);
});
}

View File

@@ -23,6 +23,31 @@
namespace api {
template<class T>
std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
std::vector<T> res;
res.reserve(map.size());
for (const auto& [key, value] : map) {
res.push_back(T());
res.back().key = key;
res.back().value = value;
}
return res;
}
template<class T, class MAP>
std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
res.reserve(res.size() + std::size(map));
for (const auto& [key, value] : map) {
T val;
val.key = fmt::to_string(key);
val.value = fmt::to_string(value);
res.push_back(val);
}
return res;
}
template <typename T, typename S = T>
T map_sum(T&& dest, const S& src) {
for (const auto& i : src) {

View File

@@ -98,7 +98,7 @@ future<> set_server_config(http_context& ctx, db::config& cfg);
future<> unset_server_config(http_context& ctx);
future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
future<> unset_server_snitch(http_context& ctx);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
future<> unset_server_storage_service(http_context& ctx);
future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
future<> unset_server_client_routes(http_context& ctx);

View File

@@ -100,8 +100,9 @@ rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service
rapidjson::Document root;
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
root.Parse(content.c_str());
const auto route_entries = parse_set_client_array(root);
co_await cr.local().set_client_routes(parse_set_client_array(root));
co_await cr.local().set_client_routes(route_entries);
co_return seastar::json::json_void();
}
@@ -131,7 +132,8 @@ rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_serv
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
root.Parse(content.c_str());
co_await cr.local().delete_client_routes(parse_delete_client_array(root));
const auto route_keys = parse_delete_client_array(root);
co_await cr.local().delete_client_routes(route_keys);
co_return seastar::json::json_void();
}

View File

@@ -18,9 +18,7 @@
#include "utils/assert.hh"
#include "utils/estimated_histogram.hh"
#include <algorithm>
#include <sstream>
#include "db/data_listeners.hh"
#include "utils/hash.hh"
#include "storage_service.hh"
#include "compaction/compaction_manager.hh"
#include "unimplemented.hh"
@@ -344,56 +342,6 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
return ret;
}
static
future<json::json_return_type>
rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
bool filters_provided = false;
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
table_filters.emplace(parse_fully_qualified_cf_name(filter));
}
}
std::unordered_set<sstring> keyspace_filters {};
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
keyspace_filters.emplace(std::move(filter));
}
}
// when the query is empty return immediately
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
apilog.debug("toppartitions query: processing results");
cf::toppartitions_query_results results;
results.read_cardinality = 0;
results.write_cardinality = 0;
return make_ready_future<json::json_return_type>(results);
}
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
api::req_param<unsigned> capacity(*req, "capacity", 256);
api::req_param<unsigned> list_size(*req, "list_size", 10);
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
return run_toppartitions_query(q);
});
}
void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
cf::get_column_family_name.set(r, [&db] (const_req req){
std::vector<sstring> res;
@@ -1099,10 +1047,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
});
});
ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
return rest_toppartitions_generic(db, std::move(req));
});
cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
if (!req->get_query_param("split_output").empty()) {
fail(unimplemented::cause::API);
@@ -1269,7 +1213,6 @@ void unset_column_family(http_context& ctx, routes& r) {
cf::get_sstable_count_per_level.unset(r);
cf::get_sstables_for_key.unset(r);
cf::toppartitions.unset(r);
ss::toppartitions_generic.unset(r);
cf::force_major_compaction.unset(r);
ss::get_load.unset(r);
ss::get_metrics_load.unset(r);

View File

@@ -17,7 +17,9 @@
#include "gms/feature_service.hh"
#include "schema/schema_builder.hh"
#include "sstables/sstables_manager.hh"
#include "utils/hash.hh"
#include <optional>
#include <sstream>
#include <stdexcept>
#include <time.h>
#include <algorithm>
@@ -513,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
auto sstables = parsed.GetArray() |
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
std::ranges::to<std::vector>();
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
keyspace,
table,
endpoint,
bucket,
prefix,
sstables.size(),
scope,
primary_replica_only);
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
co_return json::json_return_type(fmt::to_string(task_id));
});
@@ -534,15 +527,13 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
}
void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
auto keyspace = validate_keyspace(ctx, req);
auto view = req->get_path_param("view");
co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
storage_service_json::mapper res;
res.key = i.first;
res.value = i.second;
return res;
}));
return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
});
});
cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -556,13 +547,17 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
vp.insert(b.second);
}
}
std::vector<sstring> res;
replica::database& db = vb.local().get_db();
auto uuid = validate_table(db, ks, cf_name);
replica::column_family& cf = db.find_column_family(uuid);
co_return cf.get_index_manager().list_indexes()
| std::views::transform([] (const auto& i) { return i.metadata().name(); })
| std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
| std::ranges::to<std::vector>();
res.reserve(cf.get_index_manager().list_indexes().size());
for (auto&& i : cf.get_index_manager().list_indexes()) {
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
res.emplace_back(i.metadata().name());
}
}
co_return res;
});
}
@@ -580,16 +575,6 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
}
namespace {
template <typename Key, typename Value>
storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
storage_service_json::mapper val;
val.key = fmt::to_string(i.first);
val.value = fmt::to_string(i.second);
return val;
}
}
static
future<json::json_return_type>
rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -607,7 +592,62 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
}
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
storage_service_json::mapper val;
val.key = fmt::to_string(i.first);
val.value = fmt::to_string(i.second);
return val;
}));
}
static
future<json::json_return_type>
rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
bool filters_provided = false;
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
table_filters.emplace(parse_fully_qualified_cf_name(filter));
}
}
std::unordered_set<sstring> keyspace_filters {};
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
filters_provided = true;
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
std::getline(ss, filter, ',');
keyspace_filters.emplace(std::move(filter));
}
}
// when the query is empty return immediately
if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
apilog.debug("toppartitions query: processing results");
httpd::column_family_json::toppartitions_query_results results;
results.read_cardinality = 0;
results.write_cardinality = 0;
return make_ready_future<json::json_return_type>(results);
}
api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
api::req_param<unsigned> capacity(*req, "capacity", 256);
api::req_param<unsigned> list_size(*req, "list_size", 10);
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
return run_toppartitions_query(q);
});
}
static
@@ -641,6 +681,7 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
table_id = validate_table(ctx.db.local(), keyspace, table);
}
std::vector<ss::maplist_mapper> res;
co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
[](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
ss::maplist_mapper m;
@@ -731,13 +772,17 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
apilog.info("cleanup_all global={}", global);
if (global) {
co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
co_return co_await ss.do_clusterwide_vnodes_cleanup();
});
auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
if (!ss.is_topology_coordinator_enabled()) {
co_return false;
}
co_await ss.do_clusterwide_vnodes_cleanup();
co_return true;
});
if (done) {
co_return json::json_return_type(0);
}
// fall back to the local cleanup if local cleanup is requested
// fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
auto& db = ctx.db;
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
@@ -745,7 +790,9 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
// Mark this node as clean
co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
co_await ss.reset_cleanup_needed();
if (ss.is_topology_coordinator_enabled()) {
co_await ss.reset_cleanup_needed();
}
});
co_return json::json_return_type(0);
@@ -756,6 +803,9 @@ future<json::json_return_type>
rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
apilog.info("reset_cleanup_needed");
co_await ss.invoke_on(0, [] (service::storage_service& ss) {
if (!ss.is_topology_coordinator_enabled()) {
throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
}
return ss.reset_cleanup_needed();
});
co_return json_void();
@@ -783,31 +833,9 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)
static
future<json::json_return_type>
rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
bool major = false;
if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
major = validate_bool(major_param);
}
apilog.info("logstor_compaction: major={}", major);
auto& db = ctx.db;
co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
co_return json_void();
}
static
future<json::json_return_type>
rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
apilog.info("logstor_flush");
auto& db = ctx.db;
co_await replica::database::flush_logstor_separator_on_all_shards(db);
co_return json_void();
}
static
future<json::json_return_type>
rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
rest_decommission(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
apilog.info("decommission");
return ss.local().decommission(ssc).then([] {
return ss.local().decommission().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
}
@@ -1284,7 +1312,10 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
}
co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
return ss.local().get_ownership().then([] (auto&& ownership) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
});
}
static
@@ -1301,7 +1332,10 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
}
}
co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
std::vector<storage_service_json::mapper> res;
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
});
}
static
@@ -1311,7 +1345,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
}
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1377,7 +1411,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
apilog.warn("retrain_dict: called before the cluster feature was enabled");
throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
}
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
@@ -1523,54 +1557,6 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
});
}
static
future<json::json_return_type>
rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
auto table = api::req_param<sstring>(*req, "table", {}).value;
if (table.empty()) {
table = api::req_param<sstring>(*req, "cf", {}).value;
}
if (keyspace.empty()) {
throw bad_param_exception("The query parameter 'keyspace' is required");
}
if (table.empty()) {
throw bad_param_exception("The query parameter 'table' is required");
}
keyspace = validate_keyspace(ctx, keyspace);
auto tid = validate_table(ctx.db.local(), keyspace, table);
auto& cf = ctx.db.local().find_column_family(tid);
if (!cf.uses_logstor()) {
throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
}
return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
merged_stats += shard_stats;
}, [tid](const replica::database& db) {
return db.get_logstor_table_segment_stats(tid);
}).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
ss::table_logstor_info result;
result.keyspace = keyspace;
result.table = table;
result.compaction_groups = merged_stats.compaction_group_count;
result.segments = merged_stats.segment_count;
for (const auto& bucket : merged_stats.histogram) {
ss::logstor_hist_bucket hist;
hist.count = bucket.count;
hist.max_data_size = bucket.max_data_size;
result.data_size_histogram.push(std::move(hist));
}
return make_ready_future<json::json_return_type>(stream_object(result));
});
});
}
static
future<json::json_return_type>
rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1583,14 +1569,26 @@ rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::
static
future<json::json_return_type>
rest_upgrade_to_raft_topology(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
apilog.info("Requested to schedule upgrade to raft topology, but this version does not need it since it uses raft topology by default.");
apilog.info("Requested to schedule upgrade to raft topology");
try {
co_await ss.invoke_on(0, [] (auto& ss) {
return ss.start_upgrade_to_raft_topology();
});
} catch (...) {
auto ex = std::current_exception();
apilog.error("Failed to schedule upgrade to raft topology: {}", ex);
std::rethrow_exception(std::move(ex));
}
co_return json_void();
}
static
future<json::json_return_type>
rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
co_return sstring("done");
const auto ustate = co_await ss.invoke_on(0, [] (auto& ss) {
return ss.get_topology_upgrade_state();
});
co_return sstring(format("{}", ustate));
}
static
@@ -1800,8 +1798,9 @@ rest_bind(FuncType func, BindArgs&... args) {
return std::bind_front(func, std::ref(args)...);
}
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1816,9 +1815,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
ss::decommission.set(r, rest_bind(rest_decommission, ss));
ss::move.set(r, rest_bind(rest_move, ss));
ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1867,7 +1864,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1884,6 +1880,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
void unset_storage_service(http_context& ctx, routes& r) {
ss::get_token_endpoint.unset(r);
ss::toppartitions_generic.unset(r);
ss::get_release_version.unset(r);
ss::get_scylla_release_version.unset(r);
ss::get_schema_version.unset(r);
@@ -1897,8 +1894,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::reset_cleanup_needed.unset(r);
ss::force_flush.unset(r);
ss::force_keyspace_flush.unset(r);
ss::logstor_compaction.unset(r);
ss::logstor_flush.unset(r);
ss::decommission.unset(r);
ss::move.unset(r);
ss::remove_node.unset(r);
@@ -1946,7 +1941,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::get_ownership.unset(r);
ss::get_effective_ownership.unset(r);
ss::sstable_info.unset(r);
ss::logstor_info.unset(r);
ss::reload_raft_topology_state.unset(r);
ss::upgrade_to_raft_topology.unset(r);
ss::raft_topology_upgrade_status.unset(r);
@@ -2026,16 +2020,12 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
auto tag = req->get_query_param("tag");
auto column_families = split(req->get_query_param("cf"), ",");
auto sfopt = req->get_query_param("sf");
auto tcopt = req->get_query_param("tc");
db::snapshot_options opts = {
.skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
};
auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);
std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
try {
if (column_families.empty()) {
co_await snap_ctl.local().take_snapshot(tag, keynames, opts);
co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
} else {
if (keynames.empty()) {
throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -2043,7 +2033,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
if (keynames.size() > 1) {
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
}
co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
}
co_return json_void();
} catch (...) {
@@ -2052,27 +2042,6 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
}
});
ss::take_cluster_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
apilog.info("take_cluster_snapshot: {}", req->get_query_params());
auto tag = req->get_query_param("tag");
auto column_families = split(req->get_query_param("table"), ",");
// Note: not published/active. Retain as internal option, but...
auto sfopt = req->get_query_param("skip_flush");
db::snapshot_options opts = {
.skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
};
std::vector<sstring> keynames = split(req->get_query_param("keyspace"), ",");
try {
co_await snap_ctl.local().take_cluster_column_family_snapshot(keynames, column_families, tag, opts);
co_return json_void();
} catch (...) {
apilog.error("take_cluster_snapshot failed: {}", std::current_exception());
throw;
}
});
ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
apilog.info("del_snapshot: {}", req->get_query_params());
auto tag = req->get_query_param("tag");
@@ -2099,8 +2068,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
auto info = parse_scrub_options(ctx, std::move(req));
if (!info.snapshot_tag.empty()) {
db::snapshot_options opts = {.skip_flush = false};
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
compaction::compaction_stats stats;
@@ -2163,7 +2131,6 @@ void unset_snapshot(http_context& ctx, routes& r) {
ss::start_backup.unset(r);
cf::get_true_snapshots_size.unset(r);
cf::get_all_true_snapshots_size.unset(r);
ss::decommission.unset(r);
}
}

View File

@@ -66,7 +66,7 @@ struct scrub_info {
scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req);
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
void unset_storage_service(http_context& ctx, httpd::routes& r);
void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
void unset_sstables_loader(http_context& ctx, httpd::routes& r);

View File

@@ -190,13 +190,6 @@ void set_system(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
});
});
hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
return smp::submit_to(0, [&ctx] {
auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
});
});
}
}

View File

@@ -9,7 +9,6 @@
#include <seastar/core/chunked_fifo.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/exception.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/http/exception.hh>
#include "task_manager.hh"
@@ -265,7 +264,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
if (id) {
module->unregister_task(id);
}
co_await coroutine::maybe_yield();
co_await maybe_yield();
}
});
co_return json_void();

View File

@@ -146,8 +146,7 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
auto info = parse_scrub_options(ctx, std::move(req));
if (!info.snapshot_tag.empty()) {
db::snapshot_options opts = {.skip_flush = false};
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();

View File

@@ -209,11 +209,15 @@ future<> audit::stop_audit() {
});
}
audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
if (!audit_instance().local_is_initialized()) {
return nullptr;
}
return std::make_unique<audit_info>(cat, keyspace, table, batch);
return std::make_unique<audit_info>(cat, keyspace, table);
}
audit_info_ptr audit::create_no_audit_info() {
return audit_info_ptr();
}
future<> audit::start(const db::config& cfg) {
@@ -263,21 +267,18 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
}
future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
auto audit_info = statement->get_audit_info();
if (!audit_info) {
return make_ready_future<>();
}
if (audit_info->batch()) {
cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
if (batch != nullptr) {
return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
return inspect(m.statement, query_state, options, error);
});
} else {
if (audit::local_audit_instance().should_log(audit_info)) {
auto audit_info = statement->get_audit_info();
if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
return audit::local_audit_instance().log(audit_info, query_state, options, error);
}
return make_ready_future<>();
}
return make_ready_future<>();
}
future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {

View File

@@ -75,13 +75,11 @@ class audit_info final {
sstring _keyspace;
sstring _table;
sstring _query;
bool _batch;
public:
audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
audit_info(statement_category cat, sstring keyspace, sstring table)
: _category(cat)
, _keyspace(std::move(keyspace))
, _table(std::move(table))
, _batch(batch)
{ }
void set_query_string(const std::string_view& query_string) {
_query = sstring(query_string);
@@ -91,7 +89,6 @@ public:
const sstring& query() const { return _query; }
sstring category_string() const;
statement_category category() const { return _category; }
bool batch() const { return _batch; }
};
using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -129,7 +126,8 @@ public:
}
static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
static future<> stop_audit();
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
static audit_info_ptr create_no_audit_info();
audit(locator::shared_token_metadata& stm,
cql3::query_processor& qp,
service::migration_manager& mm,

View File

@@ -53,10 +53,10 @@ static std::string json_escape(std::string_view str) {
}
future<> audit_syslog_storage_helper::syslog_send_helper(temporary_buffer<char> msg) {
future<> audit_syslog_storage_helper::syslog_send_helper(const sstring& msg) {
try {
auto lock = co_await get_units(_semaphore, 1, std::chrono::hours(1));
co_await _sender.send(_syslog_address, std::span(&msg, 1));
co_await _sender.send(_syslog_address, net::packet{msg.data(), msg.size()});
}
catch (const std::exception& e) {
auto error_msg = seastar::format(
@@ -90,7 +90,7 @@ future<> audit_syslog_storage_helper::start(const db::config& cfg) {
co_return;
}
co_await syslog_send_helper(temporary_buffer<char>::copy_of("Initializing syslog audit backend."));
co_await syslog_send_helper("Initializing syslog audit backend.");
}
future<> audit_syslog_storage_helper::stop() {
@@ -120,7 +120,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
audit_info->table(),
username);
co_await syslog_send_helper(std::move(msg).release());
co_await syslog_send_helper(msg);
}
future<> audit_syslog_storage_helper::write_login(const sstring& username,
@@ -139,7 +139,7 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
client_ip,
username);
co_await syslog_send_helper(std::move(msg).release());
co_await syslog_send_helper(msg.c_str());
}
}

View File

@@ -26,7 +26,7 @@ class audit_syslog_storage_helper : public storage_helper {
net::datagram_channel _sender;
seastar::semaphore _semaphore;
future<> syslog_send_helper(seastar::temporary_buffer<char> msg);
future<> syslog_send_helper(const sstring& msg);
public:
explicit audit_syslog_storage_helper(cql3::query_processor&, service::migration_manager&);
virtual ~audit_syslog_storage_helper();

View File

@@ -17,14 +17,15 @@ target_sources(scylla_auth
password_authenticator.cc
passwords.cc
permission.cc
permissions_cache.cc
resource.cc
role_or_anonymous.cc
roles-metadata.cc
sasl_challenge.cc
saslauthd_authenticator.cc
service.cc
standard_role_manager.cc
transitional.cc
maintenance_socket_authenticator.cc
maintenance_socket_role_manager.cc)
target_include_directories(scylla_auth
PUBLIC
@@ -48,4 +49,4 @@ if (Scylla_USE_PRECOMPILED_HEADER_USE)
target_precompile_headers(scylla_auth REUSE_FROM scylla-precompiled-header)
endif()
check_headers(check-headers scylla_auth
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)

View File

@@ -9,9 +9,19 @@
#include "auth/allow_all_authenticator.hh"
#include "service/migration_manager.hh"
#include "utils/class_registrator.hh"
namespace auth {
constexpr std::string_view allow_all_authenticator_name("org.apache.cassandra.auth.AllowAllAuthenticator");
// To ensure correct initialization order, we unfortunately need to use a string literal.
static const class_registrator<
authenticator,
allow_all_authenticator,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
}

View File

@@ -9,9 +9,18 @@
#include "auth/allow_all_authorizer.hh"
#include "auth/common.hh"
#include "utils/class_registrator.hh"
namespace auth {
constexpr std::string_view allow_all_authorizer_name("org.apache.cassandra.auth.AllowAllAuthorizer");
// To ensure correct initialization order, we unfortunately need to use a string literal.
static const class_registrator<
authorizer,
allow_all_authorizer,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthorizer");
}

View File

@@ -26,7 +26,7 @@ extern const std::string_view allow_all_authorizer_name;
class allow_all_authorizer final : public authorizer {
public:
allow_all_authorizer(cql3::query_processor&) {
allow_all_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
}
virtual future<> start() override {

View File

@@ -8,7 +8,6 @@
#include "auth/cache.hh"
#include "auth/common.hh"
#include "auth/role_or_anonymous.hh"
#include "auth/roles-metadata.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
@@ -16,38 +15,19 @@
#include "db/system_keyspace.hh"
#include "schema/schema.hh"
#include <iterator>
#include <seastar/core/abort_source.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/core/format.hh>
#include <seastar/core/metrics.hh>
#include <seastar/core/do_with.hh>
namespace auth {
logging::logger logger("auth-cache");
cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
cache::cache(cql3::query_processor& qp) noexcept
: _current_version(0)
, _qp(qp)
, _loading_sem(1)
, _as(as)
, _permission_loader(nullptr)
, _permission_loader_sem(8) {
namespace sm = seastar::metrics;
_metrics.add_group("auth_cache", {
sm::make_gauge("roles", [this] { return _roles.size(); },
sm::description("Number of roles currently cached")),
sm::make_gauge("permissions", [this] {
return _cached_permissions_count;
}, sm::description("Total number of permission sets currently cached across all roles"))
});
, _qp(qp) {
}
void cache::set_permission_loader(permission_loader_func loader) {
_permission_loader = std::move(loader);
}
lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
auto it = _roles.find(role);
if (it == _roles.end()) {
return {};
@@ -55,93 +35,6 @@ lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const
return it->second;
}
void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
for (const auto& [name, record] : _roles) {
func(name, *record);
}
}
size_t cache::roles_count() const noexcept {
return _roles.size();
}
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
std::unordered_map<resource, permission_set>* perms_cache;
lw_shared_ptr<role_record> role_ptr;
if (is_anonymous(role)) {
perms_cache = &_anonymous_permissions;
} else {
const auto& role_name = *role.name;
auto role_it = _roles.find(role_name);
if (role_it == _roles.end()) {
// Role might have been deleted but there are some connections
// left which reference it. They should no longer have access to anything.
return make_ready_future<permission_set>(permissions::NONE);
}
role_ptr = role_it->second;
perms_cache = &role_ptr->cached_permissions;
}
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
return make_ready_future<permission_set>(it->second);
}
// keep alive role_ptr as it holds perms_cache (except anonymous)
return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
return load_permissions(role, r, perms_cache);
});
}
future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
SCYLLA_ASSERT(_permission_loader);
auto units = co_await get_units(_permission_loader_sem, 1, _as);
// Check again, perhaps we were blocked and other call loaded
// the permissions already. This is a protection against misses storm.
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
co_return it->second;
}
auto perms = co_await _permission_loader(role, r);
add_permissions(*perms_cache, r, perms);
co_return perms;
}
future<> cache::prune(const resource& r) {
auto units = co_await get_units(_loading_sem, 1, _as);
_anonymous_permissions.erase(r);
for (auto& it : _roles) {
// Prunning can run concurrently with other functions but it
// can only cause cached_permissions extra reload via get_permissions.
remove_permissions(it.second->cached_permissions, r);
co_await coroutine::maybe_yield();
}
}
future<> cache::reload_all_permissions() noexcept {
SCYLLA_ASSERT(_permission_loader);
auto units = co_await get_units(_loading_sem, 1, _as);
auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
std::vector<resource> keys;
keys.reserve(m.size());
for (const auto& [res, _] : m) {
keys.push_back(res);
}
return keys;
};
const role_or_anonymous anon;
for (const auto& res : copy_keys(_anonymous_permissions)) {
_anonymous_permissions[res] = co_await _permission_loader(anon, res);
}
for (auto& [role, entry] : _roles) {
auto& perms_cache = entry->cached_permissions;
auto r = role_or_anonymous(role);
for (const auto& res : copy_keys(perms_cache)) {
perms_cache[res] = co_await _permission_loader(r, res);
}
}
logger.debug("Reloaded auth cache with {} entries", _roles.size());
}
future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
auto rec = make_lw_shared<role_record>();
rec->version = _current_version;
@@ -209,7 +102,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
future<> cache::prune_all() noexcept {
for (auto it = _roles.begin(); it != _roles.end(); ) {
if (it->second->version != _current_version) {
remove_role(it++);
_roles.erase(it++);
co_await coroutine::maybe_yield();
} else {
++it;
@@ -219,9 +112,10 @@ future<> cache::prune_all() noexcept {
}
future<> cache::load_all() {
if (legacy_mode(_qp)) {
co_return;
}
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
++_current_version;
logger.info("Loading all roles");
@@ -230,7 +124,7 @@ future<> cache::load_all() {
const auto name = r.get_as<sstring>("role");
auto role = co_await fetch_role(name);
if (role) {
add_role(name, role);
_roles[name] = role;
}
co_return stop_iteration::no;
};
@@ -243,71 +137,36 @@ future<> cache::load_all() {
co_await distribute_role(name, role);
}
co_await container().invoke_on_others([this](cache& c) -> future<> {
auto units = co_await get_units(c._loading_sem, 1, c._as);
c._current_version = _current_version;
co_await c.prune_all();
});
}
future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
if (!role) {
// Role might have been removed or not yet added, either way
// their members will be handled by another top call to this function.
future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
if (legacy_mode(_qp)) {
co_return;
}
for (const auto& member_name : role->members) {
bool is_new = roles.insert(member_name).second;
if (!is_new) {
continue;
}
lw_shared_ptr<cache::role_record> member_role;
auto r = _roles.find(member_name);
if (r != _roles.end()) {
member_role = r->second;
}
co_await gather_inheriting_roles(roles, member_role, member_name);
}
}
future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
std::unordered_set<role_name_t> roles_to_clear_perms;
for (const auto& name : roles) {
logger.info("Loading role {}", name);
auto role = co_await fetch_role(name);
if (role) {
add_role(name, role);
co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
_roles[name] = role;
} else {
if (auto it = _roles.find(name); it != _roles.end()) {
auto old_role = it->second;
remove_role(it);
co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
}
_roles.erase(name);
}
co_await distribute_role(name, role);
}
co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
for (const auto& name : roles_to_clear_perms) {
c.clear_role_permissions(name);
co_await coroutine::maybe_yield();
}
});
}
future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
auto role_ptr = role.get();
co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
auto units = co_await get_units(c._loading_sem, 1, c._as);
co_await container().invoke_on_others([&name, role_ptr](cache& c) {
if (!role_ptr) {
c.remove_role(name);
co_return;
c._roles.erase(name);
return;
}
auto role_copy = make_lw_shared<role_record>(*role_ptr);
c.add_role(name, std::move(role_copy));
c._roles[name] = std::move(role_copy);
});
}
@@ -318,40 +177,4 @@ bool cache::includes_table(const table_id& id) noexcept {
|| id == db::system_keyspace::role_permissions()->id();
}
void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
if (auto it = _roles.find(name); it != _roles.end()) {
_cached_permissions_count -= it->second->cached_permissions.size();
}
_cached_permissions_count += role->cached_permissions.size();
_roles[name] = std::move(role);
}
void cache::remove_role(const role_name_t& name) {
if (auto it = _roles.find(name); it != _roles.end()) {
remove_role(it);
}
}
void cache::remove_role(roles_map::iterator it) {
_cached_permissions_count -= it->second->cached_permissions.size();
_roles.erase(it);
}
void cache::clear_role_permissions(const role_name_t& name) {
if (auto it = _roles.find(name); it != _roles.end()) {
_cached_permissions_count -= it->second->cached_permissions.size();
it->second->cached_permissions.clear();
}
}
void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
if (cache.emplace(r, perms).second) {
++_cached_permissions_count;
}
}
void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
_cached_permissions_count -= cache.erase(r);
}
} // namespace auth

View File

@@ -8,8 +8,6 @@
#pragma once
#include <seastar/core/abort_source.hh>
#include <string_view>
#include <unordered_set>
#include <unordered_map>
@@ -17,15 +15,11 @@
#include <seastar/core/future.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/semaphore.hh>
#include <seastar/core/metrics_registration.hh>
#include "absl-flat_hash_map.hh"
#include <absl/container/flat_hash_map.h>
#include "auth/permission.hh"
#include "auth/common.hh"
#include "auth/resource.hh"
#include "auth/role_or_anonymous.hh"
namespace cql3 { class query_processor; }
@@ -35,7 +29,6 @@ class cache : public peering_sharded_service<cache> {
public:
using role_name_t = sstring;
using version_tag_t = char;
using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;
struct role_record {
bool can_login = false;
@@ -43,60 +36,26 @@ public:
std::unordered_set<role_name_t> member_of;
std::unordered_set<role_name_t> members;
sstring salted_hash;
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
private:
friend cache;
// cached permissions include effects of role's inheritance
std::unordered_map<resource, permission_set> cached_permissions;
std::unordered_map<sstring, sstring> attributes;
std::unordered_map<sstring, permission_set> permissions;
version_tag_t version; // used for seamless cache reloads
};
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
void set_permission_loader(permission_loader_func loader);
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
future<> prune(const resource& r);
future<> reload_all_permissions() noexcept;
explicit cache(cql3::query_processor& qp) noexcept;
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
future<> load_all();
future<> load_roles(std::unordered_set<role_name_t> roles);
static bool includes_table(const table_id&) noexcept;
// Returns the number of roles in the cache.
size_t roles_count() const noexcept;
// The callback doesn't suspend (no co_await) so it observes the state
// of the cache atomically.
void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
private:
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
roles_map _roles;
// anonymous permissions map exists mainly due to compatibility with
// higher layers which use role_or_anonymous to get permissions.
std::unordered_map<resource, permission_set> _anonymous_permissions;
version_tag_t _current_version;
cql3::query_processor& _qp;
semaphore _loading_sem; // protects iteration of _roles map
abort_source& _as;
permission_loader_func _permission_loader;
semaphore _permission_loader_sem; // protects against reload storms on a single role change
metrics::metric_groups _metrics;
size_t _cached_permissions_count = 0;
future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
future<> prune_all() noexcept;
future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
void remove_role(const role_name_t& name);
void remove_role(roles_map::iterator it);
void clear_role_permissions(const role_name_t& name);
void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
};
} // namespace auth

View File

@@ -13,11 +13,14 @@
#include <boost/regex.hpp>
#include <fmt/ranges.h>
#include "utils/class_registrator.hh"
#include "utils/to_string.hh"
#include "data_dictionary/data_dictionary.hh"
#include "cql3/query_processor.hh"
#include "db/config.hh"
static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);
static logging::logger clogger("certificate_authenticator");
@@ -27,6 +30,13 @@ static const std::string cfg_query_attr = "query";
static const std::string cfg_source_subject = "SUBJECT";
static const std::string cfg_source_altname = "ALTNAME";
static const class_registrator<auth::authenticator
, auth::certificate_authenticator
, cql3::query_processor&
, ::service::raft_group0_client&
, ::service::migration_manager&
, auth::cache&> cert_auth_reg(CERT_AUTH_NAME);
enum class auth::certificate_authenticator::query_source {
subject, altname
};
@@ -89,7 +99,7 @@ future<> auth::certificate_authenticator::stop() {
}
std::string_view auth::certificate_authenticator::qualified_java_name() const {
return "com.scylladb.auth.CertificateAuthenticator";
return certificate_authenticator_name;
}
bool auth::certificate_authenticator::require_authentication() const {

View File

@@ -27,6 +27,8 @@ namespace auth {
class cache;
extern const std::string_view certificate_authenticator_name;
class certificate_authenticator : public authenticator {
enum class query_source;
std::vector<std::pair<query_source, boost::regex>> _queries;

View File

@@ -14,11 +14,18 @@
#include <seastar/core/sharded.hh>
#include "mutation/canonical_mutation.hh"
#include "schema/schema_fwd.hh"
#include "mutation/timestamp.hh"
#include "utils/assert.hh"
#include "utils/exponential_backoff_retry.hh"
#include "cql3/query_processor.hh"
#include "cql3/statements/create_table_statement.hh"
#include "schema/schema_builder.hh"
#include "service/migration_manager.hh"
#include "service/raft/group0_state_machine.hh"
#include "timeout_config.hh"
#include "utils/error_injection.hh"
#include "db/system_keyspace.hh"
namespace auth {
@@ -26,14 +33,22 @@ namespace meta {
namespace legacy {
constinit const std::string_view AUTH_KS("system_auth");
constinit const std::string_view USERS_CF("users");
} // namespace legacy
constinit const std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
} // namespace meta
static logging::logger auth_log("auth");
std::string default_superuser(cql3::query_processor& qp) {
return qp.db().get_config().auth_superuser_name();
bool legacy_mode(cql3::query_processor& qp) {
return qp.auth_version < db::auth_version_t::v2;
}
std::string_view get_auth_ks_name(cql3::query_processor& qp) {
if (legacy_mode(qp)) {
return meta::legacy::AUTH_KS;
}
return db::system_keyspace::NAME;
}
// Func must support being invoked more than once.
@@ -50,6 +65,47 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
}).discard_result();
}
static future<> create_legacy_metadata_table_if_missing_impl(
std::string_view table_name,
cql3::query_processor& qp,
std::string_view cql,
::service::migration_manager& mm) {
SCYLLA_ASSERT(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
auto db = qp.db();
auto parsed_statement = cql3::query_processor::parse_statement(cql, cql3::dialect{});
auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);
parsed_cf_statement.prepare_keyspace(meta::legacy::AUTH_KS);
auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);
const auto schema = statement->get_cf_meta_data(qp.db());
const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
schema_builder b(schema);
b.set_uuid(uuid);
schema_ptr table = b.build();
if (!db.has_schema(table->ks_name(), table->cf_name())) {
auto group0_guard = co_await mm.start_group0_operation();
auto ts = group0_guard.write_timestamp();
try {
co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
} catch (const exceptions::already_exists_exception&) {}
}
}
future<> create_legacy_metadata_table_if_missing(
std::string_view table_name,
cql3::query_processor& qp,
std::string_view cql,
::service::migration_manager& mm) noexcept {
return futurize_invoke(create_legacy_metadata_table_if_missing_impl, table_name, qp, cql, mm);
}
::service::query_state& internal_distributed_query_state() noexcept {
#ifdef DEBUG
// Give the much slower debug tests more headroom for completing auth queries.
@@ -84,6 +140,56 @@ static future<> announce_mutations_with_guard(
return group0_client.add_entry(std::move(group0_cmd), std::move(group0_guard), as, timeout);
}
future<> announce_mutations_with_batching(
::service::raft_group0_client& group0_client,
start_operation_func_t start_operation_func,
std::function<::service::mutations_generator(api::timestamp_type t)> gen,
seastar::abort_source& as,
std::optional<::service::raft_timeout> timeout) {
// account for command's overhead, it's better to use smaller threshold than constantly bounce off the limit
size_t memory_threshold = group0_client.max_command_size() * 0.75;
utils::get_local_injector().inject("auth_announce_mutations_command_max_size",
[&memory_threshold] {
memory_threshold = 1000;
});
size_t memory_usage = 0;
utils::chunked_vector<canonical_mutation> muts;
// guard has to be taken before we execute code in gen as
// it can do read-before-write and we want announce_mutations
// operation to be linearizable with other such calls,
// for instance if we do select and then delete in gen
// we want both to operate on the same data or fail
// if someone else modified it in the middle
std::optional<::service::group0_guard> group0_guard;
group0_guard = co_await start_operation_func(as);
auto timestamp = group0_guard->write_timestamp();
auto g = gen(timestamp);
while (auto mut = co_await g()) {
muts.push_back(canonical_mutation{*mut});
memory_usage += muts.back().representation().size();
if (memory_usage >= memory_threshold) {
if (!group0_guard) {
group0_guard = co_await start_operation_func(as);
timestamp = group0_guard->write_timestamp();
}
co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
group0_guard = std::nullopt;
memory_usage = 0;
muts = {};
}
}
if (!muts.empty()) {
if (!group0_guard) {
group0_guard = co_await start_operation_func(as);
timestamp = group0_guard->write_timestamp();
}
co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
}
}
future<> announce_mutations(
cql3::query_processor& qp,
::service::raft_group0_client& group0_client,

View File

@@ -21,7 +21,12 @@
using namespace std::chrono_literals;
namespace replica {
class database;
}
namespace service {
class migration_manager;
class query_state;
}
@@ -35,8 +40,10 @@ namespace meta {
namespace legacy {
extern constinit const std::string_view AUTH_KS;
extern constinit const std::string_view USERS_CF;
} // namespace legacy
constexpr std::string_view DEFAULT_SUPERUSER_NAME("cassandra");
extern constinit const std::string_view AUTH_PACKAGE_NAME;
} // namespace meta
@@ -45,7 +52,12 @@ constexpr std::string_view PERMISSIONS_CF = "role_permissions";
constexpr std::string_view ROLE_MEMBERS_CF = "role_members";
constexpr std::string_view ROLE_ATTRIBUTES_CF = "role_attributes";
std::string default_superuser(cql3::query_processor& qp);
// This is a helper to check whether auth-v2 is on.
bool legacy_mode(cql3::query_processor& qp);
// We have legacy implementation using different keyspace
// and need to parametrize depending on runtime feature.
std::string_view get_auth_ks_name(cql3::query_processor& qp);
template <class Task>
future<> once_among_shards(Task&& f) {
@@ -59,6 +71,12 @@ future<> once_among_shards(Task&& f) {
// Func must support being invoked more than once.
future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_function<future<>()> func);
future<> create_legacy_metadata_table_if_missing(
std::string_view table_name,
cql3::query_processor&,
std::string_view cql,
::service::migration_manager&) noexcept;
///
/// Time-outs for internal, non-local CQL queries.
///
@@ -66,6 +84,20 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
::service::raft_timeout get_raft_timeout() noexcept;
// Execute update query via group0 mechanism, mutations will be applied on all nodes.
// Use this function when need to perform read before write on a single guard or if
// you have more than one mutation and potentially exceed single command size limit.
using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source&)>;
future<> announce_mutations_with_batching(
::service::raft_group0_client& group0_client,
// since we can operate also in topology coordinator context where we need stronger
// guarantees than start_operation from group0_client gives we allow to inject custom
// function here
start_operation_func_t start_operation_func,
std::function<::service::mutations_generator(api::timestamp_type t)> gen,
seastar::abort_source& as,
std::optional<::service::raft_timeout> timeout);
// Execute update query via group0 mechanism, mutations will be applied on all nodes.
future<> announce_mutations(
cql3::query_processor& qp,

View File

@@ -26,6 +26,7 @@ extern "C" {
#include "cql3/untyped_result_set.hh"
#include "exceptions/exceptions.hh"
#include "utils/log.hh"
#include "utils/class_registrator.hh"
namespace auth {
@@ -39,14 +40,111 @@ static constexpr std::string_view PERMISSIONS_NAME = "permissions";
static logging::logger alogger("default_authorizer");
default_authorizer::default_authorizer(cql3::query_processor& qp)
: _qp(qp) {
// To ensure correct initialization order, we unfortunately need to use a string literal.
static const class_registrator<
authorizer,
default_authorizer,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.CassandraAuthorizer");
default_authorizer::default_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
: _qp(qp)
, _migration_manager(mm) {
}
default_authorizer::~default_authorizer() {
}
static const sstring legacy_table_name{"permissions"};
bool default_authorizer::legacy_metadata_exists() const {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<bool> default_authorizer::legacy_any_granted() const {
static const sstring query = seastar::format("SELECT * FROM {}.{} LIMIT 1", meta::legacy::AUTH_KS, PERMISSIONS_CF);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{},
cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> results) {
return !results->empty();
});
}
future<> default_authorizer::migrate_legacy_metadata() {
alogger.info("Starting migration of legacy permissions metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
return do_with(
row.get_as<sstring>("username"),
parse_resource(row.get_as<sstring>(RESOURCE_NAME)),
::service::group0_batch::unused(),
[this, &row](const auto& username, const auto& r, auto& mc) {
const permission_set perms = permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
return grant(username, perms, r, mc);
});
}).finally([results] {});
}).then([] {
alogger.info("Finished migrating legacy permissions metadata.");
}).handle_exception([](std::exception_ptr ep) {
alogger.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> default_authorizer::start_legacy() {
static const sstring create_table = fmt::format(
"CREATE TABLE {}.{} ("
"{} text,"
"{} text,"
"{} set<text>,"
"PRIMARY KEY({}, {})"
") WITH gc_grace_seconds={}",
meta::legacy::AUTH_KS,
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME,
PERMISSIONS_NAME,
ROLE_NAME,
RESOURCE_NAME,
90 * 24 * 60 * 60); // 3 months.
return once_among_shards([this] {
return create_legacy_metadata_table_if_missing(
PERMISSIONS_CF,
_qp,
create_table,
_migration_manager).then([this] {
_finished = do_after_system_ready(_as, [this] {
return async([this] {
_migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
if (legacy_metadata_exists()) {
if (!legacy_any_granted().get()) {
migrate_legacy_metadata().get();
return;
}
alogger.warn("Ignoring legacy permissions metadata since role permissions exist.");
}
});
});
});
});
}
future<> default_authorizer::start() {
if (legacy_mode(_qp)) {
return start_legacy();
}
return make_ready_future<>();
}
@@ -63,7 +161,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? AND {} = ?",
PERMISSIONS_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
@@ -87,13 +185,21 @@ default_authorizer::modify(
std::string_view op,
::service::group0_batch& mc) {
const sstring query = seastar::format("UPDATE {}.{} SET {} = {} {} ? WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
PERMISSIONS_NAME,
PERMISSIONS_NAME,
op,
ROLE_NAME,
RESOURCE_NAME);
if (legacy_mode(_qp)) {
co_return co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
internal_distributed_query_state(),
{permissions::to_strings(set), sstring(role_name), resource.name()},
cql3::query_processor::cache_internal::no).discard_result();
}
co_await collect_mutations(_qp, mc, query,
{permissions::to_strings(set), sstring(role_name), resource.name()});
}
@@ -112,7 +218,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
ROLE_NAME,
RESOURCE_NAME,
PERMISSIONS_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF);
const auto results = co_await _qp.execute_internal(
@@ -137,16 +243,74 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
future<> default_authorizer::revoke_all(std::string_view role_name, ::service::group0_batch& mc) {
try {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
}
}
future<> default_authorizer::revoke_all_legacy(const resource& resource) {
static const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
ROLE_NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
RESOURCE_NAME);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{resource.name()},
cql3::query_processor::cache_internal::no).then_wrapped([this, resource](future<::shared_ptr<cql3::untyped_result_set>> f) {
try {
auto res = f.get();
return parallel_for_each(
res->begin(),
res->end(),
[this, res, resource](const cql3::untyped_result_set::row& r) {
static const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
return _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
{r.get_as<sstring>(ROLE_NAME), resource.name()},
cql3::query_processor::cache_internal::no).discard_result().handle_exception(
[resource](auto ep) {
try {
std::rethrow_exception(ep);
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
}
});
});
} catch (const exceptions::request_execution_exception& e) {
alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
return make_ready_future();
}
});
}
future<> default_authorizer::revoke_all(const resource& resource, ::service::group0_batch& mc) {
if (legacy_mode(_qp)) {
co_return co_await revoke_all_legacy(resource);
}
if (resource.kind() == resource_kind::data &&
data_resource_view(resource).is_keyspace()) {
revoke_all_keyspace_resources(resource, mc);
@@ -157,7 +321,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
auto gen = [this, name] (api::timestamp_type t) -> ::service::mutations_generator {
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
ROLE_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
RESOURCE_NAME);
auto res = co_await _qp.execute_internal(
@@ -167,7 +331,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
cql3::query_processor::cache_internal::no);
for (const auto& r : *res) {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);
@@ -192,7 +356,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
const sstring query = seastar::format("SELECT {}, {} FROM {}.{}",
ROLE_NAME,
RESOURCE_NAME,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF);
auto res = co_await _qp.execute_internal(
query,
@@ -207,7 +371,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
continue;
}
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
PERMISSIONS_CF,
ROLE_NAME,
RESOURCE_NAME);

View File

@@ -27,12 +27,14 @@ namespace auth {
class default_authorizer : public authorizer {
cql3::query_processor& _qp;
::service::migration_manager& _migration_manager;
abort_source _as{};
future<> _finished{make_ready_future<>()};
public:
default_authorizer(cql3::query_processor&);
default_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
~default_authorizer();
@@ -57,6 +59,16 @@ public:
virtual const resource_set& protected_resources() const override;
private:
future<> start_legacy();
bool legacy_metadata_exists() const;
future<> revoke_all_legacy(const resource&);
future<bool> legacy_any_granted() const;
future<> migrate_legacy_metadata();
future<> modify(std::string_view, permission_set, const resource&, std::string_view, ::service::group0_batch&);
void revoke_all_keyspace_resources(const resource& ks_resource, ::service::group0_batch& mc);

View File

@@ -24,6 +24,7 @@
#include "exceptions/exceptions.hh"
#include "seastarx.hh"
#include "service/raft/raft_group0_client.hh"
#include "utils/class_registrator.hh"
#include "db/config.hh"
#include "utils/exponential_backoff_retry.hh"
@@ -71,22 +72,26 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
return values;
}
const char* ldap_role_manager_full_name = "com.scylladb.auth.LDAPRoleManager";
} // anonymous namespace
namespace auth {
static const class_registrator<
role_manager,
ldap_role_manager,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> registration(ldap_role_manager_full_name);
ldap_role_manager::ldap_role_manager(
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
, _bind_password(bind_password)
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
, _cache(cache)
, _cache_pruner(make_ready_future<>()) {
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
}
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -95,8 +100,6 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
qp.db().get_config().ldap_attr_role(),
qp.db().get_config().ldap_bind_dn(),
qp.db().get_config().ldap_bind_passwd(),
qp.db().get_config().permissions_update_interval_in_ms(),
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
qp,
rg0c,
mm,
@@ -104,7 +107,7 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
}
std::string_view ldap_role_manager::qualified_java_name() const noexcept {
return "com.scylladb.auth.LDAPRoleManager";
return ldap_role_manager_full_name;
}
const resource_set& ldap_role_manager::protected_resources() const {
@@ -116,22 +119,6 @@ future<> ldap_role_manager::start() {
return make_exception_future(
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
}
_cache_pruner = futurize_invoke([this] () -> future<> {
while (true) {
try {
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
} catch (const seastar::sleep_aborted&) {
co_return; // ignore
}
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
try {
co_await c.reload_all_permissions();
} catch (...) {
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
}
});
}
});
return _std_mgr.start();
}
@@ -188,11 +175,7 @@ future<conn_ptr> ldap_role_manager::reconnect() {
future<> ldap_role_manager::stop() {
_as.request_abort();
return std::move(_cache_pruner).then([this] {
return _std_mgr.stop();
}).then([this] {
return _connection_factory.stop();
});
return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
}
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {

View File

@@ -10,7 +10,6 @@
#pragma once
#include <seastar/core/abort_source.hh>
#include <seastar/core/future.hh>
#include <stdexcept>
#include "ent/ldap/ldap_connection.hh"
@@ -35,29 +34,22 @@ class ldap_role_manager : public role_manager {
seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
seastar::sstring _bind_name; ///< Username for LDAP simple bind.
seastar::sstring _bind_password; ///< Password for LDAP simple bind.
uint32_t _permissions_update_interval_in_ms;
utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
seastar::abort_source _as;
cache& _cache;
seastar::future<> _cache_pruner;
public:
ldap_role_manager(
std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
std::string_view bind_name, ///< LDAP bind credentials.
std::string_view bind_password, ///< LDAP bind credentials.
uint32_t permissions_update_interval_in_ms,
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
cql3::query_processor& qp, ///< Passed to standard_role_manager.
::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
::service::migration_manager& mm, ///< Passed to standard_role_manager.
cache& cache ///< Passed to standard_role_manager.
);
/// Retrieves LDAP configuration entries from qp and invokes the other constructor.
/// Retrieves LDAP configuration entries from qp and invokes the other constructor. Required by
/// class_registrator<role_manager>.
ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache);
/// Thrown when query-template parsing fails.

View File

@@ -1,31 +0,0 @@
/*
* Copyright (C) 2026-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include "auth/maintenance_socket_authenticator.hh"
namespace auth {
maintenance_socket_authenticator::~maintenance_socket_authenticator() {
}
future<> maintenance_socket_authenticator::start() {
return make_ready_future<>();
}
future<> maintenance_socket_authenticator::ensure_superuser_is_created() const {
return make_ready_future<>();
}
bool maintenance_socket_authenticator::require_authentication() const {
return false;
}
} // namespace auth

View File

@@ -1,36 +0,0 @@
/*
* Copyright (C) 2026-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#pragma once
#include <seastar/core/shared_future.hh>
#include "password_authenticator.hh"
namespace auth {
// maintenance_socket_authenticator is used for clients connecting to the
// maintenance socket. It does not require authentication,
// while still allowing the managing of roles and their credentials.
class maintenance_socket_authenticator : public password_authenticator {
public:
using password_authenticator::password_authenticator;
virtual ~maintenance_socket_authenticator();
virtual future<> start() override;
virtual future<> ensure_superuser_is_created() const override;
bool require_authentication() const override;
};
} // namespace auth

View File

@@ -1,37 +0,0 @@
/*
* Copyright (C) 2026-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#pragma once
#include "auth/default_authorizer.hh"
#include "auth/permission.hh"
namespace auth {
// maintenance_socket_authorizer is used for clients connecting to the
// maintenance socket. It grants all permissions unconditionally (like
// AllowAllAuthorizer) while still supporting grant/revoke operations
// (delegated to the underlying CassandraAuthorizer / default_authorizer).
class maintenance_socket_authorizer : public default_authorizer {
public:
using default_authorizer::default_authorizer;
~maintenance_socket_authorizer() override = default;
future<> start() override {
return make_ready_future<>();
}
future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
return make_ready_future<permission_set>(permissions::ALL);
}
};
} // namespace auth

View File

@@ -13,48 +13,23 @@
#include <string_view>
#include "auth/cache.hh"
#include "cql3/description.hh"
#include "utils/log.hh"
#include "utils/on_internal_error.hh"
#include "utils/class_registrator.hh"
namespace auth {
static logging::logger log("maintenance_socket_role_manager");
constexpr std::string_view maintenance_socket_role_manager_name = "com.scylladb.auth.MaintenanceSocketRoleManager";
future<> maintenance_socket_role_manager::ensure_role_operations_are_enabled() {
if (_is_maintenance_mode) {
on_internal_error(log, "enabling role operations not allowed in maintenance mode");
}
static const class_registrator<
role_manager,
maintenance_socket_role_manager,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> registration(sstring{maintenance_socket_role_manager_name});
if (_std_mgr.has_value()) {
on_internal_error(log, "role operations are already enabled");
}
_std_mgr.emplace(_qp, _group0_client, _migration_manager, _cache);
return _std_mgr->start();
}
void maintenance_socket_role_manager::set_maintenance_mode() {
if (_std_mgr.has_value()) {
on_internal_error(log, "cannot enter maintenance mode after role operations have been enabled");
}
_is_maintenance_mode = true;
}
maintenance_socket_role_manager::maintenance_socket_role_manager(
cql3::query_processor& qp,
::service::raft_group0_client& rg0c,
::service::migration_manager& mm,
cache& c)
: _qp(qp)
, _group0_client(rg0c)
, _migration_manager(mm)
, _cache(c)
, _std_mgr(std::nullopt)
, _is_maintenance_mode(false) {
}
std::string_view maintenance_socket_role_manager::qualified_java_name() const noexcept {
return "com.scylladb.auth.MaintenanceSocketRoleManager";
return maintenance_socket_role_manager_name;
}
const resource_set& maintenance_socket_role_manager::protected_resources() const {
@@ -68,161 +43,81 @@ future<> maintenance_socket_role_manager::start() {
}
future<> maintenance_socket_role_manager::stop() {
return _std_mgr ? _std_mgr->stop() : make_ready_future<>();
}
future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
return _std_mgr ? _std_mgr->ensure_superuser_is_created() : make_ready_future<>();
}
template<typename T = void>
future<T> operation_not_available_in_maintenance_mode_exception(std::string_view operation) {
return make_exception_future<T>(
std::runtime_error(fmt::format("role manager: {} operation not available through maintenance socket in maintenance mode", operation)));
}
template<typename T = void>
future<T> manager_not_ready_exception(std::string_view operation) {
return make_exception_future<T>(
std::runtime_error(fmt::format("role manager: {} operation not available because manager not ready yet (role operations not enabled)", operation)));
}
future<> maintenance_socket_role_manager::validate_operation(std::string_view name) const {
if (_is_maintenance_mode) {
return operation_not_available_in_maintenance_mode_exception(name);
}
if (!_std_mgr) {
return manager_not_ready_exception(name);
}
return make_ready_future<>();
}
future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
auto f = validate_operation("CREATE");
if (f.failed()) {
return f;
}
return _std_mgr->create(role_name, c, mc);
future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
return make_ready_future<>();
}
template<typename T = void>
future<T> operation_not_supported_exception(std::string_view operation) {
return make_exception_future<T>(
std::runtime_error(fmt::format("role manager: {} operation not supported through maintenance socket", operation)));
}
future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config&, ::service::group0_batch&) {
return operation_not_supported_exception("CREATE");
}
future<> maintenance_socket_role_manager::drop(std::string_view role_name, ::service::group0_batch& mc) {
auto f = validate_operation("DROP");
if (f.failed()) {
return f;
}
return _std_mgr->drop(role_name, mc);
return operation_not_supported_exception("DROP");
}
future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) {
auto f = validate_operation("ALTER");
if (f.failed()) {
return f;
}
return _std_mgr->alter(role_name, u, mc);
future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) {
return operation_not_supported_exception("ALTER");
}
future<> maintenance_socket_role_manager::grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) {
auto f = validate_operation("GRANT");
if (f.failed()) {
return f;
}
return _std_mgr->grant(grantee_name, role_name, mc);
return operation_not_supported_exception("GRANT");
}
future<> maintenance_socket_role_manager::revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) {
auto f = validate_operation("REVOKE");
if (f.failed()) {
return f;
}
return _std_mgr->revoke(revokee_name, role_name, mc);
return operation_not_supported_exception("REVOKE");
}
future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query m) {
auto f = validate_operation("QUERY GRANTED");
if (f.failed()) {
return make_exception_future<role_set>(f.get_exception());
}
return _std_mgr->query_granted(grantee_name, m);
future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
return operation_not_supported_exception<role_set>("QUERY GRANTED");
}
future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state& qs) {
auto f = validate_operation("QUERY ALL DIRECTLY GRANTED");
if (f.failed()) {
return make_exception_future<role_to_directly_granted_map>(f.get_exception());
}
return _std_mgr->query_all_directly_granted(qs);
future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
}
future<role_set> maintenance_socket_role_manager::query_all(::service::query_state& qs) {
auto f = validate_operation("QUERY ALL");
if (f.failed()) {
return make_exception_future<role_set>(f.get_exception());
}
return _std_mgr->query_all(qs);
future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
return operation_not_supported_exception<role_set>("QUERY ALL");
}
future<bool> maintenance_socket_role_manager::exists(std::string_view role_name) {
auto f = validate_operation("EXISTS");
if (f.failed()) {
return make_exception_future<bool>(f.get_exception());
}
return _std_mgr->exists(role_name);
return operation_not_supported_exception<bool>("EXISTS");
}
future<bool> maintenance_socket_role_manager::is_superuser(std::string_view role_name) {
auto f = validate_operation("IS SUPERUSER");
if (f.failed()) {
return make_exception_future<bool>(f.get_exception());
}
return _std_mgr->is_superuser(role_name);
return make_ready_future<bool>(true);
}
future<bool> maintenance_socket_role_manager::can_login(std::string_view role_name) {
auto f = validate_operation("CAN LOGIN");
if (f.failed()) {
return make_exception_future<bool>(f.get_exception());
}
return _std_mgr->can_login(role_name);
return make_ready_future<bool>(true);
}
future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
auto f = validate_operation("GET ATTRIBUTE");
if (f.failed()) {
return make_exception_future<std::optional<sstring>>(f.get_exception());
}
return _std_mgr->get_attribute(role_name, attribute_name, qs);
future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
}
future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
auto f = validate_operation("QUERY ATTRIBUTE FOR ALL");
if (f.failed()) {
return make_exception_future<role_manager::attribute_vals>(f.get_exception());
}
return _std_mgr->query_attribute_for_all(attribute_name, qs);
future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
}
future<> maintenance_socket_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
auto f = validate_operation("SET ATTRIBUTE");
if (f.failed()) {
return f;
}
return _std_mgr->set_attribute(role_name, attribute_name, attribute_value, mc);
return operation_not_supported_exception("SET ATTRIBUTE");
}
future<> maintenance_socket_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
auto f = validate_operation("REMOVE ATTRIBUTE");
if (f.failed()) {
return f;
}
return _std_mgr->remove_attribute(role_name, attribute_name, mc);
return operation_not_supported_exception("REMOVE ATTRIBUTE");
}
future<std::vector<cql3::description>> maintenance_socket_role_manager::describe_role_grants() {
auto f = validate_operation("DESCRIBE ROLE GRANTS");
if (f.failed()) {
return make_exception_future<std::vector<cql3::description>>(f.get_exception());
}
return _std_mgr->describe_role_grants();
return operation_not_supported_exception<std::vector<cql3::description>>("DESCRIBE SCHEMA WITH INTERNALS");
}
} // namespace auth

View File

@@ -11,7 +11,6 @@
#include "auth/cache.hh"
#include "auth/resource.hh"
#include "auth/role_manager.hh"
#include "auth/standard_role_manager.hh"
#include <seastar/core/future.hh>
namespace cql3 {
@@ -25,26 +24,13 @@ class raft_group0_client;
namespace auth {
// This role manager is used by the maintenance socket. It has disabled all role management operations
// in maintenance mode. In normal mode it delegates all operations to a standard_role_manager,
// which is created on demand when the node joins the cluster.
extern const std::string_view maintenance_socket_role_manager_name;
// This role manager is used by the maintenance socket. It has disabled all role management operations to not depend on
// system_auth keyspace, which may be not yet created when the maintenance socket starts listening.
class maintenance_socket_role_manager final : public role_manager {
cql3::query_processor& _qp;
::service::raft_group0_client& _group0_client;
::service::migration_manager& _migration_manager;
cache& _cache;
std::optional<standard_role_manager> _std_mgr;
bool _is_maintenance_mode;
public:
void set_maintenance_mode() override;
// Ensures role management operations are enabled.
// It must be called once the node has joined the cluster.
// In the meantime all role management operations will fail.
future<> ensure_role_operations_are_enabled() override;
maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {}
virtual std::string_view qualified_java_name() const noexcept override;
@@ -56,21 +42,21 @@ public:
virtual future<> ensure_superuser_is_created() override;
virtual future<> create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) override;
virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;
virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
virtual future<> alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) override;
virtual future<> alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) override;
virtual future<> grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) override;
virtual future<> revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) override;
virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query m) override;
virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& qs) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
virtual future<role_set> query_all(::service::query_state& qs) override;
virtual future<role_set> query_all(::service::query_state&) override;
virtual future<bool> exists(std::string_view role_name) override;
@@ -78,19 +64,15 @@ public:
virtual future<bool> can_login(std::string_view role_name) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;
virtual future<> remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) override;
virtual future<std::vector<cql3::description>> describe_role_grants() override;
private:
future<> validate_operation(std::string_view name) const;
};
}

View File

@@ -26,9 +26,10 @@
#include "cql3/untyped_result_set.hh"
#include "utils/log.hh"
#include "service/migration_manager.hh"
#include "utils/class_registrator.hh"
#include "replica/database.hh"
#include "cql3/query_processor.hh"
#include "db/config.hh"
#include "db/system_keyspace.hh"
namespace auth {
@@ -36,10 +37,29 @@ constexpr std::string_view password_authenticator_name("org.apache.cassandra.aut
// name of the hash column.
static constexpr std::string_view SALTED_HASH = "salted_hash";
static constexpr std::string_view DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);
static logging::logger plogger("password_authenticator");
// To ensure correct initialization order, we unfortunately need to use a string literal.
static const class_registrator<
authenticator,
password_authenticator,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
static std::string_view get_config_value(std::string_view value, std::string_view def) {
return value.empty() ? def : value;
}
std::string password_authenticator::default_superuser(const db::config& cfg) {
return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
}
password_authenticator::~password_authenticator() {
}
@@ -49,6 +69,7 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
, _migration_manager(mm)
, _cache(cache)
, _stopped(make_ready_future<>())
, _superuser(default_superuser(qp.db().get_config()))
{}
static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -57,18 +78,76 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
sstring password_authenticator::update_row_query() const {
return seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
}
static const sstring legacy_table_name{"credentials"};
bool password_authenticator::legacy_metadata_exists() const {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<> password_authenticator::migrate_legacy_metadata() const {
plogger.info("Starting migration of legacy authentication metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
auto username = row.get_as<sstring>("username");
auto salted_hash = row.get_as<sstring>(SALTED_HASH);
static const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
return _qp.execute_internal(
query,
consistency_for_user(username),
internal_distributed_query_state(),
{std::move(salted_hash), username},
cql3::query_processor::cache_internal::no).discard_result();
}).finally([results] {});
}).then([] {
plogger.info("Finished migrating legacy authentication metadata.");
}).handle_exception([](std::exception_ptr ep) {
plogger.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> password_authenticator::legacy_create_default_if_missing() {
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
if (exists) {
co_return;
}
std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
if (salted_pwd.empty()) {
salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
}
const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
{salted_pwd, _superuser},
cql3::query_processor::cache_internal::no);
plogger.info("Created default superuser authentication record.");
}
future<> password_authenticator::maybe_create_default_password() {
auto needs_password = [this] () -> future<bool> {
if (default_superuser(_qp).empty()) {
co_return false;
}
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
auto results = co_await _qp.execute_internal(query,
db::consistency_level::LOCAL_ONE,
internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
@@ -78,7 +157,7 @@ future<> password_authenticator::maybe_create_default_password() {
bool has_default = false;
bool has_superuser_with_password = false;
for (auto& result : *results) {
if (result.get_as<sstring>(meta::roles_table::role_col_name) == default_superuser(_qp)) {
if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
has_default = true;
}
if (has_salted_hash(result)) {
@@ -99,12 +178,12 @@ future<> password_authenticator::maybe_create_default_password() {
co_return;
}
// Set default superuser's password.
std::string salted_pwd(_qp.db().get_config().auth_superuser_salted_password());
std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
if (salted_pwd.empty()) {
co_return;
salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
}
const auto update_query = update_row_query();
co_await collect_mutations(_qp, batch, update_query, {salted_pwd, default_superuser(_qp)});
co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
plogger.info("Created default superuser authentication record.");
}
@@ -137,14 +216,58 @@ future<> password_authenticator::start() {
_stopped = do_after_system_ready(_as, [this] {
return async([this] {
if (legacy_mode(_qp)) {
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
_migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
if (legacy::any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get()) {
if (legacy_metadata_exists()) {
plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
}
return;
}
if (legacy_metadata_exists()) {
migrate_legacy_metadata().get();
return;
}
legacy_create_default_if_missing().get();
}
utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
maybe_create_default_password_with_retries().get();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
if (!legacy_mode(_qp)) {
maybe_create_default_password_with_retries().get();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
}
}
});
});
if (legacy_mode(_qp)) {
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
return create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
_migration_manager);
}
return make_ready_future<>();
});
}
@@ -154,6 +277,15 @@ future<> password_authenticator::stop() {
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
}
db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
// TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
// super user uses plain LOCAL_ONE?
if (role_name == DEFAULT_USER_NAME) {
return db::consistency_level::QUORUM;
}
return db::consistency_level::LOCAL_ONE;
}
std::string_view password_authenticator::qualified_java_name() const {
return password_authenticator_name;
}
@@ -183,12 +315,20 @@ future<authenticated_user> password_authenticator::authenticate(
const sstring password = credentials.at(PASSWORD_KEY);
try {
auto role = _cache.get(username);
if (!role || role->salted_hash.empty()) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
std::optional<sstring> salted_hash;
if (legacy_mode(_qp)) {
salted_hash = co_await get_password_hash(username);
if (!salted_hash) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
} else {
auto role = _cache.get(username);
if (!role || role->salted_hash.empty()) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
salted_hash = role->salted_hash;
}
const auto& salted_hash = role->salted_hash;
const bool password_match = co_await passwords::check(password, salted_hash);
const bool password_match = co_await passwords::check(password, *salted_hash);
if (!password_match) {
throw exceptions::authentication_exception("Username and/or password are incorrect");
}
@@ -227,7 +367,16 @@ future<> password_authenticator::create(std::string_view role_name, const authen
}
const auto query = update_row_query();
co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_user(role_name),
internal_distributed_query_state(),
{std::move(*maybe_hash), sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
}
}
future<> password_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
@@ -238,21 +387,38 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
const auto password = std::get<password_option>(*options.credentials).password;
const sstring query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query,
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_user(role_name),
internal_distributed_query_state(),
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
}
}
future<> password_authenticator::drop(std::string_view name, ::service::group0_batch& mc) {
const sstring query = seastar::format("DELETE {} FROM {}.{} WHERE {} = ?",
SALTED_HASH,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query, consistency_for_user(name),
internal_distributed_query_state(),
{sstring(name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(name)});
}
}
future<custom_options> password_authenticator::query_custom_options(std::string_view role_name) const {
@@ -271,13 +437,13 @@ future<std::optional<sstring>> password_authenticator::get_password_hash(std::st
// that a map lookup string->statement is not gonna kill us much.
const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ?",
SALTED_HASH,
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
const auto res = co_await _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
consistency_for_user(role_name),
internal_distributed_query_state(),
{role_name},
cql3::query_processor::cache_internal::yes);

View File

@@ -13,6 +13,7 @@
#include <seastar/core/abort_source.hh>
#include <seastar/core/shared_future.hh>
#include "db/consistency_level_type.hh"
#include "auth/authenticator.hh"
#include "auth/passwords.hh"
#include "auth/cache.hh"
@@ -43,11 +44,15 @@ class password_authenticator : public authenticator {
cache& _cache;
future<> _stopped;
abort_source _as;
std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
shared_promise<> _superuser_created_promise;
// We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
public:
static db::consistency_level consistency_for_user(std::string_view role_name);
static std::string default_superuser(const db::config&);
password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
~password_authenticator();
@@ -85,6 +90,12 @@ public:
virtual future<> ensure_superuser_is_created() const override;
private:
bool legacy_metadata_exists() const;
future<> migrate_legacy_metadata() const;
future<> legacy_create_default_if_missing();
future<> maybe_create_default_password();
future<> maybe_create_default_password_with_retries();

View File

@@ -76,14 +76,11 @@ sstring generate_salt(RandomNumberEngine& g, scheme scheme) {
///
/// Hash a password combined with an implementation-specific salt string.
/// Deprecated in favor of `hash_with_salt_async`. This function is still used
/// when generating password hashes for storage to ensure that
/// `hash_with_salt` and `hash_with_salt_async` produce identical results,
/// preserving backward compatibility.
/// Deprecated in favor of `hash_with_salt_async`.
///
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
///
sstring hash_with_salt(const sstring& pass, const sstring& salt);
[[deprecated("Use hash_with_salt_async instead")]] sstring hash_with_salt(const sstring& pass, const sstring& salt);
///
/// Async version of `hash_with_salt` that returns a future.

38
auth/permissions_cache.cc Normal file
View File

@@ -0,0 +1,38 @@
/*
* Copyright (C) 2017-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "auth/permissions_cache.hh"
#include <fmt/ranges.h>
#include "auth/authorizer.hh"
#include "auth/service.hh"
namespace auth {
permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
: _cache(c, log, [&ser, &log](const key_type& k) {
log.debug("Refreshing permissions for {}", k.first);
return ser.get_uncached_permissions(k.first, k.second);
}) {
}
bool permissions_cache::update_config(utils::loading_cache_config c) {
return _cache.update_config(std::move(c));
}
void permissions_cache::reset() {
_cache.reset();
}
future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
return do_with(key_type(maybe_role, r), [this](const auto& k) {
return _cache.get(k);
});
}
}

66
auth/permissions_cache.hh Normal file
View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2017-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <iostream>
#include <utility>
#include <fmt/core.h>
#include <seastar/core/future.hh>
#include "auth/permission.hh"
#include "auth/resource.hh"
#include "auth/role_or_anonymous.hh"
#include "utils/log.hh"
#include "utils/hash.hh"
#include "utils/loading_cache.hh"
namespace std {
inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
return os;
}
}
namespace db {
class config;
}
namespace auth {
class service;
class permissions_cache final {
using cache_type = utils::loading_cache<
std::pair<role_or_anonymous, resource>,
permission_set,
1,
utils::loading_cache_reload_enabled::yes,
utils::simple_entry_size<permission_set>,
utils::tuple_hash>;
using key_type = typename cache_type::key_type;
cache_type _cache;
public:
explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
future <> stop() {
return _cache.stop();
}
bool update_config(utils::loading_cache_config);
void reset();
future<permission_set> get(const role_or_anonymous&, const resource&);
};
}

View File

@@ -112,11 +112,6 @@ public:
virtual future<> stop() = 0;
///
/// Notify that the maintenance mode is starting.
///
virtual void set_maintenance_mode() {}
///
/// Ensure that superuser role exists.
///
@@ -124,11 +119,6 @@ public:
///
virtual future<> ensure_superuser_is_created() = 0;
///
/// Ensure role management operations are enabled. Some role managers may defer initialization.
///
virtual future<> ensure_role_operations_are_enabled() { return make_ready_future<>(); }
///
/// \returns an exceptional future with \ref role_already_exists for a role that has previously been created.
///

68
auth/roles-metadata.cc Normal file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2018-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "auth/roles-metadata.hh"
#include <seastar/core/format.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/sstring.hh>
#include "auth/common.hh"
#include "cql3/query_processor.hh"
#include "cql3/untyped_result_set.hh"
namespace auth {
namespace legacy {
future<bool> default_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
auth::meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
auto results = co_await qp.execute_internal(query, cl
, internal_distributed_query_state()
, {rolename.value_or(std::string(auth::meta::DEFAULT_SUPERUSER_NAME))}
, cql3::query_processor::cache_internal::yes
);
if (!results->empty()) {
co_return p(results->one());
}
}
co_return false;
}
future<bool> any_nondefault_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{}", auth::meta::legacy::AUTH_KS, meta::roles_table::name);
auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
, internal_distributed_query_state(), cql3::query_processor::cache_internal::no
);
if (results->empty()) {
co_return false;
}
static const sstring col_name = sstring(meta::roles_table::role_col_name);
co_return std::ranges::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
return is_nondefault && p(row);
});
}
} // namespace legacy
} // namespace auth

View File

@@ -8,7 +8,18 @@
#pragma once
#include <optional>
#include <string_view>
#include <functional>
#include <seastar/core/future.hh>
#include "seastarx.hh"
namespace cql3 {
class query_processor;
class untyped_result_set_row;
}
namespace auth {
@@ -24,4 +35,26 @@ constexpr std::string_view role_col_name{"role", 4};
} // namespace meta
namespace legacy {
///
/// Check that the default role satisfies a predicate, or `false` if the default role does not exist.
///
future<bool> default_role_row_satisfies(
cql3::query_processor&,
std::function<bool(const cql3::untyped_result_set_row&)>,
std::optional<std::string> rolename = {}
);
///
/// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
///
future<bool> any_nondefault_role_row_satisfies(
cql3::query_processor&,
std::function<bool(const cql3::untyped_result_set_row&)>,
std::optional<std::string> rolename = {}
);
} // namespace legacy
} // namespace auth

View File

@@ -22,11 +22,21 @@
#include "db/config.hh"
#include "utils/log.hh"
#include "seastarx.hh"
#include "utils/class_registrator.hh"
namespace auth {
static logging::logger mylog("saslauthd_authenticator");
// To ensure correct initialization order, we unfortunately need to use a string literal.
static const class_registrator<
authenticator,
saslauthd_authenticator,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
: _socket_path(qp.db().get_config().saslauthd_socket_path())
{}

View File

@@ -16,8 +16,6 @@
#include <algorithm>
#include <chrono>
#include <boost/algorithm/string.hpp>
#include <seastar/core/future-util.hh>
#include <seastar/core/shard_id.hh>
#include <seastar/core/sharded.hh>
@@ -25,18 +23,8 @@
#include "auth/allow_all_authenticator.hh"
#include "auth/allow_all_authorizer.hh"
#include "auth/certificate_authenticator.hh"
#include "auth/common.hh"
#include "auth/default_authorizer.hh"
#include "auth/ldap_role_manager.hh"
#include "auth/maintenance_socket_authenticator.hh"
#include "auth/maintenance_socket_authorizer.hh"
#include "auth/maintenance_socket_role_manager.hh"
#include "auth/password_authenticator.hh"
#include "auth/role_or_anonymous.hh"
#include "auth/saslauthd_authenticator.hh"
#include "auth/standard_role_manager.hh"
#include "auth/transitional.hh"
#include "cql3/functions/functions.hh"
#include "cql3/query_processor.hh"
#include "cql3/description.hh"
@@ -55,6 +43,7 @@
#include "service/raft/raft_group0_client.hh"
#include "mutation/timestamp.hh"
#include "utils/assert.hh"
#include "utils/class_registrator.hh"
#include "locator/abstract_replication_strategy.hh"
#include "data_dictionary/keyspace_metadata.hh"
#include "service/storage_service.hh"
@@ -74,6 +63,91 @@ static const sstring superuser_col_name("super");
static logging::logger log("auth_service");
class auth_migration_listener final : public ::service::migration_listener {
authorizer& _authorizer;
cql3::query_processor& _qp;
public:
explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a), _qp(qp) {
}
private:
void on_create_keyspace(const sstring& ks_name) override {}
void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
void on_update_keyspace(const sstring& ks_name) override {}
void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
void on_drop_keyspace(const sstring& ks_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
});
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
});
}
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_data_resource(ks_name, cf_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
});
}
void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
void on_drop_function(const sstring& ks_name, const sstring& function_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
// Do it in the background.
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_functions_resource(ks_name, function_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
});
}
void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {
if (!legacy_mode(_qp)) {
// in non legacy path revoke is part of schema change statement execution
return;
}
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
return _authorizer.revoke_all(
auth::make_functions_resource(ks_name, aggregate_name), mc);
}).handle_exception([] (std::exception_ptr e) {
log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
});
}
void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
};
static future<> validate_role_exists(const service& ser, std::string_view role_name) {
return ser.underlying_role_manager().exists(role_name).then([role_name](bool exists) {
if (!exists) {
@@ -83,36 +157,50 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
}
service::service(
utils::loading_cache_config c,
cache& cache,
cql3::query_processor& qp,
::service::raft_group0_client& g0,
::service::migration_notifier& mn,
std::unique_ptr<authorizer> z,
std::unique_ptr<authenticator> a,
std::unique_ptr<role_manager> r,
maintenance_socket_enabled used_by_maintenance_socket)
: _cache(cache)
: _loading_cache_config(std::move(c))
, _permissions_cache(nullptr)
, _cache(cache)
, _qp(qp)
, _group0_client(g0)
, _mnotifier(mn)
, _authorizer(std::move(z))
, _authenticator(std::move(a))
, _role_manager(std::move(r))
, _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
, _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
, _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
, _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
, _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
, _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
, _used_by_maintenance_socket(used_by_maintenance_socket) {}
service::service(
utils::loading_cache_config c,
cql3::query_processor& qp,
::service::raft_group0_client& g0,
authorizer_factory authorizer_factory,
authenticator_factory authenticator_factory,
role_manager_factory role_manager_factory,
::service::migration_notifier& mn,
::service::migration_manager& mm,
const service_config& sc,
maintenance_socket_enabled used_by_maintenance_socket,
cache& cache)
: service(
std::move(c),
cache,
qp,
g0,
authorizer_factory(),
authenticator_factory(),
role_manager_factory(),
mn,
create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
used_by_maintenance_socket) {
}
@@ -145,6 +233,9 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
}
future<> service::start(::service::migration_manager& mm, db::system_keyspace& sys_ks) {
auto auth_version = co_await sys_ks.get_auth_version();
// version is set in query processor to be easily available in various places we call auth::legacy_mode check.
_qp.auth_version = auth_version;
if (this_shard_id() == 0) {
co_await _cache.load_all();
}
@@ -166,20 +257,25 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
co_await _role_manager->ensure_superuser_is_created();
}
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
if (!_used_by_maintenance_socket) {
// Maintenance socket mode can't cache permissions because it has
// different authorizer. We can't mix cached permissions, they could be
// different in normal mode.
_cache.set_permission_loader(std::bind(
&service::get_uncached_permissions,
this, std::placeholders::_1, std::placeholders::_2));
}
_permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
co_await once_among_shards([this] {
_mnotifier.register_listener(_migration_listener.get());
return make_ready_future<>();
});
}
future<> service::stop() {
_as.request_abort();
_cache.set_permission_loader(nullptr);
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
// Only one of the shards has the listener registered, but let's try to
// unregister on each one just to make sure.
return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
if (_permissions_cache) {
return _permissions_cache->stop();
}
return make_ready_future<>();
}).then([this] {
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
});
}
future<> service::ensure_superuser_is_created() {
@@ -187,8 +283,21 @@ future<> service::ensure_superuser_is_created() {
co_await _authenticator->ensure_superuser_is_created();
}
void service::update_cache_config() {
auto db = _qp.db();
utils::loading_cache_config perm_cache_config;
perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
}
}
void service::reset_authorization_cache() {
_permissions_cache->reset();
_qp.reset_cache();
}
@@ -213,14 +322,7 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
}
future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
if (_used_by_maintenance_socket) {
return get_uncached_permissions(maybe_role, r);
}
return _cache.get_permissions(maybe_role, r);
}
void service::set_maintenance_mode() {
_role_manager->set_maintenance_mode();
return _permissions_cache->get(maybe_role, r);
}
future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -258,10 +360,6 @@ static void validate_authentication_options_are_supported(
}
}
future<> service::ensure_role_operations_are_enabled() {
return _role_manager->ensure_role_operations_are_enabled();
}
future<> service::create_role(std::string_view name,
const role_config& config,
const authentication_options& options,
@@ -279,6 +377,11 @@ future<> service::create_role(std::string_view name,
ep = std::current_exception();
}
if (ep) {
// Rollback only in legacy mode as normally mutations won't be
// applied in case exception is raised
if (legacy_mode(_qp)) {
co_await underlying_role_manager().drop(name, mc);
}
std::rethrow_exception(std::move(ep));
}
}
@@ -344,11 +447,6 @@ future<bool> service::exists(const resource& r) const {
return make_ready_future<bool>(false);
}
future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
co_await _authorizer->revoke_all(r, mc);
co_await _cache.prune(r);
}
future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
std::vector<cql3::description> result{};
@@ -357,11 +455,11 @@ future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_
const bool authenticator_uses_password_hashes = _authenticator->uses_password_hashes();
const auto default_su = cql3::util::maybe_quote(default_superuser(_qp));
auto produce_create_statement = [&default_su, with_hashed_passwords] (const sstring& formatted_role_name,
auto produce_create_statement = [with_hashed_passwords] (const sstring& formatted_role_name,
const std::optional<sstring>& maybe_hashed_password, bool can_login, bool is_superuser) {
const sstring role_part = formatted_role_name == default_su
// Even after applying formatting to a role, `formatted_role_name` can only equal `meta::DEFAULT_SUPER_NAME`
// if the original identifier was equal to it.
const sstring role_part = formatted_role_name == meta::DEFAULT_SUPERUSER_NAME
? seastar::format("IF NOT EXISTS {}", formatted_role_name)
: formatted_role_name;
@@ -574,10 +672,6 @@ future<std::vector<cql3::description>> service::describe_auth(bool with_hashed_p
// Free functions.
//
void set_maintenance_mode(service& ser) {
ser.set_maintenance_mode();
}
future<bool> has_superuser(const service& ser, const authenticated_user& u) {
if (is_anonymous(u)) {
return make_ready_future<bool>(false);
@@ -586,10 +680,6 @@ future<bool> has_superuser(const service& ser, const authenticated_user& u) {
return ser.has_superuser(*u.name);
}
future<> ensure_role_operations_are_enabled(service& ser) {
return ser.underlying_role_manager().ensure_role_operations_are_enabled();
}
future<role_set> get_roles(const service& ser, const authenticated_user& u) {
if (is_anonymous(u)) {
return make_ready_future<role_set>();
@@ -711,7 +801,7 @@ future<> revoke_permissions(
}
future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
return ser.revoke_all(r, mc);
return ser.underlying_authorizer().revoke_all(r, mc);
}
future<std::vector<permission_details>> list_filtered_permissions(
@@ -772,115 +862,78 @@ future<> commit_mutations(service& ser, ::service::group0_batch&& mc) {
return ser.commit_mutations(std::move(mc));
}
namespace {
future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as) {
// FIXME: if this function fails it may leave partial data in the new tables
// that should be cleared
auto gen = [&sys_ks] (api::timestamp_type ts) -> ::service::mutations_generator {
auto& qp = sys_ks.query_processor();
for (const auto& cf_name : std::vector<sstring>{
"roles", "role_members", "role_attributes", "role_permissions"}) {
schema_ptr schema;
try {
schema = qp.db().find_schema(meta::legacy::AUTH_KS, cf_name);
} catch (const data_dictionary::no_such_column_family&) {
continue; // some tables might not have been created if they were not used
}
std::string_view get_short_name(std::string_view name) {
auto pos = name.find_last_of('.');
if (pos == std::string_view::npos) {
return name;
}
return name.substr(pos + 1);
}
// use longer than usual timeout as we scan the whole table
// but not infinite or very long as we want to fail reasonably fast
const auto t = 5min;
const timeout_config tc{t, t, t, t, t, t, t};
::service::client_state cs(::service::client_state::internal_tag{}, tc);
::service::query_state qs(cs, empty_service_permit());
} // anonymous namespace
authorizer_factory make_authorizer_factory(
std::string_view name,
sharded<cql3::query_processor>& qp) {
std::string_view short_name = get_short_name(name);
if (boost::iequals(short_name, "AllowAllAuthorizer")) {
return [&qp] {
return std::make_unique<allow_all_authorizer>(qp.local());
};
} else if (boost::iequals(short_name, "CassandraAuthorizer")) {
return [&qp] {
return std::make_unique<default_authorizer>(qp.local());
};
} else if (boost::iequals(short_name, "TransitionalAuthorizer")) {
return [&qp] {
return std::make_unique<transitional_authorizer>(qp.local());
};
}
throw std::invalid_argument(fmt::format("Unknown authorizer: {}", name));
}
authenticator_factory make_authenticator_factory(
std::string_view name,
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& auth_cache) {
std::string_view short_name = get_short_name(name);
if (boost::iequals(short_name, "AllowAllAuthenticator")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<allow_all_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
} else if (boost::iequals(short_name, "PasswordAuthenticator")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<password_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
} else if (boost::iequals(short_name, "CertificateAuthenticator")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<certificate_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
} else if (boost::iequals(short_name, "SaslauthdAuthenticator")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<saslauthd_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
} else if (boost::iequals(short_name, "TransitionalAuthenticator")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<transitional_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
}
throw std::invalid_argument(fmt::format("Unknown authenticator: {}", name));
}
role_manager_factory make_role_manager_factory(
std::string_view name,
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& auth_cache) {
std::string_view short_name = get_short_name(name);
if (boost::iequals(short_name, "CassandraRoleManager")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<standard_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
};
} else if (boost::iequals(short_name, "LDAPRoleManager")) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<ldap_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
};
}
throw std::invalid_argument(fmt::format("Unknown role manager: {}", name));
}
authenticator_factory make_maintenance_socket_authenticator_factory(
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& auth_cache) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<maintenance_socket_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
};
}
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
return [&qp] {
return std::make_unique<maintenance_socket_authorizer>(qp.local());
};
}
role_manager_factory make_maintenance_socket_role_manager_factory(
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& auth_cache) {
return [&qp, &g0, &mm, &auth_cache] {
return std::make_unique<maintenance_socket_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
auto rows = co_await qp.execute_internal(
seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
db::consistency_level::ALL,
qs,
{},
cql3::query_processor::cache_internal::no);
if (rows->empty()) {
continue;
}
std::vector<sstring> col_names;
for (const auto& col : schema->all_columns()) {
col_names.push_back(col.name_as_cql_string());
}
sstring val_binders_str = "?";
for (size_t i = 1; i < col_names.size(); ++i) {
val_binders_str += ", ?";
}
for (const auto& row : *rows) {
std::vector<data_value_or_unset> values;
for (const auto& col : schema->all_columns()) {
if (row.has(col.name_as_text())) {
values.push_back(
col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
} else {
values.push_back(unset_value{});
}
}
auto muts = co_await qp.get_mutations_internal(
seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
db::system_keyspace::NAME,
cf_name,
fmt::join(col_names, ", "),
val_binders_str),
internal_distributed_query_state(),
ts,
std::move(values));
if (muts.size() != 1) {
on_internal_error(log,
format("expecting single insert mutation, got {}", muts.size()));
}
co_yield std::move(muts[0]);
}
}
co_yield co_await sys_ks.make_auth_version_mutation(ts,
db::system_keyspace::auth_version_t::v2);
};
co_await announce_mutations_with_batching(g0,
start_operation_func,
std::move(gen),
as,
std::nullopt);
}
}

View File

@@ -12,7 +12,6 @@
#include <memory>
#include <optional>
#include <seastar/core/coroutine.hh>
#include <seastar/core/future.hh>
#include <seastar/core/sstring.hh>
#include <seastar/util/bool_class.hh>
@@ -21,6 +20,7 @@
#include "auth/authenticator.hh"
#include "auth/authorizer.hh"
#include "auth/permission.hh"
#include "auth/permissions_cache.hh"
#include "auth/cache.hh"
#include "auth/role_manager.hh"
#include "auth/common.hh"
@@ -37,16 +37,19 @@ class query_processor;
namespace service {
class migration_manager;
class migration_notifier;
class migration_listener;
}
namespace auth {
class role_or_anonymous;
/// Factory function types for creating auth module instances on each shard.
using authorizer_factory = std::function<std::unique_ptr<authorizer>()>;
using authenticator_factory = std::function<std::unique_ptr<authenticator>()>;
using role_manager_factory = std::function<std::unique_ptr<role_manager>()>;
struct service_config final {
sstring authorizer_java_name;
sstring authenticator_java_name;
sstring role_manager_java_name;
};
///
/// Due to poor (in this author's opinion) decisions of Apache Cassandra, certain choices of one role-manager,
@@ -72,27 +75,43 @@ public:
/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
/// given an object from another shard. Used for bouncing lwt requests to correct shard.
class service final : public seastar::peering_sharded_service<service> {
utils::loading_cache_config _loading_cache_config;
std::unique_ptr<permissions_cache> _permissions_cache;
cache& _cache;
cql3::query_processor& _qp;
::service::raft_group0_client& _group0_client;
::service::migration_notifier& _mnotifier;
authorizer::ptr_type _authorizer;
authenticator::ptr_type _authenticator;
role_manager::ptr_type _role_manager;
// Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
std::unique_ptr<::service::migration_listener> _migration_listener;
std::function<void(uint32_t)> _permissions_cache_cfg_cb;
serialized_action _permissions_cache_config_action;
utils::observer<uint32_t> _permissions_cache_max_entries_observer;
utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
maintenance_socket_enabled _used_by_maintenance_socket;
abort_source _as;
public:
service(
utils::loading_cache_config,
cache& cache,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_notifier&,
std::unique_ptr<authorizer>,
std::unique_ptr<authenticator>,
std::unique_ptr<role_manager>,
@@ -100,15 +119,16 @@ public:
///
/// This constructor is intended to be used when the class is sharded via \ref seastar::sharded. In that case, the
/// arguments must be copyable, which is why we delay construction with instance-construction factories instead
/// arguments must be copyable, which is why we delay construction with instance-construction instructions instead
/// of the instances themselves.
///
service(
utils::loading_cache_config,
cql3::query_processor&,
::service::raft_group0_client&,
authorizer_factory,
authenticator_factory,
role_manager_factory,
::service::migration_notifier&,
::service::migration_manager&,
const service_config&,
maintenance_socket_enabled,
cache&);
@@ -118,6 +138,8 @@ public:
future<> ensure_superuser_is_created();
void update_cache_config();
void reset_authorization_cache();
///
@@ -130,11 +152,6 @@ public:
///
future<permission_set> get_uncached_permissions(const role_or_anonymous&, const resource&) const;
///
/// Notify the service that the node is entering maintenance mode.
///
void set_maintenance_mode();
///
/// Query whether the named role has been granted a role that is a superuser.
///
@@ -144,11 +161,6 @@ public:
///
future<bool> has_superuser(std::string_view role_name) const;
///
/// Ensure that the role operations are enabled. Some role managers defer initialization.
///
future<> ensure_role_operations_are_enabled();
///
/// Create a role with optional authentication information.
///
@@ -169,13 +181,6 @@ public:
future<bool> exists(const resource&) const;
///
/// Revoke all permissions granted to any role for a particular resource.
///
/// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
///
future<> revoke_all(const resource&, ::service::group0_batch&) const;
///
/// Produces descriptions that can be used to restore the state of auth. That encompasses
/// roles, role grants, and permission grants.
@@ -194,9 +199,12 @@ public:
return *_role_manager;
}
cql3::query_processor& query_processor() const noexcept {
return _qp;
}
future<> commit_mutations(::service::group0_batch&& mc) {
co_await std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
co_await _group0_client.send_group0_read_barrier_to_live_members();
return std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
}
private:
@@ -207,12 +215,8 @@ private:
future<std::vector<cql3::description>> describe_permissions() const;
};
void set_maintenance_mode(service&);
future<bool> has_superuser(const service&, const authenticated_user&);
future<> ensure_role_operations_are_enabled(service&);
future<role_set> get_roles(const service&, const authenticated_user&);
future<permission_set> get_permissions(const service&, const authenticated_user&, const resource&);
@@ -396,55 +400,7 @@ future<std::vector<permission_details>> list_filtered_permissions(
// Finalizes write operations performed in auth by committing mutations via raft group0.
future<> commit_mutations(service& ser, ::service::group0_batch&& mc);
///
/// Factory helper functions for creating auth module instances.
/// These are intended for use with sharded<service>::start() where copyable arguments are required.
/// The returned factories capture the sharded references and call .local() when invoked on each shard.
///
/// Creates an authorizer factory for config-selectable authorizer types.
/// @param name The authorizer class name (e.g., "CassandraAuthorizer", "AllowAllAuthorizer")
authorizer_factory make_authorizer_factory(
std::string_view name,
sharded<cql3::query_processor>& qp);
/// Creates an authenticator factory for config-selectable authenticator types.
/// @param name The authenticator class name (e.g., "PasswordAuthenticator", "AllowAllAuthenticator")
authenticator_factory make_authenticator_factory(
std::string_view name,
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& cache);
/// Creates a role_manager factory for config-selectable role manager types.
/// @param name The role manager class name (e.g., "CassandraRoleManager")
role_manager_factory make_role_manager_factory(
std::string_view name,
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& cache);
/// Creates a factory for the maintenance socket authenticator.
/// This authenticator is not config-selectable and is only used for the maintenance socket.
authenticator_factory make_maintenance_socket_authenticator_factory(
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& cache);
/// Creates a factory for the maintenance socket authorizer.
/// This authorizer is not config-selectable and is only used for the maintenance socket.
/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
/// Creates a factory for the maintenance socket role manager.
/// This role manager is not config-selectable and is only used for the maintenance socket.
role_manager_factory make_maintenance_socket_role_manager_factory(
sharded<cql3::query_processor>& qp,
::service::raft_group0_client& g0,
sharded<::service::migration_manager>& mm,
sharded<cache>& cache);
// Migrates data from old keyspace to new one which supports linearizable writes via raft.
future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as);
}

View File

@@ -28,14 +28,15 @@
#include "cql3/untyped_result_set.hh"
#include "cql3/util.hh"
#include "db/consistency_level_type.hh"
#include "db/system_keyspace.hh"
#include "exceptions/exceptions.hh"
#include "utils/error_injection.hh"
#include "utils/log.hh"
#include <seastar/core/loop.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include "service/raft/raft_group0_client.hh"
#include "utils/class_registrator.hh"
#include "service/migration_manager.hh"
#include "password_authenticator.hh"
#include "utils/managed_string.hh"
namespace auth {
@@ -43,21 +44,57 @@ namespace auth {
static logging::logger log("standard_role_manager");
future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
auto role = _cache.get(role_name);
if (!role) {
return make_ready_future<std::optional<record>>(std::nullopt);
static const class_registrator<
role_manager,
standard_role_manager,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");
struct record final {
sstring name;
bool is_superuser;
bool can_login;
role_set member_of;
};
static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
return db::consistency_level::QUORUM;
}
return make_ready_future<std::optional<record>>(std::make_optional(record{
.name = sstring(role_name),
.is_superuser = role->is_superuser,
.can_login = role->can_login,
.member_of = role->member_of
}));
return db::consistency_level::LOCAL_ONE;
}
future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
return find_record(role_name).then([role_name](std::optional<record> mr) {
static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
get_auth_ks_name(qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
const auto results = co_await qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::yes);
if (results->empty()) {
co_return std::optional<record>();
}
const cql3::untyped_result_set_row& row = results->one();
co_return std::make_optional(record{
row.get_as<sstring>(sstring(meta::roles_table::role_col_name)),
row.get_or<bool>("is_superuser", false),
row.get_or<bool>("can_login", false),
(row.has("member_of")
? row.get_set<sstring>("member_of")
: role_set())});
}
static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
if (!mr) {
throw nonexistant_role(role_name);
}
@@ -76,6 +113,7 @@ standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::servic
, _migration_manager(mm)
, _cache(cache)
, _stopped(make_ready_future<>())
, _superuser(password_authenticator::default_superuser(qp.db().get_config()))
{}
std::string_view standard_role_manager::qualified_java_name() const noexcept {
@@ -90,12 +128,79 @@ const resource_set& standard_role_manager::protected_resources() const {
return resources;
}
future<> standard_role_manager::maybe_create_default_role() {
if (default_superuser(_qp).empty()) {
co_return;
future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const {
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
static const sstring create_role_members_query = fmt::format(
"CREATE TABLE {}.{} ("
" role text,"
" member text,"
" PRIMARY KEY (role, member)"
")",
meta::legacy::AUTH_KS,
ROLE_MEMBERS_CF);
static const sstring create_role_attributes_query = seastar::format(
"CREATE TABLE {}.{} ("
" role text,"
" name text,"
" value text,"
" PRIMARY KEY(role, name)"
")",
meta::legacy::AUTH_KS,
ROLE_ATTRIBUTES_CF);
return when_all_succeed(
create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
_migration_manager),
create_legacy_metadata_table_if_missing(
ROLE_MEMBERS_CF,
_qp,
create_role_members_query,
_migration_manager),
create_legacy_metadata_table_if_missing(
ROLE_ATTRIBUTES_CF,
_qp,
create_role_attributes_query,
_migration_manager)).discard_result();
}
future<> standard_role_manager::legacy_create_default_role_if_missing() {
try {
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_can_login, _superuser);
if (exists) {
co_return;
}
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
{_superuser},
cql3::query_processor::cache_internal::no).discard_result();
log.info("Created default superuser role '{}'.", _superuser);
} catch (const exceptions::unavailable_exception& e) {
log.warn("Skipped default role setup: some nodes were not ready; will retry");
throw e;
}
}
future<> standard_role_manager::maybe_create_default_role() {
auto has_superuser = [this] () -> future<bool> {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
for (const auto& result : *results) {
@@ -119,12 +224,12 @@ future<> standard_role_manager::maybe_create_default_role() {
// There is no superuser which has can_login field - create default role.
// Note that we don't check if can_login is set to true.
const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, batch, insert_query, {default_superuser(_qp)});
co_await collect_mutations(_qp, batch, insert_query, {_superuser});
co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
log.info("Created default superuser role '{}'.", default_superuser(_qp));
log.info("Created default superuser role '{}'.", _superuser);
}
future<> standard_role_manager::maybe_create_default_role_with_retries() {
@@ -147,12 +252,78 @@ future<> standard_role_manager::maybe_create_default_role_with_retries() {
}
}
static const sstring legacy_table_name{"users"};
bool standard_role_manager::legacy_metadata_exists() {
return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
}
future<> standard_role_manager::migrate_legacy_metadata() {
log.info("Starting migration of legacy user metadata.");
static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
return _qp.execute_internal(
query,
db::consistency_level::QUORUM,
internal_distributed_query_state(),
cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
role_config config;
config.is_superuser = row.get_or<bool>("super", false);
config.can_login = true;
return do_with(
row.get_as<sstring>("name"),
std::move(config),
::service::group0_batch::unused(),
[this](const auto& name, const auto& config, auto& mc) {
return create_or_replace(meta::legacy::AUTH_KS, name, config, mc);
});
}).finally([results] {});
}).then([] {
log.info("Finished migrating legacy user metadata.");
}).handle_exception([](std::exception_ptr ep) {
log.error("Encountered an error during migration!");
std::rethrow_exception(ep);
});
}
future<> standard_role_manager::start() {
return once_among_shards([this] () -> future<> {
if (legacy_mode(_qp)) {
co_await create_legacy_metadata_tables_if_missing();
}
auto handler = [this] () -> future<> {
co_await maybe_create_default_role_with_retries();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
const bool legacy = legacy_mode(_qp);
if (legacy) {
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);
if (co_await legacy::any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
if (legacy_metadata_exists()) {
log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
}
co_return;
}
if (legacy_metadata_exists()) {
co_await migrate_legacy_metadata();
co_return;
}
co_await legacy_create_default_role_if_missing();
}
if (!legacy) {
co_await maybe_create_default_role_with_retries();
if (!_superuser_created_promise.available()) {
_superuser_created_promise.set_value();
}
}
};
@@ -171,12 +342,21 @@ future<> standard_role_manager::ensure_superuser_is_created() {
return _superuser_created_promise.get_shared_future();
}
future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
future<> standard_role_manager::create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
db::system_keyspace::NAME,
auth_ks_name,
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(role_name), c.is_superuser, c.can_login});
if (auth_ks_name == meta::legacy::AUTH_KS) {
co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), c.is_superuser, c.can_login},
cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name), c.is_superuser, c.can_login});
}
}
future<>
@@ -186,7 +366,7 @@ standard_role_manager::create(std::string_view role_name, const role_config& c,
throw role_already_exists(role_name);
}
return create_or_replace(role_name, c, mc);
return create_or_replace(get_auth_ks_name(_qp), role_name, c, mc);
});
}
@@ -206,16 +386,25 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
return fmt::to_string(fmt::join(assignments, ", "));
};
return require_record(role_name).then([this, role_name, &u, &mc](record) {
return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
if (!u.is_superuser && !u.can_login) {
return make_ready_future<>();
}
const sstring query = seastar::format("UPDATE {}.{} SET {} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
build_column_assignments(u),
meta::roles_table::role_col_name);
return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
if (legacy_mode(_qp)) {
return _qp.execute_internal(
std::move(query),
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
}
});
}
@@ -226,11 +415,11 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
// First, revoke this role from all roles that are members of it.
const auto revoke_from_members = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("SELECT member FROM {}.{} WHERE role = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
const auto members = co_await _qp.execute_internal(
query,
db::consistency_level::LOCAL_ONE,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no);
@@ -258,33 +447,102 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
// Delete all attributes for that role
const auto remove_attributes_of = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name)},
cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
};
// Finally, delete the role itself.
const auto delete_role = [this, role_name, &mc] () -> future<> {
const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name)},
cql3::query_processor::cache_internal::no).discard_result();
} else {
co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
}
};
co_await when_all_succeed(revoke_from_members, revoke_members_of, remove_attributes_of);
co_await delete_role();
}
future<>
standard_role_manager::legacy_modify_membership(
std::string_view grantee_name,
std::string_view role_name,
membership_change ch) {
const auto modify_roles = [this, role_name, grantee_name, ch] () -> future<> {
const auto query = seastar::format(
"UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
get_auth_ks_name(_qp),
meta::roles_table::name,
(ch == membership_change::add ? '+' : '-'),
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
query,
consistency_for_role(grantee_name),
internal_distributed_query_state(),
{role_set{sstring(role_name)}, sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
};
const auto modify_role_members = [this, role_name, grantee_name, ch] () -> future<> {
switch (ch) {
case membership_change::add: {
const sstring insert_query = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
co_return co_await _qp.execute_internal(
insert_query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
}
case membership_change::remove: {
const sstring delete_query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
co_return co_await _qp.execute_internal(
delete_query,
consistency_for_role(role_name),
internal_distributed_query_state(),
{sstring(role_name), sstring(grantee_name)},
cql3::query_processor::cache_internal::no).discard_result();
}
}
};
co_await when_all_succeed(modify_roles, modify_role_members).discard_result();
}
future<>
standard_role_manager::modify_membership(
std::string_view grantee_name,
std::string_view role_name,
membership_change ch,
::service::group0_batch& mc) {
if (legacy_mode(_qp)) {
co_return co_await legacy_modify_membership(grantee_name, role_name, ch);
}
const auto modify_roles = seastar::format(
"UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
meta::roles_table::name,
(ch == membership_change::add ? '+' : '-'),
meta::roles_table::role_col_name);
@@ -295,12 +553,12 @@ standard_role_manager::modify_membership(
switch (ch) {
case membership_change::add:
modify_role_members = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
break;
case membership_change::remove:
modify_role_members = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
break;
default:
@@ -362,17 +620,18 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
});
}
future<> standard_role_manager::collect_roles(
static future<> collect_roles(
cql3::query_processor& qp,
std::string_view grantee_name,
bool recurse,
role_set& roles) {
return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
roles.insert(role_name);
if (recurse) {
return collect_roles(role_name, true, roles);
return collect_roles(qp, role_name, true, roles);
}
return make_ready_future<>();
@@ -387,68 +646,115 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
return do_with(
role_set{sstring(grantee_name)},
[this, grantee_name, recurse](role_set& roles) {
return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
});
}
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
const sstring query = seastar::format("SELECT * FROM {}.{}",
get_auth_ks_name(_qp),
ROLE_MEMBERS_CF);
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
qs,
cql3::query_processor::cache_internal::yes);
role_to_directly_granted_map roles_map;
_cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
for (const auto& granted_role : record.member_of) {
roles_map.emplace(name, granted_role);
}
});
std::transform(
results->begin(),
results->end(),
std::inserter(roles_map, roles_map.begin()),
[] (const cql3::untyped_result_set_row& row) {
return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
);
co_return roles_map;
}
future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
const sstring query = seastar::format("SELECT {} FROM {}.{}",
meta::roles_table::role_col_name,
get_auth_ks_name(_qp),
meta::roles_table::name);
// To avoid many copies of a view.
static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
if (legacy_mode(_qp)) {
throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
}
}
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
qs,
cql3::query_processor::cache_internal::yes);
role_set roles;
roles.reserve(_cache.roles_count());
_cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
roles.insert(name);
});
std::transform(
results->begin(),
results->end(),
std::inserter(roles, roles.begin()),
[] (const cql3::untyped_result_set_row& row) {
return row.get_as<sstring>(role_col_name_string);}
);
co_return roles;
}
future<bool> standard_role_manager::exists(std::string_view role_name) {
return find_record(role_name).then([](std::optional<record> mr) {
return find_record(_qp, role_name).then([](std::optional<record> mr) {
return static_cast<bool>(mr);
});
}
future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
return require_record(role_name).then([](record r) {
return require_record(_qp, role_name).then([](record r) {
return r.is_superuser;
});
}
future<bool> standard_role_manager::can_login(std::string_view role_name) {
return require_record(role_name).then([](record r) {
return r.can_login;
});
if (legacy_mode(_qp)) {
const auto r = co_await require_record(_qp, role_name);
co_return r.can_login;
}
auto role = _cache.get(sstring(role_name));
if (!role) {
throw nonexistant_role(role_name);
}
co_return role->can_login;
}
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
auto role = _cache.get(role_name);
if (!role) {
co_return std::nullopt;
const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
if (!result_set->empty()) {
const cql3::untyped_result_set_row &row = result_set->one();
co_return std::optional<sstring>(row.get_as<sstring>("value"));
}
auto it = role->attributes.find(attribute_name);
if (it != role->attributes.end()) {
co_return it->second;
}
co_return std::nullopt;
co_return std::optional<sstring>{};
}
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
attribute_vals result;
_cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
auto it = record.attributes.find(attribute_name);
if (it != record.attributes.end()) {
result.emplace(name, it->second);
}
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
if (att_val) {
role_to_att_val.emplace(std::move(role), std::move(*att_val));
}
});
}).then([&role_to_att_val] () {
return make_ready_future<attribute_vals>(std::move(role_to_att_val));
});
});
});
co_return result;
}
future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
@@ -456,10 +762,14 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
throw auth::nonexistant_role(role_name);
}
const sstring query = seastar::format("INSERT INTO {}.{} (role, name, value) VALUES (?, ?, ?)",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
}
}
future<> standard_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
@@ -467,10 +777,14 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
throw auth::nonexistant_role(role_name);
}
const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
db::system_keyspace::NAME,
get_auth_ks_name(_qp),
ROLE_ATTRIBUTES_CF);
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name)});
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes).discard_result();
} else {
co_await collect_mutations(_qp, mc, query,
{sstring(role_name), sstring(attribute_name)});
}
}
future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {

View File

@@ -40,6 +40,7 @@ class standard_role_manager final : public role_manager {
cache& _cache;
future<> _stopped;
abort_source _as;
std::string _superuser;
shared_promise<> _superuser_created_promise;
public:
@@ -89,26 +90,23 @@ public:
private:
enum class membership_change { add, remove };
struct record final {
sstring name;
bool is_superuser;
bool can_login;
role_set member_of;
};
future<> create_legacy_metadata_tables_if_missing() const;
bool legacy_metadata_exists();
future<> migrate_legacy_metadata();
future<> legacy_create_default_role_if_missing();
future<> maybe_create_default_role();
future<> maybe_create_default_role_with_retries();
future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);
future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
future<std::optional<record>> find_record(std::string_view role_name);
future<record> require_record(std::string_view role_name);
future<> collect_roles(
std::string_view grantee_name,
bool recurse,
role_set& roles);
};
} // namespace auth

View File

@@ -8,200 +8,244 @@
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include "auth/transitional.hh"
#include "auth/authenticated_user.hh"
#include "auth/authenticator.hh"
#include "auth/authorizer.hh"
#include "auth/default_authorizer.hh"
#include "auth/password_authenticator.hh"
#include "auth/cache.hh"
#include "auth/permission.hh"
#include "service/raft/raft_group0_client.hh"
#include "utils/class_registrator.hh"
namespace auth {
transitional_authenticator::transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
: transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
static const sstring PACKAGE_NAME("com.scylladb.auth.");
static const sstring& transitional_authenticator_name() {
static const sstring name = PACKAGE_NAME + "TransitionalAuthenticator";
return name;
}
transitional_authenticator::transitional_authenticator(std::unique_ptr<authenticator> a)
: _authenticator(std::move(a)) {
static const sstring& transitional_authorizer_name() {
static const sstring name = PACKAGE_NAME + "TransitionalAuthorizer";
return name;
}
future<> transitional_authenticator::start() {
return _authenticator->start();
}
class transitional_authenticator : public authenticator {
std::unique_ptr<authenticator> _authenticator;
future<> transitional_authenticator::stop() {
return _authenticator->stop();
}
public:
static const sstring PASSWORD_AUTHENTICATOR_NAME;
std::string_view transitional_authenticator::qualified_java_name() const {
return "com.scylladb.auth.TransitionalAuthenticator";
}
bool transitional_authenticator::require_authentication() const {
return true;
}
authentication_option_set transitional_authenticator::supported_options() const {
return _authenticator->supported_options();
}
authentication_option_set transitional_authenticator::alterable_options() const {
return _authenticator->alterable_options();
}
future<authenticated_user> transitional_authenticator::authenticate(const credentials_map& credentials) const {
auto i = credentials.find(authenticator::USERNAME_KEY);
if ((i == credentials.end() || i->second.empty())
&& (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
// return anon user
return make_ready_future<authenticated_user>(anonymous_user());
transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
: transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
}
return make_ready_future().then([this, &credentials] {
return _authenticator->authenticate(credentials);
}).handle_exception([](auto ep) {
try {
std::rethrow_exception(ep);
} catch (const exceptions::authentication_exception&) {
transitional_authenticator(std::unique_ptr<authenticator> a)
: _authenticator(std::move(a)) {
}
virtual future<> start() override {
return _authenticator->start();
}
virtual future<> stop() override {
return _authenticator->stop();
}
virtual std::string_view qualified_java_name() const override {
return transitional_authenticator_name();
}
virtual bool require_authentication() const override {
return true;
}
virtual authentication_option_set supported_options() const override {
return _authenticator->supported_options();
}
virtual authentication_option_set alterable_options() const override {
return _authenticator->alterable_options();
}
virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override {
auto i = credentials.find(authenticator::USERNAME_KEY);
if ((i == credentials.end() || i->second.empty())
&& (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
// return anon user
return make_ready_future<authenticated_user>(anonymous_user());
}
});
}
future<> transitional_authenticator::create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
return _authenticator->create(role_name, options, mc);
}
future<> transitional_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
return _authenticator->alter(role_name, options, mc);
}
future<> transitional_authenticator::drop(std::string_view role_name, ::service::group0_batch& mc) {
return _authenticator->drop(role_name, mc);
}
future<custom_options> transitional_authenticator::query_custom_options(std::string_view role_name) const {
return _authenticator->query_custom_options(role_name);
}
bool transitional_authenticator::uses_password_hashes() const {
return _authenticator->uses_password_hashes();
}
future<std::optional<sstring>> transitional_authenticator::get_password_hash(std::string_view role_name) const {
return _authenticator->get_password_hash(role_name);
}
const resource_set& transitional_authenticator::protected_resources() const {
return _authenticator->protected_resources();
}
::shared_ptr<sasl_challenge> transitional_authenticator::new_sasl_challenge() const {
class sasl_wrapper : public sasl_challenge {
public:
sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
: _sasl(std::move(sasl)) {
}
virtual bytes evaluate_response(bytes_view client_response) override {
return make_ready_future().then([this, &credentials] {
return _authenticator->authenticate(credentials);
}).handle_exception([](auto ep) {
try {
return _sasl->evaluate_response(client_response);
std::rethrow_exception(ep);
} catch (const exceptions::authentication_exception&) {
_complete = true;
return {};
// return anon user
return make_ready_future<authenticated_user>(anonymous_user());
}
}
});
}
virtual bool is_complete() const override {
return _complete || _sasl->is_complete();
}
virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
return _authenticator->create(role_name, options, mc);
}
virtual future<authenticated_user> get_authenticated_user() const override {
return futurize_invoke([this] {
return _sasl->get_authenticated_user().handle_exception([](auto ep) {
try {
std::rethrow_exception(ep);
} catch (const exceptions::authentication_exception&) {
// return anon user
return make_ready_future<authenticated_user>(anonymous_user());
}
virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
return _authenticator->alter(role_name, options, mc);
}
virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override {
return _authenticator->drop(role_name, mc);
}
virtual future<custom_options> query_custom_options(std::string_view role_name) const override {
return _authenticator->query_custom_options(role_name);
}
virtual bool uses_password_hashes() const override {
return _authenticator->uses_password_hashes();
}
virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override {
return _authenticator->get_password_hash(role_name);
}
virtual const resource_set& protected_resources() const override {
return _authenticator->protected_resources();
}
virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
class sasl_wrapper : public sasl_challenge {
public:
sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
: _sasl(std::move(sasl)) {
}
virtual bytes evaluate_response(bytes_view client_response) override {
try {
return _sasl->evaluate_response(client_response);
} catch (const exceptions::authentication_exception&) {
_complete = true;
return {};
}
}
virtual bool is_complete() const override {
return _complete || _sasl->is_complete();
}
virtual future<authenticated_user> get_authenticated_user() const override {
return futurize_invoke([this] {
return _sasl->get_authenticated_user().handle_exception([](auto ep) {
try {
std::rethrow_exception(ep);
} catch (const exceptions::authentication_exception&) {
// return anon user
return make_ready_future<authenticated_user>(anonymous_user());
}
});
});
});
}
}
const sstring& get_username() const override {
return _sasl->get_username();
}
const sstring& get_username() const override {
return _sasl->get_username();
}
private:
::shared_ptr<sasl_challenge> _sasl;
private:
::shared_ptr<sasl_challenge> _sasl;
bool _complete = false;
};
return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
}
bool _complete = false;
};
return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
}
future<> transitional_authenticator::ensure_superuser_is_created() const {
return _authenticator->ensure_superuser_is_created();
}
virtual future<> ensure_superuser_is_created() const override {
return _authenticator->ensure_superuser_is_created();
}
};
transitional_authorizer::transitional_authorizer(cql3::query_processor& qp)
: transitional_authorizer(std::make_unique<default_authorizer>(qp)) {
}
class transitional_authorizer : public authorizer {
std::unique_ptr<authorizer> _authorizer;
transitional_authorizer::transitional_authorizer(std::unique_ptr<authorizer> a)
: _authorizer(std::move(a)) {
}
public:
transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
: transitional_authorizer(std::make_unique<default_authorizer>(qp, g0, mm)) {
}
transitional_authorizer(std::unique_ptr<authorizer> a)
: _authorizer(std::move(a)) {
}
transitional_authorizer::~transitional_authorizer() {
}
~transitional_authorizer() {
}
future<> transitional_authorizer::start() {
return _authorizer->start();
}
virtual future<> start() override {
return _authorizer->start();
}
future<> transitional_authorizer::stop() {
return _authorizer->stop();
}
virtual future<> stop() override {
return _authorizer->stop();
}
std::string_view transitional_authorizer::qualified_java_name() const {
return "com.scylladb.auth.TransitionalAuthorizer";
}
virtual std::string_view qualified_java_name() const override {
return transitional_authorizer_name();
}
future<permission_set> transitional_authorizer::authorize(const role_or_anonymous&, const resource&) const {
static const permission_set transitional_permissions =
permission_set::of<
permission::CREATE,
permission::ALTER,
permission::DROP,
permission::SELECT,
permission::MODIFY>();
virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
static const permission_set transitional_permissions =
permission_set::of<
permission::CREATE,
permission::ALTER,
permission::DROP,
permission::SELECT,
permission::MODIFY>();
return make_ready_future<permission_set>(transitional_permissions);
}
return make_ready_future<permission_set>(transitional_permissions);
}
future<> transitional_authorizer::grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
return _authorizer->grant(s, std::move(ps), r, mc);
}
virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override {
return _authorizer->grant(s, std::move(ps), r, mc);
}
future<> transitional_authorizer::revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
return _authorizer->revoke(s, std::move(ps), r, mc);
}
virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override {
return _authorizer->revoke(s, std::move(ps), r, mc);
}
future<std::vector<permission_details>> transitional_authorizer::list_all() const {
return _authorizer->list_all();
}
virtual future<std::vector<permission_details>> list_all() const override {
return _authorizer->list_all();
}
future<> transitional_authorizer::revoke_all(std::string_view s, ::service::group0_batch& mc) {
return _authorizer->revoke_all(s, mc);
}
virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override {
return _authorizer->revoke_all(s, mc);
}
future<> transitional_authorizer::revoke_all(const resource& r, ::service::group0_batch& mc) {
return _authorizer->revoke_all(r, mc);
}
virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override {
return _authorizer->revoke_all(r, mc);
}
const resource_set& transitional_authorizer::protected_resources() const {
return _authorizer->protected_resources();
}
virtual const resource_set& protected_resources() const override {
return _authorizer->protected_resources();
}
};
}
//
// To ensure correct initialization order, we unfortunately need to use string literals.
//
static const class_registrator<
auth::authenticator,
auth::transitional_authenticator,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&,
auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
static const class_registrator<
auth::authorizer,
auth::transitional_authorizer,
cql3::query_processor&,
::service::raft_group0_client&,
::service::migration_manager&> transitional_authorizer_reg(auth::PACKAGE_NAME + "TransitionalAuthorizer");

View File

@@ -1,81 +0,0 @@
/*
* Copyright (C) 2026-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#pragma once
#include "auth/authenticator.hh"
#include "auth/authorizer.hh"
#include "auth/cache.hh"
namespace cql3 {
class query_processor;
}
namespace service {
class raft_group0_client;
class migration_manager;
}
namespace auth {
///
/// Transitional authenticator that allows anonymous access when credentials are not provided
/// or authentication fails. Used for migration scenarios.
///
class transitional_authenticator : public authenticator {
std::unique_ptr<authenticator> _authenticator;
public:
transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache);
transitional_authenticator(std::unique_ptr<authenticator> a);
virtual future<> start() override;
virtual future<> stop() override;
virtual std::string_view qualified_java_name() const override;
virtual bool require_authentication() const override;
virtual authentication_option_set supported_options() const override;
virtual authentication_option_set alterable_options() const override;
virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override;
virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
virtual future<custom_options> query_custom_options(std::string_view role_name) const override;
virtual bool uses_password_hashes() const override;
virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override;
virtual const resource_set& protected_resources() const override;
virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
virtual future<> ensure_superuser_is_created() const override;
};
///
/// Transitional authorizer that grants a fixed set of permissions to all users.
/// Used for migration scenarios.
///
class transitional_authorizer : public authorizer {
std::unique_ptr<authorizer> _authorizer;
public:
transitional_authorizer(cql3::query_processor& qp);
transitional_authorizer(std::unique_ptr<authorizer> a);
~transitional_authorizer();
virtual future<> start() override;
virtual future<> stop() override;
virtual std::string_view qualified_java_name() const override;
virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override;
virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
virtual future<std::vector<permission_details>> list_all() const override;
virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override;
virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override;
virtual const resource_set& protected_resources() const override;
};
} // namespace auth

View File

@@ -10,15 +10,24 @@
#include <random>
#include <unordered_set>
#include <algorithm>
#include <seastar/core/sleep.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/util/later.hh>
#include "gms/endpoint_state.hh"
#include "gms/versioned_value.hh"
#include "keys/keys.hh"
#include "replica/database.hh"
#include "db/system_keyspace.hh"
#include "db/system_distributed_keyspace.hh"
#include "dht/token-sharding.hh"
#include "locator/token_metadata.hh"
#include "types/set.hh"
#include "gms/application_state.hh"
#include "gms/inet_address.hh"
#include "gms/gossiper.hh"
#include "gms/feature_service.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/UUID_gen.hh"
@@ -32,6 +41,16 @@
extern logging::logger cdc_log;
static int get_shard_count(const locator::host_id& endpoint, const gms::gossiper& g) {
auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
return ep_state ? std::stoi(ep_state->value()) : -1;
}
static unsigned get_sharding_ignore_msb(const locator::host_id& endpoint, const gms::gossiper& g) {
auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
return ep_state ? std::stoi(ep_state->value()) : 0;
}
namespace db {
extern thread_local data_type cdc_streams_set_type;
}
@@ -185,7 +204,7 @@ future<topology_description> topology_description::clone_async() const {
for (const auto& entry : _entries) {
vec.push_back(entry);
co_await coroutine::maybe_yield();
co_await seastar::maybe_yield();
}
co_return topology_description{std::move(vec)};
@@ -206,6 +225,12 @@ static std::vector<stream_id> create_stream_ids(
return result;
}
bool should_propose_first_generation(const locator::host_id& my_host_id, const gms::gossiper& g) {
return g.for_each_endpoint_state_until([&] (const gms::endpoint_state& eps) {
return stop_iteration(my_host_id < eps.get_host_id());
}) == stop_iteration::no;
}
bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
if (tm.sorted_tokens().size() != gen.entries().size()) {
// We probably have garbage streams from old generations
@@ -305,6 +330,38 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
}
// non-static for testing
size_t limit_of_streams_in_topology_description() {
// Each stream takes 16B and we don't want to exceed 4MB so we can have
// at most 262144 streams but not less than 1 per vnode.
return 4 * 1024 * 1024 / 16;
}
// non-static for testing
topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
uint64_t streams_count = 0;
for (auto& tr_desc : desc.entries()) {
streams_count += tr_desc.streams.size();
}
size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
if (limit >= streams_count) {
return std::move(desc);
}
size_t streams_per_vnode_limit = limit / desc.entries().size();
auto entries = std::move(desc).entries();
auto start = entries.back().token_range_end;
for (size_t idx = 0; idx < entries.size(); ++idx) {
auto end = entries[idx].token_range_end;
if (entries[idx].streams.size() > streams_per_vnode_limit) {
entries[idx].streams =
create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
}
start = end;
}
return topology_description(std::move(entries));
}
// Compute a set of tokens that split the token ring into vnodes.
static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
auto tokens = tmptr->sorted_tokens();
@@ -362,6 +419,364 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
return ts;
}
future<cdc::generation_id> generation_service::legacy_make_new_generation(const std::unordered_set<dht::token>& bootstrap_tokens, bool add_delay) {
const locator::token_metadata_ptr tmptr = _token_metadata.get();
// Fetch sharding parameters for a node that owns vnode ending with this token
// using gossiped application states.
auto get_sharding_info = [&] (dht::token end) -> std::pair<size_t, uint8_t> {
if (bootstrap_tokens.contains(end)) {
return {smp::count, _cfg.ignore_msb_bits};
} else {
auto endpoint = tmptr->get_endpoint(end);
if (!endpoint) {
throw std::runtime_error(
format("Can't find endpoint for token {}", end));
}
auto sc = get_shard_count(*endpoint, _gossiper);
return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
}
};
auto uuid = utils::make_random_uuid();
auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
// Our caller should ensure that there are normal tokens in the token ring.
auto normal_token_owners = tmptr->count_normal_token_owners();
SCYLLA_ASSERT(normal_token_owners);
if (_feature_service.cdc_generations_v2) {
cdc_log.info("Inserting new generation data at UUID {}", uuid);
// This may take a while.
co_await _sys_dist_ks.local().insert_cdc_generation(uuid, gen, { normal_token_owners });
// Begin the race.
cdc::generation_id_v2 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay), uuid};
cdc_log.info("New CDC generation: {}", gen_id);
co_return gen_id;
}
// The CDC_GENERATIONS_V2 feature is not enabled: some nodes may still not understand the V2 format.
// We must create a generation in the old format.
// If the cluster is large we may end up with a generation that contains
// large number of streams. This is problematic because we store the
// generation in a single row (V1 format). For a generation with large number of rows
// this will lead to a row that can be as big as 32MB. This is much more
// than the limit imposed by commitlog_segment_size_in_mb. If the size of
// the row that describes a new generation grows above
// commitlog_segment_size_in_mb, the write will fail and the new node won't
// be able to join. To avoid such problem we make sure that such row is
// always smaller than 4MB. We do that by removing some CDC streams from
// each vnode if the total number of streams is too large.
gen = limit_number_of_streams_if_needed(std::move(gen));
cdc_log.warn(
"Creating a new CDC generation in the old storage format due to a partially upgraded cluster:"
" the CDC_GENERATIONS_V2 feature is known by this node, but not enabled in the cluster."
" The old storage format forces us to create a suboptimal generation."
" It is recommended to finish the upgrade and then create a new generation either by bootstrapping"
" a new node or running the checkAndRepairCdcStreams nodetool command.");
// Begin the race.
cdc::generation_id_v1 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay)};
co_await _sys_dist_ks.local().insert_cdc_topology_description(gen_id, std::move(gen), { normal_token_owners });
cdc_log.info("New CDC generation: {}", gen_id);
co_return gen_id;
}
/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
* We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
* but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
* which means it will gossip the generation's timestamp.
*/
static std::optional<cdc::generation_id> get_generation_id_for(const locator::host_id& endpoint, const gms::endpoint_state& eps) {
const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
if (!gen_id_ptr) {
return std::nullopt;
}
auto gen_id_string = gen_id_ptr->value();
cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
}
static future<std::optional<cdc::topology_description>> retrieve_generation_data_v2(
cdc::generation_id_v2 id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks) {
auto cdc_gen = co_await sys_dist_ks.read_cdc_generation(id.id);
if (!cdc_gen && id.id.is_timestamp()) {
// If we entered legacy mode due to recovery, we (or some other node)
// might gossip about a generation that was previously propagated
// through raft. If that's the case, it will sit in
// the system.cdc_generations_v3 table.
//
// If the provided id is not a timeuuid, we don't want to query
// the system.cdc_generations_v3 table. This table stores generation
// ids as timeuuids. If the provided id is not a timeuuid, the
// generation cannot be in system.cdc_generations_v3. Also, the query
// would fail with a marshaling error.
cdc_gen = co_await sys_ks.read_cdc_generation_opt(id.id);
}
co_return cdc_gen;
}
static future<std::optional<cdc::topology_description>> retrieve_generation_data(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks,
db::system_distributed_keyspace::context ctx) {
return std::visit(make_visitor(
[&] (const cdc::generation_id_v1& id) {
return sys_dist_ks.read_cdc_topology_description(id, ctx);
},
[&] (const cdc::generation_id_v2& id) {
return retrieve_generation_data_v2(id, sys_ks, sys_dist_ks);
}
), gen_id);
}
static future<> do_update_streams_description(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
db::system_distributed_keyspace& sys_dist_ks,
db::system_distributed_keyspace::context ctx) {
if (co_await sys_dist_ks.cdc_desc_exists(get_ts(gen_id), ctx)) {
cdc_log.info("Generation {}: streams description table already updated.", gen_id);
co_return;
}
// We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
auto topo = co_await retrieve_generation_data(gen_id, sys_ks, sys_dist_ks, ctx);
if (!topo) {
throw no_generation_data_exception(gen_id);
}
co_await sys_dist_ks.create_cdc_desc(get_ts(gen_id), *topo, ctx);
cdc_log.info("CDC description table successfully updated with generation {}.", gen_id);
}
/* Inform CDC users about a generation of streams (identified by the given timestamp)
* by inserting it into the cdc_streams table.
*
* Assumes that the cdc_generation_descriptions table contains this generation.
*
* Returning from this function does not mean that the table update was successful: the function
* might run an asynchronous task in the background.
*/
static future<> update_streams_description(
cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) {
try {
co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
} catch (...) {
cdc_log.warn(
"Could not update CDC description table with generation {}: {}. Will retry in the background.",
gen_id, std::current_exception());
// It is safe to discard this future: we keep system distributed keyspace alive.
(void)(([] (cdc::generation_id gen_id,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) -> future<> {
while (true) {
try {
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
} catch (seastar::sleep_aborted&) {
cdc_log.warn( "Aborted update CDC description table with generation {}", gen_id);
co_return;
}
try {
co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
co_return;
} catch (...) {
cdc_log.warn(
"Could not update CDC description table with generation {}: {}. Will try again.",
gen_id, std::current_exception());
}
}
})(gen_id, sys_ks, std::move(sys_dist_ks), std::move(get_num_token_owners), abort_src));
}
}
static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
return db_clock::time_point(utils::UUID_gen::unix_timestamp(uuid));
}
static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
db::system_distributed_keyspace& sys_dist_ks,
abort_source& abort_src,
const noncopyable_function<unsigned()>& get_num_token_owners) {
while (true) {
try {
co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
} catch (...) {
cdc_log.warn(
"Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
std::current_exception());
}
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
}
}
// Contains a CDC log table's creation time (extracted from its schema's id)
// and its CDC TTL setting.
struct time_and_ttl {
db_clock::time_point creation_time;
int ttl;
};
/*
* See `maybe_rewrite_streams_descriptions`.
* This is the long-running-in-the-background part of that function.
* It returns the timestamp of the last rewritten generation (if any).
*/
static future<std::optional<cdc::generation_id_v1>> rewrite_streams_descriptions(
std::vector<time_and_ttl> times_and_ttls,
db::system_keyspace& sys_ks,
shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
noncopyable_function<unsigned()> get_num_token_owners,
abort_source& abort_src) {
cdc_log.info("Retrieving generation timestamps for rewriting...");
auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
cdc_log.info("Generation timestamps retrieved.");
// Find first generation timestamp such that some CDC log table may contain data before this timestamp.
// This predicate is monotonic w.r.t the timestamps.
auto now = db_clock::now();
std::sort(tss.begin(), tss.end());
auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
// partition_point finds first element that does *not* satisfy the predicate.
return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
[&] (const time_and_ttl& tat) {
// In this CDC log table there are no entries older than the table's creation time
// or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
// If ttl is set to 0 then entries in this table never expire. In that case we look
// only at the table's creation time.
auto no_entries_older_than =
(tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
- std::chrono::seconds(10);
return no_entries_older_than < ts;
});
});
// Find first generation timestamp such that some CDC log table may contain data in this generation.
// This and all later generations need to be written to the new streams table.
if (first != tss.begin()) {
--first;
}
if (first == tss.end()) {
cdc_log.info("No generations to rewrite.");
co_return std::nullopt;
}
cdc_log.info("First generation to rewrite: {}", *first);
bool each_success = true;
co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
while (true) {
try {
co_return co_await do_update_streams_description(cdc::generation_id_v1{ts}, sys_ks, *sys_dist_ks, { get_num_token_owners() });
} catch (const no_generation_data_exception& e) {
cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
each_success = false;
co_return;
} catch (...) {
cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
}
co_await sleep_abortable(std::chrono::seconds(60), abort_src);
}
});
if (each_success) {
cdc_log.info("Rewriting stream tables finished successfully.");
} else {
cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
}
if (first != tss.end()) {
co_return cdc::generation_id_v1{*std::prev(tss.end())};
}
co_return std::nullopt;
}
future<> generation_service::maybe_rewrite_streams_descriptions() {
if (!_db.has_schema(_sys_dist_ks.local().NAME, _sys_dist_ks.local().CDC_DESC_V1)) {
// This cluster never went through a Scylla version which used this table
// or the user deleted the table. Nothing to do.
co_return;
}
if (co_await _sys_ks.local().cdc_is_rewritten()) {
co_return;
}
if (_cfg.dont_rewrite_streams) {
cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
co_return;
}
// For each CDC log table get the TTL setting (from CDC options) and the table's creation time
std::vector<time_and_ttl> times_and_ttls;
_db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
auto& s = *t->schema();
auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
if (!base) {
// Not a CDC log table.
return;
}
auto& cdc_opts = base->cdc_options();
if (!cdc_opts.enabled()) {
// This table is named like a CDC log table but it's not one.
return;
}
times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
});
if (times_and_ttls.empty()) {
// There's no point in rewriting old generations' streams (they don't contain any data).
cdc_log.info("No CDC log tables present, not rewriting stream tables.");
co_return co_await _sys_ks.local().cdc_set_rewritten(std::nullopt);
}
auto get_num_token_owners = [tm = _token_metadata.get()] { return tm->count_normal_token_owners(); };
// This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
// and some nodes that are UP may still be marked as DOWN by us.
// Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
// it doesn't - we'll retry - but it's nice if we succeed without any warnings).
co_await sleep_abortable(std::chrono::seconds(10), _abort_src);
cdc_log.info("Rewriting stream tables in the background...");
auto last_rewritten = co_await rewrite_streams_descriptions(
std::move(times_and_ttls),
_sys_ks.local(),
_sys_dist_ks.local_shared(),
std::move(get_num_token_owners),
_abort_src);
co_await _sys_ks.local().cdc_set_rewritten(last_rewritten);
}
static void assert_shard_zero(const sstring& where) {
if (this_shard_id() != 0) {
on_internal_error(cdc_log, format("`{}`: must be run on shard 0", where));
}
}
class and_reducer {
private:
bool _result = true;
@@ -388,26 +803,206 @@ public:
}
};
class generation_handling_nonfatal_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
constexpr char could_not_retrieve_msg_template[]
= "Could not retrieve CDC streams with timestamp {} upon gossip event. Reason: \"{}\". Action: {}.";
generation_service::generation_service(
config cfg,
config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
replica::database& db)
abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
replica::database& db,
std::function<bool()> raft_topology_change_enabled)
: _cfg(std::move(cfg))
, _gossiper(g)
, _sys_dist_ks(sys_dist_ks)
, _sys_ks(sys_ks)
, _abort_src(abort_src)
, _token_metadata(stm)
, _feature_service(f)
, _db(db)
, _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
{
}
future<> generation_service::stop() {
try {
co_await std::move(_cdc_streams_rewrite_complete);
} catch (...) {
cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
}
if (_joined && (this_shard_id() == 0)) {
co_await leave_ring();
}
_stopped = true;
return make_ready_future<>();
}
generation_service::~generation_service() {
SCYLLA_ASSERT(_stopped);
}
future<> generation_service::handle_cdc_generation(cdc::generation_id gen_id) {
future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
_gen_id = std::move(startup_gen_id);
_gossiper.register_(shared_from_this());
_joined = true;
// Retrieve the latest CDC generation seen in gossip (if any).
co_await legacy_scan_cdc_generations();
// Ensure that the new CDC stream description table has all required streams.
// See the function's comment for details.
//
// Since this depends on the entire cluster (and therefore we cannot guarantee
// timely completion), run it in the background and wait for it in stop().
_cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
}
future<> generation_service::leave_ring() {
assert_shard_zero(__PRETTY_FUNCTION__);
_joined = false;
co_await _gossiper.unregister_(shared_from_this());
}
future<> generation_service::on_join(gms::inet_address ep, locator::host_id id, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
return on_change(ep, id, ep_state->get_application_state_map(), pid);
}
future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
assert_shard_zero(__PRETTY_FUNCTION__);
if (_raft_topology_change_enabled()) {
return make_ready_future<>();
}
return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
return legacy_handle_cdc_generation(gen_id);
});
}
future<> generation_service::check_and_repair_cdc_streams() {
// FIXME: support Raft group 0-based topology changes
if (!_joined) {
throw std::runtime_error("check_and_repair_cdc_streams: node not initialized yet");
}
std::optional<cdc::generation_id> latest = _gen_id;
_gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& state) {
auto addr = state.get_host_id();
if (_gossiper.is_left(addr)) {
cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
return;
}
if (!_gossiper.is_normal(addr)) {
throw std::runtime_error(fmt::format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
" ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
}
const auto gen_id = get_generation_id_for(addr, state);
if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
latest = gen_id;
}
});
auto tmptr = _token_metadata.get();
auto sys_dist_ks = get_sys_dist_ks();
bool should_regenerate = false;
if (!latest) {
cdc_log.warn("check_and_repair_cdc_streams: no generation observed in gossip");
should_regenerate = true;
} else if (std::holds_alternative<cdc::generation_id_v1>(*latest)
&& _feature_service.cdc_generations_v2) {
cdc_log.info(
"Cluster still using CDC generation storage format V1 (id: {}), even though it already understands the V2 format."
" Creating a new generation using V2.", *latest);
should_regenerate = true;
} else {
cdc_log.info("check_and_repair_cdc_streams: last generation observed in gossip: {}", *latest);
static const auto timeout_msg = "Timeout while fetching CDC topology description";
static const auto topology_read_error_note = "Note: this is likely caused by"
" node(s) being down or unreachable. It is recommended to check the network and"
" restart/remove the failed node(s), then retry checkAndRepairCdcStreams command";
static const auto exception_translating_msg = "Translating the exception to `request_execution_exception`";
std::optional<topology_description> gen;
try {
gen = co_await retrieve_generation_data(*latest, _sys_ks.local(), *sys_dist_ks, { tmptr->count_normal_token_owners() });
} catch (exceptions::request_timeout_exception& e) {
cdc_log.error("{}: \"{}\". {}.", timeout_msg, e.what(), exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
format("{}. {}.", timeout_msg, topology_read_error_note));
} catch (exceptions::unavailable_exception& e) {
static const auto unavailable_msg = "Node(s) unavailable while fetching CDC topology description";
cdc_log.error("{}: \"{}\". {}.", unavailable_msg, e.what(), exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::UNAVAILABLE,
format("{}. {}.", unavailable_msg, topology_read_error_note));
} catch (...) {
const auto ep = std::current_exception();
if (is_timeout_exception(ep)) {
cdc_log.error("{}: \"{}\". {}.", timeout_msg, ep, exception_translating_msg);
throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
format("{}. {}.", timeout_msg, topology_read_error_note));
}
// On exotic errors proceed with regeneration
cdc_log.error("Exception while reading CDC topology description: \"{}\". Regenerating streams anyway.", ep);
should_regenerate = true;
}
if (!gen) {
cdc_log.error(
"Could not find CDC generation with timestamp {} in distributed system tables (current time: {}),"
" even though some node gossiped about it.",
latest, db_clock::now());
should_regenerate = true;
} else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
should_regenerate = true;
cdc_log.info("CDC generation {} needs repair, regenerating", latest);
}
}
if (!should_regenerate) {
if (latest != _gen_id) {
co_await legacy_do_handle_cdc_generation(*latest);
}
cdc_log.info("CDC generation {} does not need repair", latest);
co_return;
}
const auto new_gen_id = co_await legacy_make_new_generation({}, true);
// Need to artificially update our STATUS so other nodes handle the generation ID change
// FIXME: after 0e0282cd nodes do not require a STATUS update to react to CDC generation changes.
// The artificial STATUS update here should eventually be removed (in a few releases).
auto status = _gossiper.get_this_endpoint_state_ptr()->get_application_state_ptr(gms::application_state::STATUS);
if (!status) {
cdc_log.error("Our STATUS is missing");
cdc_log.error("Aborting CDC generation repair due to missing STATUS");
co_return;
}
// Update _gen_id first, so that legacy_do_handle_cdc_generation (which will get called due to the status update)
// won't try to update the gossiper, which would result in a deadlock inside add_local_application_state
_gen_id = new_gen_id;
co_await _gossiper.add_local_application_state(
std::pair(gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(new_gen_id)),
std::pair(gms::application_state::STATUS, *status)
);
co_await _sys_ks.local().update_cdc_generation_id(new_gen_id);
}
future<> generation_service::handle_cdc_generation(cdc::generation_id_v2 gen_id) {
auto ts = get_ts(gen_id);
if (co_await container().map_reduce(and_reducer(), [ts] (generation_service& svc) {
return !svc._cdc_metadata.prepare(ts);
@@ -429,8 +1024,171 @@ future<> generation_service::handle_cdc_generation(cdc::generation_id gen_id) {
}
}
future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::generation_id> gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
if (!gen_id) {
co_return;
}
if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
}
// The service should not be listening for generation changes until after the node
// is bootstrapped and since the node leaves the ring on decommission
if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
return !svc._cdc_metadata.prepare(ts);
})) {
co_return;
}
bool using_this_gen = false;
try {
using_this_gen = co_await legacy_do_handle_cdc_generation_intercept_nonfatal_errors(*gen_id);
} catch (generation_handling_nonfatal_exception& e) {
cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "retrying in the background");
legacy_async_handle_cdc_generation(*gen_id);
co_return;
} catch (...) {
cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying");
co_return; // Exotic ("fatal") exception => do not retry
}
if (using_this_gen) {
cdc_log.info("Starting to use generation {}", *gen_id);
co_await update_streams_description(*gen_id, _sys_ks.local(), get_sys_dist_ks(),
[&tm = _token_metadata] { return tm.get()->count_normal_token_owners(); },
_abort_src);
}
}
void generation_service::legacy_async_handle_cdc_generation(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
(void)(([] (cdc::generation_id gen_id, shared_ptr<generation_service> svc) -> future<> {
while (true) {
co_await sleep_abortable(std::chrono::seconds(5), svc->_abort_src);
try {
bool using_this_gen = co_await svc->legacy_do_handle_cdc_generation_intercept_nonfatal_errors(gen_id);
if (using_this_gen) {
cdc_log.info("Starting to use generation {}", gen_id);
co_await update_streams_description(gen_id, svc->_sys_ks.local(), svc->get_sys_dist_ks(),
[&tm = svc->_token_metadata] { return tm.get()->count_normal_token_owners(); },
svc->_abort_src);
}
co_return;
} catch (generation_handling_nonfatal_exception& e) {
cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "continuing to retry in the background");
} catch (...) {
cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying anymore");
co_return; // Exotic ("fatal") exception => do not retry
}
if (co_await svc->container().map_reduce(and_reducer(), [ts = get_ts(gen_id)] (generation_service& svc) {
return svc._cdc_metadata.known_or_obsolete(ts);
})) {
co_return;
}
}
})(gen_id, shared_from_this()));
}
future<> generation_service::legacy_scan_cdc_generations() {
assert_shard_zero(__PRETTY_FUNCTION__);
std::optional<cdc::generation_id> latest;
_gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& eps) {
auto gen_id = get_generation_id_for(eps.get_host_id(), eps);
if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
latest = gen_id;
}
});
if (latest) {
cdc_log.info("Latest generation seen during startup: {}", *latest);
co_await legacy_handle_cdc_generation(latest);
} else {
cdc_log.info("No generation seen during startup.");
}
}
future<bool> generation_service::legacy_do_handle_cdc_generation_intercept_nonfatal_errors(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
// Use futurize_invoke to catch all exceptions from legacy_do_handle_cdc_generation.
return futurize_invoke([this, gen_id] {
return legacy_do_handle_cdc_generation(gen_id);
}).handle_exception([] (std::exception_ptr ep) -> future<bool> {
try {
std::rethrow_exception(ep);
} catch (exceptions::request_timeout_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (exceptions::unavailable_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (exceptions::read_failure_exception& e) {
throw generation_handling_nonfatal_exception(e.what());
} catch (...) {
const auto ep = std::current_exception();
if (is_timeout_exception(ep)) {
throw generation_handling_nonfatal_exception(format("{}", ep));
}
throw;
}
});
}
future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation_id gen_id) {
assert_shard_zero(__PRETTY_FUNCTION__);
auto sys_dist_ks = get_sys_dist_ks();
auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
if (!gen) {
// This may happen during raft upgrade when a node gossips about a generation that
// was propagated through raft and we didn't apply it yet.
throw generation_handling_nonfatal_exception(fmt::format(
"Could not find CDC generation {} in distributed system tables (current time: {}),"
" even though some node gossiped about it.",
gen_id, db_clock::now()));
}
// We always gossip about the generation with the greatest timestamp. Specific nodes may remember older generations,
// but eventually they forget when their clocks move past the latest generation's timestamp.
// The cluster as a whole is only interested in the last generation so restarting nodes may learn what it is.
// We assume that generation changes don't happen ``too often'' so every node can learn about a generation
// before it is superseded by a newer one which causes nodes to start gossiping the about the newer one.
// The assumption follows from the requirement of bootstrapping nodes sequentially.
if (!_gen_id || get_ts(*_gen_id) < get_ts(gen_id)) {
_gen_id = gen_id;
co_await _sys_ks.local().update_cdc_generation_id(gen_id);
co_await _gossiper.add_local_application_state(
gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(gen_id));
}
// Return `true` iff the generation was inserted on any of our shards.
co_return co_await container().map_reduce(or_reducer(),
[ts = get_ts(gen_id), &gen] (generation_service& svc) -> future<bool> {
// We need to copy it here before awaiting anything to avoid destruction of the captures.
const auto timestamp = ts;
topology_description gen_copy = co_await gen->clone_async();
co_return svc._cdc_metadata.insert(timestamp, std::move(gen_copy));
});
}
shared_ptr<db::system_distributed_keyspace> generation_service::get_sys_dist_ks() {
assert_shard_zero(__PRETTY_FUNCTION__);
if (!_sys_dist_ks.local_is_initialized()) {
throw std::runtime_error("system distributed keyspace not initialized");
}
return _sys_dist_ks.local_shared();
}
db_clock::time_point get_ts(const generation_id& gen_id) {
return gen_id.ts;
return std::visit([] (auto& id) { return id.ts; }, gen_id);
}
future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const locator::tablet_map& map, api::timestamp_type ts) {

View File

@@ -34,6 +34,16 @@ namespace seastar {
class abort_source;
} // namespace seastar
namespace db {
class config;
class system_distributed_keyspace;
} // namespace db
namespace gms {
class inet_address;
class gossiper;
} // namespace gms
namespace locator {
class tablet_map;
} // namespace locator
@@ -143,6 +153,23 @@ struct cdc_stream_diff {
using table_streams = std::map<api::timestamp_type, committed_stream_set>;
class no_generation_data_exception : public std::runtime_error {
public:
no_generation_data_exception(cdc::generation_id generation_ts)
: std::runtime_error(fmt::format("could not find generation data for timestamp {}", generation_ts))
{}
};
/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
* which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
* that there's a bug, or the user messed with our local tables).
*
* It checks whether we should be the node to propose the first generation of CDC streams.
* The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
* when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
*/
bool should_propose_first_generation(const locator::host_id& me, const gms::gossiper&);
/*
* Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
* with `token_metadata`.

View File

@@ -15,22 +15,48 @@
namespace cdc {
struct generation_id_v1 {
db_clock::time_point ts;
bool operator==(const generation_id_v1&) const = default;
};
struct generation_id {
struct generation_id_v2 {
db_clock::time_point ts;
utils::UUID id;
bool operator==(const generation_id&) const = default;
bool operator==(const generation_id_v2&) const = default;
};
using generation_id = std::variant<generation_id_v1, generation_id_v2>;
db_clock::time_point get_ts(const generation_id&);
} // namespace cdc
template <>
struct fmt::formatter<cdc::generation_id_v1> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id_v1& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "{}", gen_id.ts);
}
};
template <>
struct fmt::formatter<cdc::generation_id_v2> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id_v2& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
}
};
template <>
struct fmt::formatter<cdc::generation_id> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
template <typename FormatContext>
auto format(const cdc::generation_id& gen_id, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
return std::visit([&ctx] (auto& id) {
return fmt::format_to(ctx.out(), "{}", id);
}, gen_id);
}
};

Some files were not shown because too many files have changed in this diff Show More