Compare commits

..

1 Commits

Author SHA1 Message Date
Dani Tweig
42629175b5 Update urgent_issue_reminder.yml - run daily
The action will run daily, alerting about urgent issues not touched in the last 7 days.
2025-08-20 14:49:33 +03:00
1138 changed files with 18134 additions and 57048 deletions

View File

@@ -47,29 +47,13 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
draft=is_draft
)
logging.info(f"Pull request created: {backport_pr.html_url}")
labels_to_add = []
priority_labels = {"P0", "P1"}
parent_pr_labels = [label.name for label in pr.labels]
for label in priority_labels:
if label in parent_pr_labels:
labels_to_add.append(label)
labels_to_add.append("force_on_cloud")
logging.info(f"Adding {label} and force_on_cloud labels from parent PR to backport PR")
break # Only apply the highest priority label
if is_collaborator:
backport_pr.add_to_assignees(pr.user)
if is_draft:
labels_to_add.append("conflicts")
backport_pr.add_to_labels("conflicts")
pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
pr_comment += "Please resolve them and mark this PR as ready for review"
backport_pr.create_issue_comment(pr_comment)
# Apply all labels at once if we have any
if labels_to_add:
backport_pr.add_to_labels(*labels_to_add)
logging.info(f"Added labels to backport PR: {labels_to_add}")
logging.info(f"Assigned PR to original author: {pr.user}")
return backport_pr
except GithubException as e:

View File

@@ -30,13 +30,8 @@ def copy_labels_from_linked_issues(repo, pr_number):
try:
issue = repo.get_issue(int(issue_number))
for label in issue.labels:
# Copy ALL labels from issues to PR when PR is opened
pr.add_to_labels(label.name)
print(f"Copied label '{label.name}' from issue #{issue_number} to PR #{pr_number}")
if label.name in ['P0', 'P1']:
pr.add_to_labels('force_on_cloud')
print(f"Added force_on_cloud label to PR #{pr_number} due to {label.name} label")
print(f"All labels from issue #{issue_number} copied to PR #{pr_number}")
print(f"Labels from issue #{issue_number} copied to PR #{pr_number}")
except Exception as e:
print(f"Error processing issue #{issue_number}: {e}")
@@ -79,22 +74,9 @@ def sync_labels(repo, number, label, action, is_issue=False):
target = repo.get_issue(int(pr_or_issue_number))
if action == 'labeled':
target.add_to_labels(label)
if label in ['P0', 'P1'] and is_issue:
# Only add force_on_cloud to PRs when P0/P1 is added to an issue
target.add_to_labels('force_on_cloud')
print(f"Added 'force_on_cloud' label to PR #{pr_or_issue_number} due to {label} label")
print(f"Label '{label}' successfully added.")
elif action == 'unlabeled':
target.remove_from_labels(label)
if label in ['P0', 'P1'] and is_issue:
# Check if any other P0/P1 labels remain before removing force_on_cloud
remaining_priority_labels = [l.name for l in target.labels if l.name in ['P0', 'P1']]
if not remaining_priority_labels:
try:
target.remove_from_labels('force_on_cloud')
print(f"Removed 'force_on_cloud' label from PR #{pr_or_issue_number} as no P0/P1 labels remain")
except Exception as e:
print(f"Warning: Could not remove force_on_cloud label: {e}")
print(f"Label '{label}' successfully removed.")
elif action == 'opened':
copy_labels_from_linked_issues(repo, number)

View File

@@ -54,13 +54,10 @@ jobs:
GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
run: python .github/scripts/auto-backport.py --repo ${{ github.repository }} --base-branch ${{ github.ref }} --commits ${{ github.event.before }}..${{ github.sha }}
- name: Check if a valid backport label exists and no backport_error
env:
LABELS_JSON: ${{ toJson(github.event.pull_request.labels) }}
id: check_label
run: |
labels_json="$LABELS_JSON"
echo "Checking labels:"
echo "$labels_json" | jq -r '.[].name'
labels_json='${{ toJson(github.event.pull_request.labels) }}'
echo "Checking labels: $(echo "$labels_json" | jq -r '.[].name')"
# Check if a valid backport label exists
if echo "$labels_json" | jq -e 'any(.[] | .name; test("backport/[0-9]+\\.[0-9]+$"))' > /dev/null; then

View File

@@ -1,12 +1,11 @@
name: Call Jira Status In Progress
on:
pull_request_target:
pull_request:
types: [opened]
jobs:
call-jira-status-in-progress:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,12 +1,11 @@
name: Call Jira Status In Review
on:
pull_request_target:
pull_request:
types: [ready_for_review, review_requested]
jobs:
call-jira-status-in-review:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -1,12 +1,13 @@
name: Call Jira Status Ready For Merge
on:
pull_request_target:
pull_request:
types: [labeled]
jobs:
call-jira-status-update:
uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
with:
label_name: 'status/merge_candidate'
secrets:
caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -37,13 +37,13 @@ jobs:
run: python .github/scripts/sync_labels.py --repo ${{ github.repository }} --number ${{ github.event.number }} --action ${{ github.event.action }}
- name: Pull request labeled or unlabeled event
if: github.event_name == 'pull_request_target' && (startsWith(github.event.label.name, 'backport/') || github.event.label.name == 'P0' || github.event.label.name == 'P1')
if: github.event_name == 'pull_request_target' && startsWith(github.event.label.name, 'backport/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: python .github/scripts/sync_labels.py --repo ${{ github.repository }} --number ${{ github.event.number }} --action ${{ github.event.action }} --label ${{ github.event.label.name }}
- name: Issue labeled or unlabeled event
if: github.event_name == 'issues' && (startsWith(github.event.label.name, 'backport/') || github.event.label.name == 'P0' || github.event.label.name == 'P1')
if: github.event_name == 'issues' && startsWith(github.event.label.name, 'backport/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: python .github/scripts/sync_labels.py --repo ${{ github.repository }} --number ${{ github.event.issue.number }} --action ${{ github.event.action }} --is_issue --label ${{ github.event.label.name }}

View File

@@ -1,21 +0,0 @@
name: Trigger Scylla CI Route
on:
issue_comment:
types: [created]
jobs:
trigger-jenkins:
if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
runs-on: ubuntu-latest
steps:
- name: Trigger Scylla-CI-Route Jenkins Job
env:
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
JENKINS_URL: "https://jenkins.scylladb.com"
run: |
PR_NUMBER=${{ github.event.issue.number }}
PR_REPO_NAME=${{ github.event.repository.full_name }}
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v

1
.gitignore vendored
View File

@@ -37,4 +37,3 @@ clang_build
.idea/
nuke
rust/target

View File

@@ -49,7 +49,7 @@ include(limit_jobs)
set(CMAKE_CXX_STANDARD "23" CACHE INTERNAL "")
set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE INTERNAL "")
set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
if(is_multi_config)
find_package(Seastar)
@@ -90,13 +90,13 @@ if(is_multi_config)
add_dependencies(Seastar::seastar_testing Seastar)
else()
set(Seastar_TESTING ON CACHE BOOL "" FORCE)
set(Seastar_API_LEVEL 9 CACHE STRING "" FORCE)
set(Seastar_API_LEVEL 7 CACHE STRING "" FORCE)
set(Seastar_DEPRECATED_OSTREAM_FORMATTERS OFF CACHE BOOL "" FORCE)
set(Seastar_APPS ON CACHE BOOL "" FORCE)
set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE)
set(Seastar_IO_URING ON CACHE BOOL "" FORCE)
set(Seastar_SCHEDULING_GROUPS_COUNT 21 CACHE STRING "" FORCE)
set(Seastar_SCHEDULING_GROUPS_COUNT 19 CACHE STRING "" FORCE)
set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
add_subdirectory(seastar)
target_compile_definitions (seastar
@@ -170,21 +170,30 @@ target_sources(scylla-main
bytes.cc
client_data.cc
clocks-impl.cc
collection_mutation.cc
converting_mutation_partition_applier.cc
counters.cc
sstable_dict_autotrainer.cc
duration.cc
exceptions/exceptions.cc
frozen_schema.cc
generic_server.cc
debug.cc
init.cc
keys/keys.cc
multishard_mutation_query.cc
mutation_query.cc
node_ops/task_manager_module.cc
partition_slice_builder.cc
query/query.cc
querier.cc
query.cc
query_ranges_to_vnodes.cc
query/query-result-set.cc
query-result-set.cc
tombstone_gc_options.cc
tombstone_gc.cc
reader_concurrency_semaphore.cc
reader_concurrency_semaphore_group.cc
schema_mutations.cc
serializer.cc
service/direct_failure_detector/failure_detector.cc
sstables_loader.cc
@@ -262,6 +271,7 @@ add_subdirectory(mutation)
add_subdirectory(mutation_writer)
add_subdirectory(node_ops)
add_subdirectory(readers)
add_subdirectory(redis)
add_subdirectory(replica)
add_subdirectory(raft)
add_subdirectory(repair)
@@ -276,7 +286,6 @@ add_subdirectory(tracing)
add_subdirectory(transport)
add_subdirectory(types)
add_subdirectory(utils)
add_subdirectory(vector_search)
add_version_library(scylla_version
release.cc)
@@ -306,6 +315,7 @@ set(scylla_libs
mutation_writer
raft
readers
redis
repair
replica
schema
@@ -318,8 +328,7 @@ set(scylla_libs
tracing
transport
types
utils
vector_search)
utils)
target_link_libraries(scylla PRIVATE
${scylla_libs})

View File

@@ -12,7 +12,7 @@ Please use the [issue tracker](https://github.com/scylladb/scylla/issues/) to re
## Contributing code to Scylla
Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form to cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
If you need help formatting or sending patches, [check out these instructions](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches).
The Scylla C++ source code uses the [Seastar coding style](https://github.com/scylladb/seastar/blob/master/coding-style.md) so please adhere to that in your patches. Note that Scylla code is written with `using namespace seastar`, so should not explicitly add the `seastar::` prefix to Seastar symbols. You will usually not need to add `using namespace seastar` to new source files, because most Scylla header files have `#include "seastarx.hh"`, which does this.

View File

@@ -43,7 +43,7 @@ $ ./tools/toolchain/dbuild ninja build/release/scylla
$ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
```
Note: do not mix environments - either perform all your work with dbuild, or natively on the host.
Note: do not mix environemtns - either perform all your work with dbuild, or natively on the host.
Note2: you can get to an interactive shell within dbuild by running it without any parameters:
```bash
$ ./tools/toolchain/dbuild
@@ -91,7 +91,7 @@ You can also specify a single mode. For example
$ ninja-build release
```
Will build everything in release mode. The valid modes are
Will build everytihng in release mode. The valid modes are
* Debug: Enables [AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer)
and other sanity checks. It has no optimizations, which allows for debugging with tools like
@@ -361,7 +361,7 @@ avoid that the gold linker can be told to create an index with
More info at https://gcc.gnu.org/wiki/DebugFission.
Both options can be enabled by passing `--split-dwarf` to configure.py.
Both options can be enable by passing `--split-dwarf` to configure.py.
Note that distcc is *not* compatible with it, but icecream
(https://github.com/icecc/icecream) is.
@@ -370,7 +370,7 @@ Note that distcc is *not* compatible with it, but icecream
Sometimes Scylla development is closely tied with a feature being developed in Seastar. It can be useful to compile Scylla with a particular check-out of Seastar.
One way to do this is to create a local remote for the Seastar submodule in the Scylla repository:
One way to do this it to create a local remote for the Seastar submodule in the Scylla repository:
```bash
$ cd $HOME/src/scylla

View File

@@ -18,7 +18,7 @@ Scylla is fairly fussy about its build environment, requiring very recent
versions of the C++23 compiler and of many libraries to build. The document
[HACKING.md](HACKING.md) includes detailed information on building and
developing Scylla, but to get Scylla building quickly on (almost) any build
machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md).
machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
This is a pre-configured Docker image which includes recent versions of all
the required compilers, libraries and build tools. Using the frozen toolchain
allows you to avoid changing anything in your build machine to meet Scylla's

View File

@@ -78,7 +78,7 @@ fi
# Default scylla product/version tags
PRODUCT=scylla
VERSION=2026.1.0-dev
VERSION=2025.4.0-dev
if test -f version
then

View File

@@ -17,7 +17,6 @@ target_sources(alternator
streams.cc
consumed_capacity.cc
ttl.cc
parsed_expression_cache.cc
${cql_grammar_srcs})
target_include_directories(alternator
PUBLIC

View File

@@ -11,6 +11,7 @@
#include "utils/log.hh"
#include <string>
#include <string_view>
#include "bytes.hh"
#include "alternator/auth.hh"
#include <fmt/format.h>
#include "auth/password_authenticator.hh"

View File

@@ -136,7 +136,6 @@ future<> controller::start_server() {
[this, addr, alternator_port, alternator_https_port, creds = std::move(creds)] (server& server) mutable {
return server.init(addr, alternator_port, alternator_https_port, creds,
_config.alternator_enforce_authorization,
_config.alternator_max_users_query_size_in_trace_output,
&_memory_limiter.local().get_semaphore(),
_config.max_concurrent_requests_per_shard);
}).handle_exception([this, addr, alternator_port, alternator_https_port] (std::exception_ptr ep) {

View File

@@ -11,7 +11,7 @@
#include <seastar/core/sharded.hh>
#include <seastar/core/smp.hh>
#include "transport/protocol_server.hh"
#include "protocol_server.hh"
namespace service {
class storage_proxy;

View File

@@ -94,9 +94,6 @@ public:
static api_error internal(std::string msg) {
return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
}
static api_error payload_too_large(std::string msg) {
return api_error("PayloadTooLarge", std::move(msg), status_type::payload_too_large);
}
// Provide the "std::exception" interface, to make it easier to print this
// exception in log messages. Note that this function is *not* used to

File diff suppressed because it is too large Load Diff

View File

@@ -58,6 +58,10 @@ namespace alternator {
class rmw_operation;
namespace parsed {
class path;
};
schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request);
bool is_alternator_keyspace(const sstring& ks_name);
// Wraps the db::get_tags_of_table and throws if the table is missing the tags extension.
@@ -128,9 +132,6 @@ using attrs_to_get_node = attribute_path_map_node<std::monostate>;
// optional means we should get all attributes, not specific ones.
using attrs_to_get = attribute_path_map<std::monostate>;
namespace parsed {
class expression_cache;
}
class executor : public peering_sharded_service<executor> {
gms::gossiper& _gossiper;
@@ -143,8 +144,6 @@ class executor : public peering_sharded_service<executor> {
// forwarding Alternator request between shards - if necessary for LWT.
smp_service_group _ssg;
std::unique_ptr<parsed::expression_cache> _parsed_expression_cache;
public:
using client_state = service::client_state;
// request_return_type is the return type of the executor methods, which
@@ -175,7 +174,6 @@ public:
cdc::metadata& cdc_metadata,
smp_service_group ssg,
utils::updateable_value<uint32_t> default_timeout_in_ms);
~executor();
future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
@@ -203,7 +201,11 @@ public:
future<request_return_type> describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request);
future<> start();
future<> stop();
future<> stop() {
// disconnect from the value source, but keep the value unchanged.
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
return make_ready_future<>();
}
static sstring table_name(const schema&);
static db::timeout_clock::time_point default_timeout();
@@ -216,10 +218,10 @@ public:
private:
friend class rmw_operation;
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
public:
static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&, const std::map<sstring, sstring> *tags = nullptr);
static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
static std::optional<rjson::value> describe_single_item(schema_ptr,
const query::partition_slice&,
@@ -228,15 +230,12 @@ public:
const std::optional<attrs_to_get>&,
uint64_t* = nullptr);
// Converts a multi-row selection result to JSON compatible with DynamoDB.
// For each row, this method calls item_callback, which takes the size of
// the item as the parameter.
static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
const query::partition_slice&& slice,
shared_ptr<cql3::selection::selection> selection,
foreign_ptr<lw_shared_ptr<query::result>> query_result,
shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
noncopyable_function<void(uint64_t)> item_callback = {});
uint64_t& rcu_half_units);
static void describe_single_item(const cql3::selection::selection&,
const std::vector<managed_bytes_opt>&,

View File

@@ -91,18 +91,6 @@ options {
throw expressions_syntax_error(format("{} at char {}", err,
ex->get_charPositionInLine()));
}
// ANTLR3 tries to recover missing tokens - it tries to finish parsing
// and create valid objects, as if the missing token was there.
// But it has a bug and leaks these tokens.
// We override offending method and handle abandoned pointers.
std::vector<std::unique_ptr<TokenType>> _missing_tokens;
TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
_missing_tokens.emplace_back(token);
return token;
}
}
@lexer::context {
void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
@@ -196,13 +184,7 @@ path_component: NAME | NAMEREF;
path returns [parsed::path p]:
root=path_component { $p.set_root($root.text); }
( '.' name=path_component { $p.add_dot($name.text); }
| '[' INTEGER ']' {
try {
$p.add_index(std::stoi($INTEGER.text));
} catch(std::out_of_range&) {
throw expressions_syntax_error("list index out of integer range");
}
}
| '[' INTEGER ']' { $p.add_index(std::stoi($INTEGER.text)); }
)*;
/* See comment above why the "depth" counter was needed here */
@@ -248,7 +230,7 @@ update_expression_clause returns [parsed::update_expression e]:
// Note the "EOF" token at the end of the update expression. We want to the
// parser to match the entire string given to it - not just its beginning!
update_expression returns [parsed::update_expression e]:
(update_expression_clause { e.append($update_expression_clause.e); })+ EOF;
(update_expression_clause { e.append($update_expression_clause.e); })* EOF;
projection_expression returns [std::vector<parsed::path> v]:
p=path { $v.push_back(std::move($p.p)); }
@@ -275,13 +257,6 @@ primitive_condition returns [parsed::primitive_condition c]:
(',' v=value[0] { $c.add_value(std::move($v.v)); })*
')'
)?
{
// Post-parse check to reject non-function single values
if ($c._op == parsed::primitive_condition::type::VALUE &&
!$c._values.front().is_func()) {
throw expressions_syntax_error("Single value must be a function");
}
}
;
// The following rules for parsing boolean expressions are verbose and

View File

@@ -18,8 +18,6 @@
#include "expressions_types.hh"
#include "utils/rjson.hh"
#include "utils/updateable_value.hh"
#include "stats.hh"
namespace alternator {
@@ -28,26 +26,6 @@ public:
using runtime_error::runtime_error;
};
namespace parsed {
class expression_cache_impl;
class expression_cache {
std::unique_ptr<expression_cache_impl> _impl;
public:
struct config {
utils::updateable_value<uint32_t> max_cache_entries;
};
expression_cache(config cfg, stats& stats);
~expression_cache();
// stop background tasks, if any
future<> stop();
update_expression parse_update_expression(std::string_view query);
std::vector<path> parse_projection_expression(std::string_view query);
condition_expression parse_condition_expression(std::string_view query, const char* caller);
};
} // namespace parsed
// Preferably use parsed::expression_cache instance instead of this free functions.
parsed::update_expression parse_update_expression(std::string_view query);
std::vector<parsed::path> parse_projection_expression(std::string_view query);
parsed::condition_expression parse_condition_expression(std::string_view query, const char* caller);

View File

@@ -209,7 +209,9 @@ public:
// function is supported).
// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
// 3. N-ary operator - v1 IN ( v2, v3, ... )
// 4. A single function call (attribute_exists etc.).
// 4. A single function call (attribute_exists etc.). The parser actually
// accepts a more general "value" here but later stages reject a value
// which is not a function call (because DynamoDB does it too).
class primitive_condition {
public:
enum class type {

View File

@@ -13,7 +13,7 @@
#include "utils/rjson.hh"
#include "serialization.hh"
#include "schema/column_computation.hh"
#include "column_computation.hh"
#include "db/view/regular_column_transformation.hh"
namespace alternator {

View File

@@ -1,109 +0,0 @@
/*
* Copyright 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "expressions.hh"
#include "utils/log.hh"
#include "utils/lru_string_map.hh"
#include <variant>
static logging::logger logger_("parsed-expression-cache");
namespace alternator::parsed {
struct expression_cache_impl {
stats& _stats;
using cached_expressions_types = std::variant<
update_expression,
condition_expression,
std::vector<path>
>;
sized_lru_string_map<cached_expressions_types> _cached_entries;
utils::observable<uint32_t>::observer _max_cache_entries_observer;
expression_cache_impl(expression_cache::config cfg, stats& stats);
// to define the specialized return type of `get_or_create()`
template <typename Func, typename... Args>
using ParseResult = std::invoke_result_t<Func, std::string_view, Args...>;
// Caching layer for parsed expressions
// The expression type is determined by the type of the parsing function passed as a parameter,
// and the return type is exactly the same as the return type of this parsing function.
// StatsType is used only to update appropriate statistics - currently it is aligned with the expression type,
// but it could be extended in the future if needed, e.g. split per operation.
template <stats::expression_types StatsType, typename Func, typename... Args>
ParseResult<Func, Args...> get_or_create(std::string_view query, Func&& parse_func, Args&&... other_args) {
if (_cached_entries.disabled()) {
return parse_func(query, std::forward<Args>(other_args)...);
}
if (!_cached_entries.sanity_check()) {
_stats.expression_cache.requests[StatsType].misses++;
return parse_func(query, std::forward<Args>(other_args)...);
}
auto value = _cached_entries.find(query);
if (value) {
logger_.trace("Cache hit for query: {}", query);
_stats.expression_cache.requests[StatsType].hits++;
try {
return std::get<ParseResult<Func, Args...>>(value->get());
} catch (const std::bad_variant_access&) {
// User can reach this code, by sending the same query string as a different expression type.
// In practice valid queries are different enough to not collide.
// Entries in cache are only valid queries.
// This request will fail at parsing below.
// If, by any chance this is a valid query, it will be updated below with the new value.
logger_.trace("Cache hit for '{}', but type mismatch.", query);
_stats.expression_cache.requests[StatsType].hits--;
}
} else {
logger_.trace("Cache miss for query: {}", query);
}
ParseResult<Func, Args...> expr = parse_func(query, std::forward<Args>(other_args)...);
// Invalid query will throw here ^
_stats.expression_cache.requests[StatsType].misses++;
if (value) [[unlikely]] {
value->get() = cached_expressions_types{expr};
} else {
_cached_entries.insert(query, cached_expressions_types{expr});
}
return expr;
}
};
expression_cache_impl::expression_cache_impl(expression_cache::config cfg, stats& stats) :
_stats(stats), _cached_entries(logger_, _stats.expression_cache.evictions),
_max_cache_entries_observer(cfg.max_cache_entries.observe([this] (uint32_t max_value) {
_cached_entries.set_max_size(max_value);
})) {
_cached_entries.set_max_size(cfg.max_cache_entries());
}
expression_cache::expression_cache(expression_cache::config cfg, stats& stats) :
_impl(std::make_unique<expression_cache_impl>(std::move(cfg), stats)) {
}
expression_cache::~expression_cache() = default;
future<> expression_cache::stop() {
return _impl->_cached_entries.stop();
}
update_expression expression_cache::parse_update_expression(std::string_view query) {
return _impl->get_or_create<stats::expression_types::UPDATE_EXPRESSION>(query, alternator::parse_update_expression);
}
std::vector<path> expression_cache::parse_projection_expression(std::string_view query) {
return _impl->get_or_create<stats::expression_types::PROJECTION_EXPRESSION>(query, alternator::parse_projection_expression);
}
condition_expression expression_cache::parse_condition_expression(std::string_view query, const char* caller) {
return _impl->get_or_create<stats::expression_types::CONDITION_EXPRESSION>(query, alternator::parse_condition_expression, caller);
}
} // namespace alternator::parsed

View File

@@ -8,8 +8,6 @@
#pragma once
#include "cdc/cdc_options.hh"
#include "cdc/log.hh"
#include "seastarx.hh"
#include "service/paxos/cas_request.hh"
#include "service/cas_shard.hh"
@@ -58,7 +56,7 @@ public:
static write_isolation get_write_isolation_for_schema(schema_ptr schema);
static write_isolation default_write_isolation;
public:
static void set_default_write_isolation(std::string_view mode);
protected:
@@ -109,11 +107,10 @@ public:
// violating this). We mark apply() "const" to let the compiler validate
// this for us. The output-only field _return_attributes is marked
// "mutable" above so that apply() can still write to it.
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const = 0;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
// Convert the above apply() into the signature needed by cas_request:
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
virtual ~rmw_operation() = default;
const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
schema_ptr schema() const { return _schema; }
const rjson::value& request() const { return _request; }
rjson::value&& move_request() && { return std::move(_request); }
@@ -127,9 +124,6 @@ public:
stats& per_table_stats,
uint64_t& wcu_total);
std::optional<service::cas_shard> shard_for_execute(bool needs_read_before_write);
private:
inline bool should_fill_preimage() const { return _schema->cdc_options().enabled(); }
};
} // namespace alternator

View File

@@ -11,8 +11,8 @@
#include "utils/log.hh"
#include "serialization.hh"
#include "error.hh"
#include "types/concrete_types.hh"
#include "types/json_utils.hh"
#include "concrete_types.hh"
#include "cql3/type_json.hh"
#include "mutation/position_in_partition.hh"
static logging::logger slogger("alternator-serialization");

View File

@@ -100,13 +100,6 @@ static void handle_CORS(const request& req, reply& rep, bool preflight) {
// the user directly. Other exceptions are unexpected, and reported as
// Internal Server Error.
class api_handler : public handler_base {
// Although the the DynamoDB API responses are JSON, additional
// conventions apply to these responses. For this reason, DynamoDB uses
// the content type "application/x-amz-json-1.0" instead of the standard
// "application/json". Some other AWS services use later versions instead
// of "1.0", but DynamoDB currently uses "1.0". Note that this content
// type applies to all replies, both success and error.
static constexpr const char* REPLY_CONTENT_TYPE = "application/x-amz-json-1.0";
public:
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
[this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
@@ -135,10 +128,13 @@ public:
// Note that despite the move, there is a copy here -
// as str is std::string and rep->_content is sstring.
rep->_content = std::move(str);
rep->set_content_type(REPLY_CONTENT_TYPE);
},
[&] (executor::body_writer&& body_writer) {
rep->write_body(REPLY_CONTENT_TYPE, std::move(body_writer));
// Unfortunately, write_body() forces us to choose
// from a fixed and irrelevant list of "mime-types"
// at this point. But we'll override it with the
// correct one (application/x-amz-json-1.0) below.
rep->write_body("json", std::move(body_writer));
},
[&] (const api_error& err) {
generate_error_reply(*rep, err);
@@ -155,6 +151,7 @@ public:
handle_CORS(*req, *rep, false);
return _f_handle(std::move(req), std::move(rep)).then(
[](std::unique_ptr<reply> rep) {
rep->set_mime_type("application/x-amz-json-1.0");
rep->done();
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
});
@@ -170,7 +167,6 @@ protected:
rjson::add(results, "message", err._msg);
rep._content = rjson::print(std::move(results));
rep._status = err._http_code;
rep.set_content_type(REPLY_CONTENT_TYPE);
slogger.trace("api_handler error case: {}", rep._content);
}
@@ -378,82 +374,35 @@ static tracing::trace_state_ptr create_tracing_session(tracing::tracing& tracing
return tracing_instance.create_session(tracing::trace_type::QUERY, props);
}
// A helper class to represent a potentially truncated view of a chunked_content.
// If the content is short enough and single chunked, it just holds a view into the content.
// Otherwise it will be copied into an internal buffer, possibly truncated (depending on maximum allowed size passed in),
// and the view will point into that buffer.
// `as_view()` method will return the view.
// `take_as_sstring()` will either move out the internal buffer (if any), or create a new sstring from the view.
// You should consider `as_view()` valid as long both the original chunked_content and the truncated_content object are alive.
class truncated_content {
std::string_view _view;
sstring _content_maybe;
void copy_from_content(const chunked_content& content) {
size_t offset = 0;
for(auto &tmp : content) {
size_t to_copy = std::min(tmp.size(), _content_maybe.size() - offset);
std::copy(tmp.get(), tmp.get() + to_copy, _content_maybe.data() + offset);
offset += to_copy;
if (offset >= _content_maybe.size()) {
break;
}
}
// truncated_content_view() prints a potentially long chunked_content for
// debugging purposes. In the common case when the content is not excessively
// long, it just returns a view into the given content, without any copying.
// But when the content is very long, it is truncated after some arbitrary
// max_len (or one chunk, whichever comes first), with "<truncated>" added at
// the end. To do this modification to the string, we need to create a new
// std::string, so the caller must pass us a reference to one, "buf", where
// we can store the content. The returned view is only alive for as long this
// buf is kept alive.
static std::string_view truncated_content_view(const chunked_content& content, std::string& buf) {
constexpr size_t max_len = 1024;
if (content.empty()) {
return std::string_view();
} else if (content.size() == 1 && content.begin()->size() <= max_len) {
return std::string_view(content.begin()->get(), content.begin()->size());
} else {
buf = std::string(content.begin()->get(), std::min(content.begin()->size(), max_len)) + "<truncated>";
return std::string_view(buf);
}
public:
truncated_content(const chunked_content& content, size_t max_len = std::numeric_limits<size_t>::max()) {
if (content.empty()) return;
if (content.size() == 1 && content.begin()->size() <= max_len) {
_view = std::string_view(content.begin()->get(), content.begin()->size());
return;
}
constexpr std::string_view truncated_text = "<truncated>";
size_t content_size = 0;
for(auto &tmp : content) {
content_size += tmp.size();
}
if (content_size <= max_len) {
_content_maybe = sstring{ sstring::initialized_later{}, content_size };
copy_from_content(content);
}
else {
_content_maybe = sstring{ sstring::initialized_later{}, max_len + truncated_text.size() };
copy_from_content(content);
std::copy(truncated_text.begin(), truncated_text.end(), _content_maybe.data() + _content_maybe.size() - truncated_text.size());
}
_view = std::string_view(_content_maybe);
}
std::string_view as_view() const { return _view; }
sstring take_as_sstring() && {
if (_content_maybe.empty() && !_view.empty()) {
return sstring{_view};
}
return std::move(_content_maybe);
}
};
// `truncated_content_view` will produce an object representing a view to a passed content
// possibly truncated at some length. The value returned is used in two ways:
// - to print it in logs (use `as_view()` method for this)
// - to pass it to tracing object, where it will be stored and used later
// (use `take_as_sstring()` method as this produces a copy in form of a sstring)
// `truncated_content` delays constructing `sstring` object until it's actually needed.
// `truncated_content` is valid as long as passed `content` is alive.
// if the content is truncated, `<truncated>` will be appended at the maximum size limit
// and total size will be `max_users_query_size_in_trace_output() + strlen("<truncated>")`.
static truncated_content truncated_content_view(const chunked_content& content, size_t max_size) {
return truncated_content{content, max_size};
}
static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query, size_t max_users_query_size_in_trace_output) {
static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query) {
tracing::trace_state_ptr trace_state;
tracing::tracing& tracing_instance = tracing::tracing::get_local_tracing_instance();
if (tracing_instance.trace_next_query() || tracing_instance.slow_query_tracing_enabled()) {
trace_state = create_tracing_session(tracing_instance);
std::string buf;
tracing::add_session_param(trace_state, "alternator_op", op);
tracing::add_query(trace_state, truncated_content_view(query, max_users_query_size_in_trace_output).take_as_sstring());
tracing::add_query(trace_state, truncated_content_view(query, buf));
tracing::begin(trace_state, seastar::format("Alternator {}", op), client_state.get_client_address());
if (!username.empty()) {
tracing::set_username(trace_state, auth::authenticated_user(username));
@@ -462,81 +411,25 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_
return trace_state;
}
// This read_entire_stream() is similar to Seastar's read_entire_stream()
// which reads the given content_stream until its end into non-contiguous
// memory. The difference is that this implementation takes an extra length
// limit, and throws an error if we read more than this limit.
// This length-limited variant would not have been needed if Seastar's HTTP
// server's set_content_length_limit() worked in every case, but unfortunately
// it does not - it only works if the request has a Content-Length header (see
// issue #8196). In contrast this function can limit the request's length no
// matter how it's encoded. We need this limit to protect Alternator from
// oversized requests that can deplete memory.
static future<chunked_content>
read_entire_stream(input_stream<char>& inp, size_t length_limit) {
chunked_content ret;
// We try to read length_limit + 1 bytes, so that we can throw an
// exception if we managed to read more than length_limit.
ssize_t remain = length_limit + 1;
do {
temporary_buffer<char> buf = co_await inp.read_up_to(remain);
if (buf.empty()) {
break;
}
remain -= buf.size();
ret.push_back(std::move(buf));
} while (remain > 0);
// If we read the full length_limit + 1 bytes, we went over the limit:
if (remain <= 0) {
// By throwing here an error, we may send a reply (the error message)
// without having read the full request body. Seastar's httpd will
// realize that we have not read the entire content stream, and
// correctly mark the connection unreusable, i.e., close it.
// This means we are currently exposed to issue #12166 caused by
// Seastar issue 1325), where the client may get an RST instead of
// a FIN, and may rarely get a "Connection reset by peer" before
// reading the error we send.
throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit));
}
co_return ret;
}
future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request> req) {
_executor._stats.total_operations++;
sstring target = req->get_header("X-Amz-Target");
// target is DynamoDB API version followed by a dot '.' and operation type (e.g. CreateTable)
auto dot = target.find('.');
std::string_view op = (dot == sstring::npos) ? std::string_view() : std::string_view(target).substr(dot+1);
if (req->content_length > request_content_length_limit) {
// If we have a Content-Length header and know the request will be too
// long, we don't need to wait for read_entire_stream() below to
// discover it. And we definitely mustn't try to get_units() below for
// for such a size.
co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
}
// JSON parsing can allocate up to roughly 2x the size of the raw
// document, + a couple of bytes for maintenance.
// If the Content-Length of the request is not available, we assume
// the largest possible request (request_content_length_limit, i.e., 16 MB)
// and after reading the request we return_units() the excess.
size_t mem_estimate = (req->content_length ? req->content_length : request_content_length_limit) * 2 + 8000;
// TODO: consider the case where req->content_length is missing. Maybe
// we need to take the content_length_limit and return some of the units
// when we finish read_content_and_verify_signature?
size_t mem_estimate = req->content_length * 2 + 8000;
auto units_fut = get_units(*_memory_limiter, mem_estimate);
if (_memory_limiter->waiters()) {
++_executor._stats.requests_blocked_memory;
}
auto units = co_await std::move(units_fut);
SCYLLA_ASSERT(req->content_stream);
chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
// If the request had no Content-Length, we reserved too many units
// so need to return some
if (req->content_length == 0) {
size_t content_length = 0;
for (const auto& chunk : content) {
content_length += chunk.size();
}
size_t new_mem_estimate = content_length * 2 + 8000;
units.return_units(mem_estimate - new_mem_estimate);
}
chunked_content content = co_await util::read_entire_stream(*req->content_stream);
auto username = co_await verify_signature(*req, content);
// As long as the system_clients_entry object is alive, this request will
// be visible in the "system.clients" virtual table. When requested, this
@@ -547,7 +440,8 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
req->get_protocol_name() == "https");
if (slogger.is_enabled(log_level::trace)) {
slogger.trace("Request: {} {} {}", op, truncated_content_view(content, _max_users_query_size_in_trace_output).as_view(), req->_headers);
std::string buf;
slogger.trace("Request: {} {} {}", op, truncated_content_view(content, buf), req->_headers);
}
auto callback_it = _callbacks.find(op);
if (callback_it == _callbacks.end()) {
@@ -567,7 +461,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
}
co_await client_state.maybe_update_per_service_level_params();
tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
tracing::trace(trace_state, "{}", op);
auto user = client_state.user();
@@ -619,7 +513,6 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
, _sl_controller(sl_controller)
, _key_cache(1024, 1min, slogger)
, _enforce_authorization(false)
, _max_users_query_size_in_trace_output(1024)
, _enabled_servers{}
, _pending_requests("alternator::server::pending_requests")
, _timeout_config(_proxy.data_dictionary().get_config())
@@ -700,12 +593,10 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
}
future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
_memory_limiter = memory_limiter;
_enforce_authorization = std::move(enforce_authorization);
_max_concurrent_requests = std::move(max_concurrent_requests);
_max_users_query_size_in_trace_output = std::move(max_users_query_size_in_trace_output);
if (!port && !https_port) {
return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
" must be specified in order to init an alternator HTTP server instance"));
@@ -715,12 +606,14 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
if (port) {
set_routes(_http_server._routes);
_http_server.set_content_length_limit(server::content_length_limit);
_http_server.set_content_streaming(true);
_http_server.listen(socket_address{addr, *port}).get();
_enabled_servers.push_back(std::ref(_http_server));
}
if (https_port) {
set_routes(_https_server._routes);
_https_server.set_content_length_limit(server::content_length_limit);
_https_server.set_content_streaming(true);
if (this_shard_id() == 0) {

View File

@@ -28,11 +28,7 @@ namespace alternator {
using chunked_content = rjson::chunked_content;
class server : public peering_sharded_service<server> {
// The maximum size of a request body that Alternator will accept,
// in bytes. This is a safety measure to prevent Alternator from
// running out of memory when a client sends a very large request.
// DynamoDB also has the same limit set to 16 MB.
static constexpr size_t request_content_length_limit = 16*MB;
static constexpr size_t content_length_limit = 16*MB;
using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;
@@ -47,7 +43,6 @@ class server : public peering_sharded_service<server> {
key_cache _key_cache;
utils::updateable_value<bool> _enforce_authorization;
utils::updateable_value<uint64_t> _max_users_query_size_in_trace_output;
utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
named_gate _pending_requests;
// In some places we will need a CQL updateable_timeout_config object even
@@ -99,8 +94,7 @@ public:
server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
future<> stop();
// get_client_data() is called (on each shard separately) when the virtual
// table "system.clients" is read. It is expected to generate a list of

View File

@@ -154,39 +154,6 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
[&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
});
seastar::metrics::label expression_label("expression");
metrics.add_group(group_name, {
seastar::metrics::make_total_operations("expression_cache_evictions", stats.expression_cache.evictions,
seastar::metrics::description("Counts number of entries evicted from expressions cache"), labels).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_hits", stats.expression_cache.requests[stats::expression_types::UPDATE_EXPRESSION].hits,
seastar::metrics::description("Counts number of hits of cached expressions"), labels)(expression_label("UpdateExpression")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::UPDATE_EXPRESSION].misses,
seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("UpdateExpression")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_hits", stats.expression_cache.requests[stats::expression_types::CONDITION_EXPRESSION].hits,
seastar::metrics::description("Counts number of hits of cached expressions"), labels)(expression_label("ConditionExpression")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::CONDITION_EXPRESSION].misses,
seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("ConditionExpression")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_hits", stats.expression_cache.requests[stats::expression_types::PROJECTION_EXPRESSION].hits,
seastar::metrics::description("Counts number of hits of cached expressions"), labels)(expression_label("ProjectionExpression")).aggregate(aggregate_labels).set_skip_when_empty(),
seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::PROJECTION_EXPRESSION].misses,
seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("ProjectionExpression")).aggregate(aggregate_labels).set_skip_when_empty()
});
}

View File

@@ -79,32 +79,6 @@ public:
utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
} api_operations;
// Operation size metrics
struct {
// Item size statistics collected per table and aggregated per node.
// Each histogram covers the range 0 - 446. Resolves #25143.
// A size is the retrieved item's size.
utils::estimated_histogram get_item_op_size_kb{30};
// A size is the maximum of the new item's size and the old item's size.
utils::estimated_histogram put_item_op_size_kb{30};
// A size is the deleted item's size. If the deleted item's size is
// unknown (i.e. read-before-write wasn't necessary and it wasn't
// forced by a configuration option), it won't be recorded on the
// histogram.
utils::estimated_histogram delete_item_op_size_kb{30};
// A size is the maximum of existing item's size and the estimated size
// of the update. This will be changed to the maximum of the existing item's
// size and the new item's size in a subsequent PR.
utils::estimated_histogram update_item_op_size_kb{30};
// A size is the sum of the sizes of all items per table. This means
// that a single BatchGetItem / BatchWriteItem updates the histogram
// for each table that it has items in.
// The sizes are the retrieved items' sizes grouped per table.
utils::estimated_histogram batch_get_item_op_size_kb{30};
// The sizes are the the written items' sizes grouped per table.
utils::estimated_histogram batch_write_item_op_size_kb{30};
} operation_sizes;
// Miscellaneous event counters
uint64_t total_operations = 0;
uint64_t unsupported_operations = 0;
@@ -127,22 +101,6 @@ public:
uint64_t wcu_total[NUM_TYPES] = {0};
// CQL-derived stats
cql3::cql_stats cql_stats;
// Enumeration of expression types only for stats
// if needed it can be extended e.g. per operation
enum expression_types {
UPDATE_EXPRESSION,
CONDITION_EXPRESSION,
PROJECTION_EXPRESSION,
NUM_EXPRESSION_TYPES
};
struct {
struct {
uint64_t hits = 0;
uint64_t misses = 0;
} requests[NUM_EXPRESSION_TYPES];
uint64_t evictions = 0;
} expression_cache;
};
struct table_stats {
@@ -152,8 +110,4 @@ struct table_stats {
};
void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);
inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
return (bytes + 1023) / 1024;
}
}

View File

@@ -13,6 +13,7 @@
#include <seastar/json/formatter.hh>
#include "auth/permission.hh"
#include "db/config.hh"
#include "cdc/log.hh"
@@ -31,7 +32,6 @@
#include "executor.hh"
#include "data_dictionary/data_dictionary.hh"
#include "utils/rjson.hh"
/**
* Base template type to implement rapidjson::internal::TypeHelper<...>:s
@@ -126,7 +126,7 @@ public:
}
};
} // namespace alternator
}
template<typename ValueType>
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
@@ -296,7 +296,7 @@ sequence_number::sequence_number(std::string_view v)
}())
{}
} // namespace alternator
}
template<typename ValueType>
struct rapidjson::internal::TypeHelper<ValueType, alternator::shard_id>
@@ -356,7 +356,7 @@ static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts)
return type;
}
} // namespace alternator
}
template<typename ValueType>
struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_view_type>
@@ -475,10 +475,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
} else {
status = "ENABLED";
}
}
}
auto ttl = std::chrono::seconds(opts.ttl());
rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));
stream_view_type type = cdc_options_to_steam_view_type(opts);
@@ -714,7 +714,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
auto type = rjson::get<shard_iterator_type>(request, "ShardIteratorType");
auto seq_num = rjson::get_opt<sequence_number>(request, "SequenceNumber");
if (type < shard_iterator_type::TRIM_HORIZON && !seq_num) {
throw api_error::validation("Missing required parameter \"SequenceNumber\"");
}
@@ -724,7 +724,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
auto stream_arn = rjson::get<alternator::stream_arn>(request, "StreamArn");
auto db = _proxy.data_dictionary();
schema_ptr schema = nullptr;
std::optional<shard_id> sid;
@@ -789,7 +789,7 @@ struct event_id {
return os;
}
};
} // namespace alternator
}
template<typename ValueType>
struct rapidjson::internal::TypeHelper<ValueType, alternator::event_id>
@@ -871,12 +871,10 @@ future<executor::request_return_type> executor::get_records(client_state& client
std::transform(pks.begin(), pks.end(), std::back_inserter(columns), [](auto& c) { return &c; });
std::transform(cks.begin(), cks.end(), std::back_inserter(columns), [](auto& c) { return &c; });
auto regular_column_start_idx = columns.size();
auto regular_column_filter = std::views::filter([](const column_definition& cdef) { return cdef.name() == op_column_name || cdef.name() == eor_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); });
std::ranges::transform(schema->regular_columns() | regular_column_filter, std::back_inserter(columns), [](auto& c) { return &c; });
auto regular_columns = std::ranges::subrange(columns.begin() + regular_column_start_idx, columns.end())
| std::views::transform(&column_definition::id)
auto regular_columns = schema->regular_columns()
| std::views::filter([](const column_definition& cdef) { return cdef.name() == op_column_name || cdef.name() == eor_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); })
| std::views::transform([&] (const column_definition& cdef) { columns.emplace_back(&cdef); return cdef.id; })
| std::ranges::to<query::column_id_vector>()
;
@@ -927,7 +925,6 @@ future<executor::request_return_type> executor::get_records(client_state& client
std::optional<utils::UUID> timestamp;
auto dynamodb = rjson::empty_object();
auto record = rjson::empty_object();
const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
using op_utype = std::underlying_type_t<cdc::operation>;
@@ -937,10 +934,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
dynamodb = rjson::empty_object();
}
if (!record.ObjectEmpty()) {
rjson::add(record, "awsRegion", rjson::from_string(dc_name));
// TODO: awsRegion?
rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
rjson::add(record, "eventSource", "scylladb:alternator");
rjson::add(record, "eventVersion", "1.1");
rjson::push_back(records, std::move(record));
record = rjson::empty_object();
--limit;
@@ -959,7 +955,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
rjson::add(dynamodb, "StreamViewType", type);
// TODO: SizeBytes
//TODO: SizeInBytes
}
/**
@@ -999,16 +995,6 @@ future<executor::request_return_type> executor::get_records(client_state& client
case cdc::operation::insert:
rjson::add(record, "eventName", "INSERT");
break;
case cdc::operation::service_row_delete:
case cdc::operation::service_partition_delete:
{
auto user_identity = rjson::empty_object();
rjson::add(user_identity, "Type", "Service");
rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
rjson::add(record, "userIdentity", std::move(user_identity));
rjson::add(record, "eventName", "REMOVE");
break;
}
default:
rjson::add(record, "eventName", "REMOVE");
break;
@@ -1134,4 +1120,4 @@ void executor::supplement_table_stream_info(rjson::value& descr, const schema& s
}
}
} // namespace alternator
}

View File

@@ -17,7 +17,6 @@
#include <seastar/core/lowres_clock.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include "cdc/log.hh"
#include "exceptions/exceptions.hh"
#include "gms/gossiper.hh"
#include "gms/inet_address.hh"
@@ -28,7 +27,7 @@
#include "replica/database.hh"
#include "service/client_state.hh"
#include "service_permit.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "service/storage_proxy.hh"
#include "service/pager/paging_state.hh"
#include "service/pager/query_pagers.hh"
@@ -57,14 +56,14 @@ static logging::logger tlogger("alternator_ttl");
namespace alternator {
// We write the expiration-time attribute enabled on a table in a
// We write the expiration-time attribute enabled on a table using a
// tag TTL_TAG_KEY.
// Currently, the *value* of this tag is simply the name of the attribute,
// and the expiration scanner interprets it as an Alternator attribute name -
// It can refer to a real column or if that doesn't exist, to a member of
// the ":attrs" map column. Although this is designed for Alternator, it may
// be good enough for CQL as well (there, the ":attrs" column won't exist).
extern const sstring TTL_TAG_KEY;
static const sstring TTL_TAG_KEY("system:ttl_attribute");
future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
_stats.api_operations.update_time_to_live++;
@@ -293,12 +292,7 @@ static future<> expire_item(service::storage_proxy& proxy,
db::consistency_level::LOCAL_QUORUM,
executor::default_timeout(), // FIXME - which timeout?
qs.get_trace_state(), qs.get_permit(),
db::allow_per_partition_rate_limit::no,
false,
cdc::per_request_options{
.is_system_originated = true,
}
);
db::allow_per_partition_rate_limit::no);
}
static size_t random_offset(size_t min, size_t max) {

View File

@@ -594,50 +594,6 @@
}
]
},
{
"path": "/storage_service/natural_endpoints/v2/{keyspace}",
"operations": [
{
"method": "GET",
"summary":"This method returns the N endpoints that are responsible for storing the specified key i.e for replication. the endpoint responsible for this key",
"type": "array",
"items": {
"type": "string"
},
"nickname": "get_natural_endpoints_v2",
"produces": [
"application/json"
],
"parameters": [
{
"name": "keyspace",
"description": "The keyspace to query about.",
"required": true,
"allowMultiple": false,
"type": "string",
"paramType": "path"
},
{
"name": "cf",
"description": "Column family name.",
"required": true,
"allowMultiple": false,
"type": "string",
"paramType": "query"
},
{
"name": "key_component",
"description": "Each component of the key for which we need to find the endpoint (e.g. ?key_component=part1&key_component=part2).",
"required": true,
"allowMultiple": true,
"type": "string",
"paramType": "query"
}
]
}
]
},
{
"path":"/storage_service/cdc_streams_check_and_repair",
"operations":[
@@ -1144,14 +1100,6 @@
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name": "drop_unfixable_sstables",
"description": "When set to true, drop unfixable sstables. Applies only to scrub mode SEGREGATE.",
"required":false,
"allowMultiple":false,
"type":"boolean",
"paramType":"query"
}
]
}
@@ -2973,14 +2921,6 @@
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"incremental_mode",
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
"required":false,
"allowMultiple":false,
"type":"string",
"paramType":"query"
}
]
}
@@ -3405,11 +3345,11 @@
"properties":{
"start_token":{
"type":"string",
"description":"The range start token (exclusive)"
"description":"The range start token"
},
"end_token":{
"type":"string",
"description":"The range end token (inclusive)"
"description":"The range start token"
},
"endpoints":{
"type":"array",
@@ -3482,7 +3422,7 @@
"version":{
"type":"string",
"enum":[
"ka", "la", "mc", "md", "me", "ms"
"ka", "la", "mc", "md", "me"
],
"description":"SSTable version"
},

View File

@@ -447,7 +447,7 @@
"description":"The number of units completed so far"
},
"children_ids":{
"type":"chunked_array",
"type":"array",
"items":{
"type":"task_identity"
},

View File

@@ -137,6 +137,14 @@ future<> unset_load_meter(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_load_meter(ctx, r); });
}
future<> set_format_selector(http_context& ctx, db::sstables_format_selector& sel) {
return ctx.http_server.set_routes([&ctx, &sel] (routes& r) { set_format_selector(ctx, r, sel); });
}
future<> unset_format_selector(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_format_selector(ctx, r); });
}
future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader) {
return ctx.http_server.set_routes([&ctx, &sst_loader] (routes& r) { set_sstables_loader(ctx, r, sst_loader); });
}
@@ -216,22 +224,15 @@ future<> unset_server_gossip(http_context& ctx) {
});
}
future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db) {
co_await register_api(ctx, "column_family",
"The column family API", [&db] (http_context& ctx, routes& r) {
set_column_family(ctx, r, db);
});
co_await register_api(ctx, "cache_service",
"The cache service API", [&db] (http_context& ctx, routes& r) {
set_cache_service(ctx, db, r);
future<> set_server_column_family(http_context& ctx, sharded<db::system_keyspace>& sys_ks) {
return register_api(ctx, "column_family",
"The column family API", [&sys_ks] (http_context& ctx, routes& r) {
set_column_family(ctx, r, sys_ks);
});
}
future<> unset_server_column_family(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) {
unset_column_family(ctx, r);
unset_cache_service(ctx, r);
});
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_column_family(ctx, r); });
}
future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms) {
@@ -263,6 +264,15 @@ future<> unset_server_stream_manager(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_stream_manager(ctx, r); });
}
future<> set_server_cache(http_context& ctx) {
return register_api(ctx, "cache_service",
"The cache service API", set_cache_service);
}
future<> unset_server_cache(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_cache_service(ctx, r); });
}
future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& proxy, sharded<gms::gossiper>& g) {
return register_api(ctx, "hinted_handoff",
"The hinted handoff API", [&proxy, &g] (http_context& ctx, routes& r) {
@@ -274,7 +284,7 @@ future<> unset_hinted_handoff(http_context& ctx) {
return ctx.http_server.set_routes([&ctx] (routes& r) { unset_hinted_handoff(ctx, r); });
}
future<> set_server_compaction_manager(http_context& ctx, sharded<compaction::compaction_manager>& cm) {
future<> set_server_compaction_manager(http_context& ctx, sharded<compaction_manager>& cm) {
return register_api(ctx, "compaction_manager", "The Compaction manager API", [&cm] (http_context& ctx, routes& r) {
set_compaction_manager(ctx, r, cm);
});

View File

@@ -73,7 +73,7 @@ inline std::vector<sstring> split(const sstring& text, const char* separator) {
*
*/
template<class T, class F, class V>
future<json::json_return_type> sum_stats(sharded<T>& d, V F::*f) {
future<json::json_return_type> sum_stats(distributed<T>& d, V F::*f) {
return d.map_reduce0([f](const T& p) {return p.get_stats().*f;}, 0,
std::plus<V>()).then([](V val) {
return make_ready_future<json::json_return_type>(val);
@@ -106,7 +106,7 @@ httpd::utils_json::rate_moving_average_and_histogram timer_to_json(const utils::
}
template<class T, class F>
future<json::json_return_type> sum_histogram_stats(sharded<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
future<json::json_return_type> sum_histogram_stats(distributed<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).hist;}, utils::ihistogram(),
std::plus<utils::ihistogram>()).then([](const utils::ihistogram& val) {
@@ -115,7 +115,7 @@ future<json::json_return_type> sum_histogram_stats(sharded<T>& d, utils::timed_
}
template<class T, class F>
future<json::json_return_type> sum_timer_stats(sharded<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
future<json::json_return_type> sum_timer_stats(distributed<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average_and_histogram(),
std::plus<utils::rate_moving_average_and_histogram>()).then([](const utils::rate_moving_average_and_histogram& val) {
@@ -124,7 +124,7 @@ future<json::json_return_type> sum_timer_stats(sharded<T>& d, utils::timed_rate
}
template<class T, class F>
future<json::json_return_type> sum_timer_stats(sharded<T>& d, utils::timed_rate_moving_average_summary_and_histogram F::*f) {
future<json::json_return_type> sum_timer_stats(distributed<T>& d, utils::timed_rate_moving_average_summary_and_histogram F::*f) {
return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average_and_histogram(),
std::plus<utils::rate_moving_average_and_histogram>()).then([](const utils::rate_moving_average_and_histogram& val) {
return make_ready_future<json::json_return_type>(timer_to_json(val));

View File

@@ -18,9 +18,7 @@
using request = http::request;
using reply = http::reply;
namespace compaction {
class compaction_manager;
}
namespace service {
@@ -58,6 +56,7 @@ class sstables_format_selector;
namespace view {
class view_builder;
}
class system_keyspace;
}
namespace netw { class messaging_service; }
class repair_service;
@@ -84,9 +83,9 @@ struct http_context {
sstring api_dir;
sstring api_doc;
httpd::http_server_control http_server;
sharded<replica::database>& db;
distributed<replica::database>& db;
http_context(sharded<replica::database>& _db)
http_context(distributed<replica::database>& _db)
: db(_db)
{
}
@@ -117,7 +116,7 @@ future<> set_server_token_metadata(http_context& ctx, sharded<locator::shared_to
future<> unset_server_token_metadata(http_context& ctx);
future<> set_server_gossip(http_context& ctx, sharded<gms::gossiper>& g);
future<> unset_server_gossip(http_context& ctx);
future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db);
future<> set_server_column_family(http_context& ctx, sharded<db::system_keyspace>& sys_ks);
future<> unset_server_column_family(http_context& ctx);
future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
future<> unset_server_messaging_service(http_context& ctx);
@@ -127,7 +126,9 @@ future<> set_server_stream_manager(http_context& ctx, sharded<streaming::stream_
future<> unset_server_stream_manager(http_context& ctx);
future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& p, sharded<gms::gossiper>& g);
future<> unset_hinted_handoff(http_context& ctx);
future<> set_server_compaction_manager(http_context& ctx, sharded<compaction::compaction_manager>& cm);
future<> set_server_cache(http_context& ctx);
future<> unset_server_cache(http_context& ctx);
future<> set_server_compaction_manager(http_context& ctx, sharded<compaction_manager>& cm);
future<> unset_server_compaction_manager(http_context& ctx);
future<> set_server_done(http_context& ctx);
future<> set_server_task_manager(http_context& ctx, sharded<tasks::task_manager>& tm, lw_shared_ptr<db::config> cfg, sharded<gms::gossiper>& gossiper);
@@ -140,6 +141,8 @@ future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
future<> unset_server_raft(http_context&);
future<> set_load_meter(http_context& ctx, service::load_meter& lm);
future<> unset_load_meter(http_context& ctx);
future<> set_format_selector(http_context& ctx, db::sstables_format_selector& sel);
future<> unset_format_selector(http_context& ctx);
future<> set_server_cql_server_test(http_context& ctx, cql_transport::controller& ctl);
future<> unset_server_cql_server_test(http_context& ctx);
future<> set_server_service_levels(http_context& ctx, cql_transport::controller& ctl, sharded<cql3::query_processor>& qp);

View File

@@ -16,7 +16,7 @@ using namespace json;
using namespace seastar::httpd;
namespace cs = httpd::cache_service_json;
void set_cache_service(http_context& ctx, sharded<replica::database>& db, routes& r) {
void set_cache_service(http_context& ctx, routes& r) {
cs::get_row_cache_save_period_in_seconds.set(r, [](std::unique_ptr<http::request> req) {
// We never save the cache
// Origin uses 0 for never
@@ -204,53 +204,53 @@ void set_cache_service(http_context& ctx, sharded<replica::database>& db, routes
});
});
cs::get_row_hits.set(r, [&db] (std::unique_ptr<http::request> req) {
return map_reduce_cf(db, uint64_t(0), [](const replica::column_family& cf) {
cs::get_row_hits.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf(ctx, uint64_t(0), [](const replica::column_family& cf) {
return cf.get_row_cache().stats().hits.count();
}, std::plus<uint64_t>());
});
cs::get_row_requests.set(r, [&db] (std::unique_ptr<http::request> req) {
return map_reduce_cf(db, uint64_t(0), [](const replica::column_family& cf) {
cs::get_row_requests.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf(ctx, uint64_t(0), [](const replica::column_family& cf) {
return cf.get_row_cache().stats().hits.count() + cf.get_row_cache().stats().misses.count();
}, std::plus<uint64_t>());
});
cs::get_row_hit_rate.set(r, [&db] (std::unique_ptr<http::request> req) {
return map_reduce_cf(db, ratio_holder(), [](const replica::column_family& cf) {
cs::get_row_hit_rate.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf(ctx, ratio_holder(), [](const replica::column_family& cf) {
return ratio_holder(cf.get_row_cache().stats().hits.count() + cf.get_row_cache().stats().misses.count(),
cf.get_row_cache().stats().hits.count());
}, std::plus<ratio_holder>());
});
cs::get_row_hits_moving_avrage.set(r, [&db] (std::unique_ptr<http::request> req) {
return map_reduce_cf_raw(db, utils::rate_moving_average(), [](const replica::column_family& cf) {
cs::get_row_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const replica::column_family& cf) {
return cf.get_row_cache().stats().hits.rate();
}, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
return make_ready_future<json::json_return_type>(meter_to_json(m));
});
});
cs::get_row_requests_moving_avrage.set(r, [&db] (std::unique_ptr<http::request> req) {
return map_reduce_cf_raw(db, utils::rate_moving_average(), [](const replica::column_family& cf) {
cs::get_row_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const replica::column_family& cf) {
return cf.get_row_cache().stats().hits.rate() + cf.get_row_cache().stats().misses.rate();
}, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
return make_ready_future<json::json_return_type>(meter_to_json(m));
});
});
cs::get_row_size.set(r, [&db] (std::unique_ptr<http::request> req) {
cs::get_row_size.set(r, [&ctx] (std::unique_ptr<http::request> req) {
// In origin row size is the weighted size.
// We currently do not support weights, so we use raw size in bytes instead
return db.map_reduce0([](replica::database& db) -> uint64_t {
return ctx.db.map_reduce0([](replica::database& db) -> uint64_t {
return db.row_cache_tracker().region().occupancy().used_space();
}, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);
});
});
cs::get_row_entries.set(r, [&db] (std::unique_ptr<http::request> req) {
return db.map_reduce0([](replica::database& db) -> uint64_t {
cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return ctx.db.map_reduce0([](replica::database& db) -> uint64_t {
return db.row_cache_tracker().partitions();
}, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);

View File

@@ -7,20 +7,15 @@
*/
#pragma once
#include <seastar/core/sharded.hh>
namespace seastar::httpd {
class routes;
}
namespace replica {
class database;
}
namespace api {
struct http_context;
void set_cache_service(http_context& ctx, seastar::sharded<replica::database>& db, seastar::httpd::routes& r);
void set_cache_service(http_context& ctx, seastar::httpd::routes& r);
void unset_cache_service(http_context& ctx, seastar::httpd::routes& r);
}

File diff suppressed because it is too large Load Diff

View File

@@ -13,20 +13,24 @@
#include <any>
#include "api/api_init.hh"
namespace db {
class system_keyspace;
}
namespace api {
void set_column_family(http_context& ctx, httpd::routes& r, sharded<replica::database>& db);
void set_column_family(http_context& ctx, httpd::routes& r, sharded<db::system_keyspace>& sys_ks);
void unset_column_family(http_context& ctx, httpd::routes& r);
table_info parse_table_info(const sstring& name, const replica::database& db);
template<class Mapper, class I, class Reducer>
future<I> map_reduce_cf_raw(sharded<replica::database>& db, const sstring& name, I init,
future<I> map_reduce_cf_raw(http_context& ctx, const sstring& name, I init,
Mapper mapper, Reducer reducer) {
auto uuid = parse_table_info(name, db.local()).id;
auto uuid = parse_table_info(name, ctx.db.local()).id;
using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::database&)>;
using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
return db.map_reduce0(mapper_type([mapper, uuid](replica::database& db) {
return ctx.db.map_reduce0(mapper_type([mapper, uuid](replica::database& db) {
return futurize_invoke([mapper, &db, uuid] {
return mapper(db.find_column_family(uuid));
}).then([] (auto result) {
@@ -41,22 +45,24 @@ future<I> map_reduce_cf_raw(sharded<replica::database>& db, const sstring& name,
template<class Mapper, class I, class Reducer>
future<json::json_return_type> map_reduce_cf(sharded<replica::database>& db, const sstring& name, I init,
future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
Mapper mapper, Reducer reducer) {
return map_reduce_cf_raw(db, name, init, mapper, reducer).then([](const I& res) {
return map_reduce_cf_raw(ctx, name, init, mapper, reducer).then([](const I& res) {
return make_ready_future<json::json_return_type>(res);
});
}
template<class Mapper, class I, class Reducer, class Result>
future<json::json_return_type> map_reduce_cf(sharded<replica::database>& db, const sstring& name, I init,
future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
Mapper mapper, Reducer reducer, Result result) {
return map_reduce_cf_raw(db, name, init, mapper, reducer).then([result](const I& res) mutable {
return map_reduce_cf_raw(ctx, name, init, mapper, reducer).then([result](const I& res) mutable {
result = res;
return make_ready_future<json::json_return_type>(result);
});
}
future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, const sstring& name, std::function<utils::time_estimated_histogram(const replica::column_family&)> f);
struct map_reduce_column_families_locally {
std::any init;
std::function<future<std::unique_ptr<std::any>>(replica::column_family&)> mapper;
@@ -72,7 +78,7 @@ struct map_reduce_column_families_locally {
};
template<class Mapper, class I, class Reducer>
future<I> map_reduce_cf_raw(sharded<replica::database>& db, I init,
future<I> map_reduce_cf_raw(http_context& ctx, I init,
Mapper mapper, Reducer reducer) {
using mapper_type = std::function<future<std::unique_ptr<std::any>>(replica::column_family&)>;
using reducer_type = std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)>;
@@ -86,7 +92,7 @@ future<I> map_reduce_cf_raw(sharded<replica::database>& db, I init,
auto wrapped_reducer = reducer_type([reducer = std::move(reducer)] (std::unique_ptr<std::any> a, std::unique_ptr<std::any> b) mutable {
return std::make_unique<std::any>(I(reducer(std::any_cast<I>(std::move(*a)), std::any_cast<I>(std::move(*b)))));
});
return db.map_reduce0(map_reduce_column_families_locally{init,
return ctx.db.map_reduce0(map_reduce_column_families_locally{init,
std::move(wrapped_mapper), wrapped_reducer}, std::make_unique<std::any>(init), wrapped_reducer).then([] (std::unique_ptr<std::any> res) {
return std::any_cast<I>(std::move(*res));
});
@@ -94,13 +100,20 @@ future<I> map_reduce_cf_raw(sharded<replica::database>& db, I init,
template<class Mapper, class I, class Reducer>
future<json::json_return_type> map_reduce_cf(sharded<replica::database>& db, I init,
future<json::json_return_type> map_reduce_cf(http_context& ctx, I init,
Mapper mapper, Reducer reducer) {
return map_reduce_cf_raw(db, init, mapper, reducer).then([](const I& res) {
return map_reduce_cf_raw(ctx, init, mapper, reducer).then([](const I& res) {
return make_ready_future<json::json_return_type>(res);
});
}
future<json::json_return_type> get_cf_stats(http_context& ctx, const sstring& name,
int64_t replica::column_family_stats::*f);
future<json::json_return_type> get_cf_stats(http_context& ctx,
int64_t replica::column_family_stats::*f);
std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name);
}

View File

@@ -29,9 +29,9 @@ namespace ss = httpd::storage_service_json;
using namespace json;
using namespace seastar::httpd;
static future<json::json_return_type> get_cm_stats(sharded<compaction::compaction_manager>& cm,
int64_t compaction::compaction_manager::stats::*f) {
return cm.map_reduce0([f](compaction::compaction_manager& cm) {
static future<json::json_return_type> get_cm_stats(sharded<compaction_manager>& cm,
int64_t compaction_manager::stats::*f) {
return cm.map_reduce0([f](compaction_manager& cm) {
return cm.get_stats().*f;
}, int64_t(0), std::plus<int64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);
@@ -47,9 +47,9 @@ static std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_ha
return std::move(a);
}
void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction::compaction_manager>& cm) {
void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction_manager>& cm) {
cm::get_compactions.set(r, [&cm] (std::unique_ptr<http::request> req) {
return cm.map_reduce0([](compaction::compaction_manager& cm) {
return cm.map_reduce0([](compaction_manager& cm) {
std::vector<cm::summary> summaries;
for (const auto& c : cm.get_compactions()) {
@@ -58,7 +58,7 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction::co
s.ks = c.ks_name;
s.cf = c.cf_name;
s.unit = "keys";
s.task_type = compaction::compaction_name(c.type);
s.task_type = sstables::compaction_name(c.type);
s.completed = c.total_keys_written;
s.total = c.total_partitions;
summaries.push_back(std::move(s));
@@ -103,20 +103,22 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction::co
cm::stop_compaction.set(r, [&cm] (std::unique_ptr<http::request> req) {
auto type = req->get_query_param("type");
return cm.invoke_on_all([type] (compaction::compaction_manager& cm) {
return cm.invoke_on_all([type] (compaction_manager& cm) {
return cm.stop_compaction(type);
}).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
});
cm::stop_keyspace_compaction.set(r, [&ctx, &cm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
cm::stop_keyspace_compaction.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto [ks_name, tables] = parse_table_infos(ctx, *req, "tables");
auto type = req->get_query_param("type");
co_await cm.invoke_on_all([&] (compaction::compaction_manager& cm) {
co_await ctx.db.invoke_on_all([&] (replica::database& db) {
auto& cm = db.get_compaction_manager();
return parallel_for_each(tables, [&] (const table_info& ti) {
return cm.stop_compaction(type, [id = ti.id] (const compaction::compaction_group_view* x) {
return x->schema()->id() == id;
auto& t = db.find_column_family(ti.id);
return t.parallel_foreach_compaction_group_view([&] (compaction::compaction_group_view& ts) {
return cm.stop_compaction(type, &ts);
});
});
});
@@ -124,13 +126,13 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction::co
});
cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return map_reduce_cf(ctx.db, int64_t(0), [](replica::column_family& cf) {
return map_reduce_cf(ctx, int64_t(0), [](replica::column_family& cf) {
return cf.estimate_pending_compactions();
}, std::plus<int64_t>());
});
cm::get_completed_tasks.set(r, [&cm] (std::unique_ptr<http::request> req) {
return get_cm_stats(cm, &compaction::compaction_manager::stats::completed_tasks);
return get_cm_stats(cm, &compaction_manager::stats::completed_tasks);
});
cm::get_total_compactions_completed.set(r, [] (std::unique_ptr<http::request> req) {
@@ -148,7 +150,7 @@ void set_compaction_manager(http_context& ctx, routes& r, sharded<compaction::co
});
cm::get_compaction_history.set(r, [&cm] (std::unique_ptr<http::request> req) {
noncopyable_function<future<>(output_stream<char>&&)> f = [&cm] (output_stream<char>&& out) -> future<> {
std::function<future<>(output_stream<char>&&)> f = [&cm] (output_stream<char>&& out) -> future<> {
auto s = std::move(out);
bool first = true;
std::exception_ptr ex;

View File

@@ -13,13 +13,11 @@ namespace seastar::httpd {
class routes;
}
namespace compaction {
class compaction_manager;
}
namespace api {
struct http_context;
void set_compaction_manager(http_context& ctx, seastar::httpd::routes& r, seastar::sharded<compaction::compaction_manager>& cm);
void set_compaction_manager(http_context& ctx, seastar::httpd::routes& r, seastar::sharded<compaction_manager>& cm);
void unset_compaction_manager(http_context& ctx, seastar::httpd::routes& r);
}

View File

@@ -21,10 +21,10 @@ namespace hf = httpd::error_injection_json;
void set_error_injection(http_context& ctx, routes& r) {
hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
sstring injection = req->get_path_param("injection");
bool one_shot = req->get_query_param("one_shot") == "True";
auto params = co_await util::read_entire_stream_contiguous(*req->content_stream);
auto params = req->content;
const size_t max_params_size = 1024 * 1024;
if (params.size() > max_params_size) {
@@ -39,11 +39,12 @@ void set_error_injection(http_context& ctx, routes& r) {
: rjson::parse_to_map<utils::error_injection_parameters>(params);
auto& errinj = utils::get_local_injector();
co_await errinj.enable_on_all(injection, one_shot, std::move(parameters));
return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
return make_ready_future<json::json_return_type>(json::json_void());
});
} catch (const rjson::error& e) {
throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
}
co_return json::json_void();
});
hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {

View File

@@ -71,7 +71,7 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis
co_return json_void{};
});
r::get_leader_host.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
if (req->get_query_param("group_id").empty()) {
if (!req->query_parameters.contains("group_id")) {
const auto leader_id = co_await raft_gr.invoke_on(0, [] (service::raft_group_registry& raft_gr) {
auto& srv = raft_gr.group0();
return srv.current_leader();
@@ -100,7 +100,7 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis
r::read_barrier.set(r, [&raft_gr] (std::unique_ptr<http::request> req) -> future<json_return_type> {
auto timeout = get_request_timeout(*req);
if (req->get_query_param("group_id").empty()) {
if (!req->query_parameters.contains("group_id")) {
// Read barrier on group 0 by default
co_await raft_gr.invoke_on(0, [timeout] (service::raft_group_registry& raft_gr) -> future<> {
co_await raft_gr.group0_with_timeouts().read_barrier(nullptr, timeout);
@@ -131,7 +131,7 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis
const auto stepdown_timeout_ticks = dur / service::raft_tick_interval;
auto timeout_dur = raft::logical_clock::duration(stepdown_timeout_ticks);
if (req->get_query_param("group_id").empty()) {
if (!req->query_parameters.contains("group_id")) {
// Stepdown on group 0 by default
co_await raft_gr.invoke_on(0, [timeout_dur] (service::raft_group_registry& raft_gr) {
apilog.info("Triggering stepdown for group0");

View File

@@ -39,7 +39,7 @@ utils::time_estimated_histogram timed_rate_moving_average_summary_merge(utils::t
* @return A future that resolves to the result of the aggregation.
*/
template<typename V, typename Reducer, typename InnerMapper>
future<V> two_dimensional_map_reduce(sharded<service::storage_proxy>& d,
future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
InnerMapper mapper, Reducer reducer, V initial_value) {
return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
@@ -59,7 +59,7 @@ future<V> two_dimensional_map_reduce(sharded<service::storage_proxy>& d,
* @return A future that resolves to the result of the aggregation.
*/
template<typename V, typename Reducer, typename F, typename C>
future<V> two_dimensional_map_reduce(sharded<service::storage_proxy>& d,
future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
C F::*f, Reducer reducer, V initial_value) {
return two_dimensional_map_reduce(d, [f] (F& stats) -> V {
return stats.*f;
@@ -75,20 +75,20 @@ future<V> two_dimensional_map_reduce(sharded<service::storage_proxy>& d,
*
*/
template<typename V, typename F>
future<json::json_return_type> sum_stats_storage_proxy(sharded<proxy>& d, V F::*f) {
future<json::json_return_type> sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
return make_ready_future<json::json_return_type>(val);
});
}
static future<utils::rate_moving_average> sum_timed_rate(sharded<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
static future<utils::rate_moving_average> sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
return (stats.*f).rate();
}, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
}
static future<json::json_return_type> sum_timed_rate_as_obj(sharded<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
static future<json::json_return_type> sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
httpd::utils_json::rate_moving_average m;
m = val;
@@ -100,7 +100,7 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
return timer_to_json(utils::rate_moving_average_and_histogram());
}
static future<json::json_return_type> sum_timed_rate_as_long(sharded<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
static future<json::json_return_type> sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
return make_ready_future<json::json_return_type>(val.count);
});
@@ -152,7 +152,7 @@ static future<json::json_return_type> total_latency(sharded<service::storage_pr
*/
template<typename F>
future<json::json_return_type>
sum_histogram_stats_storage_proxy(sharded<proxy>& d,
sum_histogram_stats_storage_proxy(distributed<proxy>& d,
utils::timed_rate_moving_average_summary_and_histogram F::*f) {
return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
return (stats.*f).hist;
@@ -172,7 +172,7 @@ sum_histogram_stats_storage_proxy(sharded<proxy>& d,
*/
template<typename F>
future<json::json_return_type>
sum_timer_stats_storage_proxy(sharded<proxy>& d,
sum_timer_stats_storage_proxy(distributed<proxy>& d,
utils::timed_rate_moving_average_summary_and_histogram F::*f) {
return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {

View File

@@ -36,7 +36,6 @@
#include "gms/gossiper.hh"
#include "db/system_keyspace.hh"
#include <seastar/http/exception.hh>
#include <seastar/http/short_streams.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/coroutine/exception.hh>
@@ -175,7 +174,9 @@ std::vector<table_info> parse_table_infos(const sstring& ks_name, const http_con
std::pair<sstring, std::vector<table_info>> parse_table_infos(const http_context& ctx, const http::request& req, sstring cf_param_name) {
auto keyspace = validate_keyspace(ctx, req);
auto tis = parse_table_infos(keyspace, ctx, req.get_query_param(cf_param_name));
const auto& query_params = req.query_parameters;
auto it = query_params.find(cf_param_name);
auto tis = parse_table_infos(keyspace, ctx, it != query_params.end() ? it->second : "");
return std::make_pair(std::move(keyspace), std::move(tis));
}
@@ -197,7 +198,7 @@ static ss::token_range token_range_endpoints_to_json(const dht::token_range_endp
return r;
}
seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, bool legacy_request) {
seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request) {
return q.scatter().then([&q, legacy_request] {
return sleep(q.duration()).then([&q, legacy_request] {
return q.gather(q.capacity()).then([&q, legacy_request] (auto topk_results) {
@@ -227,36 +228,37 @@ seastar::future<json::json_return_type> run_toppartitions_query(db::toppartition
});
}
scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req) {
future<scrub_info> parse_scrub_options(const http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl, std::unique_ptr<http::request> req) {
scrub_info info;
auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
info.keyspace = std::move(keyspace);
info.column_families = table_infos | std::views::transform(&table_info::name) | std::ranges::to<std::vector>();
auto scrub_mode_str = req->get_query_param("scrub_mode");
auto scrub_mode = compaction::compaction_type_options::scrub::mode::abort;
auto scrub_mode = sstables::compaction_type_options::scrub::mode::abort;
if (scrub_mode_str.empty()) {
const auto skip_corrupted = validate_bool_x(req->get_query_param("skip_corrupted"), false);
if (skip_corrupted) {
scrub_mode = compaction::compaction_type_options::scrub::mode::skip;
scrub_mode = sstables::compaction_type_options::scrub::mode::skip;
}
} else {
if (scrub_mode_str == "ABORT") {
scrub_mode = compaction::compaction_type_options::scrub::mode::abort;
scrub_mode = sstables::compaction_type_options::scrub::mode::abort;
} else if (scrub_mode_str == "SKIP") {
scrub_mode = compaction::compaction_type_options::scrub::mode::skip;
scrub_mode = sstables::compaction_type_options::scrub::mode::skip;
} else if (scrub_mode_str == "SEGREGATE") {
scrub_mode = compaction::compaction_type_options::scrub::mode::segregate;
scrub_mode = sstables::compaction_type_options::scrub::mode::segregate;
} else if (scrub_mode_str == "VALIDATE") {
scrub_mode = compaction::compaction_type_options::scrub::mode::validate;
scrub_mode = sstables::compaction_type_options::scrub::mode::validate;
} else {
throw httpd::bad_param_exception(fmt::format("Unknown argument for 'scrub_mode' parameter: {}", scrub_mode_str));
}
}
if (!req_param<bool>(*req, "disable_snapshot", false) && !info.column_families.empty()) {
info.snapshot_tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, tag, db::snapshot_ctl::skip_flush::no);
}
info.opts = {
@@ -264,23 +266,16 @@ scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::re
};
const sstring quarantine_mode_str = req_param<sstring>(*req, "quarantine_mode", "INCLUDE");
if (quarantine_mode_str == "INCLUDE") {
info.opts.quarantine_operation_mode = compaction::compaction_type_options::scrub::quarantine_mode::include;
info.opts.quarantine_operation_mode = sstables::compaction_type_options::scrub::quarantine_mode::include;
} else if (quarantine_mode_str == "EXCLUDE") {
info.opts.quarantine_operation_mode = compaction::compaction_type_options::scrub::quarantine_mode::exclude;
info.opts.quarantine_operation_mode = sstables::compaction_type_options::scrub::quarantine_mode::exclude;
} else if (quarantine_mode_str == "ONLY") {
info.opts.quarantine_operation_mode = compaction::compaction_type_options::scrub::quarantine_mode::only;
info.opts.quarantine_operation_mode = sstables::compaction_type_options::scrub::quarantine_mode::only;
} else {
throw httpd::bad_param_exception(fmt::format("Unknown argument for 'quarantine_mode' parameter: {}", quarantine_mode_str));
}
if(req_param<bool>(*req, "drop_unfixable_sstables", false)) {
if(scrub_mode != compaction::compaction_type_options::scrub::mode::segregate) {
throw httpd::bad_param_exception("The 'drop_unfixable_sstables' parameter is only valid when 'scrub_mode' is 'SEGREGATE'");
}
info.opts.drop_unfixable = compaction::compaction_type_options::scrub::drop_unfixable_sstables::yes;
}
return info;
co_return info;
}
void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
@@ -337,7 +332,7 @@ void set_repair(http_context& ctx, routes& r, sharded<repair_service>& repair, s
// Nodetool still sends those unsupported options. Ignore them to avoid failing nodetool repair.
static std::unordered_set<sstring> legacy_options_to_ignore = {"pullRepair", "ignoreUnreplicatedKeyspaces"};
for (auto& x : req->get_query_params()) {
for (auto& x : req->query_parameters) {
if (legacy_options_to_ignore.contains(x.first)) {
continue;
}
@@ -505,8 +500,9 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
auto prefix = req->get_query_param("prefix");
auto scope = parse_stream_scope(req->get_query_param("scope"));
rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
rjson::value parsed = rjson::parse(std::move(content));
// TODO: the http_server backing the API does not use content streaming
// should use it for better performance
rjson::value parsed = rjson::parse(req->content);
if (!parsed.IsArray()) {
throw httpd::bad_param_exception("malformatted sstables in body");
}
@@ -534,35 +530,10 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
});
});
cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
// Use of load_built_views() as filtering table should be in sync with
// built_indexes_virtual_reader filtering with BUILT_VIEWS table
std::vector<db::system_keyspace::view_name> vn = co_await vb.local().get_sys_ks().load_built_views();
std::set<sstring> vp;
for (auto b : vn) {
if (b.first == ks) {
vp.insert(b.second);
}
}
std::vector<sstring> res;
replica::database& db = vb.local().get_db();
auto uuid = validate_table(db, ks, cf_name);
replica::column_family& cf = db.find_column_family(uuid);
res.reserve(cf.get_index_manager().list_indexes().size());
for (auto&& i : cf.get_index_manager().list_indexes()) {
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
res.emplace_back(i.metadata().name());
}
}
co_return res;
});
}
void unset_view_builder(http_context& ctx, routes& r) {
ss::view_build_statuses.unset(r);
cf::get_built_indexes.unset(r);
}
static future<json::json_return_type> describe_ring_as_json(sharded<service::storage_service>& ss, sstring keyspace) {
@@ -584,8 +555,10 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
token_endpoints = ss.local().get_token_to_endpoint_map();
} else if (!keyspace_name.empty() && !table_name.empty()) {
auto& db = ctx.db.local();
auto tid = validate_table(db, keyspace_name, table_name);
token_endpoints = co_await ss.local().get_tablet_to_endpoint_map(tid);
if (!db.has_schema(keyspace_name, table_name)) {
throw bad_param_exception(fmt::format("Failed to find table {}.{}", keyspace_name, table_name));
}
token_endpoints = co_await ss.local().get_tablet_to_endpoint_map(db.find_schema(keyspace_name, table_name)->id());
} else {
throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
}
@@ -604,8 +577,9 @@ rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req
bool filters_provided = false;
std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
if (req->query_parameters.contains("table_filters")) {
filters_provided = true;
auto filters = req->get_query_param("table_filters");
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
@@ -615,8 +589,9 @@ rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req
}
std::unordered_set<sstring> keyspace_filters {};
if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
if (req->query_parameters.contains("keyspace_filters")) {
filters_provided = true;
auto filters = req->get_query_param("keyspace_filters");
std::stringstream ss { filters };
std::string filter;
while (!filters.empty() && ss.good()) {
@@ -643,8 +618,8 @@ rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req
apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
!table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
return run_toppartitions_query(q);
return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [&ctx] (db::toppartitions_query& q) {
return run_toppartitions_query(q, ctx);
});
}
@@ -671,16 +646,21 @@ future<json::json_return_type>
rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
auto keyspace = validate_keyspace(ctx, req);
auto table = req->get_query_param("cf");
std::optional<table_id> table_id;
if (table.empty()) {
ensure_tablets_disabled(ctx, keyspace, "storage_service/range_to_endpoint_map");
} else {
table_id = validate_table(ctx.db.local(), keyspace, table);
}
auto erm = std::invoke([&]() -> locator::effective_replication_map_ptr {
auto& ks = ctx.db.local().find_keyspace(keyspace);
if (table.empty()) {
ensure_tablets_disabled(ctx, keyspace, "storage_service/range_to_endpoint_map");
return ks.get_static_effective_replication_map();
} else {
auto table_id = validate_table(ctx.db.local(), keyspace, table);
auto& cf = ctx.db.local().find_column_family(table_id);
return cf.get_effective_replication_map();
}
});
std::vector<ss::maplist_mapper> res;
co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(erm),
[](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
ss::maplist_mapper m;
if (entry.first.start()) {
@@ -725,6 +705,12 @@ rest_describe_ring(http_context& ctx, sharded<service::storage_service>& ss, std
return describe_ring_as_json(ss, validate_keyspace(ctx, req));
}
static
future<json::json_return_type>
rest_get_load(http_context& ctx, std::unique_ptr<http::request> req) {
return get_cf_stats(ctx, &replica::column_family_stats::live_disk_space_used);
}
static
future<json::json_return_type>
rest_get_current_generation_number(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -742,14 +728,6 @@ rest_get_natural_endpoints(http_context& ctx, sharded<service::storage_service>&
return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
}
static
json::json_return_type
rest_get_natural_endpoints_v2(http_context& ctx, sharded<service::storage_service>& ss, const_req req) {
auto keyspace = validate_keyspace(ctx, req);
auto res = ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"), req.get_query_param_array("key_component"));
return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
}
static
future<json::json_return_type>
rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -760,6 +738,68 @@ rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::un
});
}
static
future<json::json_return_type>
rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
auto& db = ctx.db;
auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
apilog.info("force_compaction: flush={} consider_only_existing_data={}", flush, consider_only_existing_data);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
std::optional<flush_mode> fmopt;
if (!flush && !consider_only_existing_data) {
fmopt = flush_mode::skip;
}
auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
co_await task->done();
co_return json_void();
}
static
future<json::json_return_type>
rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
auto& db = ctx.db;
auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
std::optional<flush_mode> fmopt;
if (!flush && !consider_only_existing_data) {
fmopt = flush_mode::skip;
}
auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
co_await task->done();
co_return json_void();
}
static
future<json::json_return_type>
rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
auto& db = ctx.db;
auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
if (rs.is_local() || !rs.is_vnode_based()) {
auto reason = rs.is_local() ? "require" : "support";
apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
co_return json::json_return_type(0);
}
apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
auto msg = "Can not perform cleanup operation when topology changes";
apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
co_await coroutine::return_exception(std::runtime_error(msg));
}
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
{}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
co_await task->done();
co_return json::json_return_type(0);
}
static
future<json::json_return_type>
rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -768,7 +808,7 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
if (!ss.is_topology_coordinator_enabled()) {
co_return false;
}
co_await ss.do_clusterwide_vnodes_cleanup();
co_await ss.do_cluster_cleanup();
co_return true;
});
if (done) {
@@ -777,7 +817,34 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
// fall back to the local global cleanup if topology coordinator is not enabled
auto& db = ctx.db;
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
co_await task->done();
co_return json::json_return_type(0);
}
static
future<json::json_return_type>
rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
bool res = false;
auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
co_await task->done();
co_return json::json_return_type(res);
}
static
future<json::json_return_type>
rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {
auto& db = ctx.db;
auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
co_await task->done();
co_return json::json_return_type(0);
}
@@ -901,9 +968,9 @@ rest_is_starting(sharded<service::storage_service>& ss, std::unique_ptr<http::re
static
future<json::json_return_type>
rest_get_drain_progress(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
return ss.map_reduce(adder<replica::database::drain_progress>(), [] (auto& ss) {
return ss.get_database().get_drain_progress();
rest_get_drain_progress(http_context& ctx, std::unique_ptr<http::request> req) {
return ctx.db.map_reduce(adder<replica::database::drain_progress>(), [] (auto& db) {
return db.get_drain_progress();
}).then([] (auto&& progress) {
auto progress_str = format("Drained {}/{} ColumnFamilies", progress.remaining_cfs, progress.total_cfs);
return make_ready_future<json::json_return_type>(std::move(progress_str));
@@ -919,6 +986,28 @@ rest_drain(sharded<service::storage_service>& ss, std::unique_ptr<http::request>
});
}
static
json::json_return_type
rest_get_keyspaces(http_context& ctx, const_req req) {
auto type = req.get_query_param("type");
auto replication = req.get_query_param("replication");
std::vector<sstring> keyspaces;
if (type == "user") {
keyspaces = ctx.db.local().get_user_keyspaces();
} else if (type == "non_local_strategy") {
keyspaces = ctx.db.local().get_non_local_strategy_keyspaces();
} else {
keyspaces = ctx.db.local().get_all_keyspaces();
}
if (replication.empty() || replication == "all") {
return keyspaces;
}
const auto want_tablets = replication == "tablets";
return keyspaces | std::views::filter([&ctx, want_tablets] (const sstring& ks) {
return ctx.db.local().find_keyspace(ks).get_replication_strategy().uses_tablets() == want_tablets;
}) | std::ranges::to<std::vector>();
}
static
future<json::json_return_type>
rest_stop_gossiping(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1235,6 +1324,12 @@ rest_set_hinted_handoff_throttle_in_kb(std::unique_ptr<http::request> req) {
return make_ready_future<json::json_return_type>(json_void());
}
static
future<json::json_return_type>
rest_get_metrics_load(http_context& ctx, std::unique_ptr<http::request> req) {
return get_cf_stats(ctx, &replica::column_family_stats::live_disk_space_used);
}
static
json::json_return_type
rest_get_exceptions(sharded<service::storage_service>& ss, const_req req) {
@@ -1635,10 +1730,6 @@ rest_repair_tablet(http_context& ctx, sharded<service::storage_service>& ss, std
if (!await.empty()) {
await_completion = validate_bool(await);
}
// Use regular mode if the incremental_mode option is not provided by user.
auto incremental = req->get_query_param("incremental_mode");
auto incremental_mode = incremental.empty() ? locator::default_tablet_repair_incremental_mode : locator::tablet_repair_incremental_mode_from_string(incremental);
auto table_id = validate_table(ctx.db.local(), ks, table);
std::variant<utils::chunked_vector<dht::token>, service::storage_service::all_tokens_tag> tokens_variant;
if (all_tokens) {
@@ -1661,12 +1752,8 @@ rest_repair_tablet(http_context& ctx, sharded<service::storage_service>& ss, std
}) | std::ranges::to<std::unordered_set>();
}
auto dcs_filter = locator::tablet_task_info::deserialize_repair_dcs_filter(dcs);
try {
auto res = co_await ss.local().add_repair_tablet_request(table_id, tokens_variant, hosts_filter, dcs_filter, await_completion, incremental_mode);
co_return json::json_return_type(res);
} catch (std::invalid_argument& e) {
throw httpd::bad_param_exception(e.what());
}
auto res = co_await ss.local().add_repair_tablet_request(table_id, tokens_variant, hosts_filter, dcs_filter, await_completion);
co_return json::json_return_type(res);
}
static
@@ -1707,7 +1794,8 @@ rest_drop_quarantined_sstables(http_context& ctx, sharded<service::storage_servi
try {
if (!keyspace.empty()) {
keyspace = validate_keyspace(ctx, keyspace);
auto table_infos = parse_table_infos(keyspace, ctx, req->get_query_param("tables"));
auto it = req->query_parameters.find("tables");
auto table_infos = parse_table_infos(keyspace, ctx, it != req->query_parameters.end() ? it->second : "");
co_await ctx.db.invoke_on_all([&table_infos](replica::database& db) -> future<> {
return parallel_for_each(table_infos, [&db](const auto& table) -> future<> {
@@ -1759,11 +1847,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
ss::get_load.set(r, rest_bind(rest_get_load, ctx));
ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
ss::force_compaction.set(r, rest_bind(rest_force_compaction, ctx));
ss::force_keyspace_compaction.set(r, rest_bind(rest_force_keyspace_compaction, ctx));
ss::force_keyspace_cleanup.set(r, rest_bind(rest_force_keyspace_cleanup, ctx, ss));
ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
ss::perform_keyspace_offstrategy_compaction.set(r, rest_bind(rest_perform_keyspace_offstrategy_compaction, ctx));
ss::upgrade_sstables.set(r, rest_bind(rest_upgrade_sstables, ctx));
ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
ss::decommission.set(r, rest_bind(rest_decommission, ss));
@@ -1775,8 +1868,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ctx));
ss::drain.set(r, rest_bind(rest_drain, ss));
ss::get_keyspaces.set(r, rest_bind(rest_get_keyspaces, ctx));
ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
@@ -1806,6 +1900,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
ss::get_metrics_load.set(r, rest_bind(rest_get_metrics_load, ctx));
ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
@@ -1837,10 +1932,16 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::get_range_to_endpoint_map.unset(r);
ss::get_pending_range_to_endpoint_map.unset(r);
ss::describe_ring.unset(r);
ss::get_load.unset(r);
ss::get_current_generation_number.unset(r);
ss::get_natural_endpoints.unset(r);
ss::cdc_streams_check_and_repair.unset(r);
ss::force_compaction.unset(r);
ss::force_keyspace_compaction.unset(r);
ss::force_keyspace_cleanup.unset(r);
ss::cleanup_all.unset(r);
ss::perform_keyspace_offstrategy_compaction.unset(r);
ss::upgrade_sstables.unset(r);
ss::force_flush.unset(r);
ss::force_keyspace_flush.unset(r);
ss::decommission.unset(r);
@@ -1854,6 +1955,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::is_starting.unset(r);
ss::get_drain_progress.unset(r);
ss::drain.unset(r);
ss::get_keyspaces.unset(r);
ss::stop_gossiping.unset(r);
ss::start_gossiping.unset(r);
ss::is_gossip_running.unset(r);
@@ -1883,6 +1985,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
ss::get_batch_size_failure_threshold.unset(r);
ss::set_batch_size_failure_threshold.unset(r);
ss::set_hinted_handoff_throttle_in_kb.unset(r);
ss::get_metrics_load.unset(r);
ss::get_exceptions.unset(r);
ss::get_total_hints_in_progress.unset(r);
ss::get_total_hints.unset(r);
@@ -1924,7 +2027,7 @@ void unset_load_meter(http_context& ctx, routes& r) {
void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_ctl) {
ss::get_snapshot_details.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto result = co_await snap_ctl.local().get_snapshot_details();
co_return noncopyable_function<future<> (output_stream<char>&&)>([res = std::move(result)] (output_stream<char>&& o) -> future<> {
co_return std::function([res = std::move(result)] (output_stream<char>&& o) -> future<> {
std::exception_ptr ex;
output_stream<char> out = std::move(o);
try {
@@ -1964,7 +2067,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
});
ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
apilog.info("take_snapshot: {}", req->get_query_params());
apilog.info("take_snapshot: {}", req->query_parameters);
auto tag = req->get_query_param("tag");
auto column_families = split(req->get_query_param("cf"), ",");
auto sfopt = req->get_query_param("sf");
@@ -1991,7 +2094,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
});
ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
apilog.info("del_snapshot: {}", req->get_query_params());
apilog.info("del_snapshot: {}", req->query_parameters);
auto tag = req->get_query_param("tag");
auto column_family = req->get_query_param("cf");
@@ -2013,21 +2116,17 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto info = parse_scrub_options(ctx, std::move(req));
auto info = co_await parse_scrub_options(ctx, snap_ctl, std::move(req));
if (!info.snapshot_tag.empty()) {
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
compaction::compaction_stats stats;
sstables::compaction_stats stats;
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::scrub_sstables_compaction_task_impl>({}, info.keyspace, db, info.column_families, info.opts, &stats);
auto task = co_await compaction_module.make_and_start_task<scrub_sstables_compaction_task_impl>({}, info.keyspace, db, info.column_families, info.opts, &stats);
try {
co_await task->done();
if (stats.validation_errors) {
co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
}
} catch (const compaction::compaction_aborted_exception&) {
} catch (const sstables::compaction_aborted_exception&) {
co_return json::json_return_type(static_cast<int>(scrub_status::aborted));
} catch (...) {
apilog.error("scrub keyspace={} tables={} failed: {}", info.keyspace, info.column_families, std::current_exception());

View File

@@ -58,13 +58,12 @@ std::vector<table_info> parse_table_infos(const sstring& ks_name, const http_con
std::pair<sstring, std::vector<table_info>> parse_table_infos(const http_context& ctx, const http::request& req, sstring cf_param_name = "cf");
struct scrub_info {
compaction::compaction_type_options::scrub opts;
sstables::compaction_type_options::scrub opts;
sstring keyspace;
std::vector<sstring> column_families;
sstring snapshot_tag;
};
scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req);
future<scrub_info> parse_scrub_options(const http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl, std::unique_ptr<http::request> req);
void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
void unset_storage_service(http_context& ctx, httpd::routes& r);
@@ -82,7 +81,7 @@ void set_snapshot(http_context& ctx, httpd::routes& r, sharded<db::snapshot_ctl>
void unset_snapshot(http_context& ctx, httpd::routes& r);
void set_load_meter(http_context& ctx, httpd::routes& r, service::load_meter& lm);
void unset_load_meter(http_context& ctx, httpd::routes& r);
seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, bool legacy_request = false);
seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);
// converts string value of boolean parameter into bool
// maps (case insensitively)

View File

@@ -10,7 +10,7 @@
#include "api/api-doc/system.json.hh"
#include "api/api-doc/metrics.json.hh"
#include "replica/database.hh"
#include "sstables/sstables_manager.hh"
#include "db/sstables-format-selector.hh"
#include <rapidjson/document.h>
#include <boost/lexical_cast.hpp>
@@ -54,8 +54,7 @@ void set_system(http_context& ctx, routes& r) {
hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
rapidjson::Document doc;
auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
doc.Parse(content.c_str());
doc.Parse(req->content.c_str());
if (!doc.IsArray()) {
throw bad_param_exception("Expected a json array");
}
@@ -88,19 +87,21 @@ void set_system(http_context& ctx, routes& r) {
relabels[i].expr = element["regex"].GetString();
}
}
bool failed = false;
co_await smp::invoke_on_all([&relabels, &failed] {
return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
if (result.metrics_relabeled_due_to_collision > 0) {
failed = true;
return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
return smp::invoke_on_all([&relabels, &failed] {
return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
if (result.metrics_relabeled_due_to_collision > 0) {
failed = true;
}
return;
});
}).then([&failed](){
if (failed) {
throw bad_param_exception("conflicts found during relabeling");
}
return;
return make_ready_future<json::json_return_type>(seastar::json::json_void());
});
});
if (failed) {
throw bad_param_exception("conflicts found during relabeling");
}
co_return seastar::json::json_void();
});
hs::get_system_uptime.set(r, [](const_req req) {
@@ -183,13 +184,18 @@ void set_system(http_context& ctx, routes& r) {
apilog.info("Profile dumped to {}", profile_dest);
return make_ready_future<json::json_return_type>(json::json_return_type(json::json_void()));
}) ;
}
hs::get_highest_supported_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
return smp::submit_to(0, [&ctx] {
auto format = ctx.db.local().get_user_sstables_manager().get_highest_supported_format();
return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
void set_format_selector(http_context& ctx, routes& r, db::sstables_format_selector& sel) {
hs::get_highest_supported_sstable_version.set(r, [&sel] (std::unique_ptr<request> req) {
return smp::submit_to(0, [&sel] {
return make_ready_future<json::json_return_type>(seastar::to_sstring(sel.selected_format()));
});
});
}
void unset_format_selector(http_context& ctx, routes& r) {
hs::get_highest_supported_sstable_version.unset(r);
}
}

View File

@@ -12,9 +12,14 @@ namespace seastar::httpd {
class routes;
}
namespace db { class sstables_format_selector; }
namespace api {
struct http_context;
void set_system(http_context& ctx, seastar::httpd::routes& r);
void set_format_selector(http_context& ctx, seastar::httpd::routes& r, db::sstables_format_selector& sel);
void unset_format_selector(http_context& ctx, seastar::httpd::routes& r);
}

View File

@@ -6,7 +6,6 @@
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <seastar/core/chunked_fifo.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/exception.hh>
#include <seastar/http/exception.hh>
@@ -35,9 +34,8 @@ static ::tm get_time(db_clock::time_point tp) {
}
tm::task_status make_status(tasks::task_status status, sharded<gms::gossiper>& gossiper) {
chunked_fifo<tm::task_identity> tis;
tis.reserve(status.children.size());
for (const auto& child : status.children) {
std::vector<tm::task_identity> tis{status.children.size()};
std::ranges::transform(status.children, tis.begin(), [&gossiper] (const auto& child) {
tm::task_identity ident;
gms::inet_address addr{};
if (gossiper.local_is_initialized()) {
@@ -45,8 +43,8 @@ tm::task_status make_status(tasks::task_status status, sharded<gms::gossiper>& g
}
ident.task_id = child.task_id.to_sstring();
ident.node = fmt::format("{}", addr);
tis.push_back(std::move(ident));
}
return ident;
});
tm::task_status res{};
res.id = status.task_id.to_sstring();
@@ -107,11 +105,11 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
throw bad_param_exception(fmt::format("{}", std::current_exception()));
}
if (auto param = req->get_query_param("keyspace"); !param.empty()) {
keyspace = param;
if (auto it = req->query_parameters.find("keyspace"); it != req->query_parameters.end()) {
keyspace = it->second;
}
if (auto param = req->get_query_param("table"); !param.empty()) {
table = param;
if (auto it = req->query_parameters.find("table"); it != req->query_parameters.end()) {
table = it->second;
}
return module->get_stats(internal, [keyspace = std::move(keyspace), table = std::move(table)] (std::string& ks, std::string& t) {
@@ -119,7 +117,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
});
});
noncopyable_function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
auto s = std::move(os);
std::exception_ptr ex;
try {
@@ -175,8 +173,8 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
tasks::task_status status;
std::optional<std::chrono::seconds> timeout = std::nullopt;
if (auto param = req->get_query_param("timeout"); !param.empty()) {
timeout = std::chrono::seconds(boost::lexical_cast<uint32_t>(param));
if (auto it = req->query_parameters.find("timeout"); it != req->query_parameters.end()) {
timeout = std::chrono::seconds(boost::lexical_cast<uint32_t>(it->second));
}
try {
auto task = tasks::task_handler{tm.local(), id};
@@ -196,7 +194,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
auto task = tasks::task_handler{tm.local(), id};
auto res = co_await task.get_status_recursively(true);
noncopyable_function<future<>(output_stream<char>&&)> f = [r = std::move(res), &gossiper] (output_stream<char>&& os) -> future<> {
std::function<future<>(output_stream<char>&&)> f = [r = std::move(res), &gossiper] (output_stream<char>&& os) -> future<> {
auto s = std::move(os);
auto res = std::move(r);
co_await s.write("[");
@@ -217,7 +215,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
tm::get_and_update_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
uint32_t ttl = cfg.task_ttl_seconds();
try {
co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->get_query_param("ttl"), utils::config_file::config_source::API);
co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
} catch (...) {
throw bad_param_exception(fmt::format("{}", std::current_exception()));
}
@@ -232,7 +230,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
tm::get_and_update_user_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
uint32_t user_ttl = cfg.user_task_ttl_seconds();
try {
co_await cfg.user_task_ttl_seconds.set_value_on_all_shards(req->get_query_param("user_ttl"), utils::config_file::config_source::API);
co_await cfg.user_task_ttl_seconds.set_value_on_all_shards(req->query_parameters["user_ttl"], utils::config_file::config_source::API);
} catch (...) {
throw bad_param_exception(fmt::format("{}", std::current_exception()));
}

View File

@@ -57,16 +57,20 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
tmt::register_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
sharded<tasks::task_manager>& tms = tm;
const auto id_param = req->get_query_param("task_id");
auto id = !id_param.empty() ? tasks::task_id{utils::UUID{id_param}} : tasks::task_id::create_null_id();
const auto shard_param = req->get_query_param("shard");
unsigned shard = shard_param.empty() ? 0 : boost::lexical_cast<unsigned>(shard_param);
std::string keyspace = req->get_query_param("keyspace");
std::string table = req->get_query_param("table");
std::string entity = req->get_query_param("entity");
auto it = req->query_parameters.find("task_id");
auto id = it != req->query_parameters.end() ? tasks::task_id{utils::UUID{it->second}} : tasks::task_id::create_null_id();
it = req->query_parameters.find("shard");
unsigned shard = it != req->query_parameters.end() ? boost::lexical_cast<unsigned>(it->second) : 0;
it = req->query_parameters.find("keyspace");
std::string keyspace = it != req->query_parameters.end() ? it->second : "";
it = req->query_parameters.find("table");
std::string table = it != req->query_parameters.end() ? it->second : "";
it = req->query_parameters.find("entity");
std::string entity = it != req->query_parameters.end() ? it->second : "";
it = req->query_parameters.find("parent_id");
tasks::task_info data;
if (auto parent_id = req->get_query_param("parent_id"); !parent_id.empty()) {
data.id = tasks::task_id{utils::UUID{parent_id}};
if (it != req->query_parameters.end()) {
data.id = tasks::task_id{utils::UUID{it->second}};
auto parent_ptr = co_await tasks::task_manager::lookup_task_on_all_shards(tm, data.id);
data.shard = parent_ptr->get_status().shard;
}
@@ -84,7 +88,7 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
});
tmt::unregister_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto id = tasks::task_id{utils::UUID{req->get_query_param("task_id")}};
auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
try {
co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_variant task_v, tasks::virtual_task_hint) -> future<> {
return std::visit(overloaded_functor{
@@ -105,8 +109,9 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
tmt::finish_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
std::string error = req->get_query_param("error");
bool fail = !error.empty();
auto it = req->query_parameters.find("error");
bool fail = it != req->query_parameters.end();
std::string error = fail ? it->second : "";
try {
co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_variant task_v, tasks::virtual_task_hint) -> future<> {

View File

@@ -12,7 +12,6 @@
#include "api/api.hh"
#include "api/storage_service.hh"
#include "api/api-doc/tasks.json.hh"
#include "api/api-doc/storage_service.json.hh"
#include "compaction/compaction_manager.hh"
#include "compaction/task_manager_module.hh"
#include "service/storage_service.hh"
@@ -26,7 +25,6 @@ extern logging::logger apilog;
namespace api {
namespace t = httpd::tasks_json;
namespace ss = httpd::storage_service_json;
using namespace json;
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<http::request>, sstring, std::vector<table_info>)>;
@@ -46,88 +44,39 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
std::optional<compaction::flush_mode> fmopt;
std::optional<flush_mode> fmopt;
if (!flush) {
fmopt = compaction::flush_mode::skip;
fmopt = flush_mode::skip;
}
auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
co_return json::json_return_type(task->get_status().id.to_sstring());
});
ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
std::optional<compaction::flush_mode> fmopt;
if (!flush && !consider_only_existing_data) {
fmopt = compaction::flush_mode::skip;
}
auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
co_await task->done();
co_return json_void();
});
t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
auto msg = "Can not perform cleanup operation when topology changes";
apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
co_await coroutine::return_exception(std::runtime_error(msg));
}
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
co_return json::json_return_type(task->get_status().id.to_sstring());
});
ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
if (rs.is_local() || !rs.is_vnode_based()) {
auto reason = rs.is_local() ? "require" : "support";
apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
co_return json::json_return_type(0);
}
apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
auto msg = "Can not perform cleanup operation when topology changes";
apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
co_await coroutine::return_exception(std::runtime_error(msg));
}
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
{}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
co_await task->done();
co_return json::json_return_type(0);
});
t::perform_keyspace_offstrategy_compaction_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, nullptr);
auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, nullptr);
co_return json::json_return_type(task->get_status().id.to_sstring());
}));
ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
bool res = false;
auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
co_await task->done();
co_return json::json_return_type(res);
}));
t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
auto& db = ctx.db;
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
@@ -135,65 +84,28 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
co_return json::json_return_type(task->get_status().id.to_sstring());
}));
ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
auto& db = ctx.db;
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
co_await task->done();
co_return json::json_return_type(0);
}));
t::scrub_async.set(r, [&ctx, &snap_ctl] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto info = parse_scrub_options(ctx, std::move(req));
if (!info.snapshot_tag.empty()) {
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
auto info = co_await parse_scrub_options(ctx, snap_ctl, std::move(req));
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
auto task = co_await compaction_module.make_and_start_task<compaction::scrub_sstables_compaction_task_impl>({}, std::move(info.keyspace), db, std::move(info.column_families), info.opts, nullptr);
auto task = co_await compaction_module.make_and_start_task<scrub_sstables_compaction_task_impl>({}, std::move(info.keyspace), db, std::move(info.column_families), info.opts, nullptr);
co_return json::json_return_type(task->get_status().id.to_sstring());
});
ss::force_compaction.set(r, [&ctx] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
apilog.info("force_compaction: flush={} consider_only_existing_data={}", flush, consider_only_existing_data);
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
std::optional<compaction::flush_mode> fmopt;
if (!flush && !consider_only_existing_data) {
fmopt = compaction::flush_mode::skip;
}
auto task = co_await compaction_module.make_and_start_task<compaction::global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
co_await task->done();
co_return json_void();
});
}
void unset_tasks_compaction_module(http_context& ctx, httpd::routes& r) {
t::force_keyspace_compaction_async.unset(r);
ss::force_keyspace_compaction.unset(r);
t::force_keyspace_cleanup_async.unset(r);
ss::force_keyspace_cleanup.unset(r);
t::perform_keyspace_offstrategy_compaction_async.unset(r);
ss::perform_keyspace_offstrategy_compaction.unset(r);
t::upgrade_sstables_async.unset(r);
ss::upgrade_sstables.unset(r);
t::scrub_async.unset(r);
ss::force_compaction.unset(r);
}
}

View File

@@ -16,7 +16,6 @@
#include "cql3/statements/ks_prop_defs.hh"
#include "service/migration_manager.hh"
#include "service/storage_proxy.hh"
#include "locator/abstract_replication_strategy.hh"
namespace audit {
@@ -65,8 +64,8 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
data_dictionary::database db = _qp.db();
cql3::statements::ks_prop_defs old_ks_prop_defs;
auto old_ks_metadata = old_ks_prop_defs.as_ks_metadata_update(
ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features(), db.get_config());
locator::replication_strategy_config_options strategy_opts;
ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features());
std::map<sstring, sstring> strategy_opts;
for (const auto &dc: _qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters())
strategy_opts[dc] = "3";
@@ -74,7 +73,6 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
"org.apache.cassandra.locator.NetworkTopologyStrategy",
strategy_opts,
std::nullopt, // initial_tablets
std::nullopt, // consistency_option
old_ks_metadata->durable_writes(),
old_ks_metadata->get_storage_options(),
old_ks_metadata->tables());

View File

@@ -15,7 +15,7 @@
#include "mutation/canonical_mutation.hh"
#include "schema/schema_fwd.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "utils/assert.hh"
#include "utils/exponential_backoff_retry.hh"
#include "cql3/query_processor.hh"

View File

@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
}
future<role_to_directly_granted_map>
ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
ldap_role_manager::query_all_directly_granted() {
role_to_directly_granted_map result;
auto roles = co_await query_all(qs);
auto roles = co_await query_all();
for (auto& role: roles) {
auto granted_set = co_await query_granted(role, recursive_role_query::no);
for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
co_return result;
}
future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
return _std_mgr.query_all(qs);
future<role_set> ldap_role_manager::query_all() {
return _std_mgr.query_all();
}
future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
}
future<std::optional<sstring>> ldap_role_manager::get_attribute(
std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
return _std_mgr.get_attribute(role_name, attribute_name, qs);
std::string_view role_name, std::string_view attribute_name) {
return _std_mgr.get_attribute(role_name, attribute_name);
}
future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
return _std_mgr.query_attribute_for_all(attribute_name, qs);
future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
return _std_mgr.query_attribute_for_all(attribute_name);
}
future<> ldap_role_manager::set_attribute(

View File

@@ -75,9 +75,9 @@ class ldap_role_manager : public role_manager {
future<role_set> query_granted(std::string_view, recursive_role_query) override;
future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
future<role_to_directly_granted_map> query_all_directly_granted() override;
future<role_set> query_all(::service::query_state&) override;
future<role_set> query_all() override;
future<bool> exists(std::string_view) override;
@@ -85,9 +85,9 @@ class ldap_role_manager : public role_manager {
future<bool> can_login(std::string_view) override;
future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;
future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;
future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;
future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;
future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

View File

@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
return operation_not_supported_exception<role_set>("QUERY GRANTED");
}
future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
}
future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
future<role_set> maintenance_socket_role_manager::query_all() {
return operation_not_supported_exception<role_set>("QUERY ALL");
}
@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
return make_ready_future<bool>(true);
}
future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
}
future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
}

View File

@@ -53,9 +53,9 @@ public:
virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
virtual future<role_set> query_all(::service::query_state&) override;
virtual future<role_set> query_all() override;
virtual future<bool> exists(std::string_view role_name) override;
@@ -63,9 +63,9 @@ public:
virtual future<bool> can_login(std::string_view role_name) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

View File

@@ -102,11 +102,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
auto username = row.get_as<sstring>("username");
auto salted_hash = row.get_as<sstring>(SALTED_HASH);
static const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
static const auto query = update_row_query();
return _qp.execute_internal(
query,
consistency_for_user(username),
@@ -123,7 +119,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
}
future<> password_authenticator::legacy_create_default_if_missing() {
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
SCYLLA_ASSERT(legacy_mode(_qp));
const auto exists = co_await default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
if (exists) {
co_return;
}
@@ -131,11 +128,7 @@ future<> password_authenticator::legacy_create_default_if_missing() {
if (salted_pwd.empty()) {
salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
}
const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
meta::legacy::AUTH_KS,
meta::roles_table::name,
SALTED_HASH,
meta::roles_table::role_col_name);
const auto query = update_row_query();
co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
@@ -218,15 +211,11 @@ future<> password_authenticator::start() {
return async([this] {
if (legacy_mode(_qp)) {
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
_migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
if (legacy::any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get()) {
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get()) {
if (legacy_metadata_exists()) {
plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
}
@@ -251,21 +240,10 @@ future<> password_authenticator::start() {
});
if (legacy_mode(_qp)) {
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
return create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
meta::roles_table::creation_query(),
_migration_manager);
}
return make_ready_future<>();

View File

@@ -36,8 +36,7 @@ static const std::unordered_map<sstring, auth::permission> permission_names({
{"MODIFY", auth::permission::MODIFY},
{"AUTHORIZE", auth::permission::AUTHORIZE},
{"DESCRIBE", auth::permission::DESCRIBE},
{"EXECUTE", auth::permission::EXECUTE},
{"VECTOR_SEARCH_INDEXING", auth::permission::VECTOR_SEARCH_INDEXING}});
{"EXECUTE", auth::permission::EXECUTE}});
const sstring& auth::permissions::to_string(permission p) {
for (auto& v : permission_names) {

View File

@@ -33,7 +33,6 @@ enum class permission {
// data access
SELECT, // required for SELECT.
MODIFY, // required for INSERT, UPDATE, DELETE, TRUNCATE.
VECTOR_SEARCH_INDEXING, // required for SELECT from tables with vector indexes if SELECT permission is not granted.
// permission management
AUTHORIZE, // required for GRANT and REVOKE.
@@ -55,8 +54,7 @@ typedef enum_set<
permission::MODIFY,
permission::AUTHORIZE,
permission::DESCRIBE,
permission::EXECUTE,
permission::VECTOR_SEARCH_INDEXING>> permission_set;
permission::EXECUTE>> permission_set;
bool operator<(const permission_set&, const permission_set&);

View File

@@ -41,26 +41,22 @@ static const std::unordered_map<resource_kind, std::size_t> max_parts{
{resource_kind::functions, 2}};
static permission_set applicable_permissions(const data_resource_view& dv) {
// We only support VECTOR_SEARCH_INDEXING permission for ALL KEYSPACES.
auto set = permission_set::of<
if (dv.table()) {
return permission_set::of<
permission::ALTER,
permission::DROP,
permission::SELECT,
permission::MODIFY,
permission::AUTHORIZE>();
if (!dv.table()) {
set.add(permission_set::of<permission::CREATE>());
}
if (!dv.table() && !dv.keyspace()) {
set.add(permission_set::of<permission::VECTOR_SEARCH_INDEXING>());
}
return set;
return permission_set::of<
permission::CREATE,
permission::ALTER,
permission::DROP,
permission::SELECT,
permission::MODIFY,
permission::AUTHORIZE>();
}
static permission_set applicable_permissions(const role_resource_view& rv) {

View File

@@ -17,17 +17,12 @@
#include <seastar/core/format.hh>
#include <seastar/core/sstring.hh>
#include "auth/common.hh"
#include "auth/resource.hh"
#include "cql3/description.hh"
#include "seastarx.hh"
#include "exceptions/exceptions.hh"
#include "service/raft/raft_group0_client.hh"
namespace service {
class query_state;
};
namespace auth {
struct role_config final {
@@ -172,9 +167,9 @@ public:
/// (role2, role3)
/// }
///
virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;
virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;
virtual future<role_set> query_all() = 0;
virtual future<bool> exists(std::string_view role_name) = 0;
@@ -191,12 +186,12 @@ public:
///
/// \returns the value of the named attribute, if one is set.
///
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;
///
/// \returns a mapping of each role's value for the named attribute, if one is set for the role.
///
virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;
virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;
/// Sets `attribute_name` with `attribute_value` for `role_name`.
/// \returns an exceptional future with nonexistant_role if the role does not exist.

View File

@@ -18,21 +18,43 @@
namespace auth {
namespace legacy {
namespace meta {
namespace roles_table {
std::string_view creation_query() {
static const sstring instance = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
name,
role_col_name);
return instance;
}
} // namespace roles_table
} // namespace meta
future<bool> default_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
auth::meta::legacy::AUTH_KS,
get_auth_ks_name(qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
auto results = co_await qp.execute_internal(query, cl
, internal_distributed_query_state()
, {rolename.value_or(std::string(auth::meta::DEFAULT_SUPERUSER_NAME))}
, {rolename.value_or(std::string(meta::DEFAULT_SUPERUSER_NAME))}
, cql3::query_processor::cache_internal::yes
);
if (!results->empty()) {
@@ -46,7 +68,7 @@ future<bool> any_nondefault_role_row_satisfies(
cql3::query_processor& qp,
std::function<bool(const cql3::untyped_result_set_row&)> p,
std::optional<std::string> rolename) {
const sstring query = seastar::format("SELECT * FROM {}.{}", auth::meta::legacy::AUTH_KS, meta::roles_table::name);
const sstring query = seastar::format("SELECT * FROM {}.{}", get_auth_ks_name(qp), meta::roles_table::name);
auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
, internal_distributed_query_state(), cql3::query_processor::cache_internal::no
@@ -63,6 +85,4 @@ future<bool> any_nondefault_role_row_satisfies(
});
}
} // namespace legacy
} // namespace auth
}

View File

@@ -27,15 +27,15 @@ namespace meta {
namespace roles_table {
std::string_view creation_query();
constexpr std::string_view name{"roles", 5};
constexpr std::string_view role_col_name{"role", 4};
} // namespace roles_table
}
} // namespace meta
namespace legacy {
}
///
/// Check that the default role satisfies a predicate, or `false` if the default role does not exist.
@@ -55,6 +55,4 @@ future<bool> any_nondefault_role_row_satisfies(
std::optional<std::string> rolename = {}
);
} // namespace legacy
} // namespace auth
}

View File

@@ -182,7 +182,7 @@ future<> saslauthd_authenticator::alter(std::string_view role_name, const authen
}
future<> saslauthd_authenticator::drop(std::string_view name, ::service::group0_batch& mc) {
return make_ready_future<>();
throw exceptions::authentication_exception("Cannot delete passwords with SaslauthdAuthenticator");
}
future<custom_options> saslauthd_authenticator::query_custom_options(std::string_view role_name) const {

View File

@@ -40,7 +40,7 @@
#include <variant>
#include "service/migration_manager.hh"
#include "service/raft/raft_group0_client.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "utils/assert.hh"
#include "utils/class_registrator.hh"
#include "locator/abstract_replication_strategy.hh"
@@ -215,7 +215,6 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
meta::legacy::AUTH_KS,
"org.apache.cassandra.locator.SimpleStrategy",
opts,
std::nullopt,
std::nullopt);
try {

View File

@@ -231,17 +231,6 @@ struct command_desc {
} type_ = type::OTHER;
};
/// Similar to command_desc, but used in cases where multiple permissions allow the access to the resource.
struct command_desc_with_permission_set {
permission_set permission;
const ::auth::resource& resource;
enum class type {
ALTER_WITH_OPTS,
ALTER_SYSTEM_WITH_ALLOWED_OPTS,
OTHER
} type_ = type::OTHER;
};
///
/// Protected resources cannot be modified even if the performer has permissions to do so.
///

View File

@@ -50,11 +50,22 @@ constexpr std::string_view name{"role_members" , 12};
}
namespace role_attributes_table {
constexpr std::string_view name{"role_attributes", 15};
}
static std::string_view creation_query() noexcept {
static const sstring instance = seastar::format(
"CREATE TABLE {}.{} ("
" role text,"
" name text,"
" value text,"
" PRIMARY KEY(role, name)"
")",
meta::legacy::AUTH_KS,
name);
return instance;
}
}
}
static logging::logger log("standard_role_manager");
@@ -142,17 +153,6 @@ const resource_set& standard_role_manager::protected_resources() const {
}
future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const {
static const sstring create_roles_query = fmt::format(
"CREATE TABLE {}.{} ("
" {} text PRIMARY KEY,"
" can_login boolean,"
" is_superuser boolean,"
" member_of set<text>,"
" salted_hash text"
")",
meta::legacy::AUTH_KS,
meta::roles_table::name,
meta::roles_table::role_col_name);
static const sstring create_role_members_query = fmt::format(
"CREATE TABLE {}.{} ("
" role text,"
@@ -161,20 +161,13 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
")",
meta::legacy::AUTH_KS,
meta::role_members_table::name);
static const sstring create_role_attributes_query = seastar::format(
"CREATE TABLE {}.{} ("
" role text,"
" name text,"
" value text,"
" PRIMARY KEY(role, name)"
")",
meta::legacy::AUTH_KS,
meta::role_attributes_table::name);
return when_all_succeed(
create_legacy_metadata_table_if_missing(
meta::roles_table::name,
_qp,
create_roles_query,
meta::roles_table::creation_query(),
_migration_manager),
create_legacy_metadata_table_if_missing(
meta::role_members_table::name,
@@ -184,18 +177,19 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
create_legacy_metadata_table_if_missing(
meta::role_attributes_table::name,
_qp,
create_role_attributes_query,
meta::role_attributes_table::creation_query(),
_migration_manager)).discard_result();
}
future<> standard_role_manager::legacy_create_default_role_if_missing() {
SCYLLA_ASSERT(legacy_mode(_qp));
try {
const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_can_login, _superuser);
const auto exists = co_await default_role_row_satisfies(_qp, &has_can_login, _superuser);
if (exists) {
co_return;
}
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
meta::legacy::AUTH_KS,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
co_await _qp.execute_internal(
@@ -290,7 +284,7 @@ future<> standard_role_manager::migrate_legacy_metadata() {
std::move(config),
::service::group0_batch::unused(),
[this](const auto& name, const auto& config, auto& mc) {
return create_or_replace(meta::legacy::AUTH_KS, name, config, mc);
return create_or_replace(name, config, mc);
});
}).finally([results] {});
}).then([] {
@@ -311,15 +305,11 @@ future<> standard_role_manager::start() {
const bool legacy = legacy_mode(_qp);
if (legacy) {
if (!_superuser_created_promise.available()) {
// Counterintuitively, we mark promise as ready before any startup work
// because wait_for_schema_agreement() below will block indefinitely
// without cluster majority. In that case, blocking node startup
// would lead to a cluster deadlock.
_superuser_created_promise.set_value();
}
co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);
if (co_await legacy::any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
if (co_await any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
if (legacy_metadata_exists()) {
log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
}
@@ -355,12 +345,12 @@ future<> standard_role_manager::ensure_superuser_is_created() {
return _superuser_created_promise.get_shared_future();
}
future<> standard_role_manager::create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
auth_ks_name,
get_auth_ks_name(_qp),
meta::roles_table::name,
meta::roles_table::role_col_name);
if (auth_ks_name == meta::legacy::AUTH_KS) {
if (legacy_mode(_qp)) {
co_await _qp.execute_internal(
query,
consistency_for_role(role_name),
@@ -379,7 +369,7 @@ standard_role_manager::create(std::string_view role_name, const role_config& c,
throw role_already_exists(role_name);
}
return create_or_replace(get_auth_ks_name(_qp), role_name, c, mc);
return create_or_replace(role_name, c, mc);
});
}
@@ -663,30 +653,21 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
});
}
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
const sstring query = seastar::format("SELECT * FROM {}.{}",
get_auth_ks_name(_qp),
meta::role_members_table::name);
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::ONE,
qs,
cql3::query_processor::cache_internal::yes);
role_to_directly_granted_map roles_map;
std::transform(
results->begin(),
results->end(),
std::inserter(roles_map, roles_map.begin()),
[] (const cql3::untyped_result_set_row& row) {
return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
);
co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
co_return stop_iteration::no;
});
co_return roles_map;
}
future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
future<role_set> standard_role_manager::query_all() {
const sstring query = seastar::format("SELECT {} FROM {}.{}",
meta::roles_table::role_col_name,
get_auth_ks_name(_qp),
@@ -704,7 +685,7 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
const auto results = co_await _qp.execute_internal(
query,
db::consistency_level::QUORUM,
qs,
internal_distributed_query_state(),
cql3::query_processor::cache_internal::yes);
role_set roles;
@@ -736,11 +717,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
});
}
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
get_auth_ks_name(_qp),
meta::role_attributes_table::name);
const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
if (!result_set->empty()) {
const cql3::untyped_result_set_row &row = result_set->one();
co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -748,11 +729,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
co_return std::optional<sstring>{};
}
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
return query_all().then([this, attribute_name] (role_set roles) {
return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
if (att_val) {
role_to_att_val.emplace(std::move(role), std::move(*att_val));
}
@@ -797,7 +778,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
std::vector<cql3::description> result{};
const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
const auto grants = co_await query_all_directly_granted();
result.reserve(grants.size());
for (const auto& [grantee_role, granted_role] : grants) {

View File

@@ -66,9 +66,9 @@ public:
virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
virtual future<role_set> query_all(::service::query_state&) override;
virtual future<role_set> query_all() override;
virtual future<bool> exists(std::string_view role_name) override;
@@ -76,9 +76,9 @@ public:
virtual future<bool> can_login(std::string_view role_name) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;
@@ -100,7 +100,7 @@ private:
future<> maybe_create_default_role();
future<> maybe_create_default_role_with_retries();
future<> create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);
future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);

View File

@@ -13,7 +13,6 @@
#include <seastar/core/sleep.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/util/later.hh>
#include "gms/endpoint_state.hh"
#include "gms/versioned_value.hh"
@@ -31,7 +30,6 @@
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/UUID_gen.hh"
#include "utils/stall_free.hh"
#include "utils/to_string.hh"
#include "cdc/generation.hh"
@@ -68,15 +66,6 @@ api::timestamp_clock::duration get_generation_leeway() {
return generation_leeway;
}
stream_state read_stream_state(int8_t val) {
if (val != std::to_underlying(stream_state::current)
&& val != std::to_underlying(stream_state::closed)
&& val != std::to_underlying(stream_state::opened)) {
throw std::runtime_error(format("invalid value {} for stream state", val));
}
return static_cast<stream_state>(val);
}
static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
i = net::hton(i);
std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
@@ -169,10 +158,6 @@ const bytes& stream_id::to_bytes() const {
return _value;
}
bytes stream_id::to_bytes() && {
return std::move(_value);
}
partition_key stream_id::to_partition_key(const schema& log_schema) const {
return partition_key::from_single_value(log_schema, _value);
}
@@ -197,19 +182,6 @@ utils::chunked_vector<token_range_description>&& topology_description::entries()
return std::move(_entries);
}
future<topology_description> topology_description::clone_async() const {
utils::chunked_vector<token_range_description> vec{};
co_await utils::reserve_gently(vec, _entries.size());
for (const auto& entry : _entries) {
vec.push_back(entry);
co_await seastar::maybe_yield();
}
co_return topology_description{std::move(vec)};
}
static std::vector<stream_id> create_stream_ids(
size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
std::vector<stream_id> result;
@@ -1012,11 +984,8 @@ future<> generation_service::handle_cdc_generation(cdc::generation_id_v2 gen_id)
auto gen_data = co_await _sys_ks.local().read_cdc_generation(gen_id.id);
bool using_this_gen = co_await container().map_reduce(or_reducer(), [ts, &gen_data] (generation_service& svc) -> future<bool> {
// We need to copy it here before awaiting anything to avoid destruction of the captures.
const auto timestamp = ts;
topology_description gen_copy = co_await gen_data.clone_async();
co_return svc._cdc_metadata.insert(timestamp, std::move(gen_copy));
bool using_this_gen = co_await container().map_reduce(or_reducer(), [ts, &gen_data] (generation_service& svc) {
return svc._cdc_metadata.insert(ts, cdc::topology_description{gen_data});
});
if (using_this_gen) {
@@ -1168,12 +1137,9 @@ future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation
}
// Return `true` iff the generation was inserted on any of our shards.
co_return co_await container().map_reduce(or_reducer(),
[ts = get_ts(gen_id), &gen] (generation_service& svc) -> future<bool> {
// We need to copy it here before awaiting anything to avoid destruction of the captures.
const auto timestamp = ts;
topology_description gen_copy = co_await gen->clone_async();
co_return svc._cdc_metadata.insert(timestamp, std::move(gen_copy));
co_return co_await container().map_reduce(or_reducer(), [ts = get_ts(gen_id), &gen] (generation_service& svc) {
auto gen_ = *gen;
return svc._cdc_metadata.insert(ts, std::move(gen_));
});
}
@@ -1191,347 +1157,4 @@ db_clock::time_point get_ts(const generation_id& gen_id) {
return std::visit([] (auto& id) { return id.ts; }, gen_id);
}
future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const locator::tablet_map& map, api::timestamp_type ts) {
auto s = db::system_keyspace::cdc_streams_state();
mutation m(s, partition_key::from_single_value(*s,
data_value(table.uuid()).serialize_nonnull()
));
m.set_static_cell("timestamp", stream_ts, ts);
for (auto tid : map.tablet_ids()) {
auto sid = cdc::stream_id(map.get_last_token(tid), 0);
auto ck = clustering_key::from_singular(*s, dht::token::to_int64(sid.token()));
m.set_cell(ck, "stream_id", data_value(std::move(sid).to_bytes()), ts);
co_await coroutine::maybe_yield();
}
co_return std::move(m);
}
future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const std::vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
auto s = db::system_keyspace::cdc_streams_state();
mutation m(s, partition_key::from_single_value(*s,
data_value(table.uuid()).serialize_nonnull()
));
m.set_static_cell("timestamp", stream_ts, ts);
for (const auto& sid : stream_ids) {
auto ck = clustering_key::from_singular(*s, dht::token::to_int64(sid.token()));
m.set_cell(ck, "stream_id", data_value(sid.to_bytes()), ts);
co_await coroutine::maybe_yield();
}
co_return std::move(m);
}
utils::chunked_vector<mutation>
make_drop_table_streams_mutations(table_id table, api::timestamp_type ts) {
utils::chunked_vector<mutation> mutations;
mutations.reserve(2);
for (auto s : {db::system_keyspace::cdc_streams_state(),
db::system_keyspace::cdc_streams_history()}) {
mutation m(s, partition_key::from_single_value(*s,
data_value(table.uuid()).serialize_nonnull()
));
m.partition().apply(tombstone(ts, gc_clock::now()));
mutations.emplace_back(std::move(m));
}
return mutations;
}
future<> generation_service::load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables) {
// track which tables we expect to get data for, and which we actually get.
// when a table is dropped we won't get any streams for it. we will use this to
// know which tables are dropped and remove their stream map from the metadata.
std::unordered_set<table_id> tables_to_process;
if (changed_tables) {
tables_to_process = *changed_tables;
} else {
tables_to_process = _cdc_metadata.get_tables_with_cdc_tablet_streams() | std::ranges::to<std::unordered_set<table_id>>();
}
auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f) -> future<> {
if (tables) {
for (auto table : *tables) {
co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
return f(table, base_ts, std::move(base_stream_set));
});
}
} else {
co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
return f(table, base_ts, std::move(base_stream_set));
});
}
};
co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
table_streams new_table_map;
auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, std::vector<cdc::stream_id> stream_set) {
auto ts = std::chrono::duration_cast<api::timestamp_clock::duration>(stream_tp.time_since_epoch()).count();
new_table_map[ts] = committed_stream_set {stream_tp, std::move(stream_set)};
};
// if we already have a loaded streams map, and the base timestamp is unchanged, then read
// the history entries starting from the latest one we have and append it to the existing map.
// we can do it because we only append new rows with higher timestamps to the history table.
std::optional<std::reference_wrapper<const committed_stream_set>> from_streams;
std::optional<db_clock::time_point> from_ts;
const auto& all_streams = _cdc_metadata.get_all_tablet_streams();
if (auto it = all_streams.find(table); it != all_streams.end()) {
const auto& current_map = *it->second;
if (current_map.cbegin()->second.ts == base_ts) {
const auto& latest_entry = current_map.crbegin()->second;
from_streams = std::cref(latest_entry);
from_ts = latest_entry.ts;
}
}
if (!from_ts) {
append_stream(base_ts, std::move(base_stream_set));
}
co_await _sys_ks.local().read_cdc_streams_history(table, from_ts, [&] (table_id tid, db_clock::time_point ts, cdc_stream_diff diff) -> future<> {
const auto& prev_stream_set = new_table_map.empty() ?
from_streams->get().streams : std::crbegin(new_table_map)->second.streams;
append_stream(ts, co_await cdc::metadata::construct_next_stream_set(
prev_stream_set, std::move(diff.opened_streams), diff.closed_streams));
});
co_await container().invoke_on_all(coroutine::lambda([&] (generation_service& svc) -> future<> {
table_streams new_table_map_copy;
for (const auto& [ts, entry] : new_table_map) {
new_table_map_copy[ts] = entry;
co_await coroutine::maybe_yield();
}
if (!from_ts) {
svc._cdc_metadata.load_tablet_streams_map(table, std::move(new_table_map_copy));
} else {
svc._cdc_metadata.append_tablet_streams_map(table, std::move(new_table_map_copy));
}
}));
tables_to_process.erase(table);
});
// the remaining tables have no streams - remove them from the metadata
co_await container().invoke_on_all([&] (generation_service& svc) {
for (auto table : tables_to_process) {
svc._cdc_metadata.remove_tablet_streams_map(table);
}
});
}
future<> generation_service::query_cdc_timestamps(table_id table, bool ascending, noncopyable_function<future<>(db_clock::time_point)> f) {
const auto& all_tables = _cdc_metadata.get_all_tablet_streams();
auto table_it = all_tables.find(table);
if (table_it == all_tables.end()) {
co_return;
}
const auto table_streams_ptr = table_it->second; // keep alive
const auto& table_streams = *table_streams_ptr;
if (ascending) {
for (auto it = table_streams.cbegin(); it != table_streams.cend(); ++it) {
co_await f(it->second.ts);
}
} else {
for (auto it = table_streams.crbegin(); it != table_streams.crend(); ++it) {
co_await f(it->second.ts);
}
}
}
future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
const auto& all_tables = _cdc_metadata.get_all_tablet_streams();
auto table_it = all_tables.find(table);
if (table_it == all_tables.end()) {
co_return;
}
const auto table_streams_ptr = table_it->second; // keep alive
const auto& table_streams = *table_streams_ptr;
auto it_prev = table_streams.end();
auto it = table_streams.begin();
while (it != table_streams.end()) {
const auto& entry = it->second;
if (it_prev != table_streams.end()) {
const auto& prev_entry = it_prev->second;
auto diff = co_await cdc::metadata::generate_stream_diff(prev_entry.streams, entry.streams);
co_await f(entry.ts, entry.streams, std::move(diff));
} else {
co_await f(entry.ts, entry.streams, cdc::cdc_stream_diff{.closed_streams = {}, .opened_streams = entry.streams});
}
it_prev = it;
++it;
}
}
future<mutation> get_switch_streams_mutation(table_id table, db_clock::time_point stream_ts, cdc_stream_diff diff, api::timestamp_type ts) {
auto history_schema = db::system_keyspace::cdc_streams_history();
auto decomposed_ts = timestamp_type->decompose(stream_ts);
auto closed_kind = byte_type->decompose(std::to_underlying(stream_state::closed));
auto opened_kind = byte_type->decompose(std::to_underlying(stream_state::opened));
mutation m(history_schema, partition_key::from_single_value(*history_schema,
data_value(table.uuid()).serialize_nonnull()
));
for (auto&& sid : diff.closed_streams) {
co_await coroutine::maybe_yield();
auto ck = clustering_key::from_exploded(*history_schema, { decomposed_ts, closed_kind, long_type->decompose(dht::token::to_int64(sid.token())) });
m.set_cell(ck, "stream_id", data_value(std::move(sid).to_bytes()), ts);
}
for (auto&& sid : diff.opened_streams) {
co_await coroutine::maybe_yield();
auto ck = clustering_key::from_exploded(*history_schema, { decomposed_ts, opened_kind, long_type->decompose(dht::token::to_int64(sid.token())) });
m.set_cell(ck, "stream_id", data_value(std::move(sid).to_bytes()), ts);
}
co_return std::move(m);
}
future<> generation_service::generate_tablet_resize_update(utils::chunked_vector<canonical_mutation>& muts, table_id table, const locator::tablet_map& new_tablet_map, api::timestamp_type ts) {
if (!_cdc_metadata.get_all_tablet_streams().contains(table)) {
// not a CDC table
co_return;
}
std::vector<cdc::stream_id> new_streams;
new_streams.reserve(new_tablet_map.tablet_count());
for (auto tid : new_tablet_map.tablet_ids()) {
new_streams.emplace_back(new_tablet_map.get_last_token(tid), 0);
co_await coroutine::maybe_yield();
}
const auto& table_streams = *_cdc_metadata.get_all_tablet_streams().at(table);
auto current_streams_it = std::crbegin(table_streams);
if (current_streams_it == std::crend(table_streams)) {
// no streams at all - this should not happen
on_internal_error(cdc_log, format("generate_tablet_resize_update: no streams for table {}", table));
}
const auto& current_streams = current_streams_it->second;
auto new_ts = new_generation_timestamp(true, _cfg.ring_delay);
new_ts = std::max(new_ts, current_streams.ts + std::chrono::milliseconds(1)); // ensure timestamps are increasing
auto diff = co_await _cdc_metadata.generate_stream_diff(current_streams.streams, new_streams);
auto mut = co_await get_switch_streams_mutation(table, new_ts, std::move(diff), ts);
muts.emplace_back(std::move(mut));
}
future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
utils::chunked_vector<mutation> muts;
muts.reserve(2);
auto gc_now = gc_clock::now();
auto tombstone_ts = ts - 1;
{
// write the new base stream set to cdc_streams_state
auto s = db::system_keyspace::cdc_streams_state();
mutation m(s, partition_key::from_single_value(*s,
data_value(table.uuid()).serialize_nonnull()
));
m.partition().apply(tombstone(tombstone_ts, gc_now));
m.set_static_cell("timestamp", data_value(base_ts), ts);
for (const auto& sid : base_stream_set) {
co_await coroutine::maybe_yield();
auto ck = clustering_key::from_singular(*s, dht::token::to_int64(sid.token()));
m.set_cell(ck, "stream_id", data_value(sid.to_bytes()), ts);
}
muts.emplace_back(std::move(m));
}
{
// remove all entries from cdc_streams_history up to the new base
auto s = db::system_keyspace::cdc_streams_history();
mutation m(s, partition_key::from_single_value(*s,
data_value(table.uuid()).serialize_nonnull()
));
auto range = query::clustering_range::make_ending_with({
clustering_key_prefix::from_single_value(*s, timestamp_type->decompose(base_ts)), true});
auto bv = bound_view::from_range(range);
m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{ts, gc_now}});
muts.emplace_back(std::move(m));
}
co_return std::move(muts);
}
table_streams::const_iterator get_new_base_for_gc(const table_streams& streams_map, std::chrono::seconds ttl) {
// find the most recent timestamp that is older than ttl_seconds, which will become the new base.
// all streams with older timestamps can be removed because they are closed for more than ttl_seconds
// (they are all replaced by streams with the newer timestamp).
auto ts_upper_bound = db_clock::now() - ttl;
auto it = streams_map.begin();
while (it != streams_map.end()) {
auto next_it = std::next(it);
if (next_it == streams_map.end()) {
break;
}
auto next_tp = next_it->second.ts;
if (next_tp <= ts_upper_bound) {
// the next timestamp is older than ttl_seconds, so the current one is obsolete
it = next_it;
} else {
break;
}
}
return it;
}
future<utils::chunked_vector<mutation>> generation_service::garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts) {
const auto& table_streams = *_cdc_metadata.get_all_tablet_streams().at(table);
// if TTL is not provided by the caller then use the table's CDC TTL
auto base_schema = cdc::get_base_table(_db, *_db.find_schema(table));
ttl = ttl.or_else([&] -> std::optional<std::chrono::seconds> {
auto ttl_seconds = base_schema->cdc_options().ttl();
if (ttl_seconds > 0) {
return std::chrono::seconds(ttl_seconds);
} else {
// ttl=0 means no ttl
return std::nullopt;
}
});
if (!ttl) {
co_return utils::chunked_vector<mutation>{};
}
auto new_base_it = get_new_base_for_gc(table_streams, *ttl);
if (new_base_it == table_streams.begin() || new_base_it == table_streams.end()) {
// nothing to gc
co_return utils::chunked_vector<mutation>{};
}
for (auto it = table_streams.begin(); it != new_base_it; ++it) {
cdc_log.info("Garbage collecting CDC stream metadata for table {}: removing generation {} because it is older than the CDC TTL of {} seconds",
table, it->second.ts, *ttl);
}
co_return co_await get_cdc_stream_gc_mutations(table, new_base_it->second.ts, new_base_it->second.streams, ts);
}
future<> generation_service::garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts) {
for (auto table : _cdc_metadata.get_tables_with_cdc_tablet_streams()) {
co_await coroutine::maybe_yield();
auto table_muts = co_await garbage_collect_cdc_streams_for_table(table, std::nullopt, ts);
for (auto&& m : table_muts) {
muts.emplace_back(std::move(m));
}
}
}
} // namespace cdc

View File

@@ -44,10 +44,6 @@ namespace gms {
class gossiper;
} // namespace gms
namespace locator {
class tablet_map;
} // namespace locator
namespace cdc {
api::timestamp_clock::duration get_generation_leeway();
@@ -67,7 +63,6 @@ public:
uint8_t version() const;
size_t index() const;
const bytes& to_bytes() const;
bytes to_bytes() &&;
dht::token token() const;
partition_key to_partition_key(const schema& log_schema) const;
@@ -102,20 +97,10 @@ class topology_description {
utils::chunked_vector<token_range_description> _entries;
public:
topology_description(utils::chunked_vector<token_range_description> entries);
topology_description(const topology_description&) = delete;
topology_description& operator=(const topology_description&) = delete;
topology_description(topology_description&&) = default;
topology_description& operator=(topology_description&&) = default;
bool operator==(const topology_description&) const;
const utils::chunked_vector<token_range_description>& entries() const&;
utils::chunked_vector<token_range_description>&& entries() &&;
/// The object MUST NOT be modified until the returned future resolves.
future<topology_description> clone_async() const;
};
/**
@@ -133,26 +118,6 @@ public:
{}
};
enum class stream_state : int8_t {
current = 0,
closed = 1,
opened = 2,
};
stream_state read_stream_state(int8_t val);
struct committed_stream_set {
db_clock::time_point ts;
std::vector<cdc::stream_id> streams;
};
struct cdc_stream_diff {
std::vector<stream_id> closed_streams;
std::vector<stream_id> opened_streams;
};
using table_streams = std::map<api::timestamp_type, committed_stream_set>;
class no_generation_data_exception : public std::runtime_error {
public:
no_generation_data_exception(cdc::generation_id generation_ts)
@@ -219,12 +184,4 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
schema_ptr, utils::UUID gen_uuid, const cdc::topology_description&,
size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);
future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const locator::tablet_map&, api::timestamp_type);
future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const std::vector<cdc::stream_id>&, api::timestamp_type);
utils::chunked_vector<mutation> make_drop_table_streams_mutations(table_id, api::timestamp_type ts);
future<mutation> get_switch_streams_mutation(table_id table, db_clock::time_point stream_ts, cdc_stream_diff diff, api::timestamp_type ts);
future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
table_streams::const_iterator get_new_base_for_gc(const table_streams&, std::chrono::seconds ttl);
} // namespace cdc

View File

@@ -29,7 +29,6 @@ class abort_source;
namespace locator {
class shared_token_metadata;
class tablet_map;
}
namespace cdc {
@@ -146,16 +145,6 @@ public:
*/
future<> handle_cdc_generation(cdc::generation_id_v2);
future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);
future<> query_cdc_timestamps(table_id table, bool ascending, noncopyable_function<future<>(db_clock::time_point)> f);
future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);
future<> generate_tablet_resize_update(utils::chunked_vector<canonical_mutation>& muts, table_id table, const locator::tablet_map& new_tablet_map, api::timestamp_type ts);
future<utils::chunked_vector<mutation>> garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts);
future<> garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts);
private:
/* Retrieve the CDC generation which starts at the given timestamp (from a distributed table created for this purpose)
* and start using it for CDC log writes if it's not obsolete.

View File

@@ -21,17 +21,12 @@
#include "cdc/metadata.hh"
#include "cdc/cdc_partitioner.hh"
#include "bytes.hh"
#include "index/vector_index.hh"
#include "locator/abstract_replication_strategy.hh"
#include "locator/topology.hh"
#include "replica/database.hh"
#include "db/schema_tables.hh"
#include "gms/feature_service.hh"
#include "schema/schema.hh"
#include "schema/schema_builder.hh"
#include "service/migration_listener.hh"
#include "service/storage_proxy.hh"
#include "tombstone_gc_extension.hh"
#include "types/tuple.hh"
#include "cql3/statements/select_statement.hh"
#include "cql3/untyped_result_set.hh"
@@ -41,7 +36,7 @@
#include "utils/UUID_gen.hh"
#include "utils/managed_bytes.hh"
#include "types/types.hh"
#include "types/concrete_types.hh"
#include "concrete_types.hh"
#include "types/listlike_partial_deserializing_iterator.hh"
#include "tracing/trace_state.hh"
#include "stats.hh"
@@ -61,18 +56,8 @@ using namespace std::chrono_literals;
logging::logger cdc_log("cdc");
namespace {
shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy(const keyspace_metadata& ksm, const locator::topology& topo) {
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, topo);
}
} // anonymous namespace
namespace cdc {
static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&,
std::optional<table_id> = {}, schema_ptr = nullptr);
static schema_ptr create_log_schema(const schema&, std::optional<table_id> = {}, schema_ptr = nullptr);
}
static constexpr auto cdc_group_name = "cdc";
@@ -173,52 +158,26 @@ public:
});
}
virtual void on_before_allocate_tablet_map(const locator::tablet_map& map, const schema& s, utils::chunked_vector<mutation>& muts, api::timestamp_type ts) override {
if (!is_log_schema(s)) {
return;
}
auto stream_ts = db_clock::now() - duration_cast<std::chrono::milliseconds>(get_generation_leeway());
auto mut = create_table_streams_mutation(s.id(), stream_ts, map, ts).get();
muts.emplace_back(std::move(mut));
}
void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms) override {
std::vector<schema_ptr> new_cfms;
for (auto sp : cfms) {
const auto& schema = *sp;
if (!schema.cdc_options().enabled()) {
continue;
}
void on_before_create_column_family(const keyspace_metadata& ksm, const schema& schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
if (schema.cdc_options().enabled()) {
auto& db = _ctxt._proxy.get_db().local();
auto logname = log_name(schema.cf_name());
check_that_cdc_log_table_does_not_exist(db, schema, logname);
ensure_that_table_has_no_counter_columns(schema);
if (!db.features().cdc_with_tablets) {
ensure_that_table_uses_vnodes(ksm, schema, db.get_token_metadata().get_topology());
}
ensure_that_table_uses_vnodes(ksm, schema);
// in seastar thread
auto log_schema = create_log_schema(schema, db, ksm);
new_cfms.push_back(std::move(log_schema));
}
auto log_schema = create_log_schema(schema);
cfms.insert(cfms.end(), std::make_move_iterator(new_cfms.begin()), std::make_move_iterator(new_cfms.end()));
auto log_mut = db::schema_tables::make_create_table_mutations(log_schema, timestamp);
mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
}
}
void on_before_update_column_family(const schema& new_schema, const schema& old_schema, utils::chunked_vector<mutation>& mutations, api::timestamp_type timestamp) override {
bool has_vector_index = secondary_index::vector_index::has_vector_index(new_schema);
if (has_vector_index) {
// If we have a vector index, we need to ensure that the CDC log is created
// satisfying the minimal requirements of Vector Search.
secondary_index::vector_index::check_cdc_options(new_schema);
}
bool is_cdc = cdc_enabled(new_schema);
bool was_cdc = cdc_enabled(old_schema);
bool is_cdc = new_schema.cdc_options().enabled();
bool was_cdc = old_schema.cdc_options().enabled();
// if we are turning off cdc we can skip this, since even if columns change etc,
// any writer should see cdc -> off together with any actual schema changes to
@@ -236,7 +195,7 @@ public:
// make sure the apparent log table really is a cdc log (not user table)
// we just check the partitioner - since user tables should _not_ be able
// set/use this.
if (!is_log_schema(*log_schema)) {
if (log_schema->get_partitioner().name() != cdc::cdc_partitioner::classname) {
// will throw
check_that_cdc_log_table_does_not_exist(db, old_schema, logname);
}
@@ -244,23 +203,16 @@ public:
check_for_attempt_to_create_nested_cdc_log(db, new_schema);
ensure_that_table_has_no_counter_columns(new_schema);
if (!db.features().cdc_with_tablets) {
ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema, db.get_token_metadata().get_topology());
}
ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);
std::optional<table_id> maybe_id = log_schema ? std::make_optional(log_schema->id()) : std::nullopt;
auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), std::move(maybe_id), log_schema);
auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
auto log_mut = log_schema
? db::schema_tables::make_update_table_mutations(_ctxt._proxy, keyspace.metadata(), log_schema, new_log_schema, timestamp)
? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp)
: db::schema_tables::make_create_table_mutations(new_log_schema, timestamp)
;
mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
if (!log_schema) {
db.get_notifier().before_create_column_family(*keyspace.metadata(), *new_log_schema, mutations, timestamp);
}
}
}
@@ -270,41 +222,12 @@ public:
auto has_cdc_log = db.has_schema(schema.ks_name(), logname);
if (has_cdc_log) {
auto log_schema = db.find_schema(schema.ks_name(), logname);
if (is_log_schema(*log_schema)) {
auto& keyspace = db.find_keyspace(schema.ks_name());
auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
db.get_notifier().before_drop_column_family(*log_schema, mutations, timestamp);
if (log_schema->get_partitioner().name() != cdc::cdc_partitioner::classname) {
return;
}
}
if (is_log_schema(schema)) {
auto& keyspace = db.find_keyspace(schema.ks_name());
if (keyspace.uses_tablets()) {
// drop cdc streams of this table
auto drop_stream_mut = make_drop_table_streams_mutations(schema.id(), timestamp);
mutations.insert(mutations.end(), std::make_move_iterator(drop_stream_mut.begin()), std::make_move_iterator(drop_stream_mut.end()));
}
}
}
void on_before_drop_keyspace(const sstring& keyspace_name, utils::chunked_vector<mutation>& mutations, api::timestamp_type ts) override {
auto& db = _ctxt._proxy.get_db().local();
auto& ks = db.find_keyspace(keyspace_name);
if (ks.uses_tablets()) {
// drop cdc streams for all CDC tables in this keyspace
for (auto&& [name, s] : ks.metadata()->cf_meta_data()) {
seastar::thread::maybe_yield();
if (!is_log_schema(*s)) {
continue;
}
auto drop_stream_mut = make_drop_table_streams_mutations(s->id(), ts);
mutations.insert(mutations.end(), std::make_move_iterator(drop_stream_mut.begin()), std::make_move_iterator(drop_stream_mut.end()));
}
auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
}
}
@@ -312,8 +235,7 @@ public:
lowres_clock::time_point timeout,
utils::chunked_vector<mutation>&& mutations,
tracing::trace_state_ptr tr_state,
db::consistency_level write_cl,
per_request_options options
db::consistency_level write_cl
);
template<typename Iter>
@@ -347,10 +269,11 @@ private:
// Until we support CDC with tablets (issue #16317), we can't allow this
// to be attempted - in particular the log table we try to create will not
// have tablets, and will cause a failure.
static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema, const locator::topology& topo) {
auto rs = generate_replication_strategy(ksm, topo);
static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema) {
locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets());
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params);
if (rs->uses_tablets()) {
throw exceptions::invalid_request_exception(format("Cannot create CDC log for a table {}.{}, because the keyspace uses tablets, and not all nodes support the CDC with tablets feature.",
throw exceptions::invalid_request_exception(format("Cannot create CDC log for a table {}.{}, because keyspace uses tablets. See issue #16317.",
schema.ks_name(), schema.cf_name()));
}
}
@@ -496,18 +419,10 @@ static const sstring cdc_meta_column_prefix = "cdc$";
static const sstring cdc_deleted_column_prefix = cdc_meta_column_prefix + "deleted_";
static const sstring cdc_deleted_elements_column_prefix = cdc_meta_column_prefix + "deleted_elements_";
bool cdc_enabled(const schema& s) {
return s.cdc_options().enabled() || secondary_index::vector_index::has_vector_index(s);
}
bool is_log_name(const std::string_view& table_name) {
return table_name.ends_with(cdc_log_suffix);
}
bool is_log_schema(const schema& s) {
return s.get_partitioner().name() == cdc::cdc_partitioner::classname;
}
bool is_cdc_metacolumn_name(const sstring& name) {
return name.compare(0, cdc_meta_column_prefix.size(), cdc_meta_column_prefix) == 0;
}
@@ -517,7 +432,7 @@ bool is_log_for_some_table(const replica::database& db, const sstring& ks_name,
if (!base_schema) {
return false;
}
return cdc_enabled(*base_schema);
return base_schema->cdc_options().enabled();
}
schema_ptr get_base_table(const replica::database& db, const schema& s) {
@@ -581,12 +496,10 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
}
static schema_ptr create_log_schema(const schema& s, const replica::database& db,
const keyspace_metadata& ksm, std::optional<table_id> uuid, schema_ptr old)
{
static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uuid, schema_ptr old) {
schema_builder b(s.ks_name(), log_name(s.cf_name()));
b.with_partitioner(cdc::cdc_partitioner::classname);
b.set_compaction_strategy(compaction::compaction_strategy_type::time_window);
b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
b.set_comment(fmt::format("CDC log for {}.{}", s.ks_name(), s.cf_name()));
auto ttl_seconds = s.cdc_options().ttl();
if (ttl_seconds > 0) {
@@ -670,10 +583,6 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
b.set_uuid(*uuid);
}
auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata()));
b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
/**
* #10473 - if we are redefining the log table, we need to ensure any dropped
* columns are registered in "dropped_columns" table, otherwise clients will not
@@ -1368,13 +1277,6 @@ struct process_row_visitor {
};
struct process_change_visitor {
const per_request_options& _request_options;
// The types of the operations used for row / partition deletes. Introduced
// to differentiate service operations (e.g. operation::service_row_delete
// vs operation::row_delete).
const operation _row_delete_op = operation::row_delete;
const operation _partition_delete_op = operation::partition_delete;
stats::part_type_set& _touched_parts;
log_mutation_builder& _builder;
@@ -1437,7 +1339,7 @@ struct process_change_visitor {
void clustered_row_delete(const clustering_key& ckey, const tombstone&) {
_touched_parts.set<stats::part_type::ROW_DELETE>();
auto log_ck = _builder.allocate_new_log_row(_row_delete_op);
auto log_ck = _builder.allocate_new_log_row(operation::row_delete);
_builder.set_clustering_columns(log_ck, ckey);
if (_enable_updating_state && get_row_state(_clustering_row_states, ckey)) {
@@ -1481,7 +1383,7 @@ struct process_change_visitor {
void partition_delete(const tombstone&) {
_touched_parts.set<stats::part_type::PARTITION_DELETE>();
auto log_ck = _builder.allocate_new_log_row(_partition_delete_op);
auto log_ck = _builder.allocate_new_log_row(operation::partition_delete);
if (_enable_updating_state) {
_clustering_row_states.clear();
}
@@ -1496,7 +1398,6 @@ private:
schema_ptr _schema;
dht::decorated_key _dk;
schema_ptr _log_schema;
const per_request_options& _options;
/**
* #6070, #6084
@@ -1575,8 +1476,6 @@ private:
row_states_map _clustering_row_states;
cell_map _static_row_state;
const bool _uses_tablets;
utils::chunked_vector<mutation> _result_mutations;
std::optional<log_mutation_builder> _builder;
@@ -1586,27 +1485,25 @@ private:
stats::part_type_set _touched_parts;
public:
transformer(db_context ctx, schema_ptr s, dht::decorated_key dk, const per_request_options& options)
transformer(db_context ctx, schema_ptr s, dht::decorated_key dk)
: _ctx(ctx)
, _schema(std::move(s))
, _dk(std::move(dk))
, _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
, _options(options)
, _clustering_row_states(0, clustering_key::hashing(*_schema), clustering_key::equality(*_schema))
, _uses_tablets(ctx._proxy.get_db().local().find_keyspace(_schema->ks_name()).uses_tablets())
{
}
// DON'T move the transformer after this
void begin_timestamp(api::timestamp_type ts, bool is_last) override {
const auto stream_id = _uses_tablets ? _ctx._cdc_metadata.get_tablet_stream(_log_schema->id(), ts, _dk.token()) : _ctx._cdc_metadata.get_vnode_stream(ts, _dk.token());
const auto stream_id = _ctx._cdc_metadata.get_stream(ts, _dk.token());
_result_mutations.emplace_back(_log_schema, stream_id.to_partition_key(*_log_schema));
_builder.emplace(_result_mutations.back(), ts, _dk.key(), *_schema);
_enable_updating_state = _schema->cdc_options().postimage() || (!is_last && _schema->cdc_options().preimage());
}
void produce_preimage(const clustering_key* ck, const one_kind_column_set& columns_to_include) override {
// if we want full preimage, just ignore the affected columns and include everything.
// iff we want full preimage, just ignore the affected columns and include everything.
generate_image(operation::pre_image, ck, _schema->cdc_options().full_preimage() ? nullptr : &columns_to_include);
};
@@ -1692,9 +1589,6 @@ public:
void process_change(const mutation& m) override {
SCYLLA_ASSERT(_builder);
process_change_visitor v {
._request_options = _options,
._row_delete_op = _options.is_system_originated ? operation::service_row_delete : operation::row_delete,
._partition_delete_op = _options.is_system_originated ? operation::service_partition_delete : operation::partition_delete,
._touched_parts = _touched_parts,
._builder = *_builder,
._enable_updating_state = _enable_updating_state,
@@ -1726,8 +1620,7 @@ public:
const mutation& m)
{
auto& p = m.partition();
const bool no_ck_schema_partition_deletion = m.schema()->clustering_key_size() == 0 && bool(p.partition_tombstone());
if (p.clustered_rows().empty() && p.static_row().empty() && !no_ck_schema_partition_deletion) {
if (p.clustered_rows().empty() && p.static_row().empty()) {
return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
}
@@ -1776,12 +1669,12 @@ public:
});
}
}
if (!p.clustered_rows().empty() || no_ck_schema_partition_deletion) {
if (!p.clustered_rows().empty()) {
const bool has_row_delete = std::any_of(p.clustered_rows().begin(), p.clustered_rows().end(), [] (const rows_entry& re) {
return re.row().deleted_at();
});
// for postimage we need everything...
if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage() || no_ck_schema_partition_deletion) {
if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage()) {
for (const column_definition& c: _schema->regular_columns()) {
regular_columns.emplace_back(c.id);
columns.emplace_back(&c);
@@ -1896,11 +1789,11 @@ transform_mutations(utils::chunked_vector<mutation>& muts, decltype(muts.size())
} // namespace cdc
future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
// we do all this because in the case of batches, we can have mixed schemas.
auto e = mutations.end();
auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
return cdc_enabled(*m.schema());
return m.schema()->cdc_options().enabled();
});
if (i == e) {
@@ -1910,26 +1803,22 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
tracing::trace(tr_state, "CDC: Started generating mutations for log rows");
mutations.reserve(2 * mutations.size());
return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{}, std::move(options),
[this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details, per_request_options& options) {
return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl, &options] (int idx) mutable {
return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
[this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
auto& m = mutations[idx];
auto s = m.schema();
if (!cdc_enabled(*s)) {
if (!s->cdc_options().enabled()) {
return make_ready_future<>();
}
transformer trans(_ctxt, s, m.decorated_key(), options);
transformer trans(_ctxt, s, m.decorated_key());
auto f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(nullptr);
if (options.preimage && !options.preimage->empty()) {
// Preimage has been fetched by upper layers.
tracing::trace(tr_state, "CDC: Using a prefetched preimage");
f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(options.preimage);
} else if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
// Note: further improvement here would be to coalesce the pre-image selects into one
// if a batch contains several modifications to the same table. Otoh, batch is rare(?)
// iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
// so this is premature.
tracing::trace(tr_state, "CDC: Selecting preimage for {}", m.decorated_key());
f = trans.pre_image_select(qs.get_client_state(), write_cl, m).then_wrapped([this] (future<lw_shared_ptr<cql3::untyped_result_set>> f) {
@@ -1985,16 +1874,16 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
bool cdc::cdc_service::needs_cdc_augmentation(const utils::chunked_vector<mutation>& mutations) const {
return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
return cdc_enabled(*m.schema());
return m.schema()->cdc_options().enabled();
});
}
future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl, options = std::move(options)] () mutable {
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
});
}
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
}

View File

@@ -21,8 +21,7 @@
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/sstring.hh>
#include "cql3/untyped_result_set.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "tracing/trace_state.hh"
#include "utils/UUID.hh"
@@ -52,35 +51,11 @@ class database;
namespace cdc {
// cdc log table operation
enum class operation : int8_t {
// note: these values will eventually be read by a third party, probably not privvy to this
// enum decl, so don't change the constant values (or the datatype).
pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
post_image = 9,
// Operations initiated internally by Scylla. Currently used only by Alternator
service_row_delete = -3, service_partition_delete = -4,
};
struct per_request_options {
// The value of the base row before current operation, queried by higher
// layers than CDC. We assume that CDC could have seen the row in this
// state, i.e. the value isn't 'stale'/'too recent'.
lw_shared_ptr<cql3::untyped_result_set> preimage;
// Whether this mutation is a result of an internal operation initiated by
// Scylla. Currently, only TTL expiration implementation for Alternator
// uses this.
const bool is_system_originated = false;
};
struct operation_result_tracker;
class db_context;
class metadata;
bool is_log_name(const std::string_view& table_name);
bool is_log_schema(const schema& s);
/// \brief CDC service, responsible for schema listeners
///
@@ -104,9 +79,8 @@ public:
lowres_clock::time_point timeout,
utils::chunked_vector<mutation>&& mutations,
tracing::trace_state_ptr tr_state,
db::consistency_level write_cl,
per_request_options options = {}
);
db::consistency_level write_cl
);
bool needs_cdc_augmentation(const utils::chunked_vector<mutation>&) const;
};
@@ -118,13 +92,20 @@ struct db_context final {
: _proxy(proxy), _migration_notifier(notifier), _cdc_metadata(cdc_meta) {}
};
// cdc log table operation
enum class operation : int8_t {
// note: these values will eventually be read by a third party, probably not privvy to this
// enum decl, so don't change the constant values (or the datatype).
pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
post_image = 9,
};
bool is_log_for_some_table(const replica::database& db, const sstring& ks_name, const std::string_view& table_name);
schema_ptr get_base_table(const replica::database&, const schema&);
schema_ptr get_base_table(const replica::database&, std::string_view, std::string_view);
bool cdc_enabled(const schema& s);
seastar::sstring base_name(std::string_view log_name);
seastar::sstring log_name(std::string_view table_name);
seastar::sstring log_data_column_name(std::string_view column_name);

View File

@@ -10,11 +10,8 @@
#include "exceptions/exceptions.hh"
#include "cdc/generation.hh"
#include "utils/stall_free.hh"
#include "cdc/metadata.hh"
#include <seastar/coroutine/maybe_yield.hh>
extern logging::logger cdc_log;
static api::timestamp_type to_ts(db_clock::time_point tp) {
@@ -53,21 +50,6 @@ cdc::stream_id get_stream(
return get_stream(*it, tok);
}
static cdc::stream_id get_stream(
const std::vector<cdc::stream_id>& streams,
dht::token tok) {
if (streams.empty()) {
on_internal_error(cdc_log, "get_stream: streams empty");
}
auto it = std::lower_bound(streams.begin(), streams.end(), tok,
[] (const cdc::stream_id& sid, dht::token t) { return sid.token() < t; });
if (it == streams.end()) {
on_internal_error(cdc_log, fmt::format("get_stream: no stream for token {}.", tok));
}
return *it;
}
cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
auto it = _gens.upper_bound(ts);
if (it == _gens.begin()) {
@@ -84,7 +66,7 @@ bool cdc::metadata::streams_available() const {
return it != _gens.end();
}
cdc::stream_id cdc::metadata::get_vnode_stream(api::timestamp_type ts, dht::token tok) {
cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
auto now = api::new_timestamp();
if (ts > now + get_generation_leeway().count()) {
throw exceptions::invalid_request_exception(seastar::format(
@@ -159,43 +141,6 @@ cdc::stream_id cdc::metadata::get_vnode_stream(api::timestamp_type ts, dht::toke
return ret;
}
const std::vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
auto now = api::new_timestamp();
if (ts > now + get_generation_leeway().count()) {
throw exceptions::invalid_request_exception(seastar::format(
"cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
" With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
" know what streams will be used at that time.\n"
"We *do* allow sending writes into the near future, but our ability to do that is limited."
" If you really must use your own timestamps, then make sure your clocks are well-synchronized"
" with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
}
auto table_it = _tablet_streams.find(tid);
if (table_it == _tablet_streams.end()) {
throw std::runtime_error(fmt::format(
"cdc::metadata::get_stream: could not find stream metadata for table {}.", tid));
}
// find the most recent entry of stream sets with timestamp <= write timestamp.
// start from the most recent entry, which is the most likely candidate, and go
// back until we find matching entry.
const auto& table_streams = *table_it->second;
auto it = table_streams.crbegin();
for (; it != table_streams.crend() && it->first > ts; ++it);
if (it == table_streams.crend()) {
throw std::runtime_error(fmt::format(
"cdc::metadata::get_stream: could not find any CDC streams for table {} for timestamp {}.", tid, ts));
}
return it->second.streams;
}
cdc::stream_id cdc::metadata::get_tablet_stream(table_id tid, api::timestamp_type ts, dht::token tok) {
return ::get_stream(get_tablet_stream_set(tid, ts), tok);
}
bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
auto ts = to_ts(tp);
auto it = _gens.lower_bound(ts);
@@ -258,102 +203,3 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
return !it->second;
}
future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
const std::vector<cdc::stream_id>& prev_stream_set,
std::vector<cdc::stream_id> opened,
const std::vector<cdc::stream_id>& closed) {
if (closed.size() == prev_stream_set.size()) {
// all previous streams are closed, so the next stream set is just the opened streams.
co_return std::move(opened);
}
// construct the next stream set from the previous one by adding the opened
// streams and removing the closed streams. we assume each stream set is
// sorted by token, and the result is sorted as well.
std::vector<cdc::stream_id> next_stream_set;
next_stream_set.reserve(prev_stream_set.size() + opened.size() - closed.size());
auto next_prev = prev_stream_set.begin();
auto next_closed = closed.begin();
auto next_opened = opened.begin();
while (next_prev != prev_stream_set.end() || next_opened != opened.end()) {
co_await coroutine::maybe_yield();
if (next_prev == prev_stream_set.end() || (next_opened != opened.end() && next_opened->token() < next_prev->token())) {
next_stream_set.push_back(std::move(*next_opened));
next_opened++;
continue;
}
if (next_closed != closed.end() && *next_prev == *next_closed) {
next_prev++;
next_closed++;
continue;
}
next_stream_set.push_back(*next_prev);
next_prev++;
}
co_return std::move(next_stream_set);
}
void cdc::metadata::load_tablet_streams_map(table_id tid, table_streams new_table_map) {
_tablet_streams[tid] = make_lw_shared(std::move(new_table_map));
}
void cdc::metadata::append_tablet_streams_map(table_id tid, table_streams new_table_map) {
_tablet_streams[tid]->insert(std::make_move_iterator(new_table_map.begin()), std::make_move_iterator(new_table_map.end()));
}
void cdc::metadata::remove_tablet_streams_map(table_id tid) {
_tablet_streams.erase(tid);
}
std::vector<table_id> cdc::metadata::get_tables_with_cdc_tablet_streams() const {
return _tablet_streams | std::views::keys | std::ranges::to<std::vector<table_id>>();
}
future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const std::vector<stream_id>& before, const std::vector<stream_id>& after) {
std::vector<stream_id> closed, opened;
auto before_it = before.begin();
auto after_it = after.begin();
while (before_it != before.end()) {
co_await coroutine::maybe_yield();
if (after_it == after.end()) {
while (before_it != before.end()) {
co_await coroutine::maybe_yield();
closed.push_back(*before_it++);
}
break;
}
if (after_it->token() < before_it->token()) {
opened.push_back(*after_it++);
} else if (after_it->token() > before_it->token()) {
closed.push_back(*before_it++);
} else if (*after_it != *before_it) {
opened.push_back(*after_it++);
closed.push_back(*before_it++);
} else {
after_it++;
before_it++;
}
}
while (after_it != after.end()) {
co_await coroutine::maybe_yield();
opened.push_back(*after_it++);
}
co_return cdc_stream_diff {
.closed_streams = std::move(closed),
.opened_streams = std::move(opened)
};
}

View File

@@ -11,7 +11,7 @@
#include <map>
#include "db_clock.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "cdc/generation.hh"
namespace dht {
@@ -37,20 +37,10 @@ class metadata final {
using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
container_t _gens;
// per-table streams map for tables in tablets-based keyspaces.
// the streams map is shared with the virtual tables reader, hence we can only insert new entries to it, not erase.
using table_streams_ptr = lw_shared_ptr<table_streams>;
using tablet_streams_map = std::unordered_map<table_id, table_streams_ptr>;
tablet_streams_map _tablet_streams;
/* The timestamp used in the last successful `get_vnode_stream` call. */
/* The timestamp used in the last successful `get_stream` call. */
api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
const std::vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;
public:
/* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
* it is older than the generation operating at `now - get_generation_leeway()`.
@@ -61,65 +51,37 @@ public:
* CDC logs will fail fast.
*/
bool streams_available() const;
/* Return the stream for a vnode-based keyspace for the base partition whose token is `tok` to which a corresponding
* log write should go according to the generation used at time `ts` (i.e, the latest generation whose timestamp is
* less or equal to `ts`).
/* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
* according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
*
* If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
* we reject the get_vnode_stream query. This is because the resulting stream might belong to a generation which we don't
* we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
* yet know about. Similarly, we reject queries to the previous generations if the timestamp is too far away "into
* the past". The amount of leeway (how much "into the future" or "into the past" we allow `ts` to be) is defined by
* `get_generation_leeway()`.
*/
stream_id get_vnode_stream(api::timestamp_type ts, dht::token tok);
stream_id get_stream(api::timestamp_type ts, dht::token tok);
/* Similar to get_vnode_stream but for tablet-based keyspaces.
* In addition to the base partition token and timestamp, the stream also depends on the table id because each table
* has its own set of streams.
*/
stream_id get_tablet_stream(table_id tid, api::timestamp_type ts, dht::token tok);
/* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_vnode_stream` function,
/* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
* if the generation is not already known or older than the currently known ones.
*
* Returns true if the generation was inserted,
* meaning that `get_vnode_stream` might return a stream from this generation (at some time points).
* meaning that `get_stream` might return a stream from this generation (at some time points).
*/
bool insert(db_clock::time_point ts, topology_description&& gen);
/* Prepare for inserting a new generation whose timestamp is `ts`.
* This method is not required to be called before `insert`, but it's here
* to increase safety of `get_vnode_stream` calls in some situations. Use it if you:
* to increase safety of `get_stream` calls in some situations. Use it if you:
* 1. know that there is a new generation, but
* 2. you didn't yet retrieve the generation's topology_description.
*
* After preparing a generation, if `get_vnode_stream` is supposed to return a stream from this generation
* After preparing a generation, if `get_stream` is supposed to return a stream from this generation
* but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
*
* Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
*/
bool prepare(db_clock::time_point ts);
void load_tablet_streams_map(table_id tid, table_streams new_table_map);
void append_tablet_streams_map(table_id tid, table_streams new_table_map);
void remove_tablet_streams_map(table_id tid);
const tablet_streams_map& get_all_tablet_streams() const {
return _tablet_streams;
}
std::vector<table_id> get_tables_with_cdc_tablet_streams() const;
static future<std::vector<stream_id>> construct_next_stream_set(
const std::vector<cdc::stream_id>& prev_stream_set,
std::vector<cdc::stream_id> opened,
const std::vector<cdc::stream_id>& closed);
static future<cdc_stream_diff> generate_stream_diff(
const std::vector<stream_id>& before,
const std::vector<stream_id>& after);
};
} // namespace cdc

View File

@@ -9,7 +9,7 @@
#include "mutation/mutation.hh"
#include "schema/schema.hh"
#include "types/concrete_types.hh"
#include "concrete_types.hh"
#include "types/user.hh"
#include "split.hh"
@@ -111,15 +111,6 @@ struct batch {
ret.insert(std::make_pair(change.key, all_columns));
}
}
// While deleting a full partition avoids row-by-row logging for performance
// reasons, we must explicitly log single-row deletions for tables without a
// clustering key. This ensures consistent behavior with deletions of single
// rows from tables with a clustering key. See issue #26382.
if (partition_deletions && s.clustering_key_size() == 0) {
cdc::one_kind_column_set all_columns{s.regular_columns_count()};
all_columns.set(0, s.regular_columns_count(), true);
ret.emplace(clustering_key::make_empty(), all_columns);
}
auto process_change_type = [&] (const auto& changes) {
for (const auto& change : changes) {

View File

@@ -10,7 +10,7 @@
#include <boost/dynamic_bitset.hpp> // IWYU pragma: keep
#include "replica/database_fwd.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
class mutation;

View File

@@ -8,7 +8,7 @@
#include <fmt/chrono.h>
#include "mutation/timestamp.hh"
#include "timestamp.hh"
#include "clocks-impl.hh"

View File

@@ -30,6 +30,5 @@ if(_slp_vectorize_supported)
endif()
add_link_options($<$<CONFIG:RelWithDebInfo>:LINKER:--gc-sections>)
add_link_options($<$<CONFIG:RelWithDebInfo>:LINKER:--no-lto-pgo-warn-mismatch>)
maybe_limit_stack_usage_in_KB(13 RelWithDebInfo)

View File

@@ -117,9 +117,6 @@ add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}=.")
cmake_path(GET CMAKE_BINARY_DIR FILENAME build_dir_name)
add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}/=${build_dir_name}")
# https://github.com/llvm/llvm-project/issues/163007
add_compile_options("-fextend-variable-liveness=none")
default_target_arch(target_arch)
if(target_arch)
add_compile_options("-march=${target_arch}")
@@ -138,8 +135,6 @@ function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
endif()
endfunction()
option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)
macro(update_build_flags config)
cmake_parse_arguments (
parsed_args
@@ -155,7 +150,7 @@ macro(update_build_flags config)
set(linker_flags "CMAKE_EXE_LINKER_FLAGS_${CONFIG}")
string(APPEND ${cxx_flags}
" -O${parsed_args_OPTIMIZATION_LEVEL}")
if(parsed_args_WITH_DEBUG_INFO OR ${Scylla_WITH_DEBUG_INFO})
if(parsed_args_WITH_DEBUG_INFO)
string(APPEND ${cxx_flags} " -g -gz")
else()
# If Scylla is compiled without debug info, strip the debug symbols from

View File

@@ -9,7 +9,7 @@
#include "utils/assert.hh"
#include "types/collection.hh"
#include "types/user.hh"
#include "types/concrete_types.hh"
#include "concrete_types.hh"
#include "mutation/mutation_partition.hh"
#include "compaction/compaction_garbage_collector.hh"
#include "combine.hh"

View File

@@ -28,7 +28,6 @@
#include <seastar/coroutine/maybe_yield.hh>
#include "compaction/compaction_garbage_collector.hh"
#include "compaction/exceptions.hh"
#include "dht/i_partitioner.hh"
#include "sstables/exceptions.hh"
#include "sstables/sstables.hh"
@@ -53,7 +52,7 @@
#include "readers/compacting.hh"
#include "tombstone_gc.hh"
#include "replica/database.hh"
#include "mutation/timestamp.hh"
#include "timestamp.hh"
can_gc_fn always_gc = [] (tombstone, is_shadowable) { return true; };
@@ -111,9 +110,9 @@ auto fmt::formatter<max_purgeable>::format(max_purgeable mp, fmt::format_context
return format_to(ctx.out(), "max_purgeable{{timestamp={}, expiry_treshold={}, source={}}}", mp.timestamp(), expiry_str, mp.source());
}
namespace compaction {
namespace sstables {
bool is_eligible_for_compaction(const sstables::shared_sstable& sst) noexcept {
bool is_eligible_for_compaction(const shared_sstable& sst) noexcept {
return !sst->requires_view_building() && !sst->is_quarantined();
}
@@ -129,7 +128,6 @@ static const std::unordered_map<compaction_type, sstring> compaction_types = {
{ compaction_type::Upgrade, "UPGRADE" },
{ compaction_type::Reshape, "RESHAPE" },
{ compaction_type::Split, "SPLIT" },
{ compaction_type::Major, "MAJOR" },
};
sstring compaction_name(compaction_type type) {
@@ -160,7 +158,6 @@ std::string_view to_string(compaction_type type) {
case compaction_type::Upgrade: return "Upgrade";
case compaction_type::Reshape: return "Reshape";
case compaction_type::Split: return "Split";
case compaction_type::Major: return "Major";
}
on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
return "(invalid)";
@@ -194,8 +191,8 @@ std::string_view to_string(compaction_type_options::scrub::quarantine_mode quara
return "(invalid)";
}
static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& table_s, sstables::sstable_set::incremental_selector& selector,
const std::unordered_set<sstables::shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& table_s, sstable_set::incremental_selector& selector,
const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
const api::timestamp_type compacting_max_timestamp, const bool gc_check_only_compacting_sstables, const is_shadowable is_shadowable) {
if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
clogger.trace("get_max_purgeable_timestamp {}.{}: tombstone_gc_enabled=false, returning min_timestamp",
@@ -289,23 +286,23 @@ static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& ta
return max_purgeable(timestamp, source);
}
static std::vector<sstables::shared_sstable> get_uncompacting_sstables(const compaction_group_view& table_s, std::vector<sstables::shared_sstable> sstables) {
static std::vector<shared_sstable> get_uncompacting_sstables(const compaction_group_view& table_s, std::vector<shared_sstable> sstables) {
auto sstable_set = table_s.sstable_set_for_tombstone_gc();
auto all_sstables = *sstable_set->all() | std::ranges::to<std::vector>();
auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
all_sstables.insert(all_sstables.end(), compacted_undeleted.begin(), compacted_undeleted.end());
std::ranges::sort(all_sstables, std::ranges::less(), std::mem_fn(&sstables::sstable::generation));
std::ranges::sort(sstables, std::ranges::less(), std::mem_fn(&sstables::sstable::generation));
std::vector<sstables::shared_sstable> not_compacted_sstables;
std::ranges::sort(all_sstables, std::ranges::less(), std::mem_fn(&sstable::generation));
std::ranges::sort(sstables, std::ranges::less(), std::mem_fn(&sstable::generation));
std::vector<shared_sstable> not_compacted_sstables;
std::ranges::set_difference(all_sstables, sstables, std::back_inserter(not_compacted_sstables),
std::ranges::less(), std::mem_fn(&sstables::sstable::generation), std::mem_fn(&sstables::sstable::generation));
std::ranges::less(), std::mem_fn(&sstable::generation), std::mem_fn(&sstable::generation));
return not_compacted_sstables;
}
static std::vector<sstables::basic_info> extract_basic_info_from_sstables(const std::vector<sstables::shared_sstable>& sstables) {
static std::vector<basic_info> extract_basic_info_from_sstables(const std::vector<shared_sstable>& sstables) {
return sstables | std::views::transform([] (auto&& sst) {
return sstables::basic_info{.generation = sst->generation(), .origin = sst->get_origin(), .size = sst->bytes_on_disk()};
}) | std::ranges::to<std::vector<sstables::basic_info>>();
}) | std::ranges::to<std::vector<basic_info>>();
}
class compaction;
@@ -360,19 +357,19 @@ public:
};
struct compaction_writer {
sstables::shared_sstable sst;
shared_sstable sst;
// We use a ptr for pointer stability and so that it can be null
// when using a noop monitor.
sstables::sstable_writer writer;
sstable_writer writer;
// The order in here is important. A monitor must be destroyed before the writer it is monitoring since it has a
// periodic timer that checks the writer.
// The writer must be destroyed before the shared_sstable since the it may depend on the sstable
// (as in the mx::writer over compressed_file_data_sink_impl case that depends on sstables::compression).
std::unique_ptr<compaction_write_monitor> monitor;
compaction_writer(std::unique_ptr<compaction_write_monitor> monitor, sstables::sstable_writer writer, sstables::shared_sstable sst)
compaction_writer(std::unique_ptr<compaction_write_monitor> monitor, sstable_writer writer, shared_sstable sst)
: sst(std::move(sst)), writer(std::move(writer)), monitor(std::move(monitor)) {}
compaction_writer(sstables::sstable_writer writer, sstables::shared_sstable sst)
compaction_writer(sstable_writer writer, shared_sstable sst)
: compaction_writer(nullptr, std::move(writer), std::move(sst)) {}
};
@@ -451,7 +448,7 @@ public:
using use_backlog_tracker = bool_class<class use_backlog_tracker_tag>;
struct compaction_read_monitor_generator final : public sstables::read_monitor_generator {
struct compaction_read_monitor_generator final : public read_monitor_generator {
class compaction_read_monitor final : public sstables::read_monitor, public backlog_read_progress_manager {
sstables::shared_sstable _sst;
compaction_group_view& _table_s;
@@ -523,13 +520,13 @@ struct compaction_read_monitor_generator final : public sstables::read_monitor_g
}
private:
compaction_group_view& _table_s;
std::unordered_map<sstables::generation_type, compaction_read_monitor> _generated_monitors;
std::unordered_map<generation_type, compaction_read_monitor> _generated_monitors;
use_backlog_tracker _use_backlog_tracker;
friend class compaction_progress_monitor;
};
void compaction_progress_monitor::set_generator(std::unique_ptr<sstables::read_monitor_generator> generator) {
void compaction_progress_monitor::set_generator(std::unique_ptr<read_monitor_generator> generator) {
_generator = std::move(generator);
}
@@ -554,16 +551,16 @@ protected:
const compaction_sstable_creator_fn _sstable_creator;
const schema_ptr _schema;
const reader_permit _permit;
std::vector<sstables::shared_sstable> _sstables;
std::vector<sstables::generation_type> _input_sstable_generations;
std::vector<sstables::basic_info> _input_sstables_basic_info;
std::vector<shared_sstable> _sstables;
std::vector<generation_type> _input_sstable_generations;
std::vector<basic_info> _input_sstables_basic_info;
// Unused sstables are tracked because if compaction is interrupted we can only delete them.
// Deleting used sstables could potentially result in data loss.
std::unordered_set<sstables::shared_sstable> _new_partial_sstables;
std::vector<sstables::shared_sstable> _new_unused_sstables;
std::vector<sstables::shared_sstable> _all_new_sstables;
lw_shared_ptr<sstables::sstable_set> _compacting;
const compaction_type _type;
std::unordered_set<shared_sstable> _new_partial_sstables;
std::vector<shared_sstable> _new_unused_sstables;
std::vector<shared_sstable> _all_new_sstables;
lw_shared_ptr<sstable_set> _compacting;
const sstables::compaction_type _type;
const uint64_t _max_sstable_size;
const uint32_t _sstable_level;
uint64_t _start_size = 0;
@@ -582,21 +579,21 @@ protected:
bool _contains_multi_fragment_runs = false;
mutation_source_metadata _ms_metadata = {};
const compaction_sstable_replacer_fn _replacer;
const sstables::run_id _run_identifier;
const run_id _run_identifier;
// optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
std::optional<sstables::sstable_set> _sstable_set;
std::optional<sstable_set> _sstable_set;
// used to incrementally calculate max purgeable timestamp, as we iterate through decorated keys.
std::optional<sstables::sstable_set::incremental_selector> _selector;
std::unordered_set<sstables::shared_sstable> _compacting_for_max_purgeable_func;
std::optional<sstable_set::incremental_selector> _selector;
std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
// optional owned_ranges vector for cleanup;
const owned_ranges_ptr _owned_ranges = {};
// required for reshard compaction.
const dht::sharder* _sharder = nullptr;
const std::optional<dht::incremental_owned_ranges_checker> _owned_ranges_checker;
// Garbage collected sstables that are sealed but were not added to SSTable set yet.
std::vector<sstables::shared_sstable> _unused_garbage_collected_sstables;
std::vector<shared_sstable> _unused_garbage_collected_sstables;
// Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
std::vector<sstables::shared_sstable> _used_garbage_collected_sstables;
std::vector<shared_sstable> _used_garbage_collected_sstables;
utils::observable<> _stop_request_observable;
// optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
@@ -612,7 +609,7 @@ private:
// Called in a seastar thread
dht::partition_range_vector
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
get_ranges_for_invalidation(const std::vector<shared_sstable>& sstables) {
// If owned ranges is disengaged, it means no cleanup work was done and
// so nothing needs to be invalidated.
if (!_owned_ranges) {
@@ -621,7 +618,7 @@ private:
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
auto non_owned_ranges = sstables
| std::views::transform([] (const sstables::shared_sstable& sst) {
| std::views::transform([] (const shared_sstable& sst) {
seastar::thread::maybe_yield();
return dht::partition_range::make({sst->get_first_decorated_key(), true},
{sst->get_last_decorated_key(), true});
@@ -644,26 +641,26 @@ protected:
, _replacer(std::move(descriptor.replacer))
, _run_identifier(descriptor.run_identifier)
, _sstable_set(std::move(descriptor.all_sstables_snapshot))
, _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstables::sstable_set::incremental_selector>{})
, _compacting_for_max_purgeable_func(std::unordered_set<sstables::shared_sstable>(_sstables.begin(), _sstables.end()))
, _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstable_set::incremental_selector>{})
, _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
, _owned_ranges(std::move(descriptor.owned_ranges))
, _sharder(descriptor.sharder)
, _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
, _tombstone_gc_state_with_commitlog_check_disabled(descriptor.gc_check_only_compacting_sstables ? std::make_optional(_table_s.get_tombstone_gc_state().with_commitlog_check_disabled()) : std::nullopt)
, _progress_monitor(progress_monitor)
{
std::unordered_set<sstables::run_id> ssts_run_ids;
_contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (sstables::shared_sstable& sst) {
std::unordered_set<run_id> ssts_run_ids;
_contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (shared_sstable& sst) {
return !ssts_run_ids.insert(sst->run_identifier()).second;
});
_progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(_table_s, use_backlog_tracker));
}
sstables::read_monitor_generator& unwrap_monitor_generator() const {
read_monitor_generator& unwrap_monitor_generator() const {
if (_progress_monitor._generator) {
return *_progress_monitor._generator;
}
return sstables::default_read_monitor_generator();
return default_read_monitor_generator();
}
virtual uint64_t partitions_per_sstable() const {
@@ -674,7 +671,7 @@ protected:
_table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions, _schema));
}
void setup_new_sstable(sstables::shared_sstable& sst) {
void setup_new_sstable(shared_sstable& sst) {
_all_new_sstables.push_back(sst);
_new_partial_sstables.insert(sst);
for (auto ancestor : _input_sstable_generations) {
@@ -691,14 +688,14 @@ protected:
_new_partial_sstables.erase(writer->sst);
}
sstables::sstable_writer_config make_sstable_writer_config(compaction_type type) {
sstable_writer_config make_sstable_writer_config(compaction_type type) {
auto s = compaction_name(type);
std::transform(s.begin(), s.end(), s.begin(), [] (char c) {
return std::tolower(c);
});
sstables::sstable_writer_config cfg = _table_s.configure_writer(std::move(s));
sstable_writer_config cfg = _table_s.configure_writer(std::move(s));
cfg.max_sstable_size = _max_sstable_size;
cfg.monitor = &sstables::default_write_monitor();
cfg.monitor = &default_write_monitor();
cfg.run_identifier = _run_identifier;
cfg.replay_position = _rp;
cfg.sstable_level = _sstable_level;
@@ -706,7 +703,7 @@ protected:
}
api::timestamp_type maximum_timestamp() const {
auto m = std::max_element(_sstables.begin(), _sstables.end(), [] (const sstables::shared_sstable& sst1, const sstables::shared_sstable& sst2) {
auto m = std::max_element(_sstables.begin(), _sstables.end(), [] (const shared_sstable& sst1, const shared_sstable& sst2) {
return sst1->get_stats_metadata().max_timestamp < sst2->get_stats_metadata().max_timestamp;
});
return (*m)->get_stats_metadata().max_timestamp;
@@ -717,7 +714,7 @@ protected:
}
compaction_completion_desc
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
get_compaction_completion_desc(std::vector<shared_sstable> input_sstables, std::vector<shared_sstable> output_sstables) {
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
}
@@ -728,11 +725,11 @@ protected:
return bool(_sstable_set) && _table_s.tombstone_gc_enabled();
}
compaction_writer create_gc_compaction_writer(sstables::run_id gc_run) const {
compaction_writer create_gc_compaction_writer(run_id gc_run) const {
auto sst = _sstable_creator(this_shard_id());
auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
sstables::sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
cfg.run_identifier = gc_run;
cfg.monitor = monitor.get();
uint64_t estimated_partitions = std::max(1UL, uint64_t(ceil(partitions_per_sstable() * _estimated_droppable_tombstone_ratio)));
@@ -760,7 +757,7 @@ protected:
// created here as:
// 1. it can be shared across all sstables created by this writer
// 2. it is optional, as gc writer is not always used
auto gc_run = sstables::run_id::create_random_id();
auto gc_run = run_id::create_random_id();
return compacted_fragments_writer(*this,
[this, gc_run] (const dht::decorated_key&) { return create_gc_compaction_writer(gc_run); },
[this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
@@ -769,13 +766,13 @@ protected:
// Retrieves all unused garbage collected sstables that will be subsequently added
// to the SSTable set, and mark them as used.
std::vector<sstables::shared_sstable> consume_unused_garbage_collected_sstables() {
std::vector<shared_sstable> consume_unused_garbage_collected_sstables() {
auto unused = std::exchange(_unused_garbage_collected_sstables, {});
_used_garbage_collected_sstables.insert(_used_garbage_collected_sstables.end(), unused.begin(), unused.end());
return unused;
}
const std::vector<sstables::shared_sstable>& used_garbage_collected_sstables() const {
const std::vector<shared_sstable>& used_garbage_collected_sstables() const {
return _used_garbage_collected_sstables;
}
@@ -863,7 +860,6 @@ private:
_input_sstables_basic_info.reserve(_sstables.size());
int64_t repaired_at = 0;
std::vector<int64_t> repaired_at_for_compacted_sstables;
uint64_t compaction_size = 0;
for (auto& sst : _sstables) {
co_await coroutine::maybe_yield();
auto& sst_stats = sst->get_stats_metadata();
@@ -887,7 +883,7 @@ private:
}
_stats_collector.update(sst->get_encoding_stats_for_compaction());
compaction_size += sst->data_size();
_cdata.compaction_size += sst->data_size();
// We also capture the sstable, so we keep it alive while the read isn't done
ssts->insert(sst);
// FIXME: If the sstables have cardinality estimation bitmaps, use that
@@ -901,7 +897,6 @@ private:
_rp = std::max(_rp, sst_stats.position);
}
}
_cdata.compaction_size += compaction_size;
log_debug("{} [{}]", report_start_desc(), fmt::join(_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
if (repaired_at) {
_output_repaired_at = repaired_at;
@@ -1273,9 +1268,9 @@ public:
sm_fwd,
mr_fwd,
unwrap_monitor_generator(),
sstables::default_sstable_predicate(),
default_sstable_predicate(),
&_reader_statistics,
sstables::integrity_check::yes);
integrity_check::yes);
}
std::string_view report_start_desc() const override {
@@ -1291,7 +1286,7 @@ public:
setup_new_sstable(sst);
auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
sstables::sstable_writer_config cfg = make_sstable_writer_config(_type);
sstable_writer_config cfg = make_sstable_writer_config(_type);
cfg.monitor = monitor.get();
return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
}
@@ -1311,7 +1306,7 @@ public:
replace_remaining_exhausted_sstables();
}
private:
void maybe_replace_exhausted_sstables_by_sst(sstables::shared_sstable sst) {
void maybe_replace_exhausted_sstables_by_sst(shared_sstable sst) {
// Skip earlier replacement of exhausted sstables if compaction works with only single-fragment runs,
// meaning incremental compaction is disabled for this compaction.
if (!enable_garbage_collected_sstable_writer()) {
@@ -1319,7 +1314,7 @@ private:
}
auto permit = seastar::get_units(_replacer_lock, 1).get();
// Replace exhausted sstable(s), if any, by new one(s) in the column family.
auto not_exhausted = [s = _schema, &dk = sst->get_last_decorated_key()] (sstables::shared_sstable& sst) {
auto not_exhausted = [s = _schema, &dk = sst->get_last_decorated_key()] (shared_sstable& sst) {
return sst->get_last_decorated_key().tri_compare(*s, dk) > 0;
};
auto exhausted = std::partition(_sstables.begin(), _sstables.end(), not_exhausted);
@@ -1327,7 +1322,7 @@ private:
if (exhausted != _sstables.end()) {
// The goal is that exhausted sstables will be deleted as soon as possible,
// so we need to release reference to them.
std::for_each(exhausted, _sstables.end(), [this] (sstables::shared_sstable& sst) {
std::for_each(exhausted, _sstables.end(), [this] (shared_sstable& sst) {
_compacting_for_max_purgeable_func.erase(sst);
// Fully expired sstable is not actually compacted, therefore it's not present in the compacting set.
_compacting->erase(sst);
@@ -1342,7 +1337,7 @@ private:
auto unused_gc_sstables = consume_unused_garbage_collected_sstables();
_new_unused_sstables.insert(_new_unused_sstables.end(), unused_gc_sstables.begin(), unused_gc_sstables.end());
auto exhausted_ssts = std::vector<sstables::shared_sstable>(exhausted, _sstables.end());
auto exhausted_ssts = std::vector<shared_sstable>(exhausted, _sstables.end());
log_debug("Replacing earlier exhausted sstable(s) [{}] by new sstable(s) [{}]",
fmt::join(exhausted_ssts | std::views::transform([] (auto sst) { return to_string(sst, false); }), ","),
fmt::join(_new_unused_sstables | std::views::transform([] (auto sst) { return to_string(sst, true); }), ","));
@@ -1354,7 +1349,7 @@ private:
void replace_remaining_exhausted_sstables() {
if (!_sstables.empty() || !used_garbage_collected_sstables().empty()) {
std::vector<sstables::shared_sstable> old_sstables;
std::vector<shared_sstable> old_sstables;
std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
// Remove Garbage Collected SSTables from the SSTable set if any was previously added.
@@ -1423,9 +1418,9 @@ public:
sm_fwd,
mr_fwd,
unwrap_monitor_generator(),
sstables::default_sstable_predicate(),
default_sstable_predicate(),
nullptr,
sstables::integrity_check::yes);
integrity_check::yes);
}
std::string_view report_start_desc() const override {
@@ -1440,7 +1435,7 @@ public:
auto sst = _sstable_creator(this_shard_id());
setup_new_sstable(sst);
sstables::sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats()), sst};
}
@@ -1513,10 +1508,10 @@ public:
// Deduce the estimated keys based on the token range that will end up in this new sstable after the split.
auto token_range_of_new_sst = _table_s.get_token_range_after_split(dk.token());
const auto estimated_keys = _sstables[0]->estimated_keys_for_range(token_range_of_new_sst).get();
const auto estimated_keys = _sstables[0]->estimated_keys_for_range(token_range_of_new_sst);
auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
sstables::sstable_writer_config cfg = make_sstable_writer_config(_type);
sstable_writer_config cfg = make_sstable_writer_config(_type);
cfg.monitor = monitor.get();
return compaction_writer{std::move(monitor), sst->get_writer(*_schema, estimated_keys, cfg, get_encoding_stats()), sst};
@@ -1539,8 +1534,6 @@ private:
mutation_fragment_stream_validator _validator;
bool _skip_to_next_partition = false;
uint64_t& _validation_errors;
bool& _failed_to_fix_sstable;
compaction_type_options::scrub::drop_unfixable_sstables _drop_unfixable_sstables;
private:
void maybe_abort_scrub(std::function<void()> report_error) {
@@ -1551,7 +1544,7 @@ private:
++_validation_errors;
}
skip on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
void on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
auto report_fn = [this, error] (std::string_view action = "") {
report_validation_error(compaction_type::Scrub, *_schema, error, action);
};
@@ -1560,11 +1553,6 @@ private:
auto pe = mutation_fragment_v2(*_schema, _permit, partition_end{});
if (!_validator(pe)) {
if (_drop_unfixable_sstables) {
_failed_to_fix_sstable = true;
end_stream();
return skip::yes;
}
throw compaction_aborted_exception(
_schema->ks_name(),
_schema->cf_name(),
@@ -1573,17 +1561,11 @@ private:
push_mutation_fragment(std::move(pe));
if (!_validator(ps)) {
if (_drop_unfixable_sstables) {
_failed_to_fix_sstable = true;
end_stream();
return skip::yes;
}
throw compaction_aborted_exception(
_schema->ks_name(),
_schema->cf_name(),
"scrub compaction failed to rectify unexpected partition-start, validator rejects it even after the injected partition-end");
}
return skip::no;
}
skip on_invalid_partition(const dht::decorated_key& new_key, sstring error) {
@@ -1611,11 +1593,6 @@ private:
const auto& key = _validator.previous_partition_key();
if (_validator.current_tombstone()) {
if (_drop_unfixable_sstables) {
_failed_to_fix_sstable = true;
end_stream();
return skip::yes;
}
throw compaction_aborted_exception(
_schema->ks_name(),
_schema->cf_name(),
@@ -1654,46 +1631,9 @@ private:
report_fn("Rectifying by adding missing partition-end to the end of the stream");
}
void on_malformed_sstable_exception(std::exception_ptr e) {
bool should_abort = _scrub_mode == compaction_type_options::scrub::mode::abort ||
(_scrub_mode == compaction_type_options::scrub::mode::segregate && !_drop_unfixable_sstables);
if (should_abort) {
throw compaction_aborted_exception(
_schema->ks_name(),
_schema->cf_name(),
format("scrub compaction failed due to unrecoverable error: {}", e));
}
if (_drop_unfixable_sstables) {
_failed_to_fix_sstable = true;
}
end_stream();
}
void end_stream() {
// Closes the active range tombstone if needed, before emitting partition end.
if (auto current_tombstone = _validator.current_tombstone(); current_tombstone) {
const auto& last_pos = _validator.previous_position();
auto after_last_pos = position_in_partition::after_key(*_schema, last_pos.key());
auto rtc = range_tombstone_change(std::move(after_last_pos), tombstone{});
push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, std::move(rtc)));
}
// Emit partition end if needed.
if (_validator.previous_mutation_fragment_kind() != mutation_fragment_v2::kind::partition_end) {
push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end{}));
}
// Report the end of stream.
_end_of_stream = true;
}
void fill_buffer_from_underlying() {
utils::get_local_injector().inject("rest_api_keyspace_scrub_abort", [] { throw compaction_aborted_exception("", "", "scrub compaction found invalid data"); });
while (!_reader.is_buffer_empty() && !is_buffer_full()) {
if (_end_of_stream && _failed_to_fix_sstable) {
return;
}
auto mf = _reader.pop_mutation_fragment();
if (mf.is_partition_start()) {
// First check that fragment kind monotonicity stands.
@@ -1704,9 +1644,7 @@ private:
// will confuse it.
if (!_skip_to_next_partition) {
if (auto res = _validator(mf); !res) {
if (on_unexpected_partition_start(mf, res.what()) == skip::yes) {
continue;
}
on_unexpected_partition_start(mf, res.what());
}
// Continue processing this partition start.
}
@@ -1730,10 +1668,6 @@ private:
push_mutation_fragment(std::move(mf));
}
if (_end_of_stream && _failed_to_fix_sstable) {
return;
}
_end_of_stream = _reader.is_end_of_stream() && _reader.is_buffer_empty();
if (_end_of_stream) {
@@ -1744,15 +1678,12 @@ private:
}
public:
reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors,
bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables)
reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors)
: impl(underlying.schema(), underlying.permit())
, _scrub_mode(scrub_mode)
, _reader(std::move(underlying))
, _validator(*_schema)
, _validation_errors(validation_errors)
, _failed_to_fix_sstable(failed_to_fix_sstable)
, _drop_unfixable_sstables(drop_unfixable_sstables)
{ }
virtual future<> fill_buffer() override {
if (_end_of_stream) {
@@ -1772,8 +1703,6 @@ private:
} catch (const storage_io_error&) {
// Propagate these unchanged.
throw;
} catch (const sstables::malformed_sstable_exception& e) {
on_malformed_sstable_exception(std::current_exception());
} catch (...) {
// We don't want failed scrubs to be retried.
throw compaction_aborted_exception(
@@ -1803,7 +1732,6 @@ private:
mutable std::string _scrub_finish_description;
uint64_t _bucket_count = 0;
uint64_t _validation_errors = 0;
bool _failed_to_fix_sstable = false;
public:
scrub_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
@@ -1834,8 +1762,8 @@ public:
if (!range.is_full()) {
on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
}
auto full_scan_reader = _compacting->make_full_scan_reader(std::move(s), std::move(permit), nullptr, unwrap_monitor_generator(), sstables::integrity_check::yes);
return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors, _failed_to_fix_sstable, _options.drop_unfixable);
auto full_scan_reader = _compacting->make_full_scan_reader(std::move(s), std::move(permit), nullptr, unwrap_monitor_generator(), integrity_check::yes);
return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors);
}
uint64_t partitions_per_sstable() const override {
@@ -1872,45 +1800,11 @@ public:
return ret;
}
void drop_unfixable_sstables() {
if (!_sstables.empty() || !used_garbage_collected_sstables().empty()) {
std::vector<sstables::shared_sstable> old_sstables;
std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
// Remove Garbage Collected SSTables from the SSTable set if any was previously added.
auto& used_gc_sstables = used_garbage_collected_sstables();
old_sstables.insert(old_sstables.end(), used_gc_sstables.begin(), used_gc_sstables.end());
_replacer(get_compaction_completion_desc(std::move(old_sstables), {}));
}
// Mark new sstables for deletion as well
for (auto& sst : boost::range::join(_new_partial_sstables, _new_unused_sstables)) {
sst->mark_for_deletion();
}
}
virtual void on_end_of_compaction() override {
if (_options.drop_unfixable && _failed_to_fix_sstable) {
drop_unfixable_sstables();
} else {
regular_compaction::on_end_of_compaction();
}
}
virtual void stop_sstable_writer(compaction_writer* writer) override {
if (_options.drop_unfixable && _failed_to_fix_sstable && writer) {
finish_new_sstable(writer);
} else {
regular_compaction::stop_sstable_writer(writer);
}
}
friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);
friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
};
mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables) {
return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors, failed_to_fix_sstable, drop_unfixable_sstables);
mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors) {
return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors);
}
class resharding_compaction final : public compaction {
@@ -1928,7 +1822,7 @@ class resharding_compaction final : public compaction {
uint64_t estimated_partitions = 0;
};
std::vector<estimated_values> _estimation_per_shard;
std::vector<sstables::run_id> _run_identifiers;
std::vector<run_id> _run_identifiers;
private:
// return estimated partitions per sstable for a given shard
uint64_t partitions_per_sstable(shard_id s) const {
@@ -1937,7 +1831,7 @@ private:
_table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions, _schema));
}
public:
resharding_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
resharding_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor)
: compaction(table_s, std::move(descriptor), cdata, progress_monitor, use_backlog_tracker::no)
, _estimation_per_shard(smp::count)
, _run_identifiers(smp::count)
@@ -1952,7 +1846,7 @@ public:
}
}
for (auto i : std::views::iota(0u, smp::count)) {
_run_identifiers[i] = sstables::run_id::create_random_id();
_run_identifiers[i] = run_id::create_random_id();
}
}
@@ -1974,7 +1868,7 @@ public:
sm_fwd,
mr_fwd,
unwrap_monitor_generator(),
sstables::integrity_check::yes);
integrity_check::yes);
}
@@ -2047,16 +1941,15 @@ compaction_type compaction_type_options::type() const {
compaction_type::Reshard,
compaction_type::Reshape,
compaction_type::Split,
compaction_type::Major,
};
static_assert(std::variant_size_v<compaction_type_options::options_variant> == std::size(index_to_type));
return index_to_type[_options.index()];
}
static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_s, sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor) {
struct {
compaction_group_view& table_s;
compaction_descriptor&& descriptor;
sstables::compaction_descriptor&& descriptor;
compaction_data& cdata;
compaction_progress_monitor& progress_monitor;
@@ -2069,9 +1962,6 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
std::unique_ptr<compaction> operator()(compaction_type_options::regular) {
return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
}
std::unique_ptr<compaction> operator()(compaction_type_options::major) {
return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
}
std::unique_ptr<compaction> operator()(compaction_type_options::cleanup) {
return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
}
@@ -2089,7 +1979,7 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
return descriptor.options.visit(visitor_factory);
}
static future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, sstables::read_monitor_generator& monitor_generator) {
static future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, read_monitor_generator& monitor_generator) {
auto schema = table_s.schema();
auto permit = table_s.make_compaction_reader_permit();
@@ -2111,14 +2001,10 @@ static future<compaction_result> scrub_sstables_validate_mode(compaction_descrip
clogger.info("Finished scrubbing in validate mode {} - sstable is {}", sst->get_filename(), validation_errors == 0 ? "valid" : "invalid");
}
using scrub = compaction_type_options::scrub;
using scrub = sstables::compaction_type_options::scrub;
if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
for (auto& sst : descriptor.sstables) {
try {
co_await sst->change_state(sstables::sstable_state::quarantine);
} catch (...) {
clogger.error("Moving {} to quarantine failed due to {}, continuing.", sst->get_filename(), std::current_exception());
}
co_await sst->change_state(sstables::sstable_state::quarantine);
}
}
@@ -2131,7 +2017,7 @@ static future<compaction_result> scrub_sstables_validate_mode(compaction_descrip
};
}
future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
progress_monitor.set_generator(std::make_unique<compaction_read_monitor_generator>(table_s, use_backlog_tracker::no));
auto d = defer([&] { progress_monitor.reset_generator(); });
auto res = co_await scrub_sstables_validate_mode(descriptor, cdata, table_s, *progress_monitor._generator);
@@ -2139,7 +2025,7 @@ future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor des
}
future<compaction_result>
compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
if (descriptor.sstables.empty()) {
return make_exception_future<compaction_result>(std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}",
compaction_name(descriptor.options.type()), table_s.schema()->ks_name(), table_s.schema()->cf_name())));
@@ -2182,7 +2068,7 @@ get_fully_expired_sstables(const compaction_group_view& table_s, const std::vect
// Get ancestors from sstable which is empty after restart. It works for this purpose because
// we only need to check that a sstable compacted *in this instance* hasn't an ancestor undeleted.
// Not getting it from sstable metadata because mc format hasn't it available.
return std::ranges::any_of(candidate->compaction_ancestors(), [&compacted_undeleted_gens] (const sstables::generation_type& gen) {
return std::ranges::any_of(candidate->compaction_ancestors(), [&compacted_undeleted_gens] (const generation_type& gen) {
return compacted_undeleted_gens.contains(gen);
});
};
@@ -2230,17 +2116,17 @@ uint64_t compaction_descriptor::sstables_size() const {
}
auto fmt::formatter<::compaction::compaction_type>::format(::compaction::compaction_type type, fmt::format_context& ctx) const
auto fmt::formatter<sstables::compaction_type>::format(sstables::compaction_type type, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
return fmt::format_to(ctx.out(), "{}", to_string(type));
}
auto fmt::formatter<::compaction::compaction_type_options::scrub::mode>::format(::compaction::compaction_type_options::scrub::mode mode, fmt::format_context& ctx) const
auto fmt::formatter<sstables::compaction_type_options::scrub::mode>::format(sstables::compaction_type_options::scrub::mode mode, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
return fmt::format_to(ctx.out(), "{}", to_string(mode));
}
auto fmt::formatter<::compaction::compaction_type_options::scrub::quarantine_mode>::format(::compaction::compaction_type_options::scrub::quarantine_mode mode, fmt::format_context& ctx) const
auto fmt::formatter<sstables::compaction_type_options::scrub::quarantine_mode>::format(sstables::compaction_type_options::scrub::quarantine_mode mode, fmt::format_context& ctx) const
-> decltype(ctx.out()) {
return fmt::format_to(ctx.out(), "{}", to_string(mode));
}

View File

@@ -20,7 +20,9 @@
#include <seastar/core/abort_source.hh>
#include "sstables/basic_info.hh"
namespace compaction {
using namespace compaction;
namespace sstables {
bool is_eligible_for_compaction(const sstables::shared_sstable& sst) noexcept;
@@ -55,8 +57,8 @@ struct compaction_data {
utils::UUID compaction_uuid;
unsigned compaction_fan_in = 0;
struct replacement {
const std::vector<sstables::shared_sstable> removed;
const std::vector<sstables::shared_sstable> added;
const std::vector<shared_sstable> removed;
const std::vector<shared_sstable> added;
};
std::vector<replacement> pending_replacements;
@@ -109,17 +111,19 @@ struct compaction_result {
compaction_stats stats;
};
class read_monitor_generator;
class compaction_progress_monitor {
std::unique_ptr<sstables::read_monitor_generator> _generator = nullptr;
std::unique_ptr<read_monitor_generator> _generator = nullptr;
uint64_t _progress = 0;
public:
void set_generator(std::unique_ptr<sstables::read_monitor_generator> generator);
void set_generator(std::unique_ptr<read_monitor_generator> generator);
void reset_generator();
// Returns number of bytes processed with _generator.
uint64_t get_progress() const;
friend class compaction;
friend future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor, compaction_data&, compaction_group_view&, compaction_progress_monitor&);
friend future<compaction_result> scrub_sstables_validate_mode(sstables::compaction_descriptor, compaction_data&, compaction_group_view&, compaction_progress_monitor&);
};
// Compact a list of N sstables into M sstables.
@@ -127,7 +131,7 @@ public:
//
// compaction_descriptor is responsible for specifying the type of compaction, and influencing
// compaction behavior through its available member fields.
future<compaction_result> compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor);
future<compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor);
// Return list of expired sstables for column family cf.
// A sstable is fully expired *iff* its max_local_deletion_time precedes gc_before and its
@@ -138,6 +142,6 @@ std::unordered_set<sstables::shared_sstable>
get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);
// For tests, can drop after we virtualize sstables.
mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);
mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
}

View File

@@ -11,13 +11,10 @@
#include <unordered_set>
#include <memory>
#include "sstables/shared_sstable.hh"
#include "mutation/timestamp.hh"
class compaction_controller;
namespace compaction {
#include "timestamp.hh"
class compaction_backlog_manager;
class compaction_controller;
// Read and write progress are provided by structures present in progress_manager.hh
// However, we don't want to be tied to their lifetimes and for that reason we will not
@@ -122,5 +119,3 @@ public:
double backlog() const;
void register_backlog_tracker(compaction_backlog_tracker& tracker);
};
}

View File

@@ -17,10 +17,10 @@
#include "compaction_fwd.hh"
#include "mutation_writer/token_group_based_splitting_writer.hh"
namespace compaction {
namespace sstables {
enum class compaction_type {
Compaction = 0, // Used only for regular compactions
Compaction = 0,
Cleanup = 1,
Validation = 2, // Origin uses this for a compaction that is used exclusively for repair
Scrub = 3,
@@ -29,20 +29,19 @@ enum class compaction_type {
Upgrade = 6,
Reshape = 7,
Split = 8,
Major = 9,
};
struct compaction_completion_desc {
// Old, existing SSTables that should be deleted and removed from the SSTable set.
std::vector<sstables::shared_sstable> old_sstables;
std::vector<shared_sstable> old_sstables;
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
std::vector<sstables::shared_sstable> new_sstables;
std::vector<shared_sstable> new_sstables;
// Set of compacted partition ranges that should be invalidated in the cache.
dht::partition_range_vector ranges_for_cache_invalidation;
};
// creates a new SSTable for a given shard
using compaction_sstable_creator_fn = std::function<sstables::shared_sstable(shard_id shard)>;
using compaction_sstable_creator_fn = std::function<shared_sstable(shard_id shard)>;
// Replaces old sstable(s) by new one(s) which contain all non-expired data.
using compaction_sstable_replacer_fn = std::function<void(compaction_completion_desc)>;
@@ -50,8 +49,6 @@ class compaction_type_options {
public:
struct regular {
};
struct major {
};
struct cleanup {
};
struct upgrade {
@@ -77,11 +74,6 @@ public:
// Should invalid sstables be moved into quarantine.
// Only applies to validate-mode.
quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
using drop_unfixable_sstables = bool_class<class drop_unfixable_sstables_tag>;
// Drop sstables that cannot be fixed.
// Only applies to segregate-mode.
drop_unfixable_sstables drop_unfixable = drop_unfixable_sstables::no;
};
struct reshard {
};
@@ -91,7 +83,7 @@ public:
mutation_writer::classify_by_token_group classifier;
};
private:
using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major>;
using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split>;
private:
options_variant _options;
@@ -113,10 +105,6 @@ public:
return compaction_type_options(regular{});
}
static compaction_type_options make_major() {
return compaction_type_options(major{});
}
static compaction_type_options make_cleanup() {
return compaction_type_options(cleanup{});
}
@@ -125,8 +113,8 @@ public:
return compaction_type_options(upgrade{});
}
static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes, scrub::drop_unfixable_sstables drop_unfixable_sstables = scrub::drop_unfixable_sstables::no) {
return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables, .drop_unfixable = drop_unfixable_sstables});
static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
}
static compaction_type_options make_split(mutation_writer::classify_by_token_group classifier) {
@@ -181,7 +169,7 @@ struct compaction_descriptor {
compaction_sstable_replacer_fn replacer;
// Denotes if this compaction task is comprised solely of completely expired SSTables
has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;
sstables::has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;
// If set to true, gc will check only the compacting sstables to collect tombstones.
// If set to false, gc will check the memtables, commit log and other uncompacting
@@ -201,7 +189,7 @@ struct compaction_descriptor {
explicit compaction_descriptor(std::vector<sstables::shared_sstable> sstables,
int level = default_level,
uint64_t max_sstable_bytes = default_max_sstable_bytes,
sstables::run_id run_identifier = sstables::run_id::create_random_id(),
run_id run_identifier = run_id::create_random_id(),
compaction_type_options options = compaction_type_options::make_regular(),
compaction::owned_ranges_ptr owned_ranges_ = {})
: sstables(std::move(sstables))
@@ -212,12 +200,12 @@ struct compaction_descriptor {
, owned_ranges(std::move(owned_ranges_))
{}
explicit compaction_descriptor(::compaction::has_only_fully_expired has_only_fully_expired,
explicit compaction_descriptor(sstables::has_only_fully_expired has_only_fully_expired,
std::vector<sstables::shared_sstable> sstables)
: sstables(std::move(sstables))
, level(default_level)
, max_sstable_bytes(default_max_sstable_bytes)
, run_identifier(sstables::run_id::create_random_id())
, run_identifier(run_id::create_random_id())
, options(compaction_type_options::make_regular())
, has_only_fully_expired(has_only_fully_expired)
{}
@@ -233,14 +221,14 @@ struct compaction_descriptor {
}
template <>
struct fmt::formatter<compaction::compaction_type> : fmt::formatter<string_view> {
auto format(compaction::compaction_type, fmt::format_context& ctx) const -> decltype(ctx.out());
struct fmt::formatter<sstables::compaction_type> : fmt::formatter<string_view> {
auto format(sstables::compaction_type, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <>
struct fmt::formatter<compaction::compaction_type_options::scrub::mode> : fmt::formatter<string_view> {
auto format(compaction::compaction_type_options::scrub::mode, fmt::format_context& ctx) const -> decltype(ctx.out());
struct fmt::formatter<sstables::compaction_type_options::scrub::mode> : fmt::formatter<string_view> {
auto format(sstables::compaction_type_options::scrub::mode, fmt::format_context& ctx) const -> decltype(ctx.out());
};
template <>
struct fmt::formatter<compaction::compaction_type_options::scrub::quarantine_mode> : fmt::formatter<string_view> {
auto format(compaction::compaction_type_options::scrub::quarantine_mode, fmt::format_context& ctx) const -> decltype(ctx.out());
struct fmt::formatter<sstables::compaction_type_options::scrub::quarantine_mode> : fmt::formatter<string_view> {
auto format(sstables::compaction_type_options::scrub::quarantine_mode, fmt::format_context& ctx) const -> decltype(ctx.out());
};

View File

@@ -15,17 +15,20 @@
#include "compaction_descriptor.hh"
class reader_permit;
class compaction_backlog_tracker;
namespace sstables {
class sstable_set;
class compaction_strategy;
class sstables_manager;
struct sstable_writer_config;
}
namespace compaction {
class compaction_strategy;
class compaction_strategy_state;
class compaction_backlog_tracker;
}
namespace compaction {
class compaction_group_view {
public:
@@ -40,7 +43,7 @@ public:
virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
virtual compaction_strategy& get_compaction_strategy() const noexcept = 0;
virtual sstables::compaction_strategy& get_compaction_strategy() const noexcept = 0;
virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
virtual reader_permit make_compaction_reader_permit() const = 0;
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
@@ -50,7 +53,7 @@ public:
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
virtual api::timestamp_type min_memtable_live_row_marker_timestamp() const = 0;
virtual bool memtable_has_key(const dht::decorated_key& key) const = 0;
virtual future<> on_compaction_completion(compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
virtual bool tombstone_gc_enabled() const noexcept = 0;
virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;

File diff suppressed because it is too large Load Diff

View File

@@ -31,16 +31,14 @@
#include "strategy_control.hh"
#include "backlog_controller.hh"
#include "seastarx.hh"
#include "compaction/exceptions.hh"
#include "sstables/exceptions.hh"
#include "tombstone_gc.hh"
#include "utils/pluggable.hh"
#include "compaction/compaction_reenabler.hh"
#include "utils/disk_space_monitor.hh"
namespace db {
class compaction_history_entry;
class system_keyspace;
class config;
}
namespace sstables { class test_env_compaction_manager; }
@@ -63,11 +61,12 @@ inline owned_ranges_ptr make_owned_ranges_ptr(dht::token_range_vector&& ranges)
return make_lw_shared<const dht::token_range_vector>(std::move(ranges));
}
}
// Compaction manager provides facilities to submit and track compaction jobs on
// behalf of existing tables.
class compaction_manager: public peering_sharded_service<compaction_manager> {
class compaction_manager {
public:
using compaction_stats_opt = std::optional<compaction_stats>;
using compaction_stats_opt = std::optional<sstables::compaction_stats>;
struct stats {
int64_t pending_tasks = 0;
int64_t completed_tasks = 0;
@@ -102,17 +101,16 @@ private:
//
// none: started, but not yet enabled. Once the compaction manager moves out of "none", it can
// never legally move back
// stopped: stop() was called. The compaction_manager will never be running again
// stopped: stop() was called. The compaction_manager will never be enabled or disabled again
// and can no longer be used (although it is possible to still grab metrics, stats,
// etc)
// running: running, started and enabled at least once. Whether new compactions are accepted or not is determined by the counter
enum class state { none, stopped, running };
// enabled: accepting compactions
// disabled: not accepting compactions
//
// Moving the compaction manager to and from enabled and disable states is legal, as many times
// as necessary.
enum class state { none, stopped, disabled, enabled };
state _state = state::none;
// The compaction manager is initiated in the none state. It is moved to the running state when start() is invoked
// and the service is immediately enabled.
uint32_t _disabled_state_count = 0;
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
std::optional<future<>> _stop_future;
@@ -169,8 +167,6 @@ private:
// still uses it with reference semantics (inconsistently though).
// Drop this member, once the code is converted into using value semantics.
tombstone_gc_state _tombstone_gc_state;
utils::disk_space_monitor::subscription _out_of_space_subscription;
private:
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
@@ -236,7 +232,7 @@ private:
// similar-sized compaction.
void postpone_compaction_for_table(compaction::compaction_group_view* t);
using quarantine_invalid_sstables = compaction_type_options::scrub::quarantine_invalid_sstables;
using quarantine_invalid_sstables = sstables::compaction_type_options::scrub::quarantine_invalid_sstables;
future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::compaction_group_view& t, tasks::task_info info, quarantine_invalid_sstables quarantine_sstables);
future<> update_static_shares(float shares);
@@ -248,10 +244,9 @@ private:
requires std::derived_from<TaskType, compaction_task_executor> &&
std::derived_from<TaskType, compaction_task_impl>
future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(sstring reason, tasks::task_info info, compaction_group_view& t, compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
get_candidates_func get_func, throw_if_stopping do_throw_if_stopping, Args... args);
future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(sstring reason, tasks::task_info info, compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");
// Stop all fibers, without waiting. Safe to be called multiple times.
@@ -307,18 +302,12 @@ public:
// Stop all fibers. Ongoing compactions will be waited. Should only be called
// once, from main teardown path.
future<> stop();
future<> start(const db::config& cfg, utils::disk_space_monitor* dsm);
// cancels all running compactions and moves the compaction manager into disabled state.
// The compaction manager is still alive after drain but it will not accept new compactions
// unless it is moved back to enabled state.
future<> drain();
// Check if compaction manager is running, i.e. it was enabled or drained
bool is_running() const noexcept {
return _state == state::running;
}
using compaction_history_consumer = noncopyable_function<future<>(const db::compaction_history_entry&)>;
future<> get_compaction_history(compaction_history_consumer&& f);
@@ -350,27 +339,27 @@ private:
// Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
bool update_sstable_cleanup_state(compaction_group_view& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
future<> on_compaction_completion(compaction_group_view& t, compaction_completion_desc desc, sstables::offstrategy offstrategy);
future<> on_compaction_completion(compaction_group_view& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
public:
// Submit a table to be upgraded and wait for its termination.
future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::compaction_group_view& t, bool exclude_current_version, tasks::task_info info);
// Submit a table to be scrubbed and wait for its termination.
future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, compaction_type_options::scrub opts, tasks::task_info info);
future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);
// Submit a table for major compaction.
future<> perform_major_compaction(compaction::compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data = false);
// Splits a compaction group by segregating all its sstable according to the classifier[1].
// [1]: See compaction_type_options::splitting::classifier.
// [1]: See sstables::compaction_type_options::splitting::classifier.
// Returns when all sstables in the main sstable set are split. The only exception is shutdown
// or user aborted splitting using stop API.
future<compaction_stats_opt> perform_split_compaction(compaction::compaction_group_view& t, compaction_type_options::split opt, tasks::task_info info);
future<compaction_stats_opt> perform_split_compaction(compaction::compaction_group_view& t, sstables::compaction_type_options::split opt, tasks::task_info info);
// Splits a single SSTable by segregating all its data according to the classifier.
// If SSTable doesn't need split, the same input SSTable is returned as output.
// If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, sstables::compaction_type_options::split opt);
// Run a custom job for a given table, defined by a function
// it completes when future returned by job is ready or returns immediately
@@ -379,7 +368,7 @@ public:
// parameter type is the compaction type the operation can most closely be
// associated with, use compaction_type::Compaction, if none apply.
// parameter job is a function that will carry the operation
future<> run_custom_job(compaction::compaction_group_view& s, compaction_type type, const char *desc, noncopyable_function<future<>(compaction_data&, compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);
future<> run_custom_job(compaction::compaction_group_view& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);
// Disable compaction temporarily for a table t.
// Caller should call the compaction_reenabler::reenable
@@ -412,7 +401,7 @@ public:
return _stats;
}
const std::vector<compaction_info> get_compactions(std::function<bool(const compaction_group_view*)> filter = [] (auto) { return true; }) const;
const std::vector<sstables::compaction_info> get_compactions(compaction::compaction_group_view* t = nullptr) const;
// Returns true if table has an ongoing compaction, running on its behalf
bool has_table_ongoing_compaction(const compaction::compaction_group_view& t) const;
@@ -420,16 +409,15 @@ public:
bool compaction_disabled(compaction::compaction_group_view& t) const;
// Stops ongoing compaction of a given type.
future<> stop_compaction(sstring type, std::function<bool(const compaction_group_view*)> filter = [] (auto) { return true; });
future<> stop_compaction(sstring type, compaction::compaction_group_view* table = nullptr);
private:
std::vector<shared_ptr<compaction_task_executor>>
do_stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt) noexcept;
future<> stop_ongoing_compactions(sstring reason, std::function<bool(const compaction_group_view*)> filter, std::optional<compaction_type> type_opt = {}) noexcept;
do_stop_ongoing_compactions(sstring reason, compaction_group_view* t, std::optional<sstables::compaction_type> type_opt) noexcept;
public:
// Stops ongoing compaction of a given table and/or compaction_type.
future<> stop_ongoing_compactions(sstring reason, compaction::compaction_group_view* t = nullptr, std::optional<compaction_type> type_opt = {}) noexcept;
future<> stop_ongoing_compactions(sstring reason, compaction::compaction_group_view* t = nullptr, std::optional<sstables::compaction_type> type_opt = {}) noexcept;
future<> await_ongoing_compactions(compaction_group_view* t);
@@ -445,7 +433,7 @@ public:
compaction_backlog_tracker& get_backlog_tracker(compaction::compaction_group_view& t);
static compaction_data create_compaction_data();
static sstables::compaction_data create_compaction_data();
compaction::strategy_control& get_strategy_control() const noexcept;
@@ -473,7 +461,6 @@ public:
friend class compaction_weight_registration;
friend class sstables::test_env_compaction_manager;
friend class compaction::compaction_task_impl;
friend class compaction::compaction_task_executor;
friend class compaction::sstables_task_executor;
friend class compaction::major_compaction_task_executor;
@@ -487,6 +474,8 @@ public:
friend compaction_reenabler;
};
namespace compaction {
class compaction_task_executor
: public enable_shared_from_this<compaction_task_executor>
, public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>> {
@@ -509,21 +498,21 @@ protected:
compaction_manager& _cm;
::compaction::compaction_group_view* _compacting_table = nullptr;
compaction::compaction_state& _compaction_state;
compaction_data _compaction_data;
sstables::compaction_data _compaction_data;
state _state = state::none;
throw_if_stopping _do_throw_if_stopping;
compaction_progress_monitor _progress_monitor;
sstables::compaction_progress_monitor _progress_monitor;
private:
shared_future<compaction_manager::compaction_stats_opt> _compaction_done = make_ready_future<compaction_manager::compaction_stats_opt>();
exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
compaction_type _type;
sstables::compaction_type _type;
sstables::run_id _output_run_identifier;
sstring _description;
compaction_manager::compaction_stats_opt _stats = std::nullopt;
public:
explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::compaction_group_view* t, compaction_type type, sstring desc);
explicit compaction_task_executor(compaction_manager& mgr, throw_if_stopping do_throw_if_stopping, ::compaction::compaction_group_view* t, sstables::compaction_type type, sstring desc);
compaction_task_executor(compaction_task_executor&&) = delete;
compaction_task_executor(const compaction_task_executor&) = delete;
@@ -562,14 +551,14 @@ protected:
// otherwise, returns stop_iteration::no after sleep for exponential retry.
future<stop_iteration> maybe_retry(std::exception_ptr err, bool throw_on_abort = false);
future<compaction_result> compact_sstables_and_update_history(compaction_descriptor descriptor, compaction_data& cdata, on_replacement&,
future<sstables::compaction_result> compact_sstables_and_update_history(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes);
future<compaction_result> compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, on_replacement&,
future<sstables::compaction_result> compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement&,
compaction_manager::can_purge_tombstones can_purge = compaction_manager::can_purge_tombstones::yes,
sstables::offstrategy offstrategy = sstables::offstrategy::no);
future<> update_history(::compaction::compaction_group_view& t, compaction_result&& res, const compaction_data& cdata);
bool should_update_history(compaction_type ct) {
return ct == compaction_type::Compaction || ct == compaction_type::Major;
future<> update_history(::compaction::compaction_group_view& t, sstables::compaction_result&& res, const sstables::compaction_data& cdata);
bool should_update_history(sstables::compaction_type ct) {
return ct == sstables::compaction_type::Compaction;
}
public:
compaction_manager::compaction_stats_opt get_stats() const noexcept {
@@ -582,7 +571,7 @@ public:
return _compacting_table;
}
compaction_type compaction_type() const noexcept {
sstables::compaction_type compaction_type() const noexcept {
return _type;
}
@@ -590,11 +579,11 @@ public:
return _state == state::active;
}
const ::compaction::compaction_data& compaction_data() const noexcept {
const sstables::compaction_data& compaction_data() const noexcept {
return _compaction_data;
}
::compaction::compaction_data& compaction_data() noexcept {
sstables::compaction_data& compaction_data() noexcept {
return _compaction_data;
}
@@ -624,7 +613,7 @@ public:
void stop_compaction(sstring reason) noexcept;
compaction_stopped_exception make_compaction_stopped_exception() const;
sstables::compaction_stopped_exception make_compaction_stopped_exception() const;
template<typename TaskExecutor, typename... Args>
requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
@@ -654,11 +643,7 @@ struct fmt::formatter<compaction::compaction_task_executor> {
auto format(const compaction::compaction_task_executor& ex, fmt::format_context& ctx) const -> decltype(ctx.out());
};
namespace compaction {
bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges);
// Return all sstables but those that are off-strategy like the ones in maintenance set and staging dir.
future<std::vector<sstables::shared_sstable>> in_strategy_sstables(compaction::compaction_group_view& table_s);
}

View File

@@ -10,10 +10,12 @@
#include <seastar/core/gate.hh>
class compaction_manager;
namespace compaction {
class compaction_manager;
class compaction_group_view;
class compaction_state;
}
class compaction_reenabler {
compaction_manager& _cm;
@@ -36,4 +38,3 @@ public:
}
};
}

View File

@@ -62,4 +62,4 @@ struct compaction_state {
}
};
} // namespace compaction
} // namespace compaction_manager

Some files were not shown because too many files have changed in this diff Show More