Compare commits

..

1 Commits

Author SHA1 Message Date
Amnon Heiman
607ca719d7 Enable prometheus_allow_protobuf by default
Change the prometheus_allow_protobuf configuration to true by default.
This allows ScyllaDB server to serve Prometheus protobuf format (enables
native histogram support) if asked so by the monitoring server.

Update config help text/docs to reflect protobuf support (drop
“experimental” wording).

Add cluster tests to validate the default is enabled, can be overridden,
and /metrics returns protobuf when requested via Accept header (and
falls back to text when disabled).

Fixes #27817
co-Author: mykaul <mykaul@scylladb.com>

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
2026-01-19 09:40:49 +02:00
369 changed files with 3427 additions and 11803 deletions

View File

@@ -18,7 +18,7 @@ jobs:
// Regular expression pattern to check for "Fixes" prefix
// Adjusted to dynamically insert the repository full name
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
const regex = new RegExp(pattern);
if (!regex.test(body)) {

View File

@@ -1,53 +0,0 @@
name: Backport with Jira Integration
on:
push:
branches:
- master
- next-*.*
- branch-*.*
pull_request_target:
types: [labeled, closed]
branches:
- master
- next
- next-*.*
- branch-*.*
jobs:
backport-on-push:
if: github.event_name == 'push'
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'push'
base_branch: ${{ github.ref }}
commits: ${{ github.event.before }}..${{ github.sha }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
backport-on-label:
if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'labeled'
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
pull_request_number: ${{ github.event.pull_request.number }}
head_commit: ${{ github.event.pull_request.base.sha }}
label_name: ${{ github.event.label.name }}
pr_state: ${{ github.event.pull_request.state }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
backport-chain:
if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
with:
event_type: 'chain'
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
pull_request_number: ${{ github.event.pull_request.number }}
pr_body: ${{ github.event.pull_request.body }}
secrets:
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}

View File

@@ -9,53 +9,16 @@ on:
jobs:
trigger-jenkins:
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
runs-on: ubuntu-latest
steps:
- name: Verify Org Membership
id: verify_author
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
shell: bash
run: |
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
AUTHOR="${{ github.event.pull_request.user.login }}"
else
AUTHOR="${{ github.event.comment.user.login }}"
fi
ORG="scylladb"
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
echo "member=true" >> $GITHUB_OUTPUT
else
echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
echo "member=false" >> $GITHUB_OUTPUT
fi
- name: Validate Comment Trigger
if: github.event_name == 'issue_comment'
id: verify_comment
shell: bash
run: |
BODY=$(cat << 'EOF'
${{ github.event.comment.body }}
EOF
)
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
echo "trigger=true" >> $GITHUB_OUTPUT
else
echo "trigger=false" >> $GITHUB_OUTPUT
fi
- name: Trigger Scylla-CI-Route Jenkins Job
if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
env:
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
JENKINS_URL: "https://jenkins.scylladb.com"
PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
PR_REPO_NAME: "${{ github.event.repository.full_name }}"
run: |
PR_NUMBER=${{ github.event.issue.number }}
PR_REPO_NAME=${{ github.event.repository.full_name }}
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v

View File

@@ -78,7 +78,7 @@ fi
# Default scylla product/version tags
PRODUCT=scylla
VERSION=2026.1.0
VERSION=2026.1.0-dev
if test -f version
then

View File

@@ -17,7 +17,6 @@
#include "auth/service.hh"
#include "db/config.hh"
#include "db/view/view_build_status.hh"
#include "locator/tablets.hh"
#include "mutation/tombstone.hh"
#include "locator/abstract_replication_strategy.hh"
#include "utils/log.hh"
@@ -1876,34 +1875,23 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
auto ts = group0_guard.write_timestamp();
utils::chunked_vector<mutation> schema_mutations;
auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
if (stream_specification && stream_specification->IsObject()) {
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
if (rs->uses_tablets()) {
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
"If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
}
}
}
// Creating an index in tablets mode requires the keyspace to be RF-rack-valid.
// GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues.
if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, _proxy.local_db().get_token_metadata_ptr(), *rs);
} catch (const std::invalid_argument& ex) {
if (!view_builders.empty()) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes and LocalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
} else {
co_return api_error::validation(fmt::format("Cannot create table '{}' with tablets: the configuration "
"option 'rf_rack_valid_keyspaces' is enabled, which enforces that tables using tablets can only be created in clusters "
"that have either 1 or 3 racks", table_name));
}
}
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
}
try {
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
@@ -2126,12 +2114,9 @@ future<executor::request_return_type> executor::update_table(client_state& clien
co_return api_error::validation(fmt::format(
"LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
}
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(),
p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy());
} catch (const std::invalid_argument& ex) {
co_return api_error::validation(fmt::format("GlobalSecondaryIndexes on a table "
"using tablets require the number of racks in the cluster to be either 1 or 3"));
if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
!p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
}
elogger.trace("Adding GSI {}", index_name);

View File

@@ -767,7 +767,7 @@ static future<bool> scan_table(
// by tasking another node to take over scanning of the dead node's primary
// ranges. What we do here is that this node will also check expiration
// on its *secondary* ranges - but only those whose primary owner is down.
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
if (!gossiper.is_alive(tablet_primary_replica.host)) {
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);

View File

@@ -3051,7 +3051,7 @@
},
{
"name":"incremental_mode",
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
"description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
"required":false,
"allowMultiple":false,
"type":"string",

View File

@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
auto sstables = parsed.GetArray() |
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
std::ranges::to<std::vector>();
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
keyspace,
table,
endpoint,
bucket,
prefix,
sstables.size(),
scope,
primary_replica_only);
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
co_return json::json_return_type(fmt::to_string(task_id));
});
@@ -2025,14 +2016,12 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
auto tag = req->get_query_param("tag");
auto column_families = split(req->get_query_param("cf"), ",");
auto sfopt = req->get_query_param("sf");
db::snapshot_options opts = {
.skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
};
auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);
std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
try {
if (column_families.empty()) {
co_await snap_ctl.local().take_snapshot(tag, keynames, opts);
co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
} else {
if (keynames.empty()) {
throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -2040,7 +2029,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
if (keynames.size() > 1) {
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
}
co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
}
co_return json_void();
} catch (...) {
@@ -2075,8 +2064,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
auto info = parse_scrub_options(ctx, std::move(req));
if (!info.snapshot_tag.empty()) {
db::snapshot_options opts = {.skip_flush = false};
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
compaction::compaction_stats stats;

View File

@@ -146,8 +146,7 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
auto info = parse_scrub_options(ctx, std::move(req));
if (!info.snapshot_tag.empty()) {
db::snapshot_options opts = {.skip_flush = false};
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
}
auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();

View File

@@ -53,10 +53,10 @@ static std::string json_escape(std::string_view str) {
}
future<> audit_syslog_storage_helper::syslog_send_helper(temporary_buffer<char> msg) {
future<> audit_syslog_storage_helper::syslog_send_helper(const sstring& msg) {
try {
auto lock = co_await get_units(_semaphore, 1, std::chrono::hours(1));
co_await _sender.send(_syslog_address, std::span(&msg, 1));
co_await _sender.send(_syslog_address, net::packet{msg.data(), msg.size()});
}
catch (const std::exception& e) {
auto error_msg = seastar::format(
@@ -90,7 +90,7 @@ future<> audit_syslog_storage_helper::start(const db::config& cfg) {
co_return;
}
co_await syslog_send_helper(temporary_buffer<char>::copy_of("Initializing syslog audit backend."));
co_await syslog_send_helper("Initializing syslog audit backend.");
}
future<> audit_syslog_storage_helper::stop() {
@@ -120,7 +120,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
audit_info->table(),
username);
co_await syslog_send_helper(std::move(msg).release());
co_await syslog_send_helper(msg);
}
future<> audit_syslog_storage_helper::write_login(const sstring& username,
@@ -139,7 +139,7 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
client_ip,
username);
co_await syslog_send_helper(std::move(msg).release());
co_await syslog_send_helper(msg.c_str());
}
}

View File

@@ -26,7 +26,7 @@ class audit_syslog_storage_helper : public storage_helper {
net::datagram_channel _sender;
seastar::semaphore _semaphore;
future<> syslog_send_helper(seastar::temporary_buffer<char> msg);
future<> syslog_send_helper(const sstring& msg);
public:
explicit audit_syslog_storage_helper(cql3::query_processor&, service::migration_manager&);
virtual ~audit_syslog_storage_helper();

View File

@@ -876,6 +876,22 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
continue; // some tables might not have been created if they were not used
}
// use longer than usual timeout as we scan the whole table
// but not infinite or very long as we want to fail reasonably fast
const auto t = 5min;
const timeout_config tc{t, t, t, t, t, t, t};
::service::client_state cs(::service::client_state::internal_tag{}, tc);
::service::query_state qs(cs, empty_service_permit());
auto rows = co_await qp.execute_internal(
seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
db::consistency_level::ALL,
qs,
{},
cql3::query_processor::cache_internal::no);
if (rows->empty()) {
continue;
}
std::vector<sstring> col_names;
for (const auto& col : schema->all_columns()) {
col_names.push_back(col.name_as_cql_string());
@@ -884,51 +900,30 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
for (size_t i = 1; i < col_names.size(); ++i) {
val_binders_str += ", ?";
}
std::vector<mutation> collected;
// use longer than usual timeout as we scan the whole table
// but not infinite or very long as we want to fail reasonably fast
const auto t = 5min;
const timeout_config tc{t, t, t, t, t, t, t};
::service::client_state cs(::service::client_state::internal_tag{}, tc);
::service::query_state qs(cs, empty_service_permit());
co_await qp.query_internal(
seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
db::consistency_level::ALL,
{},
1000,
[&qp, &cf_name, &col_names, &val_binders_str, &schema, ts, &collected] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
std::vector<data_value_or_unset> values;
for (const auto& col : schema->all_columns()) {
if (row.has(col.name_as_text())) {
values.push_back(
col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
} else {
values.push_back(unset_value{});
}
for (const auto& row : *rows) {
std::vector<data_value_or_unset> values;
for (const auto& col : schema->all_columns()) {
if (row.has(col.name_as_text())) {
values.push_back(
col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
} else {
values.push_back(unset_value{});
}
auto muts = co_await qp.get_mutations_internal(
seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
db::system_keyspace::NAME,
cf_name,
fmt::join(col_names, ", "),
val_binders_str),
internal_distributed_query_state(),
ts,
std::move(values));
if (muts.size() != 1) {
on_internal_error(log,
format("expecting single insert mutation, got {}", muts.size()));
}
collected.push_back(std::move(muts[0]));
co_return stop_iteration::no;
},
std::move(qs));
for (auto& m : collected) {
co_yield std::move(m);
}
auto muts = co_await qp.get_mutations_internal(
seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
db::system_keyspace::NAME,
cf_name,
fmt::join(col_names, ", "),
val_binders_str),
internal_distributed_query_state(),
ts,
std::move(values));
if (muts.size() != 1) {
on_internal_error(log,
format("expecting single insert mutation, got {}", muts.size()));
}
co_yield std::move(muts[0]);
}
}
co_yield co_await sys_ks.make_auth_version_mutation(ts,

View File

@@ -778,7 +778,6 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
}
compaction::compaction_state& cs = get_compaction_state(&t);
auto gh = cs.gate.hold();
auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
if (!reason.empty()) {
cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
@@ -792,7 +791,6 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
}
compaction::compaction_state& cs = get_compaction_state(&t);
auto gh = cs.gate.hold();
auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
if (!reason.empty()) {
cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
@@ -1521,9 +1519,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
| std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
| std::ranges::to<std::unordered_set>());
};
const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
auto count = co_await num_runs_for_compaction();
if (count <= threshold) {
cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1538,7 +1534,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
auto& cstate = get_compaction_state(&t);
try {
while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
co_await cstate.compaction_done.wait();
co_await cstate.compaction_done.wait([this, &t] {
return !can_perform_regular_compaction(t);
});
}
} catch (const broken_condition_variable&) {
co_return;
@@ -2389,8 +2387,6 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
if (!c_state.gate.is_closed()) {
auto close_gate = c_state.gate.close();
co_await stop_ongoing_compactions(reason, &t);
// Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
co_await c_state.incremental_repair_lock.write_lock();
co_await std::move(close_gate);
}

View File

@@ -725,9 +725,29 @@ raft_tests = set([
vector_search_tests = set([
'test/vector_search/vector_store_client_test',
'test/vector_search/load_balancer_test',
'test/vector_search/client_test',
'test/vector_search/filter_test',
'test/vector_search/rescoring_test'
'test/vector_search/client_test'
])
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
vector_search_validator_deps = set([
'test/vector_search_validator/build-validator',
'test/vector_search_validator/Cargo.toml',
'test/vector_search_validator/crates/validator/Cargo.toml',
'test/vector_search_validator/crates/validator/src/main.rs',
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
])
vector_store_bin = 'vector-search-validator/bin/vector-store'
vector_store_deps = set([
'test/vector_search_validator/build-env',
'test/vector_search_validator/build-vector-store',
])
vector_search_validator_bins = set([
vector_search_validator_bin,
vector_store_bin,
])
wasms = set([
@@ -763,7 +783,7 @@ other = set([
'iotune',
])
all_artifacts = apps | cpp_apps | tests | other | wasms
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -1014,9 +1034,6 @@ scylla_core = (['message/messaging_service.cc',
'cql3/functions/aggregate_fcts.cc',
'cql3/functions/castas_fcts.cc',
'cql3/functions/error_injection_fcts.cc',
'cql3/statements/strong_consistency/modification_statement.cc',
'cql3/statements/strong_consistency/select_statement.cc',
'cql3/statements/strong_consistency/statement_helpers.cc',
'cql3/functions/vector_similarity_fcts.cc',
'cql3/statements/cf_prop_defs.cc',
'cql3/statements/cf_statement.cc',
@@ -1042,8 +1059,8 @@ scylla_core = (['message/messaging_service.cc',
'cql3/statements/raw/parsed_statement.cc',
'cql3/statements/property_definitions.cc',
'cql3/statements/update_statement.cc',
'cql3/statements/broadcast_modification_statement.cc',
'cql3/statements/broadcast_select_statement.cc',
'cql3/statements/strongly_consistent_modification_statement.cc',
'cql3/statements/strongly_consistent_select_statement.cc',
'cql3/statements/delete_statement.cc',
'cql3/statements/prune_materialized_view_statement.cc',
'cql3/statements/batch_statement.cc',
@@ -1334,9 +1351,6 @@ scylla_core = (['message/messaging_service.cc',
'lang/wasm.cc',
'lang/wasm_alien_thread_runner.cc',
'lang/wasm_instance_cache.cc',
'service/strong_consistency/groups_manager.cc',
'service/strong_consistency/coordinator.cc',
'service/strong_consistency/state_machine.cc',
'service/raft/group0_state_id_handler.cc',
'service/raft/group0_state_machine.cc',
'service/raft/group0_state_machine_merger.cc',
@@ -1366,7 +1380,6 @@ scylla_core = (['message/messaging_service.cc',
'vector_search/dns.cc',
'vector_search/client.cc',
'vector_search/clients.cc',
'vector_search/filter.cc',
'vector_search/truststore.cc'
] + [Antlr3Grammar('cql3/Cql.g')] \
+ scylla_raft_core
@@ -1476,7 +1489,6 @@ idls = ['idl/gossip_digest.idl.hh',
'idl/hinted_handoff.idl.hh',
'idl/storage_proxy.idl.hh',
'idl/sstables.idl.hh',
'idl/strong_consistency/state_machine.idl.hh',
'idl/group0_state_machine.idl.hh',
'idl/mapreduce_request.idl.hh',
'idl/replica_exception.idl.hh',
@@ -1772,8 +1784,6 @@ deps['test/raft/discovery_test'] = ['test/raft/discovery_test.cc',
deps['test/vector_search/vector_store_client_test'] = ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/filter_test'] = ['test/vector_search/filter_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/rescoring_test'] = ['test/vector_search/rescoring_test.cc'] + scylla_tests_dependencies
boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
@@ -2560,10 +2570,11 @@ def write_build_file(f,
description = RUST_LIB $out
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
f.write(
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
mode=mode,
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
)
)
if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2593,7 +2604,7 @@ def write_build_file(f,
continue
profile_dep = modes[mode].get('profile_target', "")
if binary in other or binary in wasms:
if binary in other or binary in wasms or binary in vector_search_validator_bins:
continue
srcs = deps[binary]
# 'scylla'
@@ -2704,10 +2715,11 @@ def write_build_file(f,
)
f.write(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
mode=mode,
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
)
)
f.write(
@@ -2875,6 +2887,19 @@ def write_build_file(f,
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
)
f.write(textwrap.dedent(f'''\
rule build-vector-search-validator
command = test/vector_search_validator/build-validator $builddir
rule build-vector-store
command = test/vector_search_validator/build-vector-store $builddir
'''))
f.write(
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
)
f.write(
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
)
f.write(textwrap.dedent(f'''\
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
build dist-unified: phony dist-unified-tar

View File

@@ -47,9 +47,6 @@ target_sources(cql3
functions/aggregate_fcts.cc
functions/castas_fcts.cc
functions/error_injection_fcts.cc
statements/strong_consistency/select_statement.cc
statements/strong_consistency/modification_statement.cc
statements/strong_consistency/statement_helpers.cc
functions/vector_similarity_fcts.cc
statements/cf_prop_defs.cc
statements/cf_statement.cc
@@ -75,8 +72,8 @@ target_sources(cql3
statements/raw/parsed_statement.cc
statements/property_definitions.cc
statements/update_statement.cc
statements/broadcast_modification_statement.cc
statements/broadcast_select_statement.cc
statements/strongly_consistent_modification_statement.cc
statements/strongly_consistent_select_statement.cc
statements/delete_statement.cc
statements/prune_materialized_view_statement.cc
statements/batch_statement.cc

View File

@@ -10,41 +10,9 @@
#include "types/types.hh"
#include "types/vector.hh"
#include "exceptions/exceptions.hh"
#include <span>
#include <bit>
namespace cql3 {
namespace functions {
namespace detail {
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
if (!param) {
throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
}
const size_t expected_size = dimension * sizeof(float);
if (param->size() != expected_size) {
throw exceptions::invalid_request_exception(
fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
expected_size, dimension, param->size()));
}
std::vector<float> result;
result.reserve(dimension);
bytes_view view(*param);
for (size_t i = 0; i < dimension; ++i) {
// read_simple handles network byte order (big-endian) conversion
uint32_t raw = read_simple<uint32_t>(view);
result.push_back(std::bit_cast<float>(raw));
}
return result;
}
} // namespace detail
namespace {
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {
// You should only use this function if you need to preserve the original vectors and cannot normalize
// them in advance.
float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
double squared_norm_a = 0.0;
double squared_norm_b = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
squared_norm_a += a * a;
@@ -69,7 +37,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
}
if (squared_norm_a == 0 || squared_norm_b == 0) {
return std::numeric_limits<float>::quiet_NaN();
throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
}
// The cosine similarity is in the range [-1, 1].
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
}
float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double sum = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
double diff = a - b;
sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl
// Assumes that both vectors are L2-normalized.
// This similarity is intended as an optimized way to perform cosine similarity calculation.
float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
double dot_product = 0.0;
for (size_t i = 0; i < v1.size(); ++i) {
double a = v1[i];
double b = v2[i];
double a = value_cast<float>(v1[i]);
double b = value_cast<float>(v2[i]);
dot_product += a * b;
}
@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
return std::nullopt;
}
// Extract dimension from the vector type
const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
size_t dimension = type.get_dimension();
const auto& type = arg_types()[0];
data_value v1 = type->deserialize(*parameters[0]);
data_value v2 = type->deserialize(*parameters[1]);
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
// Optimized path: extract floats directly from bytes, bypassing data_value overhead
std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
return float_type->decompose(result);
}

View File

@@ -11,7 +11,6 @@
#include "native_scalar_function.hh"
#include "cql3/assignment_testable.hh"
#include "cql3/functions/function_name.hh"
#include <span>
namespace cql3 {
namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
};
namespace detail {
// Extract float vector directly from serialized bytes, bypassing data_value overhead.
// This is an internal API exposed for testing purposes.
// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
} // namespace detail
} // namespace functions
} // namespace cql3

View File

@@ -48,10 +48,8 @@ const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono
struct query_processor::remote {
remote(service::migration_manager& mm, service::mapreduce_service& fwd,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& _sc_coordinator)
service::storage_service& ss, service::raft_group0_client& group0_client)
: mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
, sc_coordinator(_sc_coordinator)
, gate("query_processor::remote")
{}
@@ -59,7 +57,6 @@ struct query_processor::remote {
service::mapreduce_service& mapreducer;
service::storage_service& ss;
service::raft_group0_client& group0_client;
service::strong_consistency::coordinator& sc_coordinator;
seastar::named_gate gate;
};
@@ -517,16 +514,9 @@ query_processor::~query_processor() {
}
}
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
query_processor::acquire_strongly_consistent_coordinator() {
auto [remote_, holder] = remote();
return {remote_.get().sc_coordinator, std::move(holder)};
}
void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
service::storage_service& ss, service::raft_group0_client& group0_client,
service::strong_consistency::coordinator& sc_coordinator) {
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
service::storage_service& ss, service::raft_group0_client& group0_client) {
_remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client);
}
future<> query_processor::stop_remote() {
@@ -870,7 +860,6 @@ struct internal_query_state {
sstring query_string;
std::unique_ptr<query_options> opts;
statements::prepared_statement::checked_weak_ptr p;
std::optional<service::query_state> qs;
bool more_results = true;
};
@@ -878,14 +867,10 @@ internal_query_state query_processor::create_paged_state(
const sstring& query_string,
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
std::optional<service::query_state> qs) {
int32_t page_size) {
auto p = prepare_internal(query_string);
auto opts = make_internal_options(p, values, cl, page_size);
if (!qs) {
qs.emplace(query_state_for_internal_call());
}
return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), std::move(qs), true};
return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), true};
}
bool query_processor::has_more_results(cql3::internal_query_state& state) const {
@@ -908,8 +893,9 @@ future<> query_processor::for_each_cql_result(
future<::shared_ptr<untyped_result_set>>
query_processor::execute_paged_internal(internal_query_state& state) {
state.p->statement->validate(*this, service::client_state::for_internal_calls());
auto qs = query_state_for_internal_call();
::shared_ptr<cql_transport::messages::result_message> msg =
co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
co_await state.p->statement->execute(*this, qs, *state.opts, std::nullopt);
class visitor : public result_message::visitor_base {
internal_query_state& _state;
@@ -1216,9 +1202,8 @@ future<> query_processor::query_internal(
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
std::optional<service::query_state> qs) {
auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
auto query_state = create_paged_state(query_string, cl, values, page_size);
co_return co_await for_each_cql_result(query_state, std::move(f));
}

View File

@@ -44,10 +44,6 @@ class query_state;
class mapreduce_service;
class raft_group0_client;
namespace strong_consistency {
class coordinator;
}
namespace broadcast_tables {
struct query;
}
@@ -159,8 +155,7 @@ public:
~query_processor();
void start_remote(service::migration_manager&, service::mapreduce_service&,
service::storage_service& ss, service::raft_group0_client&,
service::strong_consistency::coordinator&);
service::storage_service& ss, service::raft_group0_client&);
future<> stop_remote();
data_dictionary::database db() {
@@ -179,9 +174,6 @@ public:
return _proxy;
}
std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
acquire_strongly_consistent_coordinator();
cql_stats& get_cql_stats() {
return _cql_stats;
}
@@ -330,7 +322,6 @@ public:
* page_size - maximum page size
* f - a function to be run on each row of the query result,
* if the function returns stop_iteration::yes the iteration will stop
* qs - optional query state (default: std::nullopt)
*
* \note This function is optimized for convenience, not performance.
*/
@@ -339,8 +330,7 @@ public:
db::consistency_level cl,
const data_value_list& values,
int32_t page_size,
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
std::optional<service::query_state> qs = std::nullopt);
noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);
/*
* \brief iterate over all cql results using paging
@@ -509,8 +499,7 @@ private:
const sstring& query_string,
db::consistency_level,
const data_value_list& values,
int32_t page_size,
std::optional<service::query_state> qs = std::nullopt);
int32_t page_size);
/*!
* \brief run a query using paging

View File

@@ -1,20 +0,0 @@
/*
* Copyright 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <ostream>
namespace cql3 {
class result;
void print_query_results_text(std::ostream& os, const result& result);
void print_query_results_json(std::ostream& os, const result& result);
} // namespace cql3

View File

@@ -9,10 +9,8 @@
*/
#include <cstdint>
#include "types/json_utils.hh"
#include "utils/assert.hh"
#include "utils/hashers.hh"
#include "utils/rjson.hh"
#include "cql3/result_set.hh"
namespace cql3 {
@@ -48,13 +46,6 @@ void metadata::add_non_serialized_column(lw_shared_ptr<column_specification> nam
_column_info->_names.emplace_back(std::move(name));
}
void metadata::hide_last_column() {
if (_column_info->_column_count == 0) {
utils::on_internal_error("Trying to hide a column when there are no columns visible.");
}
_column_info->_column_count--;
}
void metadata::set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state) {
_flags.set<flag::HAS_MORE_PAGES>();
_paging_state = std::move(paging_state);
@@ -197,85 +188,4 @@ make_empty_metadata() {
return empty_metadata_cache;
}
void print_query_results_text(std::ostream& os, const cql3::result& result) {
const auto& metadata = result.get_metadata();
const auto& column_metadata = metadata.get_names();
struct column_values {
size_t max_size{0};
sstring header_format;
sstring row_format;
std::vector<sstring> values;
void add(sstring value) {
max_size = std::max(max_size, value.size());
values.push_back(std::move(value));
}
};
std::vector<column_values> columns;
columns.resize(column_metadata.size());
for (size_t i = 0; i < column_metadata.size(); ++i) {
columns[i].add(column_metadata[i]->name->text());
}
for (const auto& row : result.result_set().rows()) {
for (size_t i = 0; i < row.size(); ++i) {
if (row[i]) {
columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
} else {
columns[i].add("");
}
}
}
std::vector<sstring> separators(columns.size(), sstring());
for (size_t i = 0; i < columns.size(); ++i) {
auto& col_values = columns[i];
col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
for (size_t c = 0; c < col_values.max_size; ++c) {
separators[i] += "-";
}
}
for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
std::vector<sstring> row;
row.reserve(columns.size());
for (size_t i = 0; i < columns.size(); ++i) {
const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
}
fmt::print(os, "{}\n", fmt::join(row, "|"));
if (!r) {
fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
}
}
}
void print_query_results_json(std::ostream& os, const cql3::result& result) {
const auto& metadata = result.get_metadata();
const auto& column_metadata = metadata.get_names();
rjson::streaming_writer writer(os);
writer.StartArray();
for (const auto& row : result.result_set().rows()) {
writer.StartObject();
for (size_t i = 0; i < row.size(); ++i) {
writer.Key(column_metadata[i]->name->text());
if (!row[i] || row[i]->empty()) {
writer.Null();
continue;
}
const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
writer.RawValue(value, type);
}
writer.EndObject();
}
writer.EndArray();
}
}

View File

@@ -73,7 +73,6 @@ public:
uint32_t value_count() const;
void add_non_serialized_column(lw_shared_ptr<column_specification> name);
void hide_last_column();
public:
void set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state);

View File

@@ -225,9 +225,10 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
// The second hyphen is not really true because currently topological changes can
// disturb it (see scylladb/scylladb#23345), but we ignore that.
locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
} catch (const std::invalid_argument& e) {
} catch (const std::exception& e) {
if (replica::database::enforce_rf_rack_validity_for_keyspace(qp.db().get_config(), *ks_md)) {
// wrap the exception manually here in a type that can be passed to the user.
// There's no guarantee what the type of the exception will be, so we need to
// wrap it manually here in a type that can be passed to the user.
throw exceptions::invalid_request_exception(e.what());
} else {
// Even when RF-rack-validity is not enforced for the keyspace, we'd

View File

@@ -123,9 +123,10 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
// We hold a group0_guard, so it's correct to check this here.
// The topology or schema cannot change while we're performing this query.
locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
} catch (const std::invalid_argument& e) {
} catch (const std::exception& e) {
if (replica::database::enforce_rf_rack_validity_for_keyspace(cfg, *ksm)) {
// wrap the exception in a type that can be passed to the user.
// There's no guarantee what the type of the exception will be, so we need to
// wrap it manually here in a type that can be passed to the user.
throw exceptions::invalid_request_exception(e.what());
} else {
// Even when RF-rack-validity is not enforced for the keyspace, we'd

View File

@@ -31,6 +31,8 @@
#include "db/config.hh"
#include "compaction/time_window_compaction_strategy.hh"
bool is_internal_keyspace(std::string_view name);
namespace cql3 {
namespace statements {
@@ -122,6 +124,10 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
#endif
if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
}
_properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
}

View File

@@ -23,7 +23,6 @@
#include "index/vector_index.hh"
#include "schema/schema.hh"
#include "service/client_state.hh"
#include "service/paxos/paxos_state.hh"
#include "types/types.hh"
#include "cql3/query_processor.hh"
#include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
fragmented_ostringstream os{};
fmt::format_to(os.to_iter(),
"/* Do NOT execute this statement! It's only for informational purposes.\n"
" A paxos state table is created automatically when enabling LWT on a base table.\n"
"\n{}\n"
"*/",
*table_desc.create_statement);
table_desc.create_statement = std::move(os).to_managed_string();
}
result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
auto& replica_db = db.real_database();
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
}) | std::ranges::to<std::vector<schema_ptr>>();
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

View File

@@ -98,7 +98,6 @@ static locator::replication_strategy_config_options prepare_options(
const sstring& strategy_class,
const locator::token_metadata& tm,
bool rf_rack_valid_keyspaces,
bool enforce_rack_list,
locator::replication_strategy_config_options options,
const locator::replication_strategy_config_options& old_options,
bool rack_list_enabled,
@@ -108,7 +107,7 @@ static locator::replication_strategy_config_options prepare_options(
auto is_nts = locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) == "org.apache.cassandra.locator.NetworkTopologyStrategy";
auto is_alter = !old_options.empty();
const auto& all_dcs = tm.get_datacenter_racks_token_owners();
auto auto_expand_racks = uses_tablets && rack_list_enabled && (rf_rack_valid_keyspaces || enforce_rack_list);
auto auto_expand_racks = uses_tablets && rf_rack_valid_keyspaces && rack_list_enabled;
logger.debug("prepare_options: {}: is_nts={} auto_expand_racks={} rack_list_enabled={} old_options={} new_options={} all_dcs={}",
strategy_class, is_nts, auto_expand_racks, rack_list_enabled, old_options, options, all_dcs);
@@ -418,7 +417,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
bool uses_tablets = initial_tablets.has_value();
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
}
@@ -435,7 +434,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
auto sc = get_replication_strategy_class();
bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
if (sc) {
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
} else {
sc = old->strategy_name();
options = old_options;

View File

@@ -11,7 +11,7 @@
#include "utils/assert.hh"
#include "cql3/cql_statement.hh"
#include "cql3/statements/modification_statement.hh"
#include "cql3/statements/broadcast_modification_statement.hh"
#include "cql3/statements/strongly_consistent_modification_statement.hh"
#include "cql3/statements/raw/modification_statement.hh"
#include "cql3/statements/prepared_statement.hh"
#include "cql3/expr/expr-utils.hh"
@@ -29,8 +29,6 @@
#include "cql3/query_processor.hh"
#include "service/storage_proxy.hh"
#include "service/broadcast_tables/experimental/lang.hh"
#include "cql3/statements/strong_consistency/modification_statement.hh"
#include "cql3/statements/strong_consistency/statement_helpers.hh"
#include <boost/lexical_cast.hpp>
@@ -548,7 +546,7 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
}
}
::shared_ptr<broadcast_modification_statement>
::shared_ptr<strongly_consistent_modification_statement>
modification_statement::prepare_for_broadcast_tables() const {
// FIXME: implement for every type of `modification_statement`.
throw service::broadcast_tables::unsupported_operation_error{};
@@ -556,27 +554,24 @@ modification_statement::prepare_for_broadcast_tables() const {
namespace raw {
::shared_ptr<cql_statement_opt_metadata>
modification_statement::prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) {
::shared_ptr<cql3::statements::modification_statement> statement = prepare(db, ctx, stats);
if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
return statement->prepare_for_broadcast_tables();
} else {
return statement;
}
}
std::unique_ptr<prepared_statement>
modification_statement::prepare(data_dictionary::database db, cql_stats& stats) {
schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
auto meta = get_prepare_context();
auto statement = std::invoke([&] -> shared_ptr<cql_statement> {
auto result = prepare(db, meta, stats);
if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
return ::make_shared<strong_consistency::modification_statement>(std::move(result));
}
if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
return result->prepare_for_broadcast_tables();
}
return result;
});
auto statement = prepare_statement(db, meta, stats);
auto partition_key_bind_indices = meta.get_partition_key_bind_indexes(*schema);
return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta,
std::move(partition_key_bind_indices));
return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta, std::move(partition_key_bind_indices));
}
::shared_ptr<cql3::statements::modification_statement>

View File

@@ -30,7 +30,7 @@ class operation;
namespace statements {
class broadcast_modification_statement;
class strongly_consistent_modification_statement;
namespace raw { class modification_statement; }
@@ -113,15 +113,15 @@ public:
virtual void add_update_for_key(mutation& m, const query::clustering_range& range, const update_parameters& params, const json_cache_opt& json_cache) const = 0;
uint32_t get_bound_terms() const override;
virtual uint32_t get_bound_terms() const override;
const sstring& keyspace() const;
virtual const sstring& keyspace() const;
const sstring& column_family() const;
virtual const sstring& column_family() const;
bool is_counter() const;
virtual bool is_counter() const;
bool is_view() const;
virtual bool is_view() const;
int64_t get_timestamp(int64_t now, const query_options& options) const;
@@ -129,12 +129,12 @@ public:
std::optional<gc_clock::duration> get_time_to_live(const query_options& options) const;
future<> check_access(query_processor& qp, const service::client_state& state) const override;
virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
// Validate before execute, using client state and current schema
void validate(query_processor&, const service::client_state& state) const override;
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
void add_operation(::shared_ptr<operation> op);
@@ -256,9 +256,7 @@ public:
virtual json_cache_opt maybe_prepare_json_cache(const query_options& options) const;
virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const;
db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;
virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const;
protected:
/**
@@ -266,7 +264,9 @@ protected:
* processed to check that they are compatible.
* @throws InvalidRequestException
*/
void validate_where_clause_for_conditions() const;
virtual void validate_where_clause_for_conditions() const;
db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;
friend class raw::modification_statement;
};

View File

@@ -40,6 +40,7 @@ protected:
public:
virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
::shared_ptr<cql_statement_opt_metadata> prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats);
::shared_ptr<cql3::statements::modification_statement> prepare(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) const;
void add_raw(sstring&& raw) { _raw_cql = std::move(raw); }
const sstring& get_raw_cql() const { return _raw_cql; }

View File

@@ -131,6 +131,8 @@ private:
void verify_ordering_is_valid(const prepared_orderings_type&, const schema&, const restrictions::statement_restrictions& restrictions) const;
prepared_ann_ordering_type prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const;
// Checks whether this ordering reverses all results.
// We only allow leaving select results unchanged or reversing them.
bool is_ordering_reversed(const prepared_orderings_type&) const;

View File

@@ -8,8 +8,6 @@
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include "cql3/statements/strong_consistency/select_statement.hh"
#include "cql3/statements/strong_consistency/statement_helpers.hh"
#include "cql3/statements/select_statement.hh"
#include "cql3/expr/expression.hh"
#include "cql3/expr/evaluate.hh"
@@ -18,7 +16,7 @@
#include "cql3/statements/raw/select_statement.hh"
#include "cql3/query_processor.hh"
#include "cql3/statements/prune_materialized_view_statement.hh"
#include "cql3/statements/broadcast_select_statement.hh"
#include "cql3/statements/strongly_consistent_select_statement.hh"
#include "exceptions/exceptions.hh"
#include <seastar/core/future.hh>
@@ -27,14 +25,12 @@
#include "service/broadcast_tables/experimental/lang.hh"
#include "service/qos/qos_common.hh"
#include "transport/messages/result_message.hh"
#include "cql3/functions/functions.hh"
#include "cql3/functions/as_json_function.hh"
#include "cql3/selection/selection.hh"
#include "cql3/util.hh"
#include "cql3/restrictions/statement_restrictions.hh"
#include "index/secondary_index.hh"
#include "types/vector.hh"
#include "vector_search/filter.hh"
#include "validation.hh"
#include "exceptions/unrecognized_entity_exception.hh"
#include <optional>
@@ -372,9 +368,8 @@ uint64_t select_statement::get_inner_loop_limit(uint64_t limit, bool is_aggregat
}
bool select_statement::needs_post_query_ordering() const {
// We need post-query ordering for queries with IN on the partition key and an ORDER BY
// and ANN index queries with rescoring.
return static_cast<bool>(_ordering_comparator);
// We need post-query ordering only for queries with IN on the partition key and an ORDER BY.
return _restrictions->key_is_in_relation() && !_parameters->orderings().empty();
}
struct select_statement_executor {
@@ -1963,46 +1958,14 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
}));
}
struct ann_ordering_info {
secondary_index::index _index;
raw::select_statement::prepared_ann_ordering_type _prepared_ann_ordering;
bool is_rescoring_enabled;
};
static std::optional<ann_ordering_info> get_ann_ordering_info(
data_dictionary::database db,
schema_ptr schema,
lw_shared_ptr<const raw::select_statement::parameters> parameters,
prepare_context& ctx) {
if (parameters->orderings().empty()) {
return std::nullopt;
}
auto [column_id, ordering] = parameters->orderings().front();
const auto& ann_vector = std::get_if<raw::select_statement::ann_vector>(&ordering);
if (!ann_vector) {
return std::nullopt;
}
::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(*schema);
const column_definition* def = schema->get_column_definition(column->name());
if (!def) {
throw exceptions::invalid_request_exception(
fmt::format("Undefined column name {}", column->text()));
}
if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
}
auto e = expr::prepare_expression(*ann_vector, db, schema->ks_name(), nullptr, def->column_specification);
expr::fill_prepare_context(e, ctx);
raw::select_statement::prepared_ann_ordering_type prepared_ann_ordering = std::make_pair(std::move(def), std::move(e));
::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<attributes> attrs) {
auto cf = db.find_column_family(schema);
auto& sim = cf.get_index_manager();
auto [index_opt, _] = restrictions->find_idx(sim);
auto indexes = sim.list_indexes();
auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
@@ -2014,90 +1977,27 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(
if (it == indexes.end()) {
throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
}
index_opt = *it;
return ann_ordering_info{
*it,
std::move(prepared_ann_ordering),
secondary_index::vector_index::is_rescoring_enabled(it->metadata().options())
};
}
static uint32_t add_similarity_function_to_selectors(
std::vector<selection::prepared_selector>& prepared_selectors,
const ann_ordering_info& ann_ordering_info,
data_dictionary::database db,
schema_ptr schema) {
auto similarity_function_name = secondary_index::vector_index::get_cql_similarity_function_name(ann_ordering_info._index.metadata().options());
// Create the function name
auto func_name = functions::function_name::native_function(sstring(similarity_function_name));
// Create the function arguments
std::vector<expr::expression> args;
args.push_back(expr::column_value(ann_ordering_info._prepared_ann_ordering.first));
args.push_back(ann_ordering_info._prepared_ann_ordering.second);
// Get the function object
std::vector<shared_ptr<assignment_testable>> provided_args;
provided_args.push_back(expr::as_assignment_testable(args[0], expr::type_of(args[0])));
provided_args.push_back(expr::as_assignment_testable(args[1], expr::type_of(args[1])));
auto func = cql3::functions::instance().get(db, schema->ks_name(), func_name, provided_args, schema->ks_name(), schema->cf_name(), nullptr);
// Create the function call expression
expr::function_call similarity_func_call{
.func = func,
.args = std::move(args),
};
// Add the similarity function as a prepared selector (last)
prepared_selectors.push_back(selection::prepared_selector{
.expr = std::move(similarity_func_call),
.alias = nullptr,
});
return prepared_selectors.size() - 1;
}
static select_statement::ordering_comparator_type get_similarity_ordering_comparator(std::vector<selection::prepared_selector>& prepared_selectors, uint32_t similarity_column_index) {
auto type = expr::type_of(prepared_selectors[similarity_column_index].expr);
if (type->get_kind() != abstract_type::kind::float_kind) {
seastar::on_internal_error(logger, "Similarity function must return float type.");
if (!index_opt) {
throw std::runtime_error("No index found.");
}
return [similarity_column_index, type] (const raw::select_statement::result_row_type& r1, const raw::select_statement::result_row_type& r2) {
auto& c1 = r1[similarity_column_index];
auto& c2 = r2[similarity_column_index];
auto f1 = c1 ? value_cast<float>(type->deserialize(*c1)) : std::numeric_limits<float>::quiet_NaN();
auto f2 = c2 ? value_cast<float>(type->deserialize(*c2)) : std::numeric_limits<float>::quiet_NaN();
if (std::isfinite(f1) && std::isfinite(f2)) {
return f1 > f2;
}
return std::isfinite(f1);
};
}
::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
auto prepared_filter = vector_search::prepare_filter(*restrictions, parameters->allow_filtering());
return ::make_shared<cql3::statements::vector_indexed_table_select_statement>(schema, bound_terms, parameters, std::move(selection), std::move(restrictions),
std::move(group_by_cell_indices), is_reversed, std::move(ordering_comparator), std::move(prepared_ann_ordering), std::move(limit),
std::move(per_partition_limit), stats, index, std::move(prepared_filter), std::move(attrs));
std::move(per_partition_limit), stats, *index_opt, std::move(attrs));
}
vector_indexed_table_select_statement::vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index,
vector_search::prepared_filter prepared_filter, std::unique_ptr<attributes> attrs)
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs)
: select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit,
per_partition_limit, stats, std::move(attrs)}
, _index{index}
, _prepared_ann_ordering(std::move(prepared_ann_ordering))
, _prepared_filter(std::move(prepared_filter)) {
, _prepared_ann_ordering(std::move(prepared_ann_ordering)) {
if (!limit.has_value()) {
throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
@@ -2132,19 +2032,13 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table
auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
auto aoe = abort_on_expiry(timeout);
auto filter_json = _prepared_filter.to_json(options);
uint64_t fetch = static_cast<uint64_t>(std::ceil(limit * secondary_index::vector_index::get_oversampling(_index.metadata().options())));
auto pkeys = co_await qp.vector_store_client().ann(
_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), fetch, filter_json, aoe.abort_source());
_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, aoe.abort_source());
if (!pkeys.has_value()) {
co_await coroutine::return_exception(
exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
}
if (pkeys->size() > limit && !secondary_index::vector_index::is_rescoring_enabled(_index.metadata().options())) {
pkeys->erase(pkeys->begin() + limit, pkeys->end());
}
co_return co_await query_base_table(qp, state, options, pkeys.value(), timeout);
});
@@ -2161,11 +2055,11 @@ void vector_indexed_table_select_statement::update_stats() const {
}
lw_shared_ptr<query::read_command> vector_indexed_table_select_statement::prepare_command_for_base_query(
query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const {
query_processor& qp, service::query_state& state, const query_options& options) const {
auto slice = make_partition_slice(options);
return ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(slice), qp.proxy().get_max_result_size(slice),
query::tombstone_limit(qp.proxy().get_tombstone_limit()),
query::row_limit(get_inner_loop_limit(fetch_limit, _selection->is_aggregate())), query::partition_limit(query::max_partitions),
query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())), query::partition_limit(query::max_partitions),
_query_start_time_point, tracing::make_trace_info(state.get_trace_state()), query_id::create_null_id(), query::is_first_page::no,
options.get_timestamp(state));
}
@@ -2183,7 +2077,7 @@ std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vecto
future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys,
lowres_clock::time_point timeout) const {
auto command = prepare_command_for_base_query(qp, state, options, pkeys.size());
auto command = prepare_command_for_base_query(qp, state, options);
// For tables without clustering columns, we can optimize by querying
// partition ranges instead of individual primary keys, since the
@@ -2222,7 +2116,6 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
query::result_merger{command->get_row_limit(), query::max_partitions});
co_return co_await wrap_result_to_error_message([this, &command, &options](auto result) {
command->set_row_limit(get_limit(options, _limit));
return process_results(std::move(result), command, options, _query_start_time_point);
})(std::move(result));
}
@@ -2236,7 +2129,6 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
{timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}, options.get_specific_options().node_local_only},
std::nullopt)
.then(wrap_result_to_error_message([this, &options, command](service::storage_proxy::coordinator_query_result qr) {
command->set_row_limit(get_limit(options, _limit));
return this->process_results(std::move(qr.query_result), command, options, _query_start_time_point);
}));
}
@@ -2331,41 +2223,32 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
prepared_selectors = maybe_jsonize_select_clause(std::move(prepared_selectors), db, schema);
std::optional<ann_ordering_info> ann_ordering_info_opt = get_ann_ordering_info(db, schema, _parameters, ctx);
bool is_ann_query = ann_ordering_info_opt.has_value();
auto aggregation_depth = 0u;
if (prepared_selectors.empty() && (!_group_by_columns.empty() || (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled))) {
// We have a "SELECT * GROUP BY" or "SELECT * ORDER BY ANN" with rescoring enabled. If we leave prepared_selectors
// empty, below we choose selection::wildcard() for SELECT *, and either:
// - forget to do the "levellize" trick needed for the GROUP BY. See #16531.
// - forget to add the similarity function needed for ORDER BY ANN with rescoring. See below.
// So we need to set prepared_selectors.
auto all_columns = selection::selection::wildcard_columns(schema);
std::vector<::shared_ptr<selection::raw_selector>> select_all;
select_all.reserve(all_columns.size());
for (const column_definition *cdef : all_columns) {
auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
select_all.push_back(::make_shared<selection::raw_selector>(
expr::unresolved_identifier(std::move(name)), nullptr));
// Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
if (!_group_by_columns.empty()) {
aggregation_depth = std::max(aggregation_depth, 1u);
if (prepared_selectors.empty()) {
// We have a "SELECT * GROUP BY". If we leave prepared_selectors
// empty, below we choose selection::wildcard() for SELECT *, and
// forget to do the "levellize" trick needed for the GROUP BY.
// So we need to set prepared_selectors. See #16531.
auto all_columns = selection::selection::wildcard_columns(schema);
std::vector<::shared_ptr<selection::raw_selector>> select_all;
select_all.reserve(all_columns.size());
for (const column_definition *cdef : all_columns) {
auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
select_all.push_back(::make_shared<selection::raw_selector>(
expr::unresolved_identifier(std::move(name)), nullptr));
}
prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
}
prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
}
for (auto& ps : prepared_selectors) {
expr::fill_prepare_context(ps.expr, ctx);
}
// Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
auto aggregation_depth = _group_by_columns.empty() ? 0u : 1u;
select_statement::ordering_comparator_type ordering_comparator;
bool hide_last_column = false;
if (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled) {
uint32_t similarity_column_index = add_similarity_function_to_selectors(prepared_selectors, *ann_ordering_info_opt, db, schema);
hide_last_column = true;
ordering_comparator = get_similarity_ordering_comparator(prepared_selectors, similarity_column_index);
}
for (auto& ps : prepared_selectors) {
aggregation_depth = std::max(aggregation_depth, expr::aggregation_depth(ps.expr));
}
@@ -2383,11 +2266,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
? selection::selection::wildcard(schema)
: selection::selection::from_selectors(db, schema, keyspace(), levellized_prepared_selectors);
if (is_ann_query && hide_last_column) {
// Hide the similarity selector from the client by reducing column_count
selection->get_result_metadata()->hide_last_column();
}
// Cassandra 5.0.2 disallows PER PARTITION LIMIT with aggregate queries
// but only if GROUP BY is not used.
// See #9879 for more details.
@@ -2395,6 +2273,8 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
throw exceptions::invalid_request_exception("PER PARTITION LIMIT is not allowed with aggregate queries.");
}
bool is_ann_query = !_parameters->orderings().empty() && std::holds_alternative<select_statement::ann_vector>(_parameters->orderings().front().second);
auto restrictions = prepare_restrictions(db, schema, ctx, selection, for_view, _parameters->allow_filtering() || is_ann_query,
restrictions::check_indexes(!_parameters->is_mutation_fragments()));
@@ -2402,14 +2282,19 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
validate_distinct_selection(*schema, *selection, *restrictions);
}
select_statement::ordering_comparator_type ordering_comparator;
bool is_reversed_ = false;
std::optional<prepared_ann_ordering_type> prepared_ann_ordering;
auto orderings = _parameters->orderings();
if (!orderings.empty() && !is_ann_query) {
if (!orderings.empty()) {
std::visit([&](auto&& ordering) {
using T = std::decay_t<decltype(ordering)>;
if constexpr (!std::is_same_v<T, select_statement::ann_vector>) {
if constexpr (std::is_same_v<T, select_statement::ann_vector>) {
prepared_ann_ordering = prepare_ann_ordering(*schema, ctx, db);
} else {
SCYLLA_ASSERT(!for_view);
verify_ordering_is_allowed(*_parameters, *restrictions);
prepared_orderings_type prepared_orderings = prepare_orderings(*schema);
@@ -2422,7 +2307,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
}
std::vector<sstring> warnings;
if (!is_ann_query) {
if (!prepared_ann_ordering.has_value()) {
check_needs_filtering(*restrictions, db.get_config().strict_allow_filtering(), warnings);
ensure_filtering_columns_retrieval(db, *selection, *restrictions);
}
@@ -2476,21 +2361,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
&& restrictions->partition_key_restrictions_size() == schema->partition_key_size());
};
if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
stmt = ::make_shared<strong_consistency::select_statement>(
schema,
ctx.bound_variables_size(),
_parameters,
std::move(selection),
std::move(restrictions),
std::move(group_by_cell_indices),
is_reversed_,
std::move(ordering_comparator),
prepare_limit(db, ctx, _limit),
prepare_limit(db, ctx, _per_partition_limit),
stats,
std::move(prepared_attrs));
} else if (_parameters->is_prune_materialized_view()) {
if (_parameters->is_prune_materialized_view()) {
stmt = ::make_shared<cql3::statements::prune_materialized_view_statement>(
schema,
ctx.bound_variables_size(),
@@ -2519,10 +2390,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
prepare_limit(db, ctx, _per_partition_limit),
stats,
std::move(prepared_attrs));
} else if (is_ann_query) {
} else if (prepared_ann_ordering) {
stmt = vector_indexed_table_select_statement::prepare(db, schema, ctx.bound_variables_size(), _parameters, std::move(selection), std::move(restrictions),
std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(ann_ordering_info_opt->_prepared_ann_ordering),
prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, ann_ordering_info_opt->_index, std::move(prepared_attrs));
std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(*prepared_ann_ordering),
prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, std::move(prepared_attrs));
} else if (restrictions->uses_secondary_indexing()) {
stmt = view_indexed_table_select_statement::prepare(
db,
@@ -2554,7 +2425,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
std::move(prepared_attrs)
);
} else if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
stmt = ::make_shared<cql3::statements::broadcast_select_statement>(
stmt = ::make_shared<cql3::statements::strongly_consistent_select_statement>(
schema,
ctx.bound_variables_size(),
_parameters,
@@ -2744,6 +2615,28 @@ void select_statement::verify_ordering_is_valid(const prepared_orderings_type& o
}
}
select_statement::prepared_ann_ordering_type select_statement::prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const {
auto [column_id, ordering] = _parameters->orderings().front();
const auto& ann_vector = std::get_if<select_statement::ann_vector>(&ordering);
SCYLLA_ASSERT(ann_vector);
::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(schema);
const column_definition* def = schema.get_column_definition(column->name());
if (!def) {
throw exceptions::invalid_request_exception(
fmt::format("Undefined column name {}", column->text()));
}
if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
}
auto e = expr::prepare_expression(*ann_vector, db, keyspace(), nullptr, def->column_specification);
expr::fill_prepare_context(e, ctx);
return std::make_pair(std::move(def), std::move(e));
}
select_statement::ordering_comparator_type select_statement::get_ordering_comparator(const prepared_orderings_type& orderings,
selection::selection& selection,
const restrictions::statement_restrictions& restrictions) {

View File

@@ -22,7 +22,6 @@
#include "locator/host_id.hh"
#include "service/cas_shard.hh"
#include "vector_search/vector_store_client.hh"
#include "vector_search/filter.hh"
namespace service {
class client_state;
@@ -363,7 +362,6 @@ private:
class vector_indexed_table_select_statement : public select_statement {
secondary_index::index _index;
prepared_ann_ordering_type _prepared_ann_ordering;
vector_search::prepared_filter _prepared_filter;
mutable gc_clock::time_point _query_start_time_point;
public:
@@ -373,13 +371,13 @@ public:
lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<cql3::attributes> attrs);
vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit, std::optional<expr::expression> per_partition_limit,
cql_stats& stats, const secondary_index::index& index, vector_search::prepared_filter prepared_filter, std::unique_ptr<cql3::attributes> attrs);
cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
private:
future<::shared_ptr<cql_transport::messages::result_message>> do_execute(
@@ -387,7 +385,7 @@ private:
void update_stats() const;
lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const;
lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options) const;
std::vector<float> get_ann_ordering_vector(const query_options& options) const;

View File

@@ -1,82 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "modification_statement.hh"
#include "transport/messages/result_message.hh"
#include "cql3/query_processor.hh"
#include "service/strong_consistency/coordinator.hh"
#include "cql3/statements/strong_consistency/statement_helpers.hh"
namespace cql3::statements::strong_consistency {
static logging::logger logger("sc_modification_statement");
modification_statement::modification_statement(shared_ptr<base_statement> statement)
: cql_statement_opt_metadata(&timeout_config::write_timeout)
, _statement(std::move(statement))
{
}
using result_message = cql_transport::messages::result_message;
future<shared_ptr<result_message>> modification_statement::execute(query_processor& qp, service::query_state& qs,
const query_options& options, std::optional<service::group0_guard> guard) const
{
return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
.then(cql_transport::messages::propagate_exception_as_future<shared_ptr<result_message>>);
}
future<shared_ptr<result_message>> modification_statement::execute_without_checking_exception_message(
query_processor& qp, service::query_state& qs, const query_options& options,
std::optional<service::group0_guard> guard) const
{
auto json_cache = base_statement::json_cache_opt{};
const auto keys = _statement->build_partition_keys(options, json_cache);
if (keys.size() != 1 || !query::is_single_partition(keys[0])) {
throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
}
if (_statement->requires_read()) {
throw exceptions::invalid_request_exception("Strongly consistent updates don't support data prefetch");
}
auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
const auto mutate_result = co_await coordinator.get().mutate(_statement->s,
keys[0].start()->value().token(),
[&](api::timestamp_type ts) {
const auto prefetch_data = update_parameters::prefetch_data(_statement->s);
const auto ttl = _statement->get_time_to_live(options);
const auto params = update_parameters(_statement->s, options, ts, ttl, prefetch_data);
const auto ranges = _statement->create_clustering_ranges(options, json_cache);
auto muts = _statement->apply_updates(keys, ranges, params, json_cache);
if (muts.size() != 1) {
on_internal_error(logger, ::format("statement '{}' has unexpected number of mutations {}",
raw_cql_statement, muts.size()));
}
return std::move(*muts.begin());
});
using namespace service::strong_consistency;
if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
co_return co_await redirect_statement(qp, options, redirect->target);
}
co_return seastar::make_shared<result_message::void_message>();
}
future<> modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
return _statement->check_access(qp, state);
}
uint32_t modification_statement::get_bound_terms() const {
return _statement->get_bound_terms();
}
bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
return _statement->depends_on(ks_name, cf_name);
}
}

View File

@@ -1,39 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "cql3/cql_statement.hh"
#include "cql3/expr/expression.hh"
#include "cql3/statements/modification_statement.hh"
namespace cql3::statements::strong_consistency {
class modification_statement : public cql_statement_opt_metadata {
using result_message = cql_transport::messages::result_message;
using base_statement = cql3::statements::modification_statement;
shared_ptr<base_statement> _statement;
public:
modification_statement(shared_ptr<base_statement> statement);
future<shared_ptr<result_message>> execute(query_processor& qp, service::query_state& state,
const query_options& options, std::optional<service::group0_guard> guard) const override;
future<shared_ptr<result_message>> execute_without_checking_exception_message(query_processor& qp,
service::query_state& qs, const query_options& options,
std::optional<service::group0_guard> guard) const override;
future<> check_access(query_processor& qp, const service::client_state& state) const override;
uint32_t get_bound_terms() const override;
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
};
}

View File

@@ -1,56 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "select_statement.hh"
#include "query/query-request.hh"
#include "cql3/query_processor.hh"
#include "service/strong_consistency/coordinator.hh"
#include "cql3/statements/strong_consistency/statement_helpers.hh"
namespace cql3::statements::strong_consistency {
using result_message = cql_transport::messages::result_message;
future<::shared_ptr<result_message>> select_statement::do_execute(query_processor& qp,
service::query_state& state,
const query_options& options) const
{
const auto key_ranges = _restrictions->get_partition_key_ranges(options);
if (key_ranges.size() != 1 || !query::is_single_partition(key_ranges[0])) {
throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
}
const auto now = gc_clock::now();
auto read_command = make_lw_shared<query::read_command>(
_query_schema->id(),
_query_schema->version(),
make_partition_slice(options),
query::max_result_size(query::result_memory_limiter::maximum_result_size),
query::tombstone_limit(query::tombstone_limit::max),
query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())),
query::partition_limit(query::max_partitions),
now,
tracing::make_trace_info(state.get_trace_state()),
query_id::create_null_id(),
query::is_first_page::no,
options.get_timestamp(state));
const auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
auto query_result = co_await coordinator.get().query(_query_schema, *read_command,
key_ranges, state.get_trace_state(), timeout);
using namespace service::strong_consistency;
if (const auto* redirect = get_if<need_redirect>(&query_result)) {
co_return co_await redirect_statement(qp, options, redirect->target);
}
co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
read_command, options, now);
}
}

View File

@@ -1,26 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "cql3/cql_statement.hh"
#include "cql3/statements/select_statement.hh"
namespace cql3::statements::strong_consistency {
class select_statement : public cql3::statements::select_statement {
using result_message = cql_transport::messages::result_message;
public:
using cql3::statements::select_statement::select_statement;
future<::shared_ptr<cql_transport::messages::result_message>> do_execute(query_processor& qp,
service::query_state& state, const query_options& options) const override;
};
}

View File

@@ -1,37 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include "statement_helpers.hh"
#include "transport/messages/result_message_base.hh"
#include "cql3/query_processor.hh"
#include "replica/database.hh"
#include "locator/tablet_replication_strategy.hh"
namespace cql3::statements::strong_consistency {
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
const query_options& options,
const locator::tablet_replica& target)
{
const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
if (target.host != my_host_id) {
throw exceptions::invalid_request_exception(format(
"Strongly consistent writes can be executed only on the leader node, "
"leader id {}, current host id {}",
target.host, my_host_id));
}
auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
}
bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name) {
const auto* tablet_aware_rs = db.find_keyspace(ks_name).get_replication_strategy().maybe_as_tablet_aware();
return tablet_aware_rs && tablet_aware_rs->get_consistency() != data_dictionary::consistency_config_option::eventual;
}
}

View File

@@ -1,23 +0,0 @@
/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "cql3/cql_statement.hh"
#include "locator/tablets.hh"
namespace cql3::statements::strong_consistency {
future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
query_processor& qp,
const query_options& options,
const locator::tablet_replica& target);
bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);
}

View File

@@ -9,7 +9,7 @@
*/
#include "cql3/statements/broadcast_modification_statement.hh"
#include "cql3/statements/strongly_consistent_modification_statement.hh"
#include <optional>
@@ -28,11 +28,11 @@
namespace cql3 {
static logging::logger logger("broadcast_modification_statement");
static logging::logger logger("strongly_consistent_modification_statement");
namespace statements {
broadcast_modification_statement::broadcast_modification_statement(
strongly_consistent_modification_statement::strongly_consistent_modification_statement(
uint32_t bound_terms,
schema_ptr schema,
broadcast_tables::prepared_update query)
@@ -43,7 +43,7 @@ broadcast_modification_statement::broadcast_modification_statement(
{ }
future<::shared_ptr<cql_transport::messages::result_message>>
broadcast_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
strongly_consistent_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
.then(cql_transport::messages::propagate_exception_as_future<shared_ptr<cql_transport::messages::result_message>>);
}
@@ -63,7 +63,7 @@ evaluate_prepared(
}
future<::shared_ptr<cql_transport::messages::result_message>>
broadcast_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
strongly_consistent_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
if (this_shard_id() != 0) {
co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
}
@@ -103,11 +103,11 @@ broadcast_modification_statement::execute_without_checking_exception_message(que
), result);
}
uint32_t broadcast_modification_statement::get_bound_terms() const {
uint32_t strongly_consistent_modification_statement::get_bound_terms() const {
return _bound_terms;
}
future<> broadcast_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
future<> strongly_consistent_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
auto f = state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::MODIFY);
if (_query.value_condition.has_value()) {
f = f.then([this, &state] {
@@ -117,7 +117,7 @@ future<> broadcast_modification_statement::check_access(query_processor& qp, con
return f;
}
bool broadcast_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
bool strongly_consistent_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
return _schema->ks_name() == ks_name && (!cf_name || _schema->cf_name() == *cf_name);
}

View File

@@ -27,13 +27,13 @@ struct prepared_update {
}
class broadcast_modification_statement : public cql_statement_opt_metadata {
class strongly_consistent_modification_statement : public cql_statement_opt_metadata {
const uint32_t _bound_terms;
const schema_ptr _schema;
const broadcast_tables::prepared_update _query;
public:
broadcast_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);
strongly_consistent_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);
virtual future<::shared_ptr<cql_transport::messages::result_message>>
execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;

View File

@@ -9,7 +9,7 @@
*/
#include "cql3/statements/broadcast_select_statement.hh"
#include "cql3/statements/strongly_consistent_select_statement.hh"
#include <seastar/core/future.hh>
#include <seastar/core/on_internal_error.hh>
@@ -24,7 +24,7 @@ namespace cql3 {
namespace statements {
static logging::logger logger("broadcast_select_statement");
static logging::logger logger("strongly_consistent_select_statement");
static
expr::expression get_key(const cql3::expr::expression& partition_key_restrictions) {
@@ -58,7 +58,7 @@ bool is_selecting_only_value(const cql3::selection::selection& selection) {
selection.get_columns()[0]->name() == "value";
}
broadcast_select_statement::broadcast_select_statement(schema_ptr schema, uint32_t bound_terms,
strongly_consistent_select_statement::strongly_consistent_select_statement(schema_ptr schema, uint32_t bound_terms,
lw_shared_ptr<const parameters> parameters,
::shared_ptr<selection::selection> selection,
::shared_ptr<const restrictions::statement_restrictions> restrictions,
@@ -73,7 +73,7 @@ broadcast_select_statement::broadcast_select_statement(schema_ptr schema, uint32
_query{prepare_query()}
{ }
broadcast_tables::prepared_select broadcast_select_statement::prepare_query() const {
broadcast_tables::prepared_select strongly_consistent_select_statement::prepare_query() const {
if (!is_selecting_only_value(*_selection)) {
throw service::broadcast_tables::unsupported_operation_error("only 'value' selector is allowed");
}
@@ -94,7 +94,7 @@ evaluate_prepared(
}
future<::shared_ptr<cql_transport::messages::result_message>>
broadcast_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
strongly_consistent_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
if (this_shard_id() != 0) {
co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
}

View File

@@ -25,12 +25,12 @@ struct prepared_select {
}
class broadcast_select_statement : public select_statement {
class strongly_consistent_select_statement : public select_statement {
const broadcast_tables::prepared_select _query;
broadcast_tables::prepared_select prepare_query() const;
public:
broadcast_select_statement(schema_ptr schema,
strongly_consistent_select_statement(schema_ptr schema,
uint32_t bound_terms,
lw_shared_ptr<const parameters> parameters,
::shared_ptr<selection::selection> selection,

View File

@@ -13,7 +13,7 @@
#include "cql3/expr/expression.hh"
#include "cql3/expr/evaluate.hh"
#include "cql3/expr/expr-utils.hh"
#include "cql3/statements/broadcast_modification_statement.hh"
#include "cql3/statements/strongly_consistent_modification_statement.hh"
#include "service/broadcast_tables/experimental/lang.hh"
#include "raw/update_statement.hh"
@@ -333,7 +333,7 @@ std::optional<expr::expression> get_value_condition(const expr::expression& the_
return binop->rhs;
}
::shared_ptr<broadcast_modification_statement>
::shared_ptr<strongly_consistent_modification_statement>
update_statement::prepare_for_broadcast_tables() const {
if (attrs) {
if (attrs->is_time_to_live_set()) {
@@ -359,7 +359,7 @@ update_statement::prepare_for_broadcast_tables() const {
.value_condition = get_value_condition(_condition),
};
return ::make_shared<broadcast_modification_statement>(
return ::make_shared<strongly_consistent_modification_statement>(
get_bound_terms(),
s,
query

View File

@@ -45,7 +45,7 @@ private:
virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const;
public:
virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const override;
virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const override;
};
/*

View File

@@ -55,21 +55,8 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
return hash & ((1ULL << batchlog_shard_bits) - 1);
}
bool is_batchlog_v1(const schema& schema) {
return schema.cf_name() == system_keyspace::BATCHLOG;
}
std::pair<partition_key, clustering_key>
get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
if (is_batchlog_v1(schema)) {
if (!id) {
on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
}
auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
auto ckey = clustering_key::make_empty();
return std::pair(std::move(pkey), std::move(ckey));
}
auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
std::vector<bytes> ckey_components;
@@ -98,14 +85,6 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
auto cdef_data = schema->get_column_definition(to_bytes("data"));
m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
if (is_batchlog_v1(*schema)) {
auto cdef_version = schema->get_column_definition(to_bytes("version"));
m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
}
return m;
}
@@ -143,10 +122,9 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
const std::chrono::seconds db::batchlog_manager::replay_interval;
const uint32_t db::batchlog_manager::page_size;
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
: _qp(qp)
, _sys_ks(sys_ks)
, _fs(fs)
, _replay_timeout(config.replay_timeout)
, _replay_rate(config.replay_rate)
, _delay(config.delay)
@@ -322,206 +300,149 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
});
}
namespace {
using clock_type = db_clock::rep;
struct replay_stats {
std::optional<db_clock::time_point> min_too_fresh;
bool need_cleanup = false;
};
} // anonymous namespace
static future<db::all_batches_replayed> process_batch(
cql3::query_processor& qp,
db::batchlog_manager::stats& stats,
db::batchlog_manager::post_replay_cleanup cleanup,
utils::rate_limiter& limiter,
schema_ptr schema,
std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
const db_clock::time_point now,
db_clock::duration replay_timeout,
std::chrono::seconds write_timeout,
const cql3::untyped_result_set::row& row) {
const bool is_v1 = db::is_batchlog_v1(*schema);
const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
auto written_at = row.get_as<db_clock::time_point>("written_at");
auto id = row.get_as<utils::UUID>("id");
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
auto timeout = replay_timeout;
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
co_return db::all_batches_replayed::no;
}
auto data = row.get_blob_unfragmented("data");
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
utils::chunked_vector<mutation> mutations;
bool send_failed = false;
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
try {
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
auto in = ser::as_input_stream(data);
while (in.size()) {
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
const auto tbl = qp.db().try_find_table(fm.column_family_id());
if (!tbl) {
continue;
}
if (written_at <= tbl->get_truncation_time()) {
continue;
}
schema_ptr s = tbl->schema();
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
}
fms.emplace_back(std::move(fm), std::move(s));
}
if (now < written_at + timeout) {
blogger.debug("Skipping replay of {}, too fresh", id);
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
co_return db::all_batches_replayed::no;
}
auto size = data.size();
for (const auto& [fm, s] : fms) {
mutations.emplace_back(fm.to_mutation(s));
co_await coroutine::maybe_yield();
}
if (!mutations.empty()) {
const auto ttl = [written_at]() -> clock_type {
/*
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
* This ensures that deletes aren't "undone" by an old batch replay.
*/
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
warn(unimplemented::cause::HINT);
#if 0
for (auto& m : *mutations) {
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
}
#endif
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
}();
if (ttl > 0) {
// Origin does the send manually, however I can't see a super great reason to do so.
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
// in both cases.
// FIXME: verify that the above is reasonably true.
co_await limiter.reserve(size);
stats.write_attempts += mutations.size();
auto timeout = db::timeout_clock::now() + write_timeout;
if (cleanup) {
co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
} else {
co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
}
}
}
} catch (data_dictionary::no_such_keyspace& ex) {
// should probably ignore and drop the batch
} catch (const data_dictionary::no_such_column_family&) {
// As above -- we should drop the batch if the table doesn't exist anymore.
} catch (...) {
blogger.warn("Replay failed (will retry): {}", std::current_exception());
// timeout, overload etc.
// Do _not_ remove the batch, assuning we got a node write error.
// Since we don't have hints (which origin is satisfied with),
// we have to resort to keeping this batch to next lap.
if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
co_return db::all_batches_replayed::no;
}
send_failed = true;
}
auto& sp = qp.proxy();
if (send_failed) {
blogger.debug("Moving batch {} to stage failed_replay", id);
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
}
// delete batch
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
shard_written_at.need_cleanup = true;
co_return db::all_batches_replayed(!send_failed);
}
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
utils::rate_limiter limiter(throttle);
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
// Use a stable `now` across all batches, so skip/replay decisions are the
// same across a while prefix of written_at (across all ids).
const auto now = db_clock::now();
auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
co_return stop_iteration::no;
};
co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
blogger.debug("Started replayAllFailedBatches");
co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
co_await _qp.query_internal(
format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
db::consistency_level::ONE,
{},
page_size,
batch);
blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
});
co_return all_replayed;
}
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
co_await maybe_migrate_v1_to_v2();
typedef db_clock::rep clock_type;
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
utils::rate_limiter limiter(throttle);
auto limiter = make_lw_shared<utils::rate_limiter>(throttle);
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
struct replay_stats {
std::optional<db_clock::time_point> min_too_fresh;
bool need_cleanup = false;
};
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
// Use a stable `now` across all batches, so skip/replay decisions are the
// same across a while prefix of written_at (across all ids).
const auto now = db_clock::now();
auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
const auto batch_shard = row.get_as<int32_t>("shard");
auto written_at = row.get_as<db_clock::time_point>("written_at");
auto id = row.get_as<utils::UUID>("id");
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
auto timeout = _replay_timeout;
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
all_replayed = all_batches_replayed::no;
co_return stop_iteration::no;
}
auto data = row.get_blob_unfragmented("data");
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
utils::chunked_vector<mutation> mutations;
bool send_failed = false;
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
try {
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
auto in = ser::as_input_stream(data);
while (in.size()) {
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
const auto tbl = _qp.db().try_find_table(fm.column_family_id());
if (!tbl) {
continue;
}
if (written_at <= tbl->get_truncation_time()) {
continue;
}
schema_ptr s = tbl->schema();
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
}
fms.emplace_back(std::move(fm), std::move(s));
}
if (now < written_at + timeout) {
blogger.debug("Skipping replay of {}, too fresh", id);
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
co_return stop_iteration::no;
}
auto size = data.size();
for (const auto& [fm, s] : fms) {
mutations.emplace_back(fm.to_mutation(s));
co_await coroutine::maybe_yield();
}
if (!mutations.empty()) {
const auto ttl = [written_at]() -> clock_type {
/*
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
* This ensures that deletes aren't "undone" by an old batch replay.
*/
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
warn(unimplemented::cause::HINT);
#if 0
for (auto& m : *mutations) {
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
}
#endif
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
}();
if (ttl > 0) {
// Origin does the send manually, however I can't see a super great reason to do so.
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
// in both cases.
// FIXME: verify that the above is reasonably true.
co_await limiter->reserve(size);
_stats.write_attempts += mutations.size();
auto timeout = db::timeout_clock::now() + write_timeout;
if (cleanup) {
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
} else {
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
}
}
}
} catch (data_dictionary::no_such_keyspace& ex) {
// should probably ignore and drop the batch
} catch (const data_dictionary::no_such_column_family&) {
// As above -- we should drop the batch if the table doesn't exist anymore.
} catch (...) {
blogger.warn("Replay failed (will retry): {}", std::current_exception());
all_replayed = all_batches_replayed::no;
// timeout, overload etc.
// Do _not_ remove the batch, assuning we got a node write error.
// Since we don't have hints (which origin is satisfied with),
// we have to resort to keeping this batch to next lap.
if (!cleanup || stage == batchlog_stage::failed_replay) {
co_return stop_iteration::no;
}
send_failed = true;
}
auto& sp = _qp.proxy();
if (send_failed) {
blogger.debug("Moving batch {} to stage failed_replay", id);
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
}
// delete batch
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
shard_written_at.need_cleanup = true;
co_return stop_iteration::no;
};
@@ -580,10 +501,3 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
co_return all_replayed;
}
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
if (_fs.batchlog_v2) {
return replay_all_failed_batches_v2(cleanup);
}
return replay_all_failed_batches_v1(cleanup);
}

View File

@@ -27,12 +27,6 @@ class query_processor;
} // namespace cql3
namespace gms {
class feature_service;
} // namespace gms
namespace db {
class system_keyspace;
@@ -55,11 +49,6 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
public:
using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
struct stats {
uint64_t write_attempts = 0;
};
private:
static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
@@ -67,13 +56,14 @@ private:
using clock_type = lowres_clock;
stats _stats;
struct stats {
uint64_t write_attempts = 0;
} _stats;
seastar::metrics::metric_groups _metrics;
cql3::query_processor& _qp;
db::system_keyspace& _sys_ks;
gms::feature_service& _fs;
db_clock::duration _replay_timeout;
uint64_t _replay_rate;
std::chrono::milliseconds _delay;
@@ -94,14 +84,12 @@ private:
future<> maybe_migrate_v1_to_v2();
future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
public:
// Takes a QP, not a distributes. Because this object is supposed
// to be per shard and does no dispatching beyond delegating the the
// shard qp (which is what you feed here).
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);
// abort the replay loop and return its future.
future<> drain();
@@ -114,7 +102,7 @@ public:
return _last_replay;
}
const stats& get_stats() const {
const stats& stats() const {
return _stats;
}
private:

View File

@@ -323,9 +323,6 @@ void cache_mutation_reader::touch_partition() {
inline
future<> cache_mutation_reader::fill_buffer() {
if (const auto& ex = get_abort_exception(); ex) {
return make_exception_future<>(ex);
}
if (_state == state::before_static_row) {
touch_partition();
auto after_static_row = [this] {

View File

@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
}
continue;
} catch (shutdown_marker&) {
_reserve_segments.abort(std::current_exception());
break;
} catch (...) {
clogger.warn("Exception in segment reservation: {}", std::current_exception());
}
co_await sleep(100ms);
}
_reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
}
future<std::vector<db::commitlog::descriptor>>

View File

@@ -1291,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1318,7 +1318,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, prometheus_port(this, "prometheus_port", value_status::Used, 9180, "Prometheus port, set to zero to disable.")
, prometheus_address(this, "prometheus_address", value_status::Used, {/* listen_address */}, "Prometheus listening address, defaulting to listen_address if not explicitly set.")
, prometheus_prefix(this, "prometheus_prefix", value_status::Used, "scylla", "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.")
, prometheus_allow_protobuf(this, "prometheus_allow_protobuf", value_status::Used, false, "If set allows the experimental Prometheus protobuf with native histogram")
, prometheus_allow_protobuf(this, "prometheus_allow_protobuf", value_status::Used, true, "Enable Prometheus protobuf with native histogram. Set to false to force text exposition format.")
, abort_on_lsa_bad_alloc(this, "abort_on_lsa_bad_alloc", value_status::Used, false, "Abort when allocation in LSA region fails.")
, murmur3_partitioner_ignore_msb_bits(this, "murmur3_partitioner_ignore_msb_bits", value_status::Used, default_murmur3_partitioner_ignore_msb_bits, "Number of most significant token bits to ignore in murmur3 partitioner; increase for very large clusters.")
, unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit.")
@@ -1341,7 +1341,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{compression_parameters::algorithm::lz4_with_dicts},
"Server-global user table compression options. If enabled, all user tables"
"will be compressed using the provided options, unless overridden"
"by compression options in the table schema. User tables are all tables in non-system keyspaces. The available options are:\n"
"by compression options in the table schema. The available options are:\n"
"* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor, LZ4WithDictsCompressor (default), SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
"* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
"* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
@@ -1584,14 +1584,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE. This feature will eventually be removed in a future version.")
, rf_rack_valid_keyspaces(this, "rf_rack_valid_keyspaces", liveness::MustRestart, value_status::Used, false,
"Enforce RF-rack-valid keyspaces. Additionally, if there are existing RF-rack-invalid "
"keyspaces, attempting to start a node with this option ON will fail. "
"DEPRECATED. Use enforce_rack_list instead.")
, enforce_rack_list(this, "enforce_rack_list", liveness::MustRestart, value_status::Used, false,
"Enforce rack list for tablet keyspaces. "
"When the option is on, CREATE STATEMENT expands numeric rfs to rack lists "
"and ALTER STATEMENT is allowed only when rack lists are used in all DCs."
"Additionally, if there are existing tablet keyspaces with numeric rf in any DC "
"attempting to start a node with this option ON will fail.")
"keyspaces, attempting to start a node with this option ON will fail.")
// FIXME: make frequency per table in order to reduce work in each iteration.
// Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
, tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
@@ -1792,21 +1785,6 @@ const db::extensions& db::config::extensions() const {
return *_extensions;
}
compression_parameters db::config::get_sstable_compression_user_table_options(bool dicts_feature_enabled) const {
if (sstable_compression_user_table_options.is_set()
|| dicts_feature_enabled
|| !sstable_compression_user_table_options().uses_dictionary_compressor()) {
return sstable_compression_user_table_options();
} else {
// Fall back to non-dict if dictionary compression is not enabled cluster-wide.
auto options = sstable_compression_user_table_options();
auto params = options.get_options();
auto algo = compression_parameters::non_dict_equivalent(options.get_algorithm());
params[compression_parameters::SSTABLE_COMPRESSION] = sstring(compression_parameters::algorithm_to_name(algo));
return compression_parameters{params};
}
}
std::map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
// We decided against using the construct-on-first-use idiom here:
// https://github.com/scylladb/scylla/pull/5369#discussion_r353614807

View File

@@ -419,13 +419,7 @@ public:
named_value<bool> enable_sstables_mc_format;
named_value<bool> enable_sstables_md_format;
named_value<sstring> sstable_format;
// NOTE: Do not use this option directly.
// Use get_sstable_compression_user_table_options() instead.
named_value<compression_parameters> sstable_compression_user_table_options;
compression_parameters get_sstable_compression_user_table_options(bool dicts_feature_enabled) const;
named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
named_value<bool> sstable_compression_dictionaries_enable_writing;
named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
@@ -605,7 +599,6 @@ public:
named_value<bool> enable_create_table_with_compact_storage;
named_value<bool> rf_rack_valid_keyspaces;
named_value<bool> enforce_rack_list;
named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
named_value<bool> force_capacity_based_balancing;

View File

@@ -31,23 +31,19 @@ size_t quorum_for(const locator::effective_replication_map& erm) {
return replication_factor ? (replication_factor / 2) + 1 : 0;
}
static size_t get_replication_factor_for_dc(const locator::effective_replication_map& erm, const sstring& dc) {
size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
using namespace locator;
const auto& rs = erm.get_replication_strategy();
if (rs.get_type() == replication_strategy_type::network_topology) {
const network_topology_strategy* nts =
const network_topology_strategy* nrs =
static_cast<const network_topology_strategy*>(&rs);
return nts->get_replication_factor(dc);
size_t replication_factor = nrs->get_replication_factor(dc);
return replication_factor ? (replication_factor / 2) + 1 : 0;
}
return erm.get_replication_factor();
}
size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
auto rf = get_replication_factor_for_dc(erm, dc);
return rf ? (rf / 2) + 1 : 0;
return quorum_for(erm);
}
size_t block_for_local_serial(const locator::effective_replication_map& erm) {
@@ -192,30 +188,18 @@ void assure_sufficient_live_nodes(
return pending <= live ? live - pending : 0;
};
auto make_rf_zero_error_msg = [cl] (const sstring& local_dc) {
return format("Cannot achieve consistency level {} in datacenter '{}' with replication factor 0. "
"Ensure the keyspace is replicated to this datacenter or use a non-local consistency level.", cl, local_dc);
};
const auto& topo = erm.get_topology();
const sstring& local_dc = topo.get_datacenter();
switch (cl) {
case consistency_level::ANY:
// local hint is acceptable, and local node is always live
break;
case consistency_level::LOCAL_ONE:
if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, 1, 0);
}
if (topo.count_local_endpoints(live_endpoints) < topo.count_local_endpoints(pending_endpoints) + 1) {
throw exceptions::unavailable_exception(cl, 1, 0);
}
break;
case consistency_level::LOCAL_QUORUM: {
if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, need, 0);
}
size_t local_live = topo.count_local_endpoints(live_endpoints);
size_t pending = topo.count_local_endpoints(pending_endpoints);
if (local_live < need + pending) {

View File

@@ -850,7 +850,7 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
std::move(permit),
e.key(),
query::clustering_key_filter_ranges(slice.row_ranges(*schema, e.key().key())),
e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), &_tracker, phase_of(pos)),
e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), nullptr, phase_of(pos)),
false,
_tracker.region(),
_read_section,

View File

@@ -1139,17 +1139,14 @@ future<> schema_applier::finalize_tables_and_views() {
// was already dropped (see https://github.com/scylladb/scylla/issues/5614)
for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
auto s = dropped_view.get();
co_await _ss.local().on_cleanup_for_drop_table(s->id());
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
}
for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
auto s = dropped_table.get();
co_await _ss.local().on_cleanup_for_drop_table(s->id());
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
}
for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
auto s = dropped_cdc.get();
co_await _ss.local().on_cleanup_for_drop_table(s->id());
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
}

View File

@@ -96,16 +96,16 @@ static logging::logger diff_logger("schema_diff");
/** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
namespace db {
namespace {
const auto set_use_schema_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
if (builder.ks_name() == schema_tables::NAME) {
builder.enable_schema_commitlog();
const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
if (ks_name == schema_tables::NAME) {
props.enable_schema_commitlog();
}
});
const auto set_group0_table_options =
schema_builder::register_schema_initializer([](schema_builder& builder) {
if (builder.ks_name() == schema_tables::NAME) {
schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
if (ks_name == schema_tables::NAME) {
// all schema tables are group0 tables
builder.set_is_group0_table(true);
props.is_group0_table = true;
}
});
}

View File

@@ -65,7 +65,7 @@ future<> snapshot_ctl::run_snapshot_modify_operation(noncopyable_function<future
});
}
future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts) {
future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
if (tag.empty()) {
throw std::runtime_error("You must supply a snapshot name.");
}
@@ -74,21 +74,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
std::ranges::copy(_db.local().get_keyspaces() | std::views::keys, std::back_inserter(keyspace_names));
};
return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), opts, this] () mutable {
return do_take_snapshot(std::move(tag), std::move(keyspace_names), opts);
return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
});
}
future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts) {
future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
co_await coroutine::parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
return check_snapshot_not_exist(ks_name, tag);
});
co_await coroutine::parallel_for_each(keyspace_names, [this, tag = std::move(tag), opts] (const auto& ks_name) {
return replica::database::snapshot_keyspace_on_all_shards(_db, ks_name, tag, opts);
co_await coroutine::parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
return replica::database::snapshot_keyspace_on_all_shards(_db, ks_name, tag, bool(sf));
});
}
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
if (ks_name.empty()) {
throw std::runtime_error("You must supply a keyspace name");
}
@@ -99,14 +99,14 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
throw std::runtime_error("You must supply a snapshot name.");
}
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), opts] () mutable {
return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), opts);
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] () mutable {
return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf);
});
}
future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
co_await check_snapshot_not_exist(ks_name, tag, tables);
co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), opts);
co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), bool(sf));
}
future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {

View File

@@ -38,14 +38,10 @@ class backup_task_impl;
} // snapshot namespace
struct snapshot_options {
bool skip_flush = false;
gc_clock::time_point created_at = gc_clock::now();
std::optional<gc_clock::time_point> expires_at;
};
class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
public:
using skip_flush = bool_class<class skip_flush_tag>;
struct table_snapshot_details {
int64_t total;
int64_t live;
@@ -74,8 +70,8 @@ public:
*
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_snapshot(sstring tag, snapshot_options opts = {}) {
return take_snapshot(tag, {}, opts);
future<> take_snapshot(sstring tag, skip_flush sf = skip_flush::no) {
return take_snapshot(tag, {}, sf);
}
/**
@@ -84,7 +80,7 @@ public:
* @param tag the tag given to the snapshot; may not be null or empty
* @param keyspace_names the names of the keyspaces to snapshot; empty means "all"
*/
future<> take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts = {});
future<> take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
/**
* Takes the snapshot of multiple tables. A snapshot name must be specified.
@@ -93,7 +89,7 @@ public:
* @param tables a vector of tables names to snapshot
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
/**
* Remove the snapshot with the given name from the given keyspaces.
@@ -131,8 +127,8 @@ private:
friend class snapshot::backup_task_impl;
future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts = {} );
future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
};
}

View File

@@ -42,11 +42,11 @@ extern logging::logger cdc_log;
namespace db {
namespace {
const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
(builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
const auto set_wait_for_sync_to_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
if ((ks_name == system_distributed_keyspace::NAME_EVERYWHERE && cf_name == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
(ks_name == system_distributed_keyspace::NAME && cf_name == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
{
builder.set_wait_for_sync_to_commitlog(true);
props.wait_for_sync_to_commitlog = true;
}
});
}

View File

@@ -66,24 +66,24 @@ static thread_local auto sstableinfo_type = user_type_impl::get_instance(
namespace db {
namespace {
const auto set_null_sharder = schema_builder::register_schema_initializer([](schema_builder& builder) {
const auto set_null_sharder = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
// tables in the "system" keyspace which need to use null sharder
static const std::unordered_set<sstring> tables = {
// empty
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.set_use_null_sharder(true);
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.use_null_sharder = true;
}
});
const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
const auto set_wait_for_sync_to_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
static const std::unordered_set<sstring> tables = {
system_keyspace::PAXOS,
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.set_wait_for_sync_to_commitlog(true);
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.wait_for_sync_to_commitlog = true;
}
});
const auto set_use_schema_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
static const std::unordered_set<sstring> tables = {
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
system_keyspace::BROADCAST_KV_STORE,
@@ -108,18 +108,18 @@ namespace {
system_keyspace::ROLE_MEMBERS,
system_keyspace::ROLE_ATTRIBUTES,
system_keyspace::ROLE_PERMISSIONS,
system_keyspace::CDC_LOCAL,
system_keyspace::v3::CDC_LOCAL,
system_keyspace::DICTS,
system_keyspace::VIEW_BUILDING_TASKS,
system_keyspace::CLIENT_ROUTES,
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.enable_schema_commitlog();
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.enable_schema_commitlog();
}
});
const auto set_group0_table_options =
schema_builder::register_schema_initializer([](schema_builder& builder) {
schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
static const std::unordered_set<sstring> tables = {
// scylla_local may store a replicated tombstone related to schema
// (see `make_group0_schema_version_mutation`), so we include it in the group0 tables list.
@@ -142,8 +142,8 @@ namespace {
system_keyspace::CLIENT_ROUTES,
system_keyspace::REPAIR_TASKS,
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.set_is_group0_table(true);
if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
props.is_group0_table = true;
}
});
}
@@ -918,7 +918,7 @@ schema_ptr system_keyspace::corrupt_data() {
return scylla_local;
}
schema_ptr system_keyspace::batches() {
schema_ptr system_keyspace::v3::batches() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, BATCHES), NAME, BATCHES,
// partition key
@@ -946,7 +946,53 @@ schema_ptr system_keyspace::batches() {
return schema;
}
schema_ptr system_keyspace::truncated() {
schema_ptr system_keyspace::v3::built_indexes() {
// identical to ours, but ours otoh is a mix-in of the 3.x series cassandra one
return db::system_keyspace::built_indexes();
}
schema_ptr system_keyspace::v3::local() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL,
// partition key
{{"key", utf8_type}},
// clustering key
{},
// regular columns
{
{"bootstrapped", utf8_type},
{"broadcast_address", inet_addr_type},
{"cluster_name", utf8_type},
{"cql_version", utf8_type},
{"data_center", utf8_type},
{"gossip_generation", int32_type},
{"host_id", uuid_type},
{"listen_address", inet_addr_type},
{"native_protocol_version", utf8_type},
{"partitioner", utf8_type},
{"rack", utf8_type},
{"release_version", utf8_type},
{"rpc_address", inet_addr_type},
{"schema_version", uuid_type},
{"thrift_version", utf8_type},
{"tokens", set_type_impl::get_instance(utf8_type, true)},
{"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)},
},
// static columns
{},
// regular column name type
utf8_type,
// comment
"information about the local node"
);
builder.set_gc_grace_seconds(0);
builder.with_hash_version();
return builder.build(schema_builder::compact_storage::no);
}();
return schema;
}
schema_ptr system_keyspace::v3::truncated() {
static thread_local auto local = [] {
schema_builder builder(generate_legacy_id(NAME, TRUNCATED), NAME, TRUNCATED,
// partition key
@@ -976,7 +1022,7 @@ schema_ptr system_keyspace::truncated() {
thread_local data_type replay_position_type = tuple_type_impl::get_instance({long_type, int32_type});
schema_ptr system_keyspace::commitlog_cleanups() {
schema_ptr system_keyspace::v3::commitlog_cleanups() {
static thread_local auto local = [] {
schema_builder builder(generate_legacy_id(NAME, COMMITLOG_CLEANUPS), NAME, COMMITLOG_CLEANUPS,
// partition key
@@ -1003,7 +1049,47 @@ schema_ptr system_keyspace::commitlog_cleanups() {
return local;
}
schema_ptr system_keyspace::available_ranges() {
schema_ptr system_keyspace::v3::peers() {
// identical
return db::system_keyspace::peers();
}
schema_ptr system_keyspace::v3::peer_events() {
// identical
return db::system_keyspace::peer_events();
}
schema_ptr system_keyspace::v3::range_xfers() {
// identical
return db::system_keyspace::range_xfers();
}
schema_ptr system_keyspace::v3::compaction_history() {
// identical
return db::system_keyspace::compaction_history();
}
schema_ptr system_keyspace::v3::sstable_activity() {
// identical
return db::system_keyspace::sstable_activity();
}
schema_ptr system_keyspace::v3::size_estimates() {
// identical
return db::system_keyspace::size_estimates();
}
schema_ptr system_keyspace::v3::large_partitions() {
// identical
return db::system_keyspace::large_partitions();
}
schema_ptr system_keyspace::v3::scylla_local() {
// identical
return db::system_keyspace::scylla_local();
}
schema_ptr system_keyspace::v3::available_ranges() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, AVAILABLE_RANGES), NAME, AVAILABLE_RANGES,
// partition key
@@ -1026,7 +1112,7 @@ schema_ptr system_keyspace::available_ranges() {
return schema;
}
schema_ptr system_keyspace::views_builds_in_progress() {
schema_ptr system_keyspace::v3::views_builds_in_progress() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, VIEWS_BUILDS_IN_PROGRESS), NAME, VIEWS_BUILDS_IN_PROGRESS,
// partition key
@@ -1049,7 +1135,7 @@ schema_ptr system_keyspace::views_builds_in_progress() {
return schema;
}
schema_ptr system_keyspace::built_views() {
schema_ptr system_keyspace::v3::built_views() {
static thread_local auto schema = [] {
schema_builder builder(generate_legacy_id(NAME, BUILT_VIEWS), NAME, BUILT_VIEWS,
// partition key
@@ -1072,7 +1158,7 @@ schema_ptr system_keyspace::built_views() {
return schema;
}
schema_ptr system_keyspace::scylla_views_builds_in_progress() {
schema_ptr system_keyspace::v3::scylla_views_builds_in_progress() {
static thread_local auto schema = [] {
auto id = generate_legacy_id(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return schema_builder(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS, std::make_optional(id))
@@ -1088,7 +1174,7 @@ schema_ptr system_keyspace::scylla_views_builds_in_progress() {
return schema;
}
/*static*/ schema_ptr system_keyspace::cdc_local() {
/*static*/ schema_ptr system_keyspace::v3::cdc_local() {
static thread_local auto cdc_local = [] {
schema_builder builder(generate_legacy_id(NAME, CDC_LOCAL), NAME, CDC_LOCAL,
// partition key
@@ -1714,9 +1800,7 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
std::unordered_set<dht::token> tset;
for (auto& t: tokens) {
auto str = value_cast<sstring>(t);
if (str != dht::token::from_sstring(str).to_sstring()) {
on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
}
SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
tset.insert(dht::token::from_sstring(str));
}
return tset;
@@ -2096,21 +2180,21 @@ future<> system_keyspace::update_cdc_generation_id(cdc::generation_id gen_id) {
co_await std::visit(make_visitor(
[this] (cdc::generation_id_v1 id) -> future<> {
co_await execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", CDC_LOCAL),
sstring(CDC_LOCAL), id.ts);
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL), id.ts);
},
[this] (cdc::generation_id_v2 id) -> future<> {
co_await execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", CDC_LOCAL),
sstring(CDC_LOCAL), id.ts, id.id);
format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL), id.ts, id.id);
}
), gen_id);
}
future<std::optional<cdc::generation_id>> system_keyspace::get_cdc_generation_id() {
auto msg = co_await execute_cql(
format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", CDC_LOCAL),
sstring(CDC_LOCAL));
format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", v3::CDC_LOCAL),
sstring(v3::CDC_LOCAL));
if (msg->empty()) {
co_return std::nullopt;
@@ -2136,19 +2220,19 @@ static const sstring CDC_REWRITTEN_KEY = "rewritten";
future<> system_keyspace::cdc_set_rewritten(std::optional<cdc::generation_id_v1> gen_id) {
if (gen_id) {
return execute_cql(
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", CDC_LOCAL),
format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
CDC_REWRITTEN_KEY, gen_id->ts).discard_result();
} else {
// Insert just the row marker.
return execute_cql(
format("INSERT INTO system.{} (key) VALUES (?)", CDC_LOCAL),
format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL),
CDC_REWRITTEN_KEY).discard_result();
}
}
future<bool> system_keyspace::cdc_is_rewritten() {
// We don't care about the actual timestamp; it's additional information for debugging purposes.
return execute_cql(format("SELECT key FROM system.{} WHERE key = ?", CDC_LOCAL), CDC_REWRITTEN_KEY)
return execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY)
.then([] (::shared_ptr<cql3::untyped_result_set> msg) {
return !msg->empty();
});
@@ -2292,11 +2376,11 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
scylla_local(), db::schema_tables::scylla_table_schema_history(),
repair_history(),
repair_tasks(),
views_builds_in_progress(), built_views(),
scylla_views_builds_in_progress(),
truncated(),
commitlog_cleanups(),
cdc_local(),
v3::views_builds_in_progress(), v3::built_views(),
v3::scylla_views_builds_in_progress(),
v3::truncated(),
v3::commitlog_cleanups(),
v3::cdc_local(),
raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
@@ -2319,7 +2403,7 @@ static bool maybe_write_in_user_memory(schema_ptr s) {
return (s.get() == system_keyspace::batchlog().get())
|| (s.get() == system_keyspace::batchlog_v2().get())
|| (s.get() == system_keyspace::paxos().get())
|| s == system_keyspace::scylla_views_builds_in_progress();
|| s == system_keyspace::v3::scylla_views_builds_in_progress();
}
future<> system_keyspace::make(
@@ -2605,7 +2689,7 @@ mutation system_keyspace::make_size_estimates_mutation(const sstring& ks, std::v
future<> system_keyspace::register_view_for_building(sstring ks_name, sstring view_name, const dht::token& token) {
sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, generation_number, cpu_id, first_token) VALUES (?, ?, ?, ?, ?)",
SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return execute_cql(
std::move(req),
std::move(ks_name),
@@ -2621,7 +2705,7 @@ future<> system_keyspace::register_view_for_building_for_all_shards(sstring ks_n
// before all shards are registered.
// if another shard has already registered, this won't overwrite its status. if it hasn't registered, we insert
// a status with first_token=null and next_token=null, indicating it hasn't made progress.
auto&& schema = db::system_keyspace::scylla_views_builds_in_progress();
auto&& schema = db::system_keyspace::v3::scylla_views_builds_in_progress();
auto timestamp = api::new_timestamp();
mutation m{schema, partition_key::from_single_value(*schema, utf8_type->decompose(ks_name))};
@@ -2639,7 +2723,7 @@ future<> system_keyspace::register_view_for_building_for_all_shards(sstring ks_n
future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring view_name, const dht::token& token) {
sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, next_token, cpu_id) VALUES (?, ?, ?, ?)",
SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
return execute_cql(
std::move(req),
std::move(ks_name),
@@ -2650,14 +2734,14 @@ future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring vi
future<> system_keyspace::remove_view_build_progress_across_all_shards(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
std::move(ks_name),
std::move(view_name),
int32_t(this_shard_id())).discard_result();
@@ -2665,20 +2749,20 @@ future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring vi
future<> system_keyspace::mark_view_as_built(sstring ks_name, sstring view_name) {
return execute_cql(
format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", BUILT_VIEWS),
format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", v3::BUILT_VIEWS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<> system_keyspace::remove_built_view(sstring ks_name, sstring view_name) {
return execute_cql(
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", BUILT_VIEWS),
format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::BUILT_VIEWS),
std::move(ks_name),
std::move(view_name)).discard_result();
}
future<std::vector<system_keyspace::view_name>> system_keyspace::load_built_views() {
return execute_cql(format("SELECT * FROM system.{}", BUILT_VIEWS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
return execute_cql(format("SELECT * FROM system.{}", v3::BUILT_VIEWS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
return *cql_result
| std::views::transform([] (const cql3::untyped_result_set::row& row) {
auto ks_name = row.get_as<sstring>("keyspace_name");
@@ -2690,7 +2774,7 @@ future<std::vector<system_keyspace::view_name>> system_keyspace::load_built_view
future<std::vector<system_keyspace::view_build_progress>> system_keyspace::load_view_build_progress() {
return execute_cql(format("SELECT keyspace_name, view_name, first_token, next_token, cpu_id FROM system.{}",
SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
std::vector<view_build_progress> progress;
for (auto& row : *cql_result) {
auto ks_name = row.get_as<sstring>("keyspace_name");
@@ -3143,8 +3227,6 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
co_return ret;
}
const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
for (auto& row : *rs) {
if (!row.has("host_id")) {
// There are no clustering rows, only the static row.
@@ -3193,7 +3275,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
};
}
} else if (must_have_tokens(nstate)) {
on_internal_error(slogger, format(
on_fatal_internal_error(slogger, format(
"load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
}
}
@@ -3275,7 +3357,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
// Currently, at most one node at a time can be in transitioning state.
if (!map->empty()) {
const auto& [other_id, other_rs] = *map->begin();
on_internal_error(slogger, format(
on_fatal_internal_error(slogger, format(
"load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
other_id, other_rs.state, host_id, nstate));
}
@@ -3333,7 +3415,8 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
gen_id.id);
if (!gen_rows || gen_rows->empty()) {
SCYLLA_ASSERT(gen_rows);
if (gen_rows->empty()) {
on_internal_error(slogger, format(
"load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
}
@@ -3380,9 +3463,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
}
if (strongly_consistent_tables) {
ret.tablet_balancing_enabled = false;
} else if (some_row.has("tablet_balancing_enabled")) {
if (some_row.has("tablet_balancing_enabled")) {
ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
} else {
ret.tablet_balancing_enabled = true;

View File

@@ -127,8 +127,6 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
static schema_ptr raft_snapshot_config();
static schema_ptr local();
static schema_ptr truncated();
static schema_ptr commitlog_cleanups();
static schema_ptr peers();
static schema_ptr peer_events();
static schema_ptr range_xfers();
@@ -139,10 +137,7 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
static schema_ptr large_rows();
static schema_ptr large_cells();
static schema_ptr corrupt_data();
static schema_ptr batches();
static schema_ptr available_ranges();
static schema_ptr built_views();
static schema_ptr cdc_local();
static schema_ptr scylla_local();
future<> force_blocking_flush(sstring cfname);
// This function is called when the system.peers table is read,
// and it fixes some types of inconsistencies that can occur
@@ -209,14 +204,6 @@ public:
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
static constexpr auto CLIENT_ROUTES = "client_routes";
static constexpr auto VERSIONS = "versions";
static constexpr auto BATCHES = "batches";
static constexpr auto AVAILABLE_RANGES = "available_ranges";
static constexpr auto VIEWS_BUILDS_IN_PROGRESS = "views_builds_in_progress";
static constexpr auto BUILT_VIEWS = "built_views";
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
static constexpr auto CDC_LOCAL = "cdc_local";
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
static constexpr auto CDC_STREAMS = "cdc_streams";
// auth
static constexpr auto ROLES = "roles";
@@ -224,6 +211,42 @@ public:
static constexpr auto ROLE_ATTRIBUTES = "role_attributes";
static constexpr auto ROLE_PERMISSIONS = "role_permissions";
struct v3 {
static constexpr auto BATCHES = "batches";
static constexpr auto PAXOS = "paxos";
static constexpr auto BUILT_INDEXES = "IndexInfo";
static constexpr auto LOCAL = "local";
static constexpr auto PEERS = "peers";
static constexpr auto PEER_EVENTS = "peer_events";
static constexpr auto RANGE_XFERS = "range_xfers";
static constexpr auto COMPACTION_HISTORY = "compaction_history";
static constexpr auto SSTABLE_ACTIVITY = "sstable_activity";
static constexpr auto SIZE_ESTIMATES = "size_estimates";
static constexpr auto AVAILABLE_RANGES = "available_ranges";
static constexpr auto VIEWS_BUILDS_IN_PROGRESS = "views_builds_in_progress";
static constexpr auto BUILT_VIEWS = "built_views";
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
static constexpr auto CDC_LOCAL = "cdc_local";
static schema_ptr batches();
static schema_ptr built_indexes();
static schema_ptr local();
static schema_ptr truncated();
static schema_ptr commitlog_cleanups();
static schema_ptr peers();
static schema_ptr peer_events();
static schema_ptr range_xfers();
static schema_ptr compaction_history();
static schema_ptr sstable_activity();
static schema_ptr size_estimates();
static schema_ptr large_partitions();
static schema_ptr scylla_local();
static schema_ptr available_ranges();
static schema_ptr views_builds_in_progress();
static schema_ptr built_views();
static schema_ptr scylla_views_builds_in_progress();
static schema_ptr cdc_local();
};
// Partition estimates for a given range of tokens.
struct range_estimates {
schema_ptr schema;
@@ -241,7 +264,6 @@ public:
static schema_ptr batchlog_v2();
static schema_ptr paxos();
static schema_ptr built_indexes(); // TODO (from Cassandra): make private
static schema_ptr scylla_local();
static schema_ptr raft();
static schema_ptr raft_snapshots();
static schema_ptr repair_history();
@@ -261,8 +283,6 @@ public:
static schema_ptr dicts();
static schema_ptr view_building_tasks();
static schema_ptr client_routes();
static schema_ptr views_builds_in_progress();
static schema_ptr scylla_views_builds_in_progress();
// auth
static schema_ptr roles();

View File

@@ -195,7 +195,7 @@ public:
return mutation_reader(std::make_unique<build_progress_reader>(
s,
std::move(permit),
_db.find_column_family(s->ks_name(), system_keyspace::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
_db.find_column_family(s->ks_name(), system_keyspace::v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
range,
slice,
std::move(trace_state),

View File

@@ -3707,7 +3707,7 @@ void validate_view_keyspace(const data_dictionary::database& db, std::string_vie
try {
locator::assert_rf_rack_valid_keyspace(keyspace_name, tmptr, rs);
} catch (const std::invalid_argument& e) {
} catch (const std::exception& e) {
throw std::logic_error(fmt::format(
"Materialized views and secondary indexes are not supported on the keyspace '{}': {}",
keyspace_name, e.what()));

View File

@@ -68,8 +68,6 @@ public:
.with_column("peer", inet_addr_type, column_kind::partition_key)
.with_column("dc", utf8_type)
.with_column("up", boolean_type)
.with_column("draining", boolean_type)
.with_column("excluded", boolean_type)
.with_column("status", utf8_type)
.with_column("load", utf8_type)
.with_column("tokens", int32_type)
@@ -109,11 +107,8 @@ public:
if (tm.get_topology().has_node(hostid)) {
// Not all entries in gossiper are present in the topology
auto& node = tm.get_topology().get_node(hostid);
sstring dc = node.dc_rack().dc;
sstring dc = tm.get_topology().get_location(hostid).dc;
set_cell(cr, "dc", dc);
set_cell(cr, "draining", node.is_draining());
set_cell(cr, "excluded", node.is_excluded());
}
if (ownership.contains(eps.get_ip())) {
@@ -1139,8 +1134,6 @@ public:
set_cell(r.cells(), "dc", node.dc());
set_cell(r.cells(), "rack", node.rack());
set_cell(r.cells(), "up", _gossiper.local().is_alive(host));
set_cell(r.cells(), "draining", node.is_draining());
set_cell(r.cells(), "excluded", node.is_excluded());
if (auto ip = _gossiper.local().get_address_map().find(host)) {
set_cell(r.cells(), "ip", data_value(inet_address(*ip)));
}
@@ -1151,9 +1144,6 @@ public:
if (stats && stats->capacity.contains(host)) {
auto capacity = stats->capacity.at(host);
set_cell(r.cells(), "storage_capacity", data_value(int64_t(capacity)));
if (auto ts_iter = stats->tablet_stats.find(host); ts_iter != stats->tablet_stats.end()) {
set_cell(r.cells(), "effective_capacity", data_value(int64_t(ts_iter->second.effective_capacity)));
}
if (auto utilization = load.get_allocated_utilization(host)) {
set_cell(r.cells(), "storage_allocated_utilization", data_value(double(*utilization)));
@@ -1178,12 +1168,9 @@ private:
.with_column("rack", utf8_type)
.with_column("ip", inet_addr_type)
.with_column("up", boolean_type)
.with_column("draining", boolean_type)
.with_column("excluded", boolean_type)
.with_column("tablets_allocated", long_type)
.with_column("tablets_allocated_per_shard", double_type)
.with_column("storage_capacity", long_type)
.with_column("effective_capacity", long_type)
.with_column("storage_allocated_load", long_type)
.with_column("storage_allocated_utilization", double_type)
.with_column("storage_load", long_type)
@@ -1345,8 +1332,8 @@ public:
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1428,8 +1415,8 @@ public:
}
private:
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
.with_column("table_name", utf8_type, column_kind::partition_key)
.with_column("timestamp", timestamp_type, column_kind::clustering_key)
@@ -1497,7 +1484,7 @@ future<> initialize_virtual_tables(
co_await add_table(std::make_unique<cdc_streams_table>(db, ss));
db.find_column_family(system_keyspace::size_estimates()).set_virtual_reader(mutation_source(db::size_estimates::virtual_reader(db, sys_ks.local())));
db.find_column_family(system_keyspace::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db)));
db.find_column_family(system_keyspace::v3::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db)));
db.find_column_family(system_keyspace::built_indexes()).set_virtual_reader(mutation_source(db::index::built_indexes_virtual_reader(db)));
}

View File

@@ -114,10 +114,6 @@ WantedBy=local-fs.target scylla-server.service
'''[1:-1]
with open('/etc/systemd/system/var-lib-systemd-coredump.mount', 'w') as f:
f.write(dot_mount)
# in case we have old mounts in deleted state hanging around from older installation
# systemd doesn't seem to be able to deal with those properly, and assume they are still active
# and doesn't do anything about them
run('umount /var/lib/systemd/coredump', shell=True, check=False)
os.makedirs('/var/lib/scylla/coredump', exist_ok=True)
systemd_unit.reload()
systemd_unit('var-lib-systemd-coredump.mount').enable()

View File

@@ -97,9 +97,7 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/
run microdnf clean all
run microdnf --setopt=tsflags=nodocs -y update
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip cpio
# Extract only systemctl binary from systemd package to avoid installing the whole systemd in the container.
run bash -rc "microdnf download systemd && rpm2cpio systemd-*.rpm | cpio -idmv ./usr/bin/systemctl && rm -rf systemd-*.rpm"
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
run pip3 install --no-cache-dir --prefix /usr supervisor
run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
@@ -108,8 +106,6 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
run mkdir -p /var/log/scylla
run chown -R scylla:scylla /var/lib/scylla
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/sysconfig/scylla-server
# Cleanup packages not needed in the final image and clean package manager cache to reduce image size.
run bash -rc "microdnf remove -y cpio && microdnf clean all"
run mkdir -p /opt/scylladb/supervisor
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE

View File

@@ -1,14 +1,6 @@
### a dictionary of redirections
#old path: new path
# Remove an outdated KB
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html
# Remove the troubleshooting page relevant for Open Source only
/stable/troubleshooting/missing-dotmount-files.html: /troubleshooting/index.html
# Move the diver information to another project
/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html

View File

@@ -142,6 +142,10 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
Alternator implements such requests by reading the entire top-level
attribute a, modifying only a.b[3].c, and then writing back a.
Currently, Alternator doesn't use Tablets. That's because Alternator relies
on LWT (lightweight transactions), and LWT is not supported in keyspaces
with Tablets enabled.
```{eval-rst}
.. toctree::
:maxdepth: 2

View File

@@ -187,23 +187,6 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
the keyspace schema with ``tablets = { 'enabled': false }`` or
``tablets = { 'enabled': true }``.
.. _keyspace-rf-rack-valid-to-enforce-rack-list:
Enforcing Rack-List Replication for Tablet Keyspaces
------------------------------------------------------------------
The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
:term:`RF-rack-valid <RF-rack-valid keyspace>`.
Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
to ``rf_rack_valid_keyspaces`` option.
To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
.. _tablets-limitations:
Limitations and Unsupported Features

View File

@@ -190,7 +190,7 @@ then every rack in every datacenter receives a replica, except for racks compris
of only :doc:`zero-token nodes </architecture/zero-token-nodes>`. Racks added after
the keyspace creation do not receive replicas.
When ``enforce_rack_list`` (or (deprecated) ``rf_rack_valid_keyspaces``) is enabled in the config and the keyspace is tablet-based,
When ``rf_rack_valid_keyspaces``` is enabled in the config and the keyspace is tablet-based,
the numeric replication factor is automatically expanded into a rack list when the statement is
executed, which can be observed in the DESCRIBE output afterwards. If the numeric RF is smaller than
the number of racks in a DC, a subset of racks is chosen arbitrarily.
@@ -200,6 +200,8 @@ for two cases. One is setting replication factor to 0, in which case the number
The other is when the numeric replication factor is equal to the current number of replicas
for a given datacanter, in which case the current rack list is preserved.
Altering from a numeric replication factor to a rack list is not supported yet.
Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
auto-expansion will only *add* new datacenters for safety, it will not alter
existing datacenters or remove any even if they are no longer in the cluster.
@@ -422,21 +424,6 @@ Altering from a rack list to a numeric replication factor is not supported.
Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).
.. _conversion-to-rack-list-rf:
Conversion to rack-list replication factor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
.. code-block:: cql
CREATE KEYSPACE Excelsior
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
ALTER KEYSPACE Excelsior
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
.. _drop-keyspace-statement:
DROP KEYSPACE

View File

@@ -241,8 +241,8 @@ Currently, the possible orderings are limited by the :ref:`clustering order <clu
.. _vector-queries:
Vector queries :label-note:`ScyllaDB Cloud`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Vector queries
~~~~~~~~~~~~~~
The ``ORDER BY`` clause can also be used with vector columns to perform the approximate nearest neighbor (ANN) search.
When using vector columns, the syntax is as follows:
@@ -280,26 +280,11 @@ For example::
FROM ImageEmbeddings
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
.. warning::
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
or columns provided in a definition of the index.
Currently, vector queries do not support filtering with ``WHERE`` clause,
grouping with ``GROUP BY`` and paging. This will be added in the future releases.
For example::
SELECT image_id FROM ImageEmbeddings
WHERE user_id = 'user123'
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
Other filtering scenarios are currently not supported.
.. note::
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
Vector indexes do not support all ScyllaDB features (e.g., tracing, TTL, paging, and grouping). More information
about Vector Search is available in the
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
.. _limit-clause:

View File

@@ -129,94 +129,30 @@ More on :doc:`Local Secondary Indexes </features/local-secondary-indexes>`
.. _create-vector-index-statement:
Vector Index :label-note:`ScyllaDB Cloud`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Vector Index :label-caution:`Experimental` :label-note:`ScyllaDB Cloud`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
Vector indexes do not support all ScyllaDB features (e.g., tracing, TTL, paging, and grouping). More information
about Vector Search is available in the
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
Vector indexes are supported in ScyllaDB Cloud only in the clusters that have the vector search feature enabled.
Moreover, vector indexes are an experimental feature that:
* is not suitable for production use,
* does not guarantee backward compatibility between ScyllaDB versions,
* does not support all the features of ScyllaDB (e.g., tracing, filtering, TTL).
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
index for indexing vectors per partition.
similarity search on vector data.
The vector index is the only custom type index supported in ScyllaDB. It is created using
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
add additional columns to the index for filtering the search results. The partition column
specified in the global vector index definition must be the vector column, and any subsequent
columns are treated as filtering columns. The local vector index requires that the partition key
of the base table is also the partition key of the index and the vector column is the first one
from the following columns.
Example of a simple index:
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global vector index. Additional filtering can be performed on the primary key
columns of the base table.
Example of a global vector index with additional filtering:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed to enable similarity search using
a global index. Additional columns are added for filtering the search results.
The filtering is possible on ``category``, ``info`` and all primary key columns
of the base table.
Example of a local vector index:
.. code-block:: cql
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
USING 'vector_index'
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
The vector column (``embedding``) is indexed for similarity search (a local
index) and additional columns are added for filtering the search results. The
filtering is possible on ``category``, ``info`` and all primary key columns of
the base table. The columns ``id`` and ``created_at`` must be the partition key
of the base table.
Vector indexes support additional filtering columns of native data types
(excluding counter and duration). The indexed column itself must be a vector
column, while the extra columns can be used to filter search results.
The supported types are:
* ``ascii``
* ``bigint``
* ``blob``
* ``boolean``
* ``date``
* ``decimal``
* ``double``
* ``float``
* ``inet``
* ``int``
* ``smallint``
* ``text``
* ``varchar``
* ``time``
* ``timestamp``
* ``timeuuid``
* ``tinyint``
* ``uuid``
* ``varint``
The following options are supported for vector indexes. All of them are optional.
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
@@ -241,26 +177,6 @@ The following options are supported for vector indexes. All of them are optional
| | as ``efSearch``. Higher values lead to better recall (i.e., more relevant results are found) | |
| | but increase query latency. Supported values are integers between 1 and 4096. | |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
| ``quantization`` | The quantization method to use for compressing vectors in Vector Index. Vectors in base table | ``f32`` |
| | are never compressed. Supported values (case-insensitive) are: | |
| | | |
| | * ``f32``: 32-bit single-precision IEEE 754 floating-point. | |
| | * ``f16``: 16-bit standard half-precision floating-point (IEEE 754). | |
| | * ``bf16``: 16-bit "Brain" floating-point (optimized for ML workloads). | |
| | * ``i8``: 8-bit signed integer. | |
| | * ``b1``: 1-bit binary value (packed 8 per byte). | |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
| ``oversampling`` | A multiplier for the candidate set size during the search phase. For example, if a query asks for 10 | ``1.0`` |
| | similar vectors (``LIMIT 10``) and ``oversampling`` is 2.0, the search will initially retrieve 20 | |
| | candidates. This can improve accuracy at the cost of latency. Supported values are | |
| | floating-point numbers between 1.0 (no oversampling) and 100.0. | |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
| ``rescoring`` | Flag enabling recalculation of similarity scores with full precision and re-ranking of the candidate set.| ``false`` |
| | Valid only for quantization below ``f32``. Supported values are: | |
| | | |
| | * ``true``: Enable rescoring. | |
| | * ``false``: Disable rescoring. | |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
.. _drop-index-statement:

View File

@@ -665,14 +665,6 @@ it is not possible to update only some elements of a vector (without updating th
Types stored in a vector are not implicitly frozen, so if you want to store a frozen collection or
frozen UDT in a vector, you need to explicitly wrap them using `frozen` keyword.
.. note::
The main application of vectors is to support vector search capabilities, which
are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
Note that Vector Search clusters do not support all ScyllaDB features (e.g., tracing, TTL, paging, and grouping). More information
about Vector Search is available in the
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
.. .. _custom-types:
.. Custom Types

View File

@@ -221,87 +221,6 @@ scylla-bucket/prefix/
```
See the API [documentation](#copying-sstables-on-s3-backup) for more details about the actual backup request.
### The snapshot manifest
Each table snapshot directory contains a manifest.json file that lists the contents of the snapshot and some metadata.
The json structure is as follows:
```
{
"manifest": {
"version": "1.0",
"scope": "node"
},
"node": {
"host_id": "<UUID>",
"datacenter": "mydc",
"rack": "myrack"
},
"snapshot": {
"name": "snapshot name",
"created_at": seconds_since_epoch,
"expires_at": seconds_since_epoch | null,
},
"table": {
"keyspace_name": "my_keyspace",
"table_name": "my_table",
"table_id": "<UUID>",
"tablets_type": "none|powof2",
"tablet_count": N
},
"sstables": [
{
"id": "67e35000-d8c6-11f0-9599-060de9f3bd1b",
"toc_name": "me-3gw7_0ndy_3wlq829wcsddgwha1n-big-TOC.txt",
"data_size": 75,
"index_size": 8,
"first_token": -8629266958227979430,
"last_token": 9168982884335614769,
},
{
"id": "67e35000-d8c6-11f0-85dc-0625e9f3bd1b",
"toc_name": "me-3gw7_0ndy_3wlq821a6cqlbmxrtn-big-TOC.txt",
"data_size": 73,
"index_size": 8,
"first_token": 221146791717891383,
"last_token": 7354559975791427036,
},
...
],
"files": [ ... ]
}
The `manifest` member contains the following attributes:
- `version` - respresenting the version of the manifest itself. It is incremented when members are added or removed from the manifest.
- `scope` - the scope of metadata stored in this manifest file. The following scopes are supported:
- `node` - the manifest describes all SSTables owned by this node in this snapshot.
The `node` member contains metadata about this node that enables datacenter- or rack-aware restore.
- `host_id` - is the node's unique host_id (a UUID).
- `datacenter` - is the node's datacenter.
- `rack` - is the node's rack.
The `snapshot` member contains metadata about the snapshot.
- `name` - is the snapshot name (a.k.a. `tag`)
- `created_at` - is the time when the snapshot was created.
- `expires_at` - is an optional time when the snapshot expires and can be dropped, if a TTL was set for the snapshot. If there is no TTL, `expires_at` may be omitted, set to null, or set to 0.
The `table` member contains metadata about the table being snapshot.
- `keyspace_name` and `table_name` - are self-explanatory.
- `table_id` - a universally unique identifier (UUID) of the table set when the table is created.
- `tablets_type`:
- `none` - if the keyspace uses vnodes replication
- `powof2` - if the keyspace uses tables replication, and the tablet token ranges are based on powers of 2.
- `tablet_count` - Optional. If `tablets_type` is not `none`, contains the number of tablets allcated in the table. If `tablets_type` is `powof2`, tablet_count would be a power of 2.
The `sstables` member is a list containing metadata about the SSTables in the snapshot.
- `id` - is the STable's unique id (a UUID). It is carried over with the SSTable when it's streamed as part of tablet migration, even if it gets a new generation.
- `toc_name` - is the name of the SSTable Table Of Contents (TOC) component.
- `data_size` and `index_size` - are the sizes of the SSTable's data and index components, respectively. They can be used to estimate how much disk space is needed for restore.
- `first_token` and `last_token` - are the first and last tokens in the SSTable, respectively. They can be used to determine if a SSTable is fully contained in a (tablet) token range to enable efficient file-based streaming of the SSTable.
The optional `files` member may contain a list of non-SSTable files included in the snapshot directory, not including the manifest.json file and schema.cql.
```
3. `CREATE KEYSPACE` with S3/GS storage
When creating a keyspace with S3/GS storage, the data is stored under the bucket passed as argument to the `CREATE KEYSPACE` statement.

View File

@@ -6,10 +6,10 @@ same amount of disk space. This means that the number of tablets located on a no
proportional to the gross disk capacity of that node. Because the used disk space of
different tablets can vary greatly, this could create imbalance in disk utilization.
Size based load balancing aims to achieve better disk utilization across nodes in a
Size based load balancing aims to achieve better disk utilization accross nodes in a
cluster. The load balancer will continuously gather information about available disk
space and tablet sizes from all the nodes. It then incrementally computes tablet
migration plans which equalize disk utilization across the cluster.
migration plans which equalize disk utilization accross the cluster.
# Basic operation
@@ -75,7 +75,7 @@ migrations), and will wait for correct tablet sizes to arrive after the next ``l
refresh by the topology coordinator.
One exception to this are nodes which have been excluded from the cluster. These nodes
are down and therefore are not able to send fresh ``load_stats``. But they have to be drained
are down and therefor are not able to send fresh ``load_stats``. But they have to be drained
of their tablets (via tablet rebuild), and the balancer must do this even with incomplete
tablet data. So, only excluded nodes are allowed to have missing tablet sizes.

View File

@@ -357,7 +357,6 @@ Schema:
CREATE TABLE system.load_per_node (
node uuid PRIMARY KEY,
dc text,
effective_capacity bigint,
rack text,
storage_allocated_load bigint,
storage_allocated_utilization double,
@@ -373,7 +372,6 @@ Columns:
* `storage_allocated_load` - Disk space allocated for tablets, assuming each tablet has a fixed size (target_tablet_size).
* `storage_allocated_utilization` - Fraction of node's disk capacity taken for `storage_allocated_load`, where 1.0 means full utilization.
* `storage_capacity` - Total disk capacity in bytes. Used to compute `storage_allocated_utilization`. By default equal to file system's capacity.
* `effective_capacity` - Sum of available disk space and tablet sizes on a node. Used to compute load on a node for size based balancing.
* `storage_load` - Disk space allocated for tablets, computed with actual tablet sizes. Can be null if some of the tablet sizes are not known.
* `storage_utilization` - Fraction of node's disk capacity taken for `storage_load` (with actual tablet sizes), where 1.0 means full utilization. Can be null if some of the tablet sizes are not known.
* `tablets_allocated` - Number of tablet replicas on the node. Migrating tablets are accounted as if migration already finished.

View File

@@ -46,11 +46,14 @@ stateDiagram-v2
state replacing {
rp_join_group0 : join_group0
rp_left_token_ring : left_token_ring
rp_tablet_draining : tablet_draining
rp_write_both_read_old : write_both_read_old
rp_write_both_read_new : write_both_read_new
[*] --> rp_join_group0
rp_join_group0 --> rp_left_token_ring: rollback
rp_join_group0 --> rp_write_both_read_old
rp_join_group0 --> rp_tablet_draining
rp_tablet_draining --> rp_write_both_read_old
rp_tablet_draining --> rp_left_token_ring: rollback
rp_join_group0 --> [*]: rejected
rp_write_both_read_old --> rp_write_both_read_new: streaming completed
rp_write_both_read_old --> rp_left_token_ring: rollback
@@ -66,34 +69,34 @@ stateDiagram-v2
normal --> decommissioning: leave
normal --> removing: remove
state decommissioning {
de_tablet_migration0 : tablet_migration (draining)
de_tablet_migration1 : tablet_migration
[*] --> de_tablet_migration0
de_tablet_migration0 --> [*]: aborted
de_left_token_ring : left_token_ring
de_tablet_draining : tablet_draining
de_tablet_migration : tablet_migration
de_write_both_read_old: write_both_read_old
de_write_both_read_new : write_both_read_new
de_rollback_to_normal : rollback_to_normal
de_rollback_to_normal --> de_tablet_migration1
de_tablet_migration0 --> de_write_both_read_old
de_tablet_migration1 --> [*]
[*] --> de_tablet_draining
de_tablet_draining --> de_rollback_to_normal: rollback
de_rollback_to_normal --> de_tablet_migration
de_tablet_draining --> de_write_both_read_old
de_tablet_migration --> [*]
de_write_both_read_old --> de_write_both_read_new: streaming completed
de_write_both_read_old --> de_rollback_to_normal: rollback
de_write_both_read_new --> de_left_token_ring
de_left_token_ring --> [*]
}
state removing {
re_tablet_migration0 : tablet_migration (draining)
re_tablet_migration1 : tablet_migration
[*] --> re_tablet_migration0
re_tablet_migration0 --> [*]: aborted
re_left_token_ring : left_token_ring
re_tablet_draining : tablet_draining
re_tablet_migration : tablet_migration
re_write_both_read_old : write_both_read_old
re_write_both_read_new : write_both_read_new
re_rollback_to_normal : rollback_to_normal
re_rollback_to_normal --> re_tablet_migration1
re_tablet_migration1 --> [*]
re_tablet_migration0 --> re_write_both_read_old
[*] --> re_tablet_draining
re_tablet_draining --> re_rollback_to_normal: rollback
re_rollback_to_normal --> re_tablet_migration
re_tablet_migration --> [*]
re_tablet_draining --> re_write_both_read_old
re_write_both_read_old --> re_write_both_read_new: streaming completed
re_write_both_read_old --> re_rollback_to_normal: rollback
re_write_both_read_new --> re_left_token_ring
@@ -178,27 +181,6 @@ are the currently supported global topology operations:
contain replicas of the table being truncated. It uses [sessions](#Topology guards)
to make sure that no stale RPCs are executed outside of the scope of the request.
## Tablet draining
Presence of node requests of type `leave` (decommission) and `remove` will cause the load
balancer to start migrating tablets away from those nodes. Until they are drained
of tablets, the requests are in paused state and are not picked by topology
coordinator for execution.
Paused requests are also pending, the "topology_request" field is engaged.
The node's state is still `normal` when request is paused.
Canceling the requests will stop the draining process, because absence of a request
lifts the draining state on the node.
When tablet scheduler is done with migrating all tablets away from a draining node,
the associated request will be unpaused, and can be picked by topology coordinator
for execution. This time it will enter the `write_both_read_old` transition and proceed
with the vnode part.
This allows multiple requests to drain tablets in parallel, and other requests to not be blocked
by long `leave` and `remove` requests.
## Zero-token nodes
Zero-token nodes (the nodes started with `join_ring=false`) never own tokens or become
@@ -239,6 +221,12 @@ that there are no tablet transitions in the system.
Tablets are migrated in parallel and independently.
There is a variant of tablet migration track called tablet draining track, which is invoked
as a step of certain topology operations (e.g. decommission, removenode). Its goal is to readjust tablet replicas
so that a given topology change can proceed. For example, when decommissioning a node, we
need to migrate tablet replicas away from the node being decommissioned.
Tablet draining happens before making changes to vnode-based replication.
## Node replace with tablets
Tablet replicas on the replaced node are rebuilt after the replacing node is already in the normal state and

View File

@@ -1,23 +0,0 @@
.. _automatic-repair:
Automatic Repair
================
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
To enable automatic repair, add this to the configuration (``scylla.yaml``):
.. code-block:: yaml
auto_repair_enabled_default: true
auto_repair_threshold_default_in_seconds: 86400
This will enable automatic repair for all tables with a repair period of 1 day. This configuration has to be set on each node, to an identical value.
More featureful configuration methods will be implemented in the future.
To disable, set ``auto_repair_enabled_default: false``.
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.

View File

@@ -3,7 +3,7 @@
Incremental Repair
==================
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
ScyllaDB's standard repair process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
@@ -28,7 +28,8 @@ Incremental Repair is only supported for tables that use the tablets architectur
Incremental Repair Modes
------------------------
While incremental repair is the default and recommended mode, you can control its behavior for a given repair operation using the ``incremental_mode`` parameter. This is useful for situations where you might need to force a full data validation.
Incremental is currently disabled by default. You can control its behavior for a given repair operation using the ``incremental_mode`` parameter.
This is useful for enabling incremental repair, or in situations where you might need to force a full data validation.
The available modes are:
@@ -37,12 +38,7 @@ The available modes are:
* ``disabled``: Completely disables the incremental repair logic for the current operation. The repair behaves like a classic, non-incremental repair, and it does not read or update any incremental repair status markers.
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental.
It can also be specified with the REST API, e.g.:
.. code::
curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental. It can also be specified with the REST API, e.g., curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
Benefits of Incremental Repair
------------------------------
@@ -51,8 +47,6 @@ Benefits of Incremental Repair
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
Notes
-----

View File

@@ -17,7 +17,6 @@ This document highlights ScyllaDB's key data modeling features.
Workload Prioritization </features/workload-prioritization>
Backup and Restore </features/backup-and-restore>
Incremental Repair </features/incremental-repair/>
Automatic Repair </features/automatic-repair/>
Vector Search </features/vector-search/>
.. panel-box::
@@ -45,7 +44,5 @@ This document highlights ScyllaDB's key data modeling features.
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
efficient and lightweight approach to maintaining data consistency by
repairing only the data that has changed since the last repair.
* :doc:`Automatic Repair </features/automatic-repair/>` schedules and runs repairs
directly in ScyllaDB, without external schedulers.
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
similarity-based queries on vector embeddings.

View File

@@ -86,7 +86,7 @@ Compaction Strategies with Materialized Views
Materialized views, just like regular tables, use one of the available :doc:`compaction strategies </architecture/compaction/compaction-strategies>`.
When a materialized view is created, it does not inherit its base table compaction strategy settings, because the data model
of a view does not necessarily have the same characteristics as the one from its base table.
Instead, the default compaction strategy (IncrementalCompactionStrategy) is used.
Instead, the default compaction strategy (SizeTieredCompactionStrategy) is used.
A compaction strategy for a new materialized view can be explicitly set during its creation, using the following command:

View File

@@ -10,6 +10,7 @@ Install ScyllaDB |CURRENT_VERSION|
/getting-started/install-scylla/launch-on-azure
/getting-started/installation-common/scylla-web-installer
/getting-started/install-scylla/install-on-linux
/getting-started/installation-common/install-jmx
/getting-started/install-scylla/run-in-docker
/getting-started/installation-common/unified-installer
/getting-started/installation-common/air-gapped-install
@@ -35,6 +36,7 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
* :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
* :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
* :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
* :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
* :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
* :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`

View File

@@ -4,9 +4,9 @@
.. |RHEL_EPEL_8| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
.. |RHEL_EPEL_9| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
========================================================
Install ScyllaDB |CURRENT_VERSION| Linux Packages
========================================================
======================================
Install ScyllaDB Linux Packages
======================================
We recommend installing ScyllaDB using :doc:`ScyllaDB Web Installer for Linux </getting-started/installation-common/scylla-web-installer/>`,
a platform-agnostic installation script, to install ScyllaDB on any supported Linux platform.
@@ -46,8 +46,8 @@ Install ScyllaDB
.. code-block:: console
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys c503c686b007f39e
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor c503c686b007f39e | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor a43e06657bac99e3 | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
.. code-block:: console
:substitutions:
@@ -94,6 +94,16 @@ Install ScyllaDB
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1
#. (Ubuntu only) Set Java 11.
.. code-block:: console
sudo apt-get update
sudo apt-get install -y openjdk-11-jre-headless
sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
.. group-tab:: Centos/RHEL
#. Install the EPEL repository.
@@ -147,6 +157,14 @@ Install ScyllaDB
sudo yum install scylla-5.2.3
(Optional) Install scylla-jmx
-------------------------------
scylla-jmx is an optional package and is not installed by default.
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
.. include:: /getting-started/_common/setup-after-install.rst
Next Steps

View File

@@ -1,6 +1,6 @@
===============================================
Launch ScyllaDB |CURRENT_VERSION| on AWS
===============================================
==========================
Launch ScyllaDB on AWS
==========================
This article will guide you through self-managed ScyllaDB deployment on AWS. For a fully-managed deployment of ScyllaDB
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.

View File

@@ -1,6 +1,6 @@
===============================================
Launch ScyllaDB |CURRENT_VERSION| on Azure
===============================================
==========================
Launch ScyllaDB on Azure
==========================
This article will guide you through self-managed ScyllaDB deployment on Azure. For a fully-managed deployment of ScyllaDB
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.

View File

@@ -1,6 +1,6 @@
=============================================
Launch ScyllaDB |CURRENT_VERSION| on GCP
=============================================
==========================
Launch ScyllaDB on GCP
==========================
This article will guide you through self-managed ScyllaDB deployment on GCP. For a fully-managed deployment of ScyllaDB
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.

View File

@@ -0,0 +1,78 @@
======================================
Install scylla-jmx Package
======================================
scylla-jmx is an optional package and is not installed by default.
If you need JMX server, you can still install it from scylla-jmx GitHub page.
.. tabs::
.. group-tab:: Debian/Ubuntu
#. Download .deb package from scylla-jmx page.
Access to https://github.com/scylladb/scylla-jmx, select latest
release from "releases", download a file end with ".deb".
#. (Optional) Transfer the downloaded package to the install node.
If the pc from which you downloaded the package is different from
the node where you install scylladb, you will need to transfer
the files to the node.
#. Install scylla-jmx package.
.. code-block:: console
sudo apt install -y ./scylla-jmx_<version>_all.deb
.. group-tab:: Centos/RHEL
#. Download .rpm package from scylla-jmx page.
Access to https://github.com/scylladb/scylla-jmx, select latest
release from "releases", download a file end with ".rpm".
#. (Optional) Transfer the downloaded package to the install node.
If the pc from which you downloaded the package is different from
the node where you install scylladb, you will need to transfer
the files to the node.
#. Install scylla-jmx package.
.. code-block:: console
sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
.. group-tab:: Install without root privileges
#. Download .tar.gz package from scylla-jmx page.
Access to https://github.com/scylladb/scylla-jmx, select latest
release from "releases", download a file end with ".tar.gz".
#. (Optional) Transfer the downloaded package to the install node.
If the pc from which you downloaded the package is different from
the node where you install scylladb, you will need to transfer
the files to the node.
#. Install scylla-jmx package.
.. code:: console
tar xpf scylla-jmx-<version>.noarch.tar.gz
cd scylla-jmx
./install.sh --nonroot
Next Steps
-----------
* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
* Get familiar with ScyllaDBs :doc:`command line reference guide </operating-scylla/nodetool>`.
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_

View File

@@ -14,35 +14,44 @@ Prerequisites
Ensure your platform is supported by the ScyllaDB version you want to install.
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
Note that if you're on CentOS 7, only root offline installation is supported.
Download and Install
-----------------------
#. Download the latest tar.gz file for ScyllaDB version (x86 or ARM) from ``https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-<version>/``.
**Example** for version 2025.1:
- Go to https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-2025.1/
- Download the ``scylla-unified`` file for the patch version you want to
install. For example, to install 2025.1.9 (x86), download
``scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz``.
Example for version 6.1: https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-6.1/
#. Uncompress the downloaded package.
**Example** for version 2025.1.9 (x86) (downloaded in the previous step):
The following example shows the package for ScyllaDB 6.1.1 (x86):
.. code::
.. code:: console
tar xvfz scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz
tar xvfz scylla-unified-6.1.1-0.20240814.8d90b817660a.x86_64.tar.gz
#. (Root offline installation only) For root offline installation on Debian-like
systems, two additional packages, ``xfsprogs`` and ``mdadm``, should be
installed to be used in RAID setup.
#. Install OpenJDK 8 or 11.
The following example shows Java installation on a CentOS-like system:
.. code:: console
sudo yum install -y java-11-openjdk-headless
For root offline installation on Debian-like systems, two additional packages, ``xfsprogs``
and ``mdadm``, should be installed to be used in RAID setup.
#. Install ScyllaDB as a user with non-root privileges:
.. code:: console
./install.sh --nonroot
./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
#. (Optional) Install scylla-jmx
scylla-jmx is an optional package and is not installed by default.
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
Configure and Run ScyllaDB
----------------------------
@@ -72,14 +81,19 @@ Run nodetool:
.. code:: console
~/scylladb/bin/nodetool nodetool status
~/scylladb/share/cassandra/bin/nodetool status
Run cqlsh:
.. code:: console
~/scylladb/bin/cqlsh
~/scylladb/share/cassandra/bin/cqlsh
Run cassandra-stress:
.. code:: console
~/scylladb/share/cassandra/bin/cassandra-stress write
.. note::
@@ -110,7 +124,7 @@ Nonroot install
./install.sh --upgrade --nonroot
.. note:: The installation script does not upgrade scylla-tools. You will have to upgrade them separately.
.. note:: The installation script does not upgrade scylla-jmx and scylla-tools. You will have to upgrade them separately.
Uninstall
===========
@@ -140,4 +154,4 @@ Next Steps
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
* Get familiar with ScyllaDBs :doc:`command line reference guide </operating-scylla/nodetool>`.
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_

View File

@@ -57,6 +57,7 @@ Knowledge Base
* :doc:`Customizing CPUSET </kb/customizing-cpuset>`
* :doc:`Recreate RAID devices </kb/raid-device>` - How to recreate your RAID devices without running scylla-setup
* :doc:`Configure ScyllaDB Networking with Multiple NIC/IP Combinations </kb/yaml-address>` - examples for setting the different IP addresses in scylla.yaml
* :doc:`Updating the Mode in perftune.yaml After a ScyllaDB Upgrade </kb/perftune-modes-sync>`
* :doc:`Kafka Sink Connector Quickstart </using-scylla/integrations/kafka-connector>`
* :doc:`Kafka Sink Connector Configuration </using-scylla/integrations/sink-config>`

View File

@@ -0,0 +1,48 @@
==============================================================
Updating the Mode in perftune.yaml After a ScyllaDB Upgrade
==============================================================
We improved ScyllaDB's performance by `removing the rx_queues_count from the mode
condition <https://github.com/scylladb/seastar/pull/949>`_. As a result, ScyllaDB operates in
the ``sq_split`` mode instead of the ``mq`` mode (see :doc:`Seastar Perftune </operating-scylla/admin-tools/perftune>` for information about the modes).
If you upgrade from an earlier version of ScyllaDB, your cluster's existing nodes may use the ``mq`` mode,
while new nodes will use the ``sq_split`` mode. As using different modes across one cluster is not recommended,
you should change the configuration to ensure that the ``sq_split`` mode is used on all nodes.
This section describes how to update the `perftune.yaml` file to configure the ``sq_split`` mode on all nodes.
Procedure
------------
The examples below assume that you are using the default locations for storing data and the `scylla.yaml` file,
and that your NIC is ``eth5``.
#. Backup your old configuration.
.. code-block:: console
sudo mv /etc/scylla.d/cpuset.conf /etc/scylla.d/cpuset.conf.old
sudo mv /etc/scylla.d/perftune.yaml /etc/scylla.d/perftune.yaml.old
#. Create a new configuration.
.. code-block:: console
sudo scylla_sysconfig_setup --nic eth5 --homedir /var/lib/scylla --confdir /etc/scylla
A new ``/etc/scylla.d/cpuset.conf`` will be generated on the output.
#. Compare the contents of the newly generated ``/etc/scylla.d/cpuset.conf`` with ``/etc/scylla.d/cpuset.conf.old`` you created in step 1.
- If they are exactly the same, rename ``/etc/scylla.d/perftune.yaml.old`` you created in step 1 back to ``/etc/scylla.d/perftune.yaml`` and continue to the next node.
- If they are different, move on to the next steps.
#. Restart the ``scylla-server`` service.
.. code-block:: console
nodetool drain
sudo systemctl restart scylla-server
#. Wait for the service to become up and running (similarly to how it is done during a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart>`). It may take a considerable amount of time before the node is in the UN state due to resharding.
#. Continue to the next node.

View File

@@ -25,8 +25,4 @@ Port Description Protocol
19142 Native shard-aware transport port (ssl) TCP
====== ============================================ ========
If you're using ScyllaDB Alternator, ensure that the ports configured
for Alternator with the ``alternator_port`` or ``alternator_https_port`` parameter
are open. See :doc:`ScyllaDB Alternator </alternator/alternator>` for details.
.. note:: For ScyllaDB Manager ports, see the `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_ documentation.

View File

@@ -93,25 +93,6 @@ API calls
Cluster tasks are not unregistered from task manager with API calls.
Node operations module
----------------------
There is a module named ``node_ops``, which allows tracking node operations: decommission, removenode, bootstrap, replace, rebuild.
The ``type`` field designates the operation, and is one of:
- ``decommission``
- ``remove node``
- ``bootstrap``
- ``replace``
- ``rebuild``
The ``scope`` and ``kind`` fields are set to ``cluster``.
The ``entity`` field holds the host id of the node which is being operated on, as long as the request
is not finished. In case of the ``replace`` operation, it will hold the host id of the replacing node.
``decommission`` and ``remove node`` tasks are abortable, but only before they finish tablet migration.
Tasks API
---------

View File

@@ -55,7 +55,7 @@ ScyllaDB nodetool cluster repair command supports the following options:
nodetool cluster repair --tablet-tokens 1,10474535988
- ``--incremental-mode`` specifies the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental.
- ``--incremental-mode`` specifies the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled'.
For example:

View File

@@ -29,13 +29,5 @@ Before you run ``nodetool decommission``:
request may fail.
In such a case, ALTER the keyspace to reduce the RF before running ``nodetool decommission``.
It's allowed to invoke ``nodetool decommission`` on multiple nodes in parallel. This will be faster than doing
it sequentially if there is significant amount of data in tablet-based keyspaces, because
tablets are migrated from nodes in parallel. Decommission process first migrates tablets away, and this
part is done in parallel for all nodes being decommissioned. Then it does the vnode-based decommission, and
this part is serialized with other vnode-based operations, including those from other decommission operations.
Decommission which is still in the tablet draining phase can be canceled using Task Manager API.
See :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
.. include:: nodetool-index.rst

View File

@@ -25,8 +25,4 @@ For Example:
nodetool rebuild <source-dc-name>
``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
.. include:: nodetool-index.rst

View File

@@ -56,17 +56,6 @@ To only mark the node as permanently down without doing actual removal, use :doc
.. _removenode-ignore-dead-nodes:
It's allowed to invoke ``nodetool removenode`` on multiple nodes in parallel. This will be faster than doing
it sequentially if there is significant amount of data in tablet-based keyspaces, because
tablets which belong to removed nodes will be rebuilt in parallel. Node removal first migrates tablets to new
replicas, in parallel for all nodes being removed. Then does the part which executes
removal for the vnode-based keyspaces, and this is serialized with other vnode-based operations, including
those from other removenode operations.
Removenode which is still in the tablet rebuild phase can be canceled using Task Manager API.
Tablets which are already rebuilt will remain on their new replicas.
See :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
Ignoring Dead Nodes
---------------------

View File

@@ -177,8 +177,8 @@ To display the log classes (output changes with each version so your display may
storage_proxy
storage_service
stream_session
broadcast_modification_statement
broadcast_select_statement
strongly_consistent_modification_statement
strongly_consistent_select_statement
system_distributed_keyspace
system_keyspace
table

View File

@@ -155,6 +155,7 @@ Add New DC
UN 54.235.9.159 109.75 KB 256 ? 39798227-9f6f-4868-8193-08570856c09a RACK1
UN 54.146.228.25 128.33 KB 256 ? 7a4957a1-9590-4434-9746-9c8a6f796a0c RACK1
.. TODO possibly provide additional information WRT how ALTER works with tablets
#. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:
@@ -170,68 +171,26 @@ Add New DC
DESCRIBE KEYSPACE mykeyspace;
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};
ALTER Command
.. code-block:: cql
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
After
.. code-block:: cql
DESCRIBE KEYSPACE mykeyspace;
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
For tablet keyspaces, update the replication factor one by one:
.. code-block:: cql
DESCRIBE KEYSPACE mykeyspace2;
CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
.. code-block:: cql
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
.. note::
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
Before
.. code-block:: cql
DESCRIBE KEYSPACE mykeyspace3;
CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
Add all the nodes to the new datacenter and then alter the keyspace one by one:
.. code-block:: cql
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
After
.. code-block:: cql
DESCRIBE KEYSPACE mykeyspace3;
CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.
For example:
@@ -239,7 +198,7 @@ Add New DC
The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.
#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
#. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.

View File

@@ -40,14 +40,12 @@ Prerequisites
Procedure
---------
#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
For example:
If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC
#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
#. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.
For example:
@@ -75,33 +73,6 @@ Procedure
cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
For tablet keyspaces, update the replication factor one by one:
.. code-block:: shell
cqlsh> DESCRIBE nba2
cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
.. code-block:: shell
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
.. note::
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
.. code-block:: shell
cqlsh> DESCRIBE nba3
cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
.. code-block:: shell
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.

View File

@@ -58,12 +58,4 @@ See also
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
Incremental Repair
------------------
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
Automatic Repair
----------------
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.

View File

@@ -17,7 +17,7 @@ Glossary
The CAP Theorem is the notion that **C** (Consistency), **A** (Availability) and **P** (Partition Tolerance) of data are mutually dependent in a distributed system. Increasing any 2 of these factors will reduce the third. ScyllaDB chooses availability and partition tolerance over consistency. See :doc:`Fault Tolerance </architecture/architecture-fault-tolerance>`.
Cluster
One or multiple ScyllaDB nodes, acting in concert, which own a single contiguous token range. State is communicated between nodes in the cluster via :doc:`Raft </architecture/raft>`.
One or multiple ScyllaDB nodes, acting in concert, which own a single contiguous token range. State is communicated between nodes in the cluster via the Gossip protocol. See :doc:`Ring Architecture </architecture/ringarchitecture/index>`.
Clustering Key
A single or multi-column clustering key determines a rows uniqueness and sort order on disk within a partition. See :doc:`Ring Architecture </architecture/ringarchitecture/index>`.
@@ -157,18 +157,14 @@ Glossary
Table
A collection of columns fetched by row. Columns are ordered by Clustering Key. See :doc:`Ring Architecture </architecture/ringarchitecture/index>`.
Tablet
The name of a :term:`Token Range` in the `Tablet Model </architecture/tablets>` of assigning ownership of data in a cluster.
Time-window compaction strategy
TWCS is designed for time series data. See :doc:`Compaction Strategies</architecture/compaction/compaction-strategies/>`.
Token
A hash value calculated over the partition key of a table. All tokens calculated over any partition key of any table are part the same token domain. This allows unified assignment of partitions to nodes in a ScyllaDB cluster via using :term:`Token Range` as the basis of ownership assignment.
A value in a range, used to identify both nodes and partitions. Each node in a ScyllaDB cluster is given an (initial) token, which defines the end of the range a node handles. See :doc:`Ring Architecture </architecture/ringarchitecture/index>`.
Token Range
A contiguous range of :term:`Token` values, the basis of ownership assignment in a ScyllaDB cluster. A token range is defined by a start and end token. Each node in a ScyllaDB cluster owns a number of token ranges.
There are two models of distributing token ranges in the cluster, the :doc:`VNode Model </architecture/ringarchitecture/index>` and the `Tablet Model </architecture/tablets>`.
The total range of potential unique identifiers supported by the partitioner. By default, each ScyllaDB node in the cluster handles 256 token ranges. Each token range corresponds to a Vnode. Each range of hashes in turn is a segment of the total range of a given hash function. See :doc:`Ring Architecture </architecture/ringarchitecture/index>`.
Tombstone
A marker that indicates that data has been deleted. A large number of tombstones may impact read performance and disk usage, so an efficient tombstone garbage collection strategy should be employed. See :ref:`Tombstones GC options <ddl-tombstones-gc>`.

Some files were not shown because too many files have changed in this diff Show More