Compare commits
146 Commits
debug_form
...
scylla-202
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9190d42863 | ||
|
|
09b0e1ba8b | ||
|
|
8a7f5f1428 | ||
|
|
185288f16e | ||
|
|
7dfcb53197 | ||
|
|
d552244812 | ||
|
|
62b344cb55 | ||
|
|
173bfd627c | ||
|
|
bf369326d6 | ||
|
|
864774fb00 | ||
|
|
49ed97cec8 | ||
|
|
81685b0d06 | ||
|
|
06013b2377 | ||
|
|
4cc5c2605f | ||
|
|
021851c5c5 | ||
|
|
c4aa14c1a7 | ||
|
|
0fdb0961a2 | ||
|
|
2100ae2d0a | ||
|
|
51fc498314 | ||
|
|
f4b938df09 | ||
|
|
0dfefc3f12 | ||
|
|
883e3e014a | ||
|
|
4ccb795beb | ||
|
|
9e02b0f45f | ||
|
|
eb9b8dbf62 | ||
|
|
995df5dec6 | ||
|
|
beb781b829 | ||
|
|
502b7f296d | ||
|
|
b251ee02a4 | ||
|
|
f26d08dde2 | ||
|
|
9cd1038c7a | ||
|
|
fdae3e4f3a | ||
|
|
d47e4898ea | ||
|
|
7bc87de838 | ||
|
|
2141b9b824 | ||
|
|
aa50edbf17 | ||
|
|
7f836aa3ec | ||
|
|
bd26803c1a | ||
|
|
2feed49285 | ||
|
|
3007cb6f37 | ||
|
|
1e2d1c7e85 | ||
|
|
55ad575c8f | ||
|
|
8982140cd9 | ||
|
|
e90449f770 | ||
|
|
98fd5c5e45 | ||
|
|
cca6a1c3dd | ||
|
|
9edd0ae3fb | ||
|
|
dc3133b031 | ||
|
|
86554e6192 | ||
|
|
637618560b | ||
|
|
8c3c5777da | ||
|
|
bb9a5261ec | ||
|
|
d5d81cc066 | ||
|
|
6a438543c2 | ||
|
|
99a67484bf | ||
|
|
cabf2845d9 | ||
|
|
ff4a0fc87e | ||
|
|
0a89dbb4d4 | ||
|
|
19cbaa1be2 | ||
|
|
9cf0f0998d | ||
|
|
f56e1760d7 | ||
|
|
db4e3a664d | ||
|
|
c292892d5f | ||
|
|
d87467f77b | ||
|
|
6e92ee1bb2 | ||
|
|
4ecc402b79 | ||
|
|
b27adefc16 | ||
|
|
cad92d5100 | ||
|
|
44cc5ae30b | ||
|
|
05a5bd542a | ||
|
|
8a626bb458 | ||
|
|
3a56a0cf99 | ||
|
|
0cdac69aab | ||
|
|
f04a3acf33 | ||
|
|
ad716f9341 | ||
|
|
2edd87f2e1 | ||
|
|
ba10e74523 | ||
|
|
5abc2fea9f | ||
|
|
2ab81f768b | ||
|
|
cfebb52db0 | ||
|
|
37ef37e8ab | ||
|
|
fdad814aa3 | ||
|
|
0257f7cc89 | ||
|
|
07bfd920e7 | ||
|
|
698ba5bd0b | ||
|
|
d8c7303d14 | ||
|
|
9365adb2fb | ||
|
|
6a55396e90 | ||
|
|
f4b79c1b1d | ||
|
|
f633f57163 | ||
|
|
09ed4178a6 | ||
|
|
2bf7a0f65e | ||
|
|
5b15c52f1e | ||
|
|
26e17202f6 | ||
|
|
b62e1b405b | ||
|
|
f3d2a16e66 | ||
|
|
eee99ebb3d | ||
|
|
c248744c5a | ||
|
|
4ba3c08d45 | ||
|
|
c8c21cc29c | ||
|
|
e95689c96b | ||
|
|
6094f4b7b2 | ||
|
|
ad64dc7c01 | ||
|
|
bafd185087 | ||
|
|
07d1f8f48a | ||
|
|
523d529d27 | ||
|
|
c8dbd43ed5 | ||
|
|
0cf9f41649 | ||
|
|
dc89e2ea37 | ||
|
|
797f56cb45 | ||
|
|
be1d418bc0 | ||
|
|
46923f7358 | ||
|
|
4032e95715 | ||
|
|
eab10c00b1 | ||
|
|
091c3b4e22 | ||
|
|
19eadafdef | ||
|
|
358fc15893 | ||
|
|
32124d209e | ||
|
|
c7f4bda459 | ||
|
|
568af3cd8d | ||
|
|
bd694dd1a1 | ||
|
|
9672e0171f | ||
|
|
8cec41acf2 | ||
|
|
d207de0d76 | ||
|
|
edde4e878e | ||
|
|
be1c674f1a | ||
|
|
a7cff37024 | ||
|
|
9431bc5628 | ||
|
|
14db8375ac | ||
|
|
614020b5d5 | ||
|
|
e091afb400 | ||
|
|
edc46fe6a1 | ||
|
|
f8b9b767c2 | ||
|
|
23d038b385 | ||
|
|
3e2d1384bf | ||
|
|
bd7481e30c | ||
|
|
16d7b65754 | ||
|
|
e30c01eae6 | ||
|
|
d0f3725887 | ||
|
|
c12168b7ef | ||
|
|
76c0162060 | ||
|
|
c9620d9573 | ||
|
|
91cf77d016 | ||
|
|
2c2f0693ab | ||
|
|
2c73d0e6b5 | ||
|
|
f94296e0ae |
@@ -18,7 +18,7 @@ jobs:
|
||||
|
||||
// Regular expression pattern to check for "Fixes" prefix
|
||||
// Adjusted to dynamically insert the repository full name
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
|
||||
const regex = new RegExp(pattern);
|
||||
|
||||
if (!regex.test(body)) {
|
||||
|
||||
53
.github/workflows/call_backport_with_jira.yaml
vendored
Normal file
53
.github/workflows/call_backport_with_jira.yaml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
name: Backport with Jira Integration
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- next-*.*
|
||||
- branch-*.*
|
||||
pull_request_target:
|
||||
types: [labeled, closed]
|
||||
branches:
|
||||
- master
|
||||
- next
|
||||
- next-*.*
|
||||
- branch-*.*
|
||||
|
||||
jobs:
|
||||
backport-on-push:
|
||||
if: github.event_name == 'push'
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'push'
|
||||
base_branch: ${{ github.ref }}
|
||||
commits: ${{ github.event.before }}..${{ github.sha }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
backport-on-label:
|
||||
if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'labeled'
|
||||
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
|
||||
pull_request_number: ${{ github.event.pull_request.number }}
|
||||
head_commit: ${{ github.event.pull_request.base.sha }}
|
||||
label_name: ${{ github.event.label.name }}
|
||||
pr_state: ${{ github.event.pull_request.state }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
backport-chain:
|
||||
if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'chain'
|
||||
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
|
||||
pull_request_number: ${{ github.event.pull_request.number }}
|
||||
pr_body: ${{ github.event.pull_request.body }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
45
.github/workflows/trigger-scylla-ci.yaml
vendored
45
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -9,16 +9,53 @@ on:
|
||||
|
||||
jobs:
|
||||
trigger-jenkins:
|
||||
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
|
||||
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Verify Org Membership
|
||||
id: verify_author
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
|
||||
AUTHOR="${{ github.event.pull_request.user.login }}"
|
||||
else
|
||||
AUTHOR="${{ github.event.comment.user.login }}"
|
||||
fi
|
||||
ORG="scylladb"
|
||||
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
|
||||
echo "member=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
|
||||
echo "member=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
shell: bash
|
||||
run: |
|
||||
BODY=$(cat << 'EOF'
|
||||
${{ github.event.comment.body }}
|
||||
EOF
|
||||
)
|
||||
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "trigger=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Trigger Scylla-CI-Route Jenkins Job
|
||||
if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
|
||||
env:
|
||||
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
|
||||
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
|
||||
JENKINS_URL: "https://jenkins.scylladb.com"
|
||||
PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
|
||||
PR_REPO_NAME: "${{ github.event.repository.full_name }}"
|
||||
run: |
|
||||
PR_NUMBER=${{ github.event.issue.number }}
|
||||
PR_REPO_NAME=${{ github.event.repository.full_name }}
|
||||
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
|
||||
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
|
||||
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0-dev
|
||||
VERSION=2026.1.0
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -767,7 +767,7 @@ static future<bool> scan_table(
|
||||
// by tasking another node to take over scanning of the dead node's primary
|
||||
// ranges. What we do here is that this node will also check expiration
|
||||
// on its *secondary* ranges - but only those whose primary owner is down.
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
|
||||
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
|
||||
if (!gossiper.is_alive(tablet_primary_replica.host)) {
|
||||
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
|
||||
|
||||
@@ -515,6 +515,15 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
|
||||
auto sstables = parsed.GetArray() |
|
||||
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
|
||||
std::ranges::to<std::vector>();
|
||||
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
|
||||
keyspace,
|
||||
table,
|
||||
endpoint,
|
||||
bucket,
|
||||
prefix,
|
||||
sstables.size(),
|
||||
scope,
|
||||
primary_replica_only);
|
||||
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
|
||||
@@ -778,6 +778,7 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
|
||||
cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
|
||||
}
|
||||
compaction::compaction_state& cs = get_compaction_state(&t);
|
||||
auto gh = cs.gate.hold();
|
||||
auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
|
||||
if (!reason.empty()) {
|
||||
cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
|
||||
@@ -791,6 +792,7 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
|
||||
cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
|
||||
}
|
||||
compaction::compaction_state& cs = get_compaction_state(&t);
|
||||
auto gh = cs.gate.hold();
|
||||
auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
|
||||
if (!reason.empty()) {
|
||||
cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
|
||||
@@ -1519,7 +1521,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
|
||||
| std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
|
||||
| std::ranges::to<std::unordered_set>());
|
||||
};
|
||||
const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
|
||||
const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
|
||||
.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
|
||||
|
||||
auto count = co_await num_runs_for_compaction();
|
||||
if (count <= threshold) {
|
||||
cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
|
||||
@@ -1534,9 +1538,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
|
||||
auto& cstate = get_compaction_state(&t);
|
||||
try {
|
||||
while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
|
||||
co_await cstate.compaction_done.wait([this, &t] {
|
||||
return !can_perform_regular_compaction(t);
|
||||
});
|
||||
co_await cstate.compaction_done.wait();
|
||||
}
|
||||
} catch (const broken_condition_variable&) {
|
||||
co_return;
|
||||
@@ -2387,6 +2389,8 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
|
||||
if (!c_state.gate.is_closed()) {
|
||||
auto close_gate = c_state.gate.close();
|
||||
co_await stop_ongoing_compactions(reason, &t);
|
||||
// Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
|
||||
co_await c_state.incremental_repair_lock.write_lock();
|
||||
co_await std::move(close_gate);
|
||||
}
|
||||
|
||||
|
||||
47
configure.py
47
configure.py
@@ -730,28 +730,6 @@ vector_search_tests = set([
|
||||
'test/vector_search/rescoring_test'
|
||||
])
|
||||
|
||||
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
|
||||
vector_search_validator_deps = set([
|
||||
'test/vector_search_validator/build-validator',
|
||||
'test/vector_search_validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/src/main.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
|
||||
])
|
||||
|
||||
vector_store_bin = 'vector-search-validator/bin/vector-store'
|
||||
vector_store_deps = set([
|
||||
'test/vector_search_validator/build-env',
|
||||
'test/vector_search_validator/build-vector-store',
|
||||
])
|
||||
|
||||
vector_search_validator_bins = set([
|
||||
vector_search_validator_bin,
|
||||
vector_store_bin,
|
||||
])
|
||||
|
||||
wasms = set([
|
||||
'wasm/return_input.wat',
|
||||
'wasm/test_complex_null_values.wat',
|
||||
@@ -785,7 +763,7 @@ other = set([
|
||||
'iotune',
|
||||
])
|
||||
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms
|
||||
|
||||
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
|
||||
@@ -2582,11 +2560,10 @@ def write_build_file(f,
|
||||
description = RUST_LIB $out
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
|
||||
f.write(
|
||||
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
|
||||
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
|
||||
mode=mode,
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
|
||||
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
|
||||
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
if profile_recipe := modes[mode].get('profile_recipe'):
|
||||
@@ -2616,7 +2593,7 @@ def write_build_file(f,
|
||||
continue
|
||||
profile_dep = modes[mode].get('profile_target', "")
|
||||
|
||||
if binary in other or binary in wasms or binary in vector_search_validator_bins:
|
||||
if binary in other or binary in wasms:
|
||||
continue
|
||||
srcs = deps[binary]
|
||||
# 'scylla'
|
||||
@@ -2727,11 +2704,10 @@ def write_build_file(f,
|
||||
)
|
||||
|
||||
f.write(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
|
||||
mode=mode,
|
||||
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
|
||||
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
|
||||
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
f.write(
|
||||
@@ -2899,19 +2875,6 @@ def write_build_file(f,
|
||||
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
rule build-vector-search-validator
|
||||
command = test/vector_search_validator/build-validator $builddir
|
||||
rule build-vector-store
|
||||
command = test/vector_search_validator/build-vector-store $builddir
|
||||
'''))
|
||||
f.write(
|
||||
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
|
||||
)
|
||||
f.write(
|
||||
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
|
||||
build dist-unified: phony dist-unified-tar
|
||||
|
||||
@@ -10,9 +10,41 @@
|
||||
#include "types/types.hh"
|
||||
#include "types/vector.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <span>
|
||||
#include <bit>
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
|
||||
namespace detail {
|
||||
|
||||
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
|
||||
if (!param) {
|
||||
throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
|
||||
}
|
||||
|
||||
const size_t expected_size = dimension * sizeof(float);
|
||||
if (param->size() != expected_size) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
|
||||
expected_size, dimension, param->size()));
|
||||
}
|
||||
|
||||
std::vector<float> result;
|
||||
result.reserve(dimension);
|
||||
|
||||
bytes_view view(*param);
|
||||
for (size_t i = 0; i < dimension; ++i) {
|
||||
// read_simple handles network byte order (big-endian) conversion
|
||||
uint32_t raw = read_simple<uint32_t>(view);
|
||||
result.push_back(std::bit_cast<float>(raw));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
namespace {
|
||||
|
||||
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
|
||||
@@ -22,14 +54,14 @@ namespace {
|
||||
|
||||
// You should only use this function if you need to preserve the original vectors and cannot normalize
|
||||
// them in advance.
|
||||
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double dot_product = 0.0;
|
||||
double squared_norm_a = 0.0;
|
||||
double squared_norm_b = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
|
||||
dot_product += a * b;
|
||||
squared_norm_a += a * a;
|
||||
@@ -37,7 +69,7 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
|
||||
}
|
||||
|
||||
if (squared_norm_a == 0 || squared_norm_b == 0) {
|
||||
throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
// The cosine similarity is in the range [-1, 1].
|
||||
@@ -46,12 +78,12 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
|
||||
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
|
||||
}
|
||||
|
||||
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
|
||||
double diff = a - b;
|
||||
sum += diff * diff;
|
||||
@@ -65,12 +97,12 @@ float compute_euclidean_similarity(const std::vector<data_value>& v1, const std:
|
||||
|
||||
// Assumes that both vectors are L2-normalized.
|
||||
// This similarity is intended as an optimized way to perform cosine similarity calculation.
|
||||
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double dot_product = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
dot_product += a * b;
|
||||
}
|
||||
|
||||
@@ -136,13 +168,15 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto& type = arg_types()[0];
|
||||
data_value v1 = type->deserialize(*parameters[0]);
|
||||
data_value v2 = type->deserialize(*parameters[1]);
|
||||
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
|
||||
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
|
||||
// Extract dimension from the vector type
|
||||
const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
|
||||
size_t dimension = type.get_dimension();
|
||||
|
||||
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
|
||||
// Optimized path: extract floats directly from bytes, bypassing data_value overhead
|
||||
std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
|
||||
std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
|
||||
|
||||
float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
|
||||
return float_type->decompose(result);
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "native_scalar_function.hh"
|
||||
#include "cql3/assignment_testable.hh"
|
||||
#include "cql3/functions/function_name.hh"
|
||||
#include <span>
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
@@ -19,7 +20,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
|
||||
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
|
||||
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
|
||||
|
||||
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
|
||||
using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
|
||||
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
|
||||
|
||||
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
|
||||
@@ -33,5 +34,14 @@ public:
|
||||
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Extract float vector directly from serialized bytes, bypassing data_value overhead.
|
||||
// This is an internal API exposed for testing purposes.
|
||||
// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
|
||||
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
} // namespace functions
|
||||
} // namespace cql3
|
||||
|
||||
20
cql3/query_result_printer.hh
Normal file
20
cql3/query_result_printer.hh
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
class result;
|
||||
|
||||
void print_query_results_text(std::ostream& os, const result& result);
|
||||
void print_query_results_json(std::ostream& os, const result& result);
|
||||
|
||||
} // namespace cql3
|
||||
@@ -9,8 +9,10 @@
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include "types/json_utils.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/hashers.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include "cql3/result_set.hh"
|
||||
|
||||
namespace cql3 {
|
||||
@@ -195,4 +197,85 @@ make_empty_metadata() {
|
||||
return empty_metadata_cache;
|
||||
}
|
||||
|
||||
void print_query_results_text(std::ostream& os, const cql3::result& result) {
|
||||
const auto& metadata = result.get_metadata();
|
||||
const auto& column_metadata = metadata.get_names();
|
||||
|
||||
struct column_values {
|
||||
size_t max_size{0};
|
||||
sstring header_format;
|
||||
sstring row_format;
|
||||
std::vector<sstring> values;
|
||||
|
||||
void add(sstring value) {
|
||||
max_size = std::max(max_size, value.size());
|
||||
values.push_back(std::move(value));
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<column_values> columns;
|
||||
columns.resize(column_metadata.size());
|
||||
|
||||
for (size_t i = 0; i < column_metadata.size(); ++i) {
|
||||
columns[i].add(column_metadata[i]->name->text());
|
||||
}
|
||||
|
||||
for (const auto& row : result.result_set().rows()) {
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
if (row[i]) {
|
||||
columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
|
||||
} else {
|
||||
columns[i].add("");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<sstring> separators(columns.size(), sstring());
|
||||
for (size_t i = 0; i < columns.size(); ++i) {
|
||||
auto& col_values = columns[i];
|
||||
col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
|
||||
col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
|
||||
for (size_t c = 0; c < col_values.max_size; ++c) {
|
||||
separators[i] += "-";
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
|
||||
std::vector<sstring> row;
|
||||
row.reserve(columns.size());
|
||||
for (size_t i = 0; i < columns.size(); ++i) {
|
||||
const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
|
||||
row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
|
||||
}
|
||||
fmt::print(os, "{}\n", fmt::join(row, "|"));
|
||||
if (!r) {
|
||||
fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_query_results_json(std::ostream& os, const cql3::result& result) {
|
||||
const auto& metadata = result.get_metadata();
|
||||
const auto& column_metadata = metadata.get_names();
|
||||
|
||||
rjson::streaming_writer writer(os);
|
||||
|
||||
writer.StartArray();
|
||||
for (const auto& row : result.result_set().rows()) {
|
||||
writer.StartObject();
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
writer.Key(column_metadata[i]->name->text());
|
||||
if (!row[i] || row[i]->empty()) {
|
||||
writer.Null();
|
||||
continue;
|
||||
}
|
||||
const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
|
||||
const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
|
||||
writer.RawValue(value, type);
|
||||
}
|
||||
writer.EndObject();
|
||||
}
|
||||
writer.EndArray();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/paxos/paxos_state.hh"
|
||||
#include "types/types.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
@@ -329,6 +330,19 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
|
||||
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
|
||||
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
|
||||
fragmented_ostringstream os{};
|
||||
|
||||
fmt::format_to(os.to_iter(),
|
||||
"/* Do NOT execute this statement! It's only for informational purposes.\n"
|
||||
" A paxos state table is created automatically when enabling LWT on a base table.\n"
|
||||
"\n{}\n"
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
}
|
||||
result.push_back(std::move(table_desc));
|
||||
@@ -364,7 +378,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
|
||||
auto& replica_db = db.real_database();
|
||||
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
|
||||
}) | std::ranges::to<std::vector<schema_ptr>>();
|
||||
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));
|
||||
|
||||
|
||||
@@ -55,8 +55,21 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
|
||||
return hash & ((1ULL << batchlog_shard_bits) - 1);
|
||||
}
|
||||
|
||||
bool is_batchlog_v1(const schema& schema) {
|
||||
return schema.cf_name() == system_keyspace::BATCHLOG;
|
||||
}
|
||||
|
||||
std::pair<partition_key, clustering_key>
|
||||
get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
|
||||
if (is_batchlog_v1(schema)) {
|
||||
if (!id) {
|
||||
on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
|
||||
}
|
||||
auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
|
||||
auto ckey = clustering_key::make_empty();
|
||||
return std::pair(std::move(pkey), std::move(ckey));
|
||||
}
|
||||
|
||||
auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
|
||||
|
||||
std::vector<bytes> ckey_components;
|
||||
@@ -85,6 +98,14 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
|
||||
auto cdef_data = schema->get_column_definition(to_bytes("data"));
|
||||
m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
|
||||
|
||||
if (is_batchlog_v1(*schema)) {
|
||||
auto cdef_version = schema->get_column_definition(to_bytes("version"));
|
||||
m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
|
||||
|
||||
auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
|
||||
m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
@@ -122,9 +143,10 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
|
||||
const std::chrono::seconds db::batchlog_manager::replay_interval;
|
||||
const uint32_t db::batchlog_manager::page_size;
|
||||
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
|
||||
: _qp(qp)
|
||||
, _sys_ks(sys_ks)
|
||||
, _fs(fs)
|
||||
, _replay_timeout(config.replay_timeout)
|
||||
, _replay_rate(config.replay_rate)
|
||||
, _delay(config.delay)
|
||||
@@ -300,23 +322,156 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
|
||||
});
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
|
||||
co_await maybe_migrate_v1_to_v2();
|
||||
namespace {
|
||||
|
||||
typedef db_clock::rep clock_type;
|
||||
using clock_type = db_clock::rep;
|
||||
|
||||
struct replay_stats {
|
||||
std::optional<db_clock::time_point> min_too_fresh;
|
||||
bool need_cleanup = false;
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static future<db::all_batches_replayed> process_batch(
|
||||
cql3::query_processor& qp,
|
||||
db::batchlog_manager::stats& stats,
|
||||
db::batchlog_manager::post_replay_cleanup cleanup,
|
||||
utils::rate_limiter& limiter,
|
||||
schema_ptr schema,
|
||||
std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
|
||||
const db_clock::time_point now,
|
||||
db_clock::duration replay_timeout,
|
||||
std::chrono::seconds write_timeout,
|
||||
const cql3::untyped_result_set::row& row) {
|
||||
const bool is_v1 = db::is_batchlog_v1(*schema);
|
||||
const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
|
||||
const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
|
||||
auto timeout = replay_timeout;
|
||||
|
||||
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
|
||||
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
|
||||
auto data = row.get_blob_unfragmented("data");
|
||||
|
||||
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
bool send_failed = false;
|
||||
|
||||
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
|
||||
|
||||
try {
|
||||
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
|
||||
auto in = ser::as_input_stream(data);
|
||||
while (in.size()) {
|
||||
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
|
||||
const auto tbl = qp.db().try_find_table(fm.column_family_id());
|
||||
if (!tbl) {
|
||||
continue;
|
||||
}
|
||||
if (written_at <= tbl->get_truncation_time()) {
|
||||
continue;
|
||||
}
|
||||
schema_ptr s = tbl->schema();
|
||||
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
fms.emplace_back(std::move(fm), std::move(s));
|
||||
}
|
||||
|
||||
if (now < written_at + timeout) {
|
||||
blogger.debug("Skipping replay of {}, too fresh", id);
|
||||
|
||||
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
|
||||
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
|
||||
auto size = data.size();
|
||||
|
||||
for (const auto& [fm, s] : fms) {
|
||||
mutations.emplace_back(fm.to_mutation(s));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
if (!mutations.empty()) {
|
||||
const auto ttl = [written_at]() -> clock_type {
|
||||
/*
|
||||
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
|
||||
* This ensures that deletes aren't "undone" by an old batch replay.
|
||||
*/
|
||||
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
|
||||
warn(unimplemented::cause::HINT);
|
||||
#if 0
|
||||
for (auto& m : *mutations) {
|
||||
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
|
||||
}
|
||||
#endif
|
||||
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
|
||||
}();
|
||||
|
||||
if (ttl > 0) {
|
||||
// Origin does the send manually, however I can't see a super great reason to do so.
|
||||
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
|
||||
// in both cases.
|
||||
// FIXME: verify that the above is reasonably true.
|
||||
co_await limiter.reserve(size);
|
||||
stats.write_attempts += mutations.size();
|
||||
auto timeout = db::timeout_clock::now() + write_timeout;
|
||||
if (cleanup) {
|
||||
co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
|
||||
} else {
|
||||
co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (const data_dictionary::no_such_column_family&) {
|
||||
// As above -- we should drop the batch if the table doesn't exist anymore.
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
// we have to resort to keeping this batch to next lap.
|
||||
if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
send_failed = true;
|
||||
}
|
||||
|
||||
auto& sp = qp.proxy();
|
||||
|
||||
if (send_failed) {
|
||||
blogger.debug("Moving batch {} to stage failed_replay", id);
|
||||
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
}
|
||||
|
||||
// delete batch
|
||||
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
|
||||
co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
shard_written_at.need_cleanup = true;
|
||||
|
||||
co_return db::all_batches_replayed(!send_failed);
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
|
||||
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
|
||||
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
|
||||
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
|
||||
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
|
||||
auto limiter = make_lw_shared<utils::rate_limiter>(throttle);
|
||||
utils::rate_limiter limiter(throttle);
|
||||
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
struct replay_stats {
|
||||
std::optional<db_clock::time_point> min_too_fresh;
|
||||
bool need_cleanup = false;
|
||||
};
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
@@ -324,125 +479,49 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
// same across a while prefix of written_at (across all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
|
||||
const auto batch_shard = row.get_as<int32_t>("shard");
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
|
||||
auto timeout = _replay_timeout;
|
||||
auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
|
||||
all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
|
||||
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
|
||||
all_replayed = all_batches_replayed::no;
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
|
||||
blogger.debug("Started replayAllFailedBatches");
|
||||
co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
|
||||
|
||||
auto data = row.get_blob_unfragmented("data");
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
|
||||
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
|
||||
db::consistency_level::ONE,
|
||||
{},
|
||||
page_size,
|
||||
batch);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
bool send_failed = false;
|
||||
blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
|
||||
});
|
||||
|
||||
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
|
||||
co_return all_replayed;
|
||||
}
|
||||
|
||||
try {
|
||||
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
|
||||
auto in = ser::as_input_stream(data);
|
||||
while (in.size()) {
|
||||
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
|
||||
const auto tbl = _qp.db().try_find_table(fm.column_family_id());
|
||||
if (!tbl) {
|
||||
continue;
|
||||
}
|
||||
if (written_at <= tbl->get_truncation_time()) {
|
||||
continue;
|
||||
}
|
||||
schema_ptr s = tbl->schema();
|
||||
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
fms.emplace_back(std::move(fm), std::move(s));
|
||||
}
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
|
||||
co_await maybe_migrate_v1_to_v2();
|
||||
|
||||
if (now < written_at + timeout) {
|
||||
blogger.debug("Skipping replay of {}, too fresh", id);
|
||||
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
|
||||
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
|
||||
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
|
||||
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
|
||||
utils::rate_limiter limiter(throttle);
|
||||
|
||||
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
auto size = data.size();
|
||||
|
||||
for (const auto& [fm, s] : fms) {
|
||||
mutations.emplace_back(fm.to_mutation(s));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
if (!mutations.empty()) {
|
||||
const auto ttl = [written_at]() -> clock_type {
|
||||
/*
|
||||
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
|
||||
* This ensures that deletes aren't "undone" by an old batch replay.
|
||||
*/
|
||||
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
|
||||
warn(unimplemented::cause::HINT);
|
||||
#if 0
|
||||
for (auto& m : *mutations) {
|
||||
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
|
||||
}
|
||||
#endif
|
||||
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
|
||||
}();
|
||||
|
||||
if (ttl > 0) {
|
||||
// Origin does the send manually, however I can't see a super great reason to do so.
|
||||
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
|
||||
// in both cases.
|
||||
// FIXME: verify that the above is reasonably true.
|
||||
co_await limiter->reserve(size);
|
||||
_stats.write_attempts += mutations.size();
|
||||
auto timeout = db::timeout_clock::now() + write_timeout;
|
||||
if (cleanup) {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
|
||||
} else {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (const data_dictionary::no_such_column_family&) {
|
||||
// As above -- we should drop the batch if the table doesn't exist anymore.
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
all_replayed = all_batches_replayed::no;
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
// we have to resort to keeping this batch to next lap.
|
||||
if (!cleanup || stage == batchlog_stage::failed_replay) {
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
send_failed = true;
|
||||
}
|
||||
|
||||
auto& sp = _qp.proxy();
|
||||
|
||||
if (send_failed) {
|
||||
blogger.debug("Moving batch {} to stage failed_replay", id);
|
||||
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
}
|
||||
|
||||
// delete batch
|
||||
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
|
||||
co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
shard_written_at.need_cleanup = true;
|
||||
// Use a stable `now` across all batches, so skip/replay decisions are the
|
||||
// same across a while prefix of written_at (across all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
|
||||
all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
@@ -501,3 +580,10 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
|
||||
co_return all_replayed;
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
|
||||
if (_fs.batchlog_v2) {
|
||||
return replay_all_failed_batches_v2(cleanup);
|
||||
}
|
||||
return replay_all_failed_batches_v1(cleanup);
|
||||
}
|
||||
|
||||
@@ -27,6 +27,12 @@ class query_processor;
|
||||
|
||||
} // namespace cql3
|
||||
|
||||
namespace gms {
|
||||
|
||||
class feature_service;
|
||||
|
||||
} // namespace gms
|
||||
|
||||
namespace db {
|
||||
|
||||
class system_keyspace;
|
||||
@@ -49,6 +55,11 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
|
||||
public:
|
||||
using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
|
||||
|
||||
struct stats {
|
||||
uint64_t write_attempts = 0;
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
|
||||
static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
|
||||
@@ -56,14 +67,13 @@ private:
|
||||
|
||||
using clock_type = lowres_clock;
|
||||
|
||||
struct stats {
|
||||
uint64_t write_attempts = 0;
|
||||
} _stats;
|
||||
stats _stats;
|
||||
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
|
||||
cql3::query_processor& _qp;
|
||||
db::system_keyspace& _sys_ks;
|
||||
gms::feature_service& _fs;
|
||||
db_clock::duration _replay_timeout;
|
||||
uint64_t _replay_rate;
|
||||
std::chrono::milliseconds _delay;
|
||||
@@ -84,12 +94,14 @@ private:
|
||||
|
||||
future<> maybe_migrate_v1_to_v2();
|
||||
|
||||
future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
|
||||
future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
|
||||
future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
|
||||
public:
|
||||
// Takes a QP, not a distributes. Because this object is supposed
|
||||
// to be per shard and does no dispatching beyond delegating the the
|
||||
// shard qp (which is what you feed here).
|
||||
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);
|
||||
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
|
||||
|
||||
// abort the replay loop and return its future.
|
||||
future<> drain();
|
||||
@@ -102,7 +114,7 @@ public:
|
||||
return _last_replay;
|
||||
}
|
||||
|
||||
const stats& stats() const {
|
||||
const stats& get_stats() const {
|
||||
return _stats;
|
||||
}
|
||||
private:
|
||||
|
||||
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
|
||||
}
|
||||
continue;
|
||||
} catch (shutdown_marker&) {
|
||||
_reserve_segments.abort(std::current_exception());
|
||||
break;
|
||||
} catch (...) {
|
||||
clogger.warn("Exception in segment reservation: {}", std::current_exception());
|
||||
}
|
||||
co_await sleep(100ms);
|
||||
}
|
||||
_reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
|
||||
}
|
||||
|
||||
future<std::vector<db::commitlog::descriptor>>
|
||||
|
||||
@@ -1291,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
|
||||
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
|
||||
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
|
||||
|
||||
@@ -1139,14 +1139,17 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
// was already dropped (see https://github.com/scylladb/scylla/issues/5614)
|
||||
for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
|
||||
auto s = dropped_view.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
|
||||
auto s = dropped_table.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
|
||||
auto s = dropped_cdc.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
|
||||
|
||||
@@ -1714,7 +1714,9 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
|
||||
std::unordered_set<dht::token> tset;
|
||||
for (auto& t: tokens) {
|
||||
auto str = value_cast<sstring>(t);
|
||||
SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
|
||||
if (str != dht::token::from_sstring(str).to_sstring()) {
|
||||
on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
|
||||
}
|
||||
tset.insert(dht::token::from_sstring(str));
|
||||
}
|
||||
return tset;
|
||||
@@ -3191,7 +3193,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
};
|
||||
}
|
||||
} else if (must_have_tokens(nstate)) {
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
|
||||
}
|
||||
}
|
||||
@@ -3273,7 +3275,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
// Currently, at most one node at a time can be in transitioning state.
|
||||
if (!map->empty()) {
|
||||
const auto& [other_id, other_rs] = *map->begin();
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
|
||||
other_id, other_rs.state, host_id, nstate));
|
||||
}
|
||||
@@ -3331,8 +3333,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
|
||||
NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
|
||||
gen_id.id);
|
||||
SCYLLA_ASSERT(gen_rows);
|
||||
if (gen_rows->empty()) {
|
||||
if (!gen_rows || gen_rows->empty()) {
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
|
||||
}
|
||||
|
||||
@@ -215,6 +215,8 @@ public:
|
||||
static constexpr auto BUILT_VIEWS = "built_views";
|
||||
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
|
||||
static constexpr auto CDC_LOCAL = "cdc_local";
|
||||
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
|
||||
static constexpr auto CDC_STREAMS = "cdc_streams";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
|
||||
@@ -1345,8 +1345,8 @@ public:
|
||||
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
|
||||
@@ -1428,8 +1428,8 @@ public:
|
||||
}
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", timestamp_type, column_kind::clustering_key)
|
||||
|
||||
6
dist/docker/redhat/build_docker.sh
vendored
6
dist/docker/redhat/build_docker.sh
vendored
@@ -97,7 +97,9 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/
|
||||
|
||||
run microdnf clean all
|
||||
run microdnf --setopt=tsflags=nodocs -y update
|
||||
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
|
||||
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip cpio
|
||||
# Extract only systemctl binary from systemd package to avoid installing the whole systemd in the container.
|
||||
run bash -rc "microdnf download systemd && rpm2cpio systemd-*.rpm | cpio -idmv ./usr/bin/systemctl && rm -rf systemd-*.rpm"
|
||||
run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
|
||||
run pip3 install --no-cache-dir --prefix /usr supervisor
|
||||
run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
|
||||
@@ -106,6 +108,8 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
|
||||
run mkdir -p /var/log/scylla
|
||||
run chown -R scylla:scylla /var/lib/scylla
|
||||
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/sysconfig/scylla-server
|
||||
# Cleanup packages not needed in the final image and clean package manager cache to reduce image size.
|
||||
run bash -rc "microdnf remove -y cpio && microdnf clean all"
|
||||
|
||||
run mkdir -p /opt/scylladb/supervisor
|
||||
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
|
||||
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html
|
||||
|
||||
# Remove the troubleshooting page relevant for Open Source only
|
||||
|
||||
/stable/troubleshooting/missing-dotmount-files.html: /troubleshooting/index.html
|
||||
|
||||
# Move the diver information to another project
|
||||
|
||||
/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html
|
||||
|
||||
@@ -142,10 +142,6 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
|
||||
Alternator implements such requests by reading the entire top-level
|
||||
attribute a, modifying only a.b[3].c, and then writing back a.
|
||||
|
||||
Currently, Alternator doesn't use Tablets. That's because Alternator relies
|
||||
on LWT (lightweight transactions), and LWT is not supported in keyspaces
|
||||
with Tablets enabled.
|
||||
|
||||
```{eval-rst}
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@@ -187,6 +187,23 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
|
||||
the keyspace schema with ``tablets = { 'enabled': false }`` or
|
||||
``tablets = { 'enabled': true }``.
|
||||
|
||||
.. _keyspace-rf-rack-valid-to-enforce-rack-list:
|
||||
|
||||
Enforcing Rack-List Replication for Tablet Keyspaces
|
||||
------------------------------------------------------------------
|
||||
|
||||
The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`.
|
||||
|
||||
Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
|
||||
to ``rf_rack_valid_keyspaces`` option.
|
||||
|
||||
To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
|
||||
rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
|
||||
After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
|
||||
sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
|
||||
|
||||
.. _tablets-limitations:
|
||||
|
||||
Limitations and Unsupported Features
|
||||
|
||||
@@ -200,8 +200,6 @@ for two cases. One is setting replication factor to 0, in which case the number
|
||||
The other is when the numeric replication factor is equal to the current number of replicas
|
||||
for a given datacanter, in which case the current rack list is preserved.
|
||||
|
||||
Altering from a numeric replication factor to a rack list is not supported yet.
|
||||
|
||||
Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
|
||||
auto-expansion will only *add* new datacenters for safety, it will not alter
|
||||
existing datacenters or remove any even if they are no longer in the cluster.
|
||||
@@ -424,6 +422,21 @@ Altering from a rack list to a numeric replication factor is not supported.
|
||||
|
||||
Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).
|
||||
|
||||
.. _conversion-to-rack-list-rf:
|
||||
|
||||
Conversion to rack-list replication factor
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. _drop-keyspace-statement:
|
||||
|
||||
DROP KEYSPACE
|
||||
|
||||
@@ -281,7 +281,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
|
||||
For example::
|
||||
|
||||
|
||||
@@ -140,17 +140,83 @@ Vector Index :label-note:`ScyllaDB Cloud`
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
|
||||
|
||||
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
|
||||
similarity search on vector data.
|
||||
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
|
||||
index for indexing vectors per partition.
|
||||
|
||||
The vector index is the only custom type index supported in ScyllaDB. It is created using
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
|
||||
add additional columns to the index for filtering the search results. The partition column
|
||||
specified in the global vector index definition must be the vector column, and any subsequent
|
||||
columns are treated as filtering columns. The local vector index requires that the partition key
|
||||
of the base table is also the partition key of the index and the vector column is the first one
|
||||
from the following columns.
|
||||
|
||||
Example of a simple index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global vector index. Additional filtering can be performed on the primary key
|
||||
columns of the base table.
|
||||
|
||||
Example of a global vector index with additional filtering:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global index. Additional columns are added for filtering the search results.
|
||||
The filtering is possible on ``category``, ``info`` and all primary key columns
|
||||
of the base table.
|
||||
|
||||
Example of a local vector index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed for similarity search (a local
|
||||
index) and additional columns are added for filtering the search results. The
|
||||
filtering is possible on ``category``, ``info`` and all primary key columns of
|
||||
the base table. The columns ``id`` and ``created_at`` must be the partition key
|
||||
of the base table.
|
||||
|
||||
Vector indexes support additional filtering columns of native data types
|
||||
(excluding counter and duration). The indexed column itself must be a vector
|
||||
column, while the extra columns can be used to filter search results.
|
||||
|
||||
The supported types are:
|
||||
|
||||
* ``ascii``
|
||||
* ``bigint``
|
||||
* ``blob``
|
||||
* ``boolean``
|
||||
* ``date``
|
||||
* ``decimal``
|
||||
* ``double``
|
||||
* ``float``
|
||||
* ``inet``
|
||||
* ``int``
|
||||
* ``smallint``
|
||||
* ``text``
|
||||
* ``varchar``
|
||||
* ``time``
|
||||
* ``timestamp``
|
||||
* ``timeuuid``
|
||||
* ``tinyint``
|
||||
* ``uuid``
|
||||
* ``varint``
|
||||
|
||||
|
||||
The following options are supported for vector indexes. All of them are optional.
|
||||
|
||||
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
|
||||
|
||||
23
docs/features/automatic-repair.rst
Normal file
23
docs/features/automatic-repair.rst
Normal file
@@ -0,0 +1,23 @@
|
||||
.. _automatic-repair:
|
||||
|
||||
Automatic Repair
|
||||
================
|
||||
|
||||
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
|
||||
|
||||
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
|
||||
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
|
||||
|
||||
To enable automatic repair, add this to the configuration (``scylla.yaml``):
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
auto_repair_enabled_default: true
|
||||
auto_repair_threshold_default_in_seconds: 86400
|
||||
|
||||
This will enable automatic repair for all tables with a repair period of 1 day. This configuration has to be set on each node, to an identical value.
|
||||
More featureful configuration methods will be implemented in the future.
|
||||
|
||||
To disable, set ``auto_repair_enabled_default: false``.
|
||||
|
||||
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
|
||||
@@ -3,7 +3,7 @@
|
||||
Incremental Repair
|
||||
==================
|
||||
|
||||
ScyllaDB's standard repair process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
|
||||
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
|
||||
|
||||
@@ -37,7 +37,12 @@ The available modes are:
|
||||
* ``disabled``: Completely disables the incremental repair logic for the current operation. The repair behaves like a classic, non-incremental repair, and it does not read or update any incremental repair status markers.
|
||||
|
||||
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental. It can also be specified with the REST API, e.g., curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental.
|
||||
It can also be specified with the REST API, e.g.:
|
||||
|
||||
.. code::
|
||||
|
||||
curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
|
||||
Benefits of Incremental Repair
|
||||
------------------------------
|
||||
@@ -46,6 +51,8 @@ Benefits of Incremental Repair
|
||||
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
|
||||
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
|
||||
|
||||
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Automatic Repair </features/automatic-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
@@ -44,5 +45,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Automatic Repair </features/automatic-repair/>` schedules and runs repairs
|
||||
directly in ScyllaDB, without external schedulers.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
@@ -10,7 +10,6 @@ Install ScyllaDB |CURRENT_VERSION|
|
||||
/getting-started/install-scylla/launch-on-azure
|
||||
/getting-started/installation-common/scylla-web-installer
|
||||
/getting-started/install-scylla/install-on-linux
|
||||
/getting-started/installation-common/install-jmx
|
||||
/getting-started/install-scylla/run-in-docker
|
||||
/getting-started/installation-common/unified-installer
|
||||
/getting-started/installation-common/air-gapped-install
|
||||
@@ -36,7 +35,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
|
||||
* :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
|
||||
* :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
* :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
|
||||
* :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
|
||||
* :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
.. |RHEL_EPEL_8| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
|
||||
.. |RHEL_EPEL_9| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
|
||||
|
||||
======================================
|
||||
Install ScyllaDB Linux Packages
|
||||
======================================
|
||||
========================================================
|
||||
Install ScyllaDB |CURRENT_VERSION| Linux Packages
|
||||
========================================================
|
||||
|
||||
We recommend installing ScyllaDB using :doc:`ScyllaDB Web Installer for Linux </getting-started/installation-common/scylla-web-installer/>`,
|
||||
a platform-agnostic installation script, to install ScyllaDB on any supported Linux platform.
|
||||
@@ -46,8 +46,8 @@ Install ScyllaDB
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor a43e06657bac99e3 | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys c503c686b007f39e
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor c503c686b007f39e | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
@@ -94,16 +94,6 @@ Install ScyllaDB
|
||||
|
||||
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1
|
||||
|
||||
|
||||
#. (Ubuntu only) Set Java 11.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y openjdk-11-jre-headless
|
||||
sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Install the EPEL repository.
|
||||
@@ -157,14 +147,6 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
|
||||
(Optional) Install scylla-jmx
|
||||
-------------------------------
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
Next Steps
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on AWS
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on AWS
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on AWS. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on Azure
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on Azure
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on Azure. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on GCP
|
||||
==========================
|
||||
=============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on GCP
|
||||
=============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on GCP. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
|
||||
======================================
|
||||
Install scylla-jmx Package
|
||||
======================================
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, you can still install it from scylla-jmx GitHub page.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
#. Download .deb package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".deb".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt install -y ./scylla-jmx_<version>_all.deb
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Download .rpm package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".rpm".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
|
||||
|
||||
|
||||
.. group-tab:: Install without root privileges
|
||||
|
||||
#. Download .tar.gz package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".tar.gz".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code:: console
|
||||
|
||||
tar xpf scylla-jmx-<version>.noarch.tar.gz
|
||||
cd scylla-jmx
|
||||
./install.sh --nonroot
|
||||
|
||||
Next Steps
|
||||
-----------
|
||||
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
|
||||
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
|
||||
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
|
||||
* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
@@ -14,44 +14,35 @@ Prerequisites
|
||||
Ensure your platform is supported by the ScyllaDB version you want to install.
|
||||
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
|
||||
|
||||
Note that if you're on CentOS 7, only root offline installation is supported.
|
||||
|
||||
Download and Install
|
||||
-----------------------
|
||||
|
||||
#. Download the latest tar.gz file for ScyllaDB version (x86 or ARM) from ``https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-<version>/``.
|
||||
|
||||
Example for version 6.1: https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-6.1/
|
||||
**Example** for version 2025.1:
|
||||
|
||||
- Go to https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-2025.1/
|
||||
- Download the ``scylla-unified`` file for the patch version you want to
|
||||
install. For example, to install 2025.1.9 (x86), download
|
||||
``scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz``.
|
||||
|
||||
#. Uncompress the downloaded package.
|
||||
|
||||
The following example shows the package for ScyllaDB 6.1.1 (x86):
|
||||
**Example** for version 2025.1.9 (x86) (downloaded in the previous step):
|
||||
|
||||
.. code:: console
|
||||
.. code::
|
||||
|
||||
tar xvfz scylla-unified-6.1.1-0.20240814.8d90b817660a.x86_64.tar.gz
|
||||
tar xvfz scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz
|
||||
|
||||
#. Install OpenJDK 8 or 11.
|
||||
|
||||
The following example shows Java installation on a CentOS-like system:
|
||||
|
||||
.. code:: console
|
||||
|
||||
sudo yum install -y java-11-openjdk-headless
|
||||
|
||||
For root offline installation on Debian-like systems, two additional packages, ``xfsprogs``
|
||||
and ``mdadm``, should be installed to be used in RAID setup.
|
||||
#. (Root offline installation only) For root offline installation on Debian-like
|
||||
systems, two additional packages, ``xfsprogs`` and ``mdadm``, should be
|
||||
installed to be used in RAID setup.
|
||||
|
||||
#. Install ScyllaDB as a user with non-root privileges:
|
||||
|
||||
.. code:: console
|
||||
|
||||
./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
|
||||
|
||||
#. (Optional) Install scylla-jmx
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
./install.sh --nonroot
|
||||
|
||||
Configure and Run ScyllaDB
|
||||
----------------------------
|
||||
@@ -81,19 +72,14 @@ Run nodetool:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/nodetool status
|
||||
~/scylladb/bin/nodetool nodetool status
|
||||
|
||||
Run cqlsh:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/cqlsh
|
||||
~/scylladb/bin/cqlsh
|
||||
|
||||
Run cassandra-stress:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/cassandra-stress write
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -124,7 +110,7 @@ Nonroot install
|
||||
|
||||
./install.sh --upgrade --nonroot
|
||||
|
||||
.. note:: The installation script does not upgrade scylla-jmx and scylla-tools. You will have to upgrade them separately.
|
||||
.. note:: The installation script does not upgrade scylla-tools. You will have to upgrade them separately.
|
||||
|
||||
Uninstall
|
||||
===========
|
||||
@@ -154,4 +140,4 @@ Next Steps
|
||||
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
|
||||
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
|
||||
* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
@@ -25,4 +25,8 @@ For Example:
|
||||
|
||||
nodetool rebuild <source-dc-name>
|
||||
|
||||
``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
|
||||
|
||||
See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
|
||||
|
||||
.. include:: nodetool-index.rst
|
||||
|
||||
@@ -155,7 +155,6 @@ Add New DC
|
||||
UN 54.235.9.159 109.75 KB 256 ? 39798227-9f6f-4868-8193-08570856c09a RACK1
|
||||
UN 54.146.228.25 128.33 KB 256 ? 7a4957a1-9590-4434-9746-9c8a6f796a0c RACK1
|
||||
|
||||
.. TODO possibly provide additional information WRT how ALTER works with tablets
|
||||
|
||||
#. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:
|
||||
|
||||
@@ -171,26 +170,68 @@ Add New DC
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
|
||||
ALTER Command
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class’: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace2;
|
||||
|
||||
CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
|
||||
CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then alter the keyspace one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -198,7 +239,7 @@ Add New DC
|
||||
|
||||
The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.
|
||||
|
||||
#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
|
||||
#. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.
|
||||
|
||||
|
||||
@@ -40,12 +40,14 @@ Prerequisites
|
||||
Procedure
|
||||
---------
|
||||
|
||||
#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
|
||||
For example:
|
||||
|
||||
If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC
|
||||
|
||||
#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
|
||||
|
||||
#. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.
|
||||
|
||||
For example:
|
||||
@@ -73,6 +75,33 @@ Procedure
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
|
||||
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba2
|
||||
cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba3
|
||||
cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
|
||||
@@ -58,4 +58,12 @@ See also
|
||||
|
||||
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
|
||||
|
||||
Incremental Repair
|
||||
------------------
|
||||
|
||||
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
|
||||
|
||||
Automatic Repair
|
||||
----------------
|
||||
|
||||
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
|
||||
|
||||
@@ -8,7 +8,6 @@ Troubleshooting ScyllaDB
|
||||
|
||||
support/index
|
||||
startup/index
|
||||
upgrade/index
|
||||
cluster/index
|
||||
modeling/index
|
||||
storage/index
|
||||
@@ -29,7 +28,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
* :doc:`Errors and ScyllaDB Customer Support <support/index>`
|
||||
* :doc:`ScyllaDB Startup <startup/index>`
|
||||
* :doc:`ScyllaDB Cluster and Node <cluster/index>`
|
||||
* :doc:`ScyllaDB Upgrade <upgrade/index>`
|
||||
* :doc:`Data Modeling <modeling/index>`
|
||||
* :doc:`Data Storage and SSTables <storage/index>`
|
||||
* :doc:`CQL errors <CQL/index>`
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade
|
||||
======================================================================================
|
||||
|
||||
Problem
|
||||
^^^^^^^
|
||||
When you reboot the machine after a ScyllaDB upgrade, you cannot access data directories under ``/var/lib/scylla``, and
|
||||
coredump saves to ``rootfs``.
|
||||
|
||||
|
||||
The problem may occur when you upgrade ScylaDB Open Source 4.6 or later to a version of ScyllaDB Enterprise if
|
||||
the ``/etc/systemd/system/var-lib-scylla.mount`` and ``/etc/systemd/system/var-lib-systemd-coredump.mount`` are
|
||||
deleted by RPM.
|
||||
|
||||
To avoid losing the files, the upgrade procedure includes a step to backup the .mount files. The following
|
||||
example shows the command to backup the files before the upgrade from version 5.0:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
for conf in $( rpm -qc $(rpm -qa | grep scylla) | grep -v contains ) /etc/systemd/system/{var-lib-scylla,var-lib-systemd-coredump}.mount; do sudo cp -v $conf $conf.backup-5.0; done
|
||||
|
||||
If you don't backup the .mount files before the upgrade, the files may be lost.
|
||||
|
||||
|
||||
Solution
|
||||
^^^^^^^^
|
||||
|
||||
If you didn't backup the .mount files before the upgrade and the files were deleted during the upgrade,
|
||||
you need to restore them manually.
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-systemd-coredump.mount``, run the following:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
[Unit]
|
||||
Description=Save coredump to scylla data directory
|
||||
Conflicts=umount.target
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/var/lib/scylla/coredump
|
||||
Where=/var/lib/systemd/coredump
|
||||
Type=none
|
||||
Options=bind
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-scylla.mount``, run the following (specifying your data disk):
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ UUID=`blkid -s UUID -o value <specify your data disk, eg: /dev/md0>`
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-scylla.mount
|
||||
[Unit]
|
||||
Description=ScyllaDB data directory
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/dev/disk/by-uuid/$UUID
|
||||
Where=/var/lib/scylla
|
||||
Type=xfs
|
||||
Options=noatime
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
After restoring .mount files, you need to enable them:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ sudo systemctl daemon-reload
|
||||
$ sudo systemctl enable --now var-lib-scylla.mount
|
||||
$ sudo systemctl enable --now var-lib-systemd-coredump.mount
|
||||
|
||||
|
||||
.. include:: /troubleshooting/_common/ts-return.rst
|
||||
@@ -1,16 +0,0 @@
|
||||
Upgrade
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
:maxdepth: 2
|
||||
|
||||
Inaccessible configuration files after ScyllaDB upgrade </troubleshooting/missing-dotmount-files>
|
||||
|
||||
.. panel-box::
|
||||
:title: Upgrade Issues
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade </troubleshooting//missing-dotmount-files>`
|
||||
|
||||
@@ -11,9 +11,13 @@ ScyllaDB. This means that:
|
||||
|
||||
* You should follow the upgrade policy:
|
||||
|
||||
* Starting with version **2025.4**, upgrades can skip minor versions as long
|
||||
as they remain within the same major version (for example, upgrading directly
|
||||
from 2025.1 → 2025.4 is supported).
|
||||
* Starting with version **2025.4**, upgrades can **skip minor versions** if:
|
||||
|
||||
* They remain within the same major version (for example, upgrading
|
||||
directly from *2025.1 → 2025.4* is supported).
|
||||
* You upgrade to the next major version (for example, upgrading
|
||||
directly from *2025.3 → 2026.1* is supported).
|
||||
|
||||
* For versions **prior to 2025.4**, upgrades must be performed consecutively—
|
||||
each successive X.Y version must be installed in order, **without skipping
|
||||
any major or minor version** (for example, upgrading directly from 2025.1 → 2025.3
|
||||
|
||||
@@ -4,8 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4/index>
|
||||
ScyllaDB 2025.4 Patch Upgrades <upgrade-guide-from-2025.4.x-to-2025.4.y>
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
|
||||
@@ -1,266 +0,0 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2025.4.x
|
||||
.. |NEW_VERSION| replace:: 2025.4.y
|
||||
|
||||
==========================================================================
|
||||
Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
|
||||
==========================================================================
|
||||
|
||||
This document describes a step-by-step procedure for upgrading from
|
||||
|SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION| (where "y" is
|
||||
the latest available version), and rolling back to version |SRC_VERSION|
|
||||
if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
|
||||
CentOS, Debian, and Ubuntu.
|
||||
See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
for information about supported versions.
|
||||
|
||||
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
Upgrade Procedure
|
||||
=================
|
||||
|
||||
.. note::
|
||||
Apply the following procedure **serially** on each node. Do not move to the next
|
||||
node before validating that the node is up and running the new version.
|
||||
|
||||
A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
|
||||
shutdown. For each of the nodes in the cluster, you will:
|
||||
|
||||
#. Drain the node and back up the data.
|
||||
#. Backup configuration file.
|
||||
#. Stop ScyllaDB.
|
||||
#. Download and install new ScyllaDB packages.
|
||||
#. Start ScyllaDB.
|
||||
#. Validate that the upgrade was successful.
|
||||
|
||||
**Before** upgrading, check which version you are running now using
|
||||
``scylla --version``. Note the current version in case you want to roll back
|
||||
the upgrade.
|
||||
|
||||
**During** the rolling upgrade it is highly recommended:
|
||||
|
||||
* Not to use new |NEW_VERSION| features.
|
||||
* Not to run administration functions, like repairs, refresh, rebuild or add
|
||||
or remove nodes. See
|
||||
`sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
|
||||
ScyllaDB Manager's scheduled or running repairs.
|
||||
* Not to apply schema changes.
|
||||
|
||||
Upgrade Steps
|
||||
=============
|
||||
|
||||
Back up the data
|
||||
------------------------------
|
||||
|
||||
Back up all the data to an external device. We recommend using
|
||||
`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
|
||||
to create backups.
|
||||
|
||||
Alternatively, you can use the ``nodetool snapshot`` command.
|
||||
For **each** node in the cluster, run the following:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
nodetool snapshot
|
||||
|
||||
Take note of the directory name that nodetool gives you, and copy all
|
||||
the directories with this name under ``/var/lib/scylla`` to a backup device.
|
||||
|
||||
When the upgrade is completed on all nodes, remove the snapshot with the
|
||||
``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
|
||||
space.
|
||||
|
||||
Back up the configuration file
|
||||
------------------------------
|
||||
|
||||
Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
|
||||
in case you need to roll back the upgrade.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
|
||||
|
||||
Gracefully stop the node
|
||||
------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server stop
|
||||
|
||||
Download and install the new release
|
||||
------------------------------------
|
||||
|
||||
You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
|
||||
a patch release.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To install a patch version on Debian or Ubuntu, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To install a patch version on RHEL or CentOS, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo yum clean all
|
||||
sudo yum update scylla\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you're using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you're using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to apply an extended upgrade procedure:
|
||||
|
||||
#. Install the new ScyllaDB version with the additional
|
||||
``scylla-machine-image`` package:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
sudo apt-get dist-upgrade scylla-machine-image
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service start scylla-server
|
||||
|
||||
Validate
|
||||
--------
|
||||
#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
|
||||
including the one you just upgraded, are in UN status.
|
||||
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
|
||||
to check the ScyllaDB version.
|
||||
#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
|
||||
#. Check again after 2 minutes to validate that no new issues are introduced.
|
||||
|
||||
Once you are sure the node upgrade is successful, move to the next node in
|
||||
the cluster.
|
||||
|
||||
Rollback Procedure
|
||||
==================
|
||||
|
||||
The following procedure describes a rollback from ScyllaDB release
|
||||
|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
|
||||
|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes.
|
||||
|
||||
* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
|
||||
* Execute the following commands one node at a time, moving to the next node only
|
||||
after the rollback procedure is completed successfully.
|
||||
|
||||
ScyllaDB rollback is a rolling procedure that does **not** require a full
|
||||
cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
|
||||
|
||||
#. Drain the node and stop ScyllaDB.
|
||||
#. Downgrade to the previous release.
|
||||
#. Restore the configuration file.
|
||||
#. Restart ScyllaDB.
|
||||
#. Validate the rollback success.
|
||||
|
||||
Rollback Steps
|
||||
==============
|
||||
|
||||
Gracefully shutdown ScyllaDB
|
||||
-----------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
sudo service stop scylla-server
|
||||
|
||||
Downgrade to the previous release
|
||||
----------------------------------
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you’re using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you’re using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to additionally downgrade
|
||||
the ``scylla-machine-image`` package.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
|
||||
Restore the configuration file
|
||||
------------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo rm -rf /etc/scylla/scylla.yaml
|
||||
sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server start
|
||||
|
||||
Validate
|
||||
--------
|
||||
Check upgrade instruction above for validation. Once you are sure the node
|
||||
rollback is successful, move to the next node in the cluster.
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2025.4
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2025.4>
|
||||
Metrics Update <metric-update-2025.x-to-2025.4>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2025.4 <metric-update-2025.x-to-2025.4>`
|
||||
@@ -1,68 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
.. |PRECEDING_VERSION| replace:: 2025.3
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_database_total_view_updates_due_to_replica_count_mismatch
|
||||
- The total number of view updates for which there were more view replicas
|
||||
than base replicas and we had to generate an extra view update because
|
||||
the additional view replica wouldn't get paired with any base replica.
|
||||
It should only increase during the Replication Factor (RF) change. It
|
||||
should stop increasing shortly after finishing the RF change.
|
||||
* - scylla_database_total_writes_rejected_due_to_out_of_space_prevention
|
||||
- Counts write operations that were rejected due to disabled user tables
|
||||
writes.
|
||||
* - scylla_index_query_latencies
|
||||
- Index query latencies.
|
||||
* - scylla_reactor_aio_retries
|
||||
- The total number of IOCB-s re-submitted via thread-pool.
|
||||
* - scylla_reactor_io_threaded_fallbacks
|
||||
- The total number of io-threaded-fallbacks operations.
|
||||
* - scylla_repair_inc_sst_read_bytes
|
||||
- The total number of bytes read from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_inc_sst_skipped_bytes
|
||||
- The total number of bytes skipped from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_tablet_time_ms
|
||||
- The time spent on tablet repair on this shard (in milliseconds).
|
||||
* - scylla_s3_downloads_blocked_on_memory
|
||||
- Counts the number of times the S3 client downloads were delayed due to
|
||||
insufficient memory availability.
|
||||
* - scylla_s3_memory_usage
|
||||
- The total number of bytes consumed by the S3 client.
|
||||
* - scylla_s3_total_read_prefetch_bytes
|
||||
- The total number of bytes requested from object.
|
||||
* - scylla_storage_proxy_replica_fenced_out_requests
|
||||
- The number of requests that resulted in a stale_topology_exception.
|
||||
* - scylla_vector_store_dns_refreshes
|
||||
- The number of DNS refreshes.
|
||||
|
||||
New and Updated Metrics in Previous 2025.x Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
|
||||
Metrics Update <metric-update-2025.x-to-2026.1>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
|
||||
@@ -0,0 +1,82 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
.. |PRECEDING_VERSION| replace:: 2025.4
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_operation_size_kb
|
||||
- Histogram of item sizes involved in a request.
|
||||
* - scylla_column_family_total_disk_space_before_compression
|
||||
- Hypothetical total disk space used if data files weren't compressed
|
||||
* - scylla_group_name_auto_repair_enabled_nr
|
||||
- Number of tablets with auto repair enabled.
|
||||
* - scylla_group_name_auto_repair_needs_repair_nr
|
||||
- Number of tablets with auto repair enabled that currently need repair.
|
||||
* - scylla_lsa_compact_time_ms
|
||||
- Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
|
||||
* - scylla_lsa_evict_time_ms
|
||||
- Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
|
||||
* - scylla_lsa_reclaim_time_ms
|
||||
- Total time spent in reclaiming LSA memory back to std allocator.
|
||||
* - scylla_object_storage_memory_usage
|
||||
- Total number of bytes consumed by the object storage client.
|
||||
* - scylla_tablet_ops_failed
|
||||
- Number of failed tablet auto repair attempts.
|
||||
* - scylla_tablet_ops_succeeded
|
||||
- Number of successful tablet auto repair attempts.
|
||||
|
||||
Renamed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric Name in |PRECEDING_VERSION|
|
||||
- Metric Name in |NEW_VERSION|
|
||||
* - scylla_s3_memory_usage
|
||||
- scylla_object_storage_memory_usage
|
||||
|
||||
Removed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are removed in ScyllaDB |NEW_VERSION|.
|
||||
|
||||
* scylla_redis_current_connections
|
||||
* scylla_redis_op_latency
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_requests_latency
|
||||
* scylla_redis_requests_served
|
||||
* scylla_redis_requests_serving
|
||||
|
||||
New and Updated Metrics in Previous Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2025.4
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2025.4
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -17,10 +17,12 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
|
||||
to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian,
|
||||
and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
for information about supported versions.
|
||||
and Ubuntu.
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions. It also applies when using
|
||||
the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
See :doc:`About Upgrade </upgrade/about-upgrade/>` for the ScyllaDB upgrade policy.
|
||||
|
||||
Before You Upgrade ScyllaDB
|
||||
==============================
|
||||
@@ -149,8 +151,9 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Update the ScyllaDB deb repo to |NEW_VERSION|.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.4.list
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -167,8 +170,9 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Update the ScyllaDB rpm repo to |NEW_VERSION|.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.4.repo
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -198,11 +202,6 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
|
||||
If you need JMX server, see
|
||||
:doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
and get new version.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
@@ -284,6 +284,7 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
|
||||
}
|
||||
[[fallthrough]];
|
||||
case httpclient::reply_status::request_timeout:
|
||||
case httpclient::reply_status::too_many_requests:
|
||||
if (retry < max_retries) {
|
||||
// service unavailable etc -> backoff + retry
|
||||
do_backoff = true;
|
||||
|
||||
@@ -182,6 +182,7 @@ public:
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
|
||||
gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
|
||||
@@ -147,17 +147,88 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
|
||||
}
|
||||
|
||||
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
|
||||
}
|
||||
auto target = targets[0];
|
||||
auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
|
||||
if (!c_def) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
|
||||
}
|
||||
auto type = c_def->type;
|
||||
if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
|
||||
|
||||
struct validate_visitor {
|
||||
const class schema& schema;
|
||||
bool& is_vector;
|
||||
|
||||
/// Vector indexes support filtering on native types that can be used as primary key columns.
|
||||
/// There is no counter (it cannot be used with vector columns)
|
||||
/// and no duration (it cannot be used as a primary key or in secondary indexes).
|
||||
static bool is_supported_filtering_column(abstract_type const & kind_type) {
|
||||
switch (kind_type.get_kind()) {
|
||||
case abstract_type::kind::ascii:
|
||||
case abstract_type::kind::boolean:
|
||||
case abstract_type::kind::byte:
|
||||
case abstract_type::kind::bytes:
|
||||
case abstract_type::kind::date:
|
||||
case abstract_type::kind::decimal:
|
||||
case abstract_type::kind::double_kind:
|
||||
case abstract_type::kind::float_kind:
|
||||
case abstract_type::kind::inet:
|
||||
case abstract_type::kind::int32:
|
||||
case abstract_type::kind::long_kind:
|
||||
case abstract_type::kind::short_kind:
|
||||
case abstract_type::kind::simple_date:
|
||||
case abstract_type::kind::time:
|
||||
case abstract_type::kind::timestamp:
|
||||
case abstract_type::kind::timeuuid:
|
||||
case abstract_type::kind::utf8:
|
||||
case abstract_type::kind::uuid:
|
||||
case abstract_type::kind::varint:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void validate(cql3::column_identifier const& column, bool is_vector) const {
|
||||
auto const& c_name = column.to_string();
|
||||
auto const* c_def = schema.get_column_definition(column.name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto type = c_def->type;
|
||||
|
||||
if (is_vector) {
|
||||
auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
|
||||
if (vector_type == nullptr) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
|
||||
auto elements_type = vector_type->get_elements_type();
|
||||
if (elements_type->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!is_supported_filtering_column(*type)) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
|
||||
for (const auto& column : columns) {
|
||||
// CQL restricts the secondary local index to have multiple columns with partition key only.
|
||||
// Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
|
||||
// so we can assume here that these are non-vectors filtering columns.
|
||||
validate(*column, false);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
|
||||
validate(*column, is_vector);
|
||||
// The first column is the vector column, the rest mustn't be vectors.
|
||||
is_vector = false;
|
||||
}
|
||||
};
|
||||
|
||||
bool is_vector = true;
|
||||
for (const auto& target : targets) {
|
||||
std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -347,8 +347,8 @@ install -d -m755 "$retc"/scylla.d
|
||||
scylla_yaml_dir=$(mktemp -d)
|
||||
scylla_yaml=$scylla_yaml_dir/scylla.yaml
|
||||
grep -v api_ui_dir conf/scylla.yaml | grep -v api_doc_dir > $scylla_yaml
|
||||
echo "api_ui_dir: /opt/scylladb/swagger-ui/dist/" >> $scylla_yaml
|
||||
echo "api_doc_dir: /opt/scylladb/api/api-doc/" >> $scylla_yaml
|
||||
echo "api_ui_dir: $prefix/swagger-ui/dist/" >> $scylla_yaml
|
||||
echo "api_doc_dir: $prefix/api/api-doc/" >> $scylla_yaml
|
||||
installconfig 644 $scylla_yaml "$retc"/scylla
|
||||
rm -rf $scylla_yaml_dir
|
||||
|
||||
|
||||
@@ -612,12 +612,16 @@ tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topo
|
||||
return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
|
||||
}
|
||||
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
|
||||
if (get_tablet_info(id).replicas.size() < 2) {
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id, const locator::topology& topo) const {
|
||||
const auto& orig_replicas = get_tablet_info(id).replicas;
|
||||
if (orig_replicas.size() < 2) {
|
||||
throw std::runtime_error(format("No secondary replica for tablet id {}", id));
|
||||
}
|
||||
const auto& replicas = get_tablet_info(id).replicas;
|
||||
return replicas.at((size_t(id)+1) % replicas.size());
|
||||
tablet_replica_set replicas = orig_replicas;
|
||||
std::ranges::sort(replicas, tablet_replica_comparator(topo));
|
||||
// This formula must match the one in get_primary_replica(),
|
||||
// just with + 1.
|
||||
return replicas.at((size_t(id) + size_t(id) / replicas.size() + 1) % replicas.size());
|
||||
}
|
||||
|
||||
std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
|
||||
|
||||
@@ -647,9 +647,10 @@ public:
|
||||
/// Returns the primary replica for the tablet
|
||||
tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
/// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
|
||||
/// Returns the secondary replica for the tablet: the replica that immediately follows the primary
|
||||
/// replica in the topology-sorted replica list.
|
||||
/// \throws std::runtime_error if the tablet has less than 2 replicas.
|
||||
tablet_replica get_secondary_replica(tablet_id id) const;
|
||||
tablet_replica get_secondary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
// Returns the replica that matches hosts and dcs filters for tablet_task_info.
|
||||
std::optional<tablet_replica> maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const;
|
||||
|
||||
2
main.cc
2
main.cc
@@ -2417,7 +2417,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
bm_cfg.delay = std::chrono::milliseconds(cfg->ring_delay_ms());
|
||||
bm_cfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();
|
||||
|
||||
bm.start(std::ref(qp), std::ref(sys_ks), bm_cfg).get();
|
||||
bm.start(std::ref(qp), std::ref(sys_ks), std::ref(feature_service), bm_cfg).get();
|
||||
auto stop_batchlog_manager = defer_verbose_shutdown("batchlog manager", [&bm] {
|
||||
bm.stop().get();
|
||||
});
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
|
||||
size 6492280
|
||||
oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
|
||||
size 6530196
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
|
||||
size 6492176
|
||||
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
|
||||
size 6528308
|
||||
|
||||
@@ -1211,6 +1211,7 @@ private:
|
||||
}
|
||||
|
||||
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
|
||||
rlogger.debug("Disabling compaction for range={} for incremental repair", _range);
|
||||
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
|
||||
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
|
||||
_rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
|
||||
@@ -1240,6 +1241,8 @@ private:
|
||||
// compaction.
|
||||
reenablers_and_holders.cres.clear();
|
||||
rlogger.info("Re-enabled compaction for range={} for incremental repair", _range);
|
||||
|
||||
co_await utils::get_local_injector().inject("wait_after_prepare_sstables_for_incremental_repair", utils::wait_for_message(5min));
|
||||
}
|
||||
|
||||
// Read rows from sstable until the size of rows exceeds _max_row_buf_size - current_size
|
||||
@@ -2633,7 +2636,7 @@ future<repair_flush_hints_batchlog_response> repair_service::repair_flush_hints_
|
||||
all_replayed = co_await _bm.local().do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no);
|
||||
utils::get_local_injector().set_parameter("repair_flush_hints_batchlog_handler", "issue_flush", fmt::to_string(flush_time));
|
||||
}
|
||||
rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={}", req.repair_uuid, from, issue_flush);
|
||||
rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={} all_replayed={}", req.repair_uuid, from, issue_flush, all_replayed);
|
||||
}
|
||||
);
|
||||
if (!all_replayed) {
|
||||
@@ -3953,3 +3956,19 @@ future<std::optional<repair_task_progress>> repair_service::get_tablet_repair_ta
|
||||
task_uuid, tid, requested, finished, progress.progress(), finished_nomerge);
|
||||
co_return progress;
|
||||
}
|
||||
|
||||
void repair_service::on_cleanup_for_drop_table(const table_id& id) {
|
||||
// Prevent repair lock from being leaked in repair_service when table is dropped midway.
|
||||
// The RPC verb that removes the lock on success path will not be called by coordinator after table was dropped.
|
||||
// We also cannot move the lock from repair_service to repair_meta, since the lock must outlive the latter.
|
||||
// Since tablet metadata has been erased at this point, we can simply erase all instances for the dropped table.
|
||||
rlogger.debug("Cleaning up state for dropped table {}", id);
|
||||
for (auto it = _repair_compaction_locks.begin(); it != _repair_compaction_locks.end();) {
|
||||
auto& [global_tid, _] = *it;
|
||||
if (global_tid.table == id) {
|
||||
it = _repair_compaction_locks.erase(it);
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,6 +318,8 @@ public:
|
||||
|
||||
future<uint32_t> get_next_repair_meta_id();
|
||||
|
||||
void on_cleanup_for_drop_table(const table_id& id);
|
||||
|
||||
friend class repair::user_requested_repair_task_impl;
|
||||
friend class repair::data_sync_repair_task_impl;
|
||||
friend class repair::tablet_repair_task_impl;
|
||||
|
||||
@@ -448,6 +448,7 @@ public:
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
|
||||
virtual future<> wait_for_background_tablet_resize_work() = 0;
|
||||
|
||||
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
|
||||
};
|
||||
|
||||
@@ -1368,8 +1368,6 @@ public:
|
||||
future<compaction_reenablers_and_lock_holders> get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
|
||||
const service::frozen_topology_guard& guard, dht::token_range range);
|
||||
future<uint64_t> estimated_partitions_in_range(dht::token_range tr) const;
|
||||
private:
|
||||
future<std::vector<compaction::compaction_group_view*>> get_compaction_group_views_for_repair(dht::token_range range);
|
||||
};
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_tablet_sstable_set(schema_ptr, const storage_group_manager& sgm, const locator::tablet_map&);
|
||||
|
||||
@@ -750,6 +750,7 @@ public:
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
|
||||
}
|
||||
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
|
||||
future<> wait_for_background_tablet_resize_work() override { return make_ready_future<>(); }
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
|
||||
return get_compaction_group().make_sstable_set();
|
||||
@@ -768,6 +769,13 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
locator::resize_decision::seq_number_t _split_ready_seq_number = std::numeric_limits<locator::resize_decision::seq_number_t>::min();
|
||||
future<> _merge_completion_fiber;
|
||||
condition_variable _merge_completion_event;
|
||||
// Ensures that processes such as incremental repair will wait for pending work from
|
||||
// merge fiber before proceeding. This guarantees stability on the compaction groups.
|
||||
// NOTE: it's important that we don't await on the barrier with any compaction group
|
||||
// gate held, since merge fiber will stop groups that in turn await on gate,
|
||||
// potentially causing an ABBA deadlock.
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
@@ -856,6 +864,7 @@ public:
|
||||
, _my_host_id(erm.get_token_metadata().get_my_id())
|
||||
, _tablet_map(&erm.get_token_metadata().tablets().get_tablet_map(schema()->id()))
|
||||
, _merge_completion_fiber(merge_completion_fiber())
|
||||
, _merge_fiber_barrier(format("[table {}.{}] merge_fiber_barrier", _t.schema()->ks_name(), _t.schema()->cf_name()))
|
||||
{
|
||||
storage_group_map ret;
|
||||
|
||||
@@ -908,6 +917,10 @@ public:
|
||||
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
|
||||
return tablet_map().get_token_range_after_split(token);
|
||||
}
|
||||
future<> wait_for_background_tablet_resize_work() override {
|
||||
co_await _merge_fiber_barrier.advance_and_await();
|
||||
co_return;
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
|
||||
// FIXME: avoid recreation of compound_set for groups which had no change. usually, only one group will be changed at a time.
|
||||
@@ -2120,33 +2133,31 @@ compaction_group::update_repaired_at_for_merge() {
|
||||
});
|
||||
}
|
||||
|
||||
future<std::vector<compaction::compaction_group_view*>> table::get_compaction_group_views_for_repair(dht::token_range range) {
|
||||
std::vector<compaction::compaction_group_view*> ret;
|
||||
auto sgs = storage_groups_for_token_range(range);
|
||||
for (auto& sg : sgs) {
|
||||
co_await coroutine::maybe_yield();
|
||||
sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
|
||||
ret.push_back(&cg->view_for_unrepaired_data());
|
||||
});
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
|
||||
const service::frozen_topology_guard& guard, dht::token_range range) {
|
||||
auto ret = compaction_reenablers_and_lock_holders();
|
||||
auto views = co_await get_compaction_group_views_for_repair(range);
|
||||
for (auto view : views) {
|
||||
auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(*view);
|
||||
// Waits for background tablet resize work like merge that might destroy compaction groups,
|
||||
// providing stability. Essentially, serializes tablet merge completion handling with
|
||||
// the start of incremental repair, from the replica side.
|
||||
co_await _sg_manager->wait_for_background_tablet_resize_work();
|
||||
|
||||
for (auto sg : storage_groups_for_token_range(range)) {
|
||||
// FIXME: indentation
|
||||
auto cgs = sg->compaction_groups_immediate();
|
||||
for (auto& cg : cgs) {
|
||||
auto gate_holder = cg->async_gate().hold();
|
||||
auto& view = cg->view_for_unrepaired_data();
|
||||
auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(view);
|
||||
tlogger.info("Disabled compaction for range={} session_id={} for incremental repair", range, guard);
|
||||
ret.cres.push_back(std::make_unique<compaction::compaction_reenabler>(std::move(cre)));
|
||||
|
||||
// This lock prevents the unrepaired compaction started by major compaction to run in parallel with repair.
|
||||
// The unrepaired compaction started by minor compaction does not need to take the lock since it ignores
|
||||
// sstables being repaired, so it can run in parallel with repair.
|
||||
auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(*view, "row_level_repair");
|
||||
auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(view, "row_level_repair");
|
||||
tlogger.info("Got unrepaired compaction and repair lock for range={} session_id={} for incremental repair", range, guard);
|
||||
ret.lock_holders.push_back(std::move(lock_holder));
|
||||
}
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
@@ -3018,7 +3029,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
|
||||
while (!_t.async_gate().is_closed()) {
|
||||
try {
|
||||
co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(60s));
|
||||
co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(5min));
|
||||
auto ks_name = schema()->ks_name();
|
||||
auto cf_name = schema()->cf_name();
|
||||
// Enable compaction after merge is done.
|
||||
@@ -3052,6 +3063,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
utils::get_local_injector().inject("replica_merge_completion_wait", [] () {
|
||||
tlogger.info("Merge completion fiber finished, about to sleep");
|
||||
});
|
||||
_pending_merge_fiber_work.reset();
|
||||
co_await _merge_completion_event.wait();
|
||||
tlogger.debug("Merge completion fiber woke up for {}.{}", schema()->ks_name(), schema()->cf_name());
|
||||
}
|
||||
@@ -3110,6 +3122,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
new_storage_groups[new_tid] = std::move(new_sg);
|
||||
}
|
||||
_storage_groups = std::move(new_storage_groups);
|
||||
_pending_merge_fiber_work = _merge_fiber_barrier.start();
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
@@ -3126,6 +3139,9 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
} else if (new_tablet_count < old_tablet_count) {
|
||||
tlogger.info0("Detected tablet merge for table {}.{}, decreasing from {} to {} tablets",
|
||||
schema()->ks_name(), schema()->cf_name(), old_tablet_count, new_tablet_count);
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
|
||||
@@ -227,6 +227,8 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
|
||||
static const std::unordered_set<auth::resource> vector_search_system_resources = {
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
|
||||
};
|
||||
|
||||
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
|
||||
|
||||
@@ -72,7 +72,7 @@ void group0_state_id_handler::refresh() {
|
||||
const auto min_state_id = std::ranges::min(group0_members_state_ids, [](auto a, auto b) {
|
||||
if (!a || !b) {
|
||||
// This should never happen, but if it does, it's a bug.
|
||||
on_fatal_internal_error(slogger, "unexpected empty state_id");
|
||||
on_internal_error(slogger, "unexpected empty state_id");
|
||||
}
|
||||
return utils::timeuuid_tri_compare(a, b) < 0;
|
||||
});
|
||||
|
||||
@@ -149,19 +149,31 @@ public:
|
||||
const auto& node = nodes_info.at(voter_id);
|
||||
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_alive_nodes_remaining > 0);
|
||||
if (_alive_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no alive nodes remaining, but node {} is alive", voter_id));
|
||||
}
|
||||
--_alive_nodes_remaining;
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: rack doesn't own a live leader, but leader {} is alive", voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
if (node.is_voter) {
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no live voters remaining, but voter {} is alive", voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
} else {
|
||||
SCYLLA_ASSERT(_existing_dead_voters_remaining > 0);
|
||||
if (_existing_dead_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no dead voters remaining, but voter {} is dead", voter_id));
|
||||
}
|
||||
--_existing_dead_voters_remaining;
|
||||
}
|
||||
}
|
||||
@@ -279,16 +291,25 @@ public:
|
||||
|
||||
if (node.is_alive) {
|
||||
if (node.is_voter) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no live voters remaining, but voter {} is alive", *voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
}
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: DC doesn't own a live leader, but leader {} is alive", *voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(_nodes_remaining > 0);
|
||||
if (_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no nodes remaining, but voter {} belongs to this DC", *voter_id));
|
||||
}
|
||||
|
||||
--_nodes_remaining;
|
||||
++_assigned_voters_count;
|
||||
|
||||
@@ -123,12 +123,7 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
|
||||
// Check the effective replication map consistency:
|
||||
// we have an inconsistent effective replication map in case we the number of
|
||||
// read replicas is higher than the replication factor.
|
||||
void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
|
||||
// Skip for non-debug builds.
|
||||
if constexpr (!tools::build_info::is_debug_build()) {
|
||||
return;
|
||||
}
|
||||
|
||||
[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
|
||||
const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
|
||||
if (!error.empty()) {
|
||||
on_internal_error(slogger, error);
|
||||
@@ -4291,7 +4286,7 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
public:
|
||||
context(storage_proxy & p, utils::chunked_vector<mutation>&& mutations, lw_shared_ptr<cdc::operation_result_tracker>&& cdc_tracker, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, coordinator_mutate_options options)
|
||||
: _p(p)
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2))
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, _p.features().batchlog_v2 ? db::system_keyspace::BATCHLOG_V2 : db::system_keyspace::BATCHLOG))
|
||||
, _ermp(_p.local_db().find_column_family(_schema->id()).get_effective_replication_map())
|
||||
, _mutations(std::move(mutations))
|
||||
, _cdc_tracker(std::move(cdc_tracker))
|
||||
@@ -6972,7 +6967,12 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
|
||||
return host_id_vector_replica_set{my_host_id(erm)};
|
||||
}
|
||||
auto endpoints = erm.get_replicas_for_reading(token);
|
||||
validate_read_replicas(erm, endpoints);
|
||||
// Skip for non-debug builds and maintenance mode.
|
||||
if constexpr (tools::build_info::is_debug_build()) {
|
||||
if (!_db.local().get_config().maintenance_mode()) {
|
||||
validate_read_replicas(erm, endpoints);
|
||||
}
|
||||
}
|
||||
auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
|
||||
endpoints.erase(it, endpoints.end());
|
||||
sort_endpoints_by_proximity(erm, endpoints);
|
||||
|
||||
@@ -532,9 +532,16 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
|
||||
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
|
||||
}
|
||||
|
||||
static std::unordered_set<locator::host_id> get_released_nodes(const service::topology& topology, const locator::token_metadata& tm) {
|
||||
return boost::join(topology.left_nodes, topology.ignored_nodes)
|
||||
| std::views::transform([] (const auto& raft_id) { return locator::host_id(raft_id.uuid()); })
|
||||
| std::views::filter([&] (const auto& h) { return !tm.get_topology().has_node(h); })
|
||||
| std::ranges::to<std::unordered_set<locator::host_id>>();
|
||||
}
|
||||
|
||||
// Synchronizes the local node state (token_metadata, system.peers/system.local tables,
|
||||
// gossiper) to align it with the other raft topology nodes.
|
||||
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal) {
|
||||
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released) {
|
||||
nodes_to_notify_after_sync nodes_to_notify;
|
||||
|
||||
rtlogger.trace("Start sync_raft_topology_nodes");
|
||||
@@ -625,7 +632,9 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await update_topology_change_info(tmptr, ::format("{} {}/{}", rs.state, id, ip));
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(_topology_state_machine._topology.req_param.contains(id));
|
||||
if (!_topology_state_machine._topology.req_param.contains(id)) {
|
||||
on_internal_error(rtlogger, format("No request parameters for replacing node {}", id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(_topology_state_machine._topology.req_param[id]).replaced_id;
|
||||
auto existing_ip = _address_map.find(locator::host_id{replaced_id.uuid()});
|
||||
const auto replaced_host_id = locator::host_id(replaced_id.uuid());
|
||||
@@ -642,7 +651,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
on_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -688,13 +697,10 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
}
|
||||
}
|
||||
|
||||
auto nodes_to_release = t.left_nodes;
|
||||
nodes_to_release.insert(t.ignored_nodes.begin(), t.ignored_nodes.end());
|
||||
for (const auto& id: nodes_to_release) {
|
||||
auto host_id = locator::host_id(id.uuid());
|
||||
if (!tmptr->get_topology().find_node(host_id)) {
|
||||
nodes_to_notify.released.push_back(host_id);
|
||||
}
|
||||
if (prev_released) {
|
||||
auto nodes_to_release = get_released_nodes(t, *tmptr);
|
||||
std::erase_if(nodes_to_release, [&] (const auto& host_id) { return prev_released->contains(host_id); });
|
||||
std::copy(nodes_to_release.begin(), nodes_to_release.end(), std::back_inserter(nodes_to_notify.released));
|
||||
}
|
||||
|
||||
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
|
||||
@@ -732,6 +738,10 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
|
||||
|
||||
rtlogger.debug("reload raft topology state");
|
||||
std::unordered_set<raft::server_id> prev_normal = _topology_state_machine._topology.normal_nodes | std::views::keys | std::ranges::to<std::unordered_set>();
|
||||
std::optional<std::unordered_set<locator::host_id>> prev_released;
|
||||
if (!_topology_state_machine._topology.is_empty()) {
|
||||
prev_released = get_released_nodes(_topology_state_machine._topology, get_token_metadata());
|
||||
}
|
||||
|
||||
std::unordered_set<locator::host_id> tablet_hosts = co_await replica::read_required_hosts(_qp);
|
||||
|
||||
@@ -832,7 +842,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
|
||||
}, topology.tstate);
|
||||
tmptr->set_read_new(read_new);
|
||||
|
||||
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal));
|
||||
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal), std::move(prev_released));
|
||||
|
||||
std::optional<locator::tablet_metadata> tablets;
|
||||
if (hint.tablets_hint) {
|
||||
@@ -6276,7 +6286,11 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
break;
|
||||
case raft_topology_cmd::command::stream_ranges: {
|
||||
co_await with_scheduling_group(_db.local().get_streaming_scheduling_group(), coroutine::lambda([&] () -> future<> {
|
||||
const auto rs = _topology_state_machine._topology.find(id)->second;
|
||||
const auto* server_rs = _topology_state_machine._topology.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Got {} request for node {} not found in topology", cmd.cmd, id));
|
||||
}
|
||||
const auto rs = server_rs->second;
|
||||
auto tstate = _topology_state_machine._topology.tstate;
|
||||
auto session = _topology_state_machine._topology.session;
|
||||
if (!rs.ring || rs.ring->tokens.empty()) {
|
||||
@@ -7328,11 +7342,15 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
|
||||
const locator::host_id this_host = _db.local().get_token_metadata().get_my_id();
|
||||
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
// Align to 64 bytes to avoid cache line ping-pong when updating size in map_reduce0() below
|
||||
struct alignas(64) aligned_tablet_size {
|
||||
uint64_t size = 0;
|
||||
};
|
||||
std::vector<aligned_tablet_size> tablet_sizes_per_shard(smp::count);
|
||||
|
||||
// Each node combines a per-table load map from all of its shards and returns it to the coordinator.
|
||||
// So if there are 1k nodes, there will be 1k RPCs in total.
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &sum_tablet_sizes] (replica::database& db) -> future<locator::load_stats> {
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &tablet_sizes_per_shard] (replica::database& db) -> future<locator::load_stats> {
|
||||
locator::load_stats load_stats{};
|
||||
auto& tables_metadata = db.get_tables_metadata();
|
||||
|
||||
@@ -7370,7 +7388,7 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
sum_tablet_sizes += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -7389,6 +7407,10 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (config_capacity != 0) {
|
||||
tls.effective_capacity = config_capacity;
|
||||
} else {
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
for (const auto& ts : tablet_sizes_per_shard) {
|
||||
sum_tablet_sizes += ts.size;
|
||||
}
|
||||
tls.effective_capacity = si.available + sum_tablet_sizes;
|
||||
}
|
||||
|
||||
@@ -8431,6 +8453,7 @@ future<> storage_service::start_maintenance_mode() {
|
||||
set_mode(mode::MAINTENANCE);
|
||||
|
||||
return mutate_token_metadata([this] (mutable_token_metadata_ptr token_metadata) -> future<> {
|
||||
token_metadata->update_topology(my_host_id(), _snitch.local()->get_location(), locator::node::state::normal, smp::count);
|
||||
return token_metadata->update_normal_tokens({ dht::token{} }, my_host_id());
|
||||
}, acquire_merge_lock::yes);
|
||||
}
|
||||
@@ -8603,4 +8626,13 @@ future<> storage_service::query_cdc_streams(table_id table, noncopyable_function
|
||||
return _cdc_gens.local().query_cdc_streams(table, std::move(f));
|
||||
}
|
||||
|
||||
future<> storage_service::on_cleanup_for_drop_table(const table_id& id) {
|
||||
co_await container().invoke_on_all([id] (storage_service& ss) {
|
||||
if (ss._repair.local_is_initialized()) {
|
||||
ss._repair.local().on_cleanup_for_drop_table(id);
|
||||
}
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -617,6 +617,8 @@ public:
|
||||
virtual void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
|
||||
virtual void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
|
||||
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
|
||||
|
||||
future<> on_cleanup_for_drop_table(const table_id& id);
|
||||
private:
|
||||
std::optional<db::system_keyspace::peer_info> get_peer_info_for_update(locator::host_id endpoint);
|
||||
// return an engaged value iff app_state_map has changes to the peer info
|
||||
@@ -1115,7 +1117,7 @@ private:
|
||||
// gossiper) to align it with the other raft topology nodes.
|
||||
// Optional target_node can be provided to restrict the synchronization to the specified node.
|
||||
// Returns a structure that describes which notifications to trigger after token metadata is updated.
|
||||
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal);
|
||||
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released);
|
||||
// Triggers notifications (on_joined, on_left) based on the recent changes to token metadata, as described by the passed in structure.
|
||||
// This function should be called on the result of `sync_raft_topology_nodes`, after the global token metadata is updated.
|
||||
future<> notify_nodes_after_sync(nodes_to_notify_after_sync&& nodes_to_notify);
|
||||
|
||||
@@ -90,14 +90,14 @@ load_balancer_stats_manager::load_balancer_stats_manager(sstring group_name):
|
||||
setup_metrics(_cluster_stats);
|
||||
}
|
||||
|
||||
load_balancer_dc_stats& load_balancer_stats_manager::for_dc(const dc_name& dc) {
|
||||
const lw_shared_ptr<load_balancer_dc_stats>& load_balancer_stats_manager::for_dc(const dc_name& dc) {
|
||||
auto it = _dc_stats.find(dc);
|
||||
if (it == _dc_stats.end()) {
|
||||
auto stats = std::make_unique<load_balancer_dc_stats>();
|
||||
auto stats = make_lw_shared<load_balancer_dc_stats>();
|
||||
setup_metrics(dc, *stats);
|
||||
it = _dc_stats.emplace(dc, std::move(stats)).first;
|
||||
}
|
||||
return *it->second;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
load_balancer_node_stats& load_balancer_stats_manager::for_node(const dc_name& dc, host_id node) {
|
||||
@@ -149,22 +149,22 @@ db::tablet_options combine_tablet_options(R&& opts) {
|
||||
|
||||
static std::unordered_set<locator::tablet_id> split_string_to_tablet_id(std::string_view s, char delimiter) {
|
||||
auto tokens_view = s | std::views::split(delimiter)
|
||||
| std::views::transform([](auto&& range) {
|
||||
return std::string_view(&*range.begin(), std::ranges::distance(range));
|
||||
})
|
||||
| std::views::transform([](std::string_view sv) {
|
||||
return locator::tablet_id(std::stoul(std::string(sv)));
|
||||
});
|
||||
| std::views::transform([](auto&& range) {
|
||||
return std::string_view(&*range.begin(), std::ranges::distance(range));
|
||||
})
|
||||
| std::views::transform([](std::string_view sv) {
|
||||
return locator::tablet_id(std::stoul(std::string(sv)));
|
||||
});
|
||||
return std::unordered_set<locator::tablet_id>{tokens_view.begin(), tokens_view.end()};
|
||||
}
|
||||
|
||||
struct repair_plan {
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_info tinfo;
|
||||
dht::token_range range;
|
||||
dht::token last_token;
|
||||
db_clock::duration repair_time_diff;
|
||||
bool is_user_reuqest;
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_info tinfo;
|
||||
dht::token_range range;
|
||||
dht::token last_token;
|
||||
db_clock::duration repair_time_diff;
|
||||
bool is_user_reuqest;
|
||||
};
|
||||
|
||||
// Used to compare different migration choices in regard to impact on load imbalance.
|
||||
@@ -291,6 +291,12 @@ struct rack_list_colocation_state {
|
||||
}
|
||||
};
|
||||
|
||||
/// Formattable wrapper for migration_plan, whose formatter prints a short summary of the plan.
|
||||
struct plan_summary {
|
||||
migration_plan& plan;
|
||||
explicit plan_summary(migration_plan& plan) : plan(plan) {}
|
||||
};
|
||||
|
||||
future<rack_list_colocation_state> find_required_rack_list_colocations(
|
||||
replica::database& db,
|
||||
token_metadata_ptr tmptr,
|
||||
@@ -452,7 +458,36 @@ struct fmt::formatter<service::repair_plan> : fmt::formatter<std::string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const service::repair_plan& p, FormatContext& ctx) const {
|
||||
auto diff_seconds = std::chrono::duration<float>(p.repair_time_diff).count();
|
||||
fmt::format_to(ctx.out(), "{{tablet={} last_token={} is_user_req={} diff_seconds={}}}", p.gid, p.last_token, p.is_user_reuqest, diff_seconds);
|
||||
fmt::format_to(ctx.out(), "{{tablet={} last_token={} is_user_req={} diff_seconds={}}}", p.gid, p.last_token, p.is_user_reuqest, diff_seconds);
|
||||
return ctx.out();
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct fmt::formatter<service::plan_summary> : fmt::formatter<std::string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const service::plan_summary& p, FormatContext& ctx) const {
|
||||
auto& plan = p.plan;
|
||||
std::string_view delim = "";
|
||||
auto get_delim = [&] { return std::exchange(delim, ", "); };
|
||||
if (plan.migrations().size()) {
|
||||
fmt::format_to(ctx.out(), "{}migrations: {}", get_delim(), plan.migrations().size());
|
||||
}
|
||||
if (plan.repair_plan().repairs().size()) {
|
||||
fmt::format_to(ctx.out(), "{}repairs: {}", get_delim(), plan.repair_plan().repairs().size());
|
||||
}
|
||||
if (plan.resize_plan().resize.size()) {
|
||||
fmt::format_to(ctx.out(), "{}resize: {}", get_delim(), plan.resize_plan().resize.size());
|
||||
}
|
||||
if (plan.resize_plan().finalize_resize.size()) {
|
||||
fmt::format_to(ctx.out(), "{}resize-ready: {}", get_delim(), plan.resize_plan().finalize_resize.size());
|
||||
}
|
||||
if (plan.rack_list_colocation_plan().size()) {
|
||||
fmt::format_to(ctx.out(), "{}rack-list colocation ready: {}", get_delim(), plan.rack_list_colocation_plan().request_to_resume());
|
||||
}
|
||||
if (delim.empty()) {
|
||||
fmt::format_to(ctx.out(), "empty");
|
||||
}
|
||||
return ctx.out();
|
||||
}
|
||||
};
|
||||
@@ -868,9 +903,12 @@ class load_balancer {
|
||||
absl::flat_hash_map<table_id, uint64_t> _disk_used_per_table;
|
||||
dc_name _dc;
|
||||
std::optional<sstring> _rack; // Set when plan making is limited to a single rack.
|
||||
sstring _location; // Name of the current scope of plan making. DC or DC+rack.
|
||||
lw_shared_ptr<load_balancer_dc_stats> _current_stats; // Stats for current scope of plan making.
|
||||
size_t _total_capacity_shards; // Total number of non-drained shards in the balanced node set.
|
||||
size_t _total_capacity_nodes; // Total number of non-drained nodes in the balanced node set.
|
||||
uint64_t _total_capacity_storage; // Total storage of non-drained nodes in the balanced node set.
|
||||
size_t _migrating_candidates; // Number of candidate replicas skipped because tablet is migrating.
|
||||
locator::load_stats_ptr _table_load_stats;
|
||||
load_balancer_stats_manager& _stats;
|
||||
std::unordered_set<host_id> _skiplist;
|
||||
@@ -995,22 +1033,21 @@ public:
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations in rack {} in DC {}", rack_plan.size(), rack, dc);
|
||||
plan.merge(std::move(rack_plan));
|
||||
}
|
||||
} else {
|
||||
auto dc_plan = co_await make_plan(dc);
|
||||
auto level = dc_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations in DC {}", dc_plan.size(), dc);
|
||||
plan.merge(std::move(dc_plan));
|
||||
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Plan for {}/{}: {}", dc, rack, plan_summary(rack_plan));
|
||||
plan.merge(std::move(rack_plan));
|
||||
}
|
||||
} else {
|
||||
auto dc_plan = co_await make_plan(dc);
|
||||
auto level = dc_plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Plan for {}: {}", dc, plan_summary(dc_plan));
|
||||
plan.merge(std::move(dc_plan));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1027,9 +1064,8 @@ public:
|
||||
plan.set_repair_plan(co_await make_repair_plan(plan));
|
||||
}
|
||||
|
||||
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
|
||||
auto level = plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Prepared plan: {}", plan_summary(plan));
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
@@ -1071,6 +1107,11 @@ public:
|
||||
if (!is_auto_repair_enabled(config)) {
|
||||
co_return false;
|
||||
}
|
||||
auto size = info.replicas.size();
|
||||
if (size <= 1) {
|
||||
lblogger.debug("Skipped auto repair for tablet={} replicas={}", gid, size);
|
||||
co_return false;
|
||||
}
|
||||
auto threshold = _db.get_config().auto_repair_threshold_default_in_seconds();
|
||||
auto repair_time_threshold = std::chrono::seconds(threshold);
|
||||
auto& last_repair_time = info.repair_time;
|
||||
@@ -1408,7 +1449,7 @@ public:
|
||||
co_return all_colocated;
|
||||
}
|
||||
|
||||
future<migration_plan> make_merge_colocation_plan(const dc_name& dc, node_load_map& nodes) {
|
||||
future<migration_plan> make_merge_colocation_plan(node_load_map& nodes) {
|
||||
migration_plan plan;
|
||||
table_resize_plan resize_plan;
|
||||
|
||||
@@ -1565,7 +1606,7 @@ public:
|
||||
if (cross_rack_migration(src, dst)) {
|
||||
// FIXME: This is illegal if table has views, as it breaks base-view pairing.
|
||||
// Can happen when RF!=#racks.
|
||||
_stats.for_dc(_dc).cross_rack_collocations++;
|
||||
_current_stats->cross_rack_collocations++;
|
||||
lblogger.debug("Cross-rack co-location migration for {}@{} (rack: {}) to co-habit {}@{} (rack: {})",
|
||||
t2_id, src, rack_of(src), t1_id, dst, rack_of(dst));
|
||||
utils::get_local_injector().inject("forbid_cross_rack_migration_attempt", [&] {
|
||||
@@ -2215,7 +2256,7 @@ public:
|
||||
|
||||
// Evaluates impact on load balance of migrating a tablet set of a given table to dst.
|
||||
migration_badness evaluate_dst_badness(node_load_map& nodes, table_id table, tablet_replica dst, uint64_t tablet_set_disk_size) {
|
||||
_stats.for_dc(_dc).candidates_evaluated++;
|
||||
_current_stats->candidates_evaluated++;
|
||||
|
||||
auto& node_info = nodes[dst.host];
|
||||
|
||||
@@ -2254,7 +2295,7 @@ public:
|
||||
|
||||
// Evaluates impact on load balance of migrating a tablet set of a given table from src.
|
||||
migration_badness evaluate_src_badness(node_load_map& nodes, table_id table, tablet_replica src, uint64_t tablet_set_disk_size) {
|
||||
_stats.for_dc(_dc).candidates_evaluated++;
|
||||
_current_stats->candidates_evaluated++;
|
||||
|
||||
auto& node_info = nodes[src.host];
|
||||
|
||||
@@ -2603,15 +2644,15 @@ public:
|
||||
auto mig_streaming_info = get_migration_streaming_infos(_tm->get_topology(), tmap, mig);
|
||||
|
||||
if (!can_accept_load(nodes, mig_streaming_info)) {
|
||||
_stats.for_dc(node_load.dc()).migrations_skipped++;
|
||||
_current_stats->migrations_skipped++;
|
||||
lblogger.debug("Unable to balance {}: load limit reached", host);
|
||||
break;
|
||||
}
|
||||
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {} size: {}", mig, tablets.tablet_set_disk_size);
|
||||
_stats.for_dc(node_load.dc()).migrations_produced++;
|
||||
_stats.for_dc(node_load.dc()).intranode_migrations_produced++;
|
||||
_current_stats->migrations_produced++;
|
||||
_current_stats->intranode_migrations_produced++;
|
||||
mark_as_scheduled(mig);
|
||||
plan.add(std::move(mig));
|
||||
|
||||
@@ -2718,21 +2759,21 @@ public:
|
||||
auto targets = get_viable_targets();
|
||||
if (rs->is_rack_based(_dc)) {
|
||||
lblogger.debug("candidate tablet {} skipped because RF is rack-based and it's in a different rack", tablet);
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_rack++;
|
||||
_current_stats->tablets_skipped_rack++;
|
||||
return skip_info{std::move(targets)};
|
||||
}
|
||||
if (!targets.contains(dst_info.id)) {
|
||||
auto new_rack_load = rack_load[dst_info.rack()] + 1;
|
||||
lblogger.debug("candidate tablet {} skipped because it would increase load on rack {} to {}, max={}",
|
||||
tablet, dst_info.rack(), new_rack_load, max_rack_load);
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_rack++;
|
||||
_current_stats->tablets_skipped_rack++;
|
||||
return skip_info{std::move(targets)};
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&& r : tmap.get_tablet_info(tablet.tablet).replicas) {
|
||||
if (r.host == dst_info.id) {
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_node++;
|
||||
_current_stats->tablets_skipped_node++;
|
||||
lblogger.debug("candidate tablet {} skipped because it has a replica on target node", tablet);
|
||||
if (need_viable_targets) {
|
||||
return skip_info{get_viable_targets()};
|
||||
@@ -2939,7 +2980,7 @@ public:
|
||||
};
|
||||
|
||||
if (min_candidate.badness.is_bad() && _use_table_aware_balancing) {
|
||||
_stats.for_dc(_dc).bad_first_candidates++;
|
||||
_current_stats->bad_first_candidates++;
|
||||
|
||||
// Consider better alternatives.
|
||||
if (drain_skipped) {
|
||||
@@ -3060,7 +3101,7 @@ public:
|
||||
lblogger.debug("Table {} shard overcommit: {}", table, overcommit);
|
||||
}
|
||||
|
||||
future<migration_plan> make_internode_plan(const dc_name& dc, node_load_map& nodes,
|
||||
future<migration_plan> make_internode_plan(node_load_map& nodes,
|
||||
const std::unordered_set<host_id>& nodes_to_drain,
|
||||
host_id target) {
|
||||
migration_plan plan;
|
||||
@@ -3120,7 +3161,7 @@ public:
|
||||
|
||||
if (nodes_by_load.empty()) {
|
||||
lblogger.debug("No more candidate nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3191,7 +3232,7 @@ public:
|
||||
|
||||
if (nodes_by_load_dst.empty()) {
|
||||
lblogger.debug("No more target nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3221,7 +3262,7 @@ public:
|
||||
const load_type max_load = std::max(max_off_candidate_load, src_node_info.avg_load);
|
||||
if (is_balanced(target_info.avg_load, max_load)) {
|
||||
lblogger.debug("Balance achieved.");
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -3255,7 +3296,7 @@ public:
|
||||
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
|
||||
if (can_check_convergence && !check_convergence(src_node_info, target_info, source_tablets)) {
|
||||
lblogger.debug("No more candidates. Load would be inverted.");
|
||||
_stats.for_dc(dc).stop_load_inversion++;
|
||||
_current_stats->stop_load_inversion++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3289,11 +3330,11 @@ public:
|
||||
}
|
||||
}
|
||||
if (candidate.badness.is_bad()) {
|
||||
_stats.for_dc(_dc).bad_migrations++;
|
||||
_current_stats->bad_migrations++;
|
||||
}
|
||||
|
||||
if (drain_skipped) {
|
||||
_stats.for_dc(_dc).migrations_from_skiplist++;
|
||||
_current_stats->migrations_from_skiplist++;
|
||||
}
|
||||
|
||||
if (src_node_info.req && *src_node_info.req == topology_request::leave && src_node_info.excluded) {
|
||||
@@ -3313,7 +3354,7 @@ public:
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {} size: {}", mig, source_tablets.tablet_set_disk_size);
|
||||
_stats.for_dc(dc).migrations_produced++;
|
||||
_current_stats->migrations_produced++;
|
||||
mark_as_scheduled(mig);
|
||||
plan.add(std::move(mig));
|
||||
} else {
|
||||
@@ -3324,10 +3365,10 @@ public:
|
||||
// Just because the next migration is blocked doesn't mean we could not proceed with migrations
|
||||
// for other shards which are produced by the planner subsequently.
|
||||
skipped_migrations++;
|
||||
_stats.for_dc(dc).migrations_skipped++;
|
||||
_current_stats->migrations_skipped++;
|
||||
if (skipped_migrations >= max_skipped_migrations) {
|
||||
lblogger.debug("Too many migrations skipped, aborting balancing");
|
||||
_stats.for_dc(dc).stop_skip_limit++;
|
||||
_current_stats->stop_skip_limit++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -3346,7 +3387,7 @@ public:
|
||||
}
|
||||
|
||||
if (plan.size() == batch_size) {
|
||||
_stats.for_dc(dc).stop_batch_size++;
|
||||
_current_stats->stop_batch_size++;
|
||||
}
|
||||
|
||||
if (plan.empty()) {
|
||||
@@ -3363,7 +3404,13 @@ public:
|
||||
// If there are 7 tablets and RF=3, each node must have 1 tablet replica.
|
||||
// So node3 will have average load of 1, and node1 and node2 will have
|
||||
// average shard load of 7.
|
||||
lblogger.info("Not possible to achieve balance.");
|
||||
|
||||
// Show when this is the final plan with no active migrations left to execute,
|
||||
// otherwise it may just be a temporary situation due to lack of candidates.
|
||||
if (_migrating_candidates == 0) {
|
||||
lblogger.info("Not possible to achieve balance in {}", _location);
|
||||
print_node_stats(nodes, only_active::no);
|
||||
}
|
||||
}
|
||||
|
||||
co_return std::move(plan);
|
||||
@@ -3420,11 +3467,37 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using only_active = bool_class<struct only_active_tag>;
|
||||
|
||||
void print_node_stats(node_load_map& nodes, only_active only_active_) {
|
||||
for (auto&& [host, load] : nodes) {
|
||||
size_t read = 0;
|
||||
size_t write = 0;
|
||||
for (auto& shard_load : load.shards) {
|
||||
read += shard_load.streaming_read_load;
|
||||
write += shard_load.streaming_write_load;
|
||||
}
|
||||
auto level = !only_active_ || (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Node {}: {}/{} load={:.6f} tablets={} shards={} tablets/shard={:.3f} state={} cap={}"
|
||||
" rd={} wr={}",
|
||||
host, load.dc(), load.rack(), load.avg_load, load.tablet_count, load.shard_count,
|
||||
load.tablets_per_shard(), load.state(), load.dusage->capacity, read, write);
|
||||
}
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan(dc_name dc, std::optional<sstring> rack = std::nullopt) {
|
||||
migration_plan plan;
|
||||
|
||||
if (utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
_dc = dc;
|
||||
_rack = rack;
|
||||
_location = fmt::format("{}{}", dc, rack ? fmt::format("/{}", *rack) : "");
|
||||
_current_stats = _stats.for_dc(dc);
|
||||
auto _ = seastar::defer([&] { _current_stats = nullptr; });
|
||||
_migrating_candidates = 0;
|
||||
|
||||
auto node_filter = [&] (const locator::node& node) {
|
||||
return node.dc_rack().dc == dc && (!rack || node.dc_rack().rack == *rack);
|
||||
@@ -3433,7 +3506,7 @@ public:
|
||||
// Causes load balancer to move some tablet even though load is balanced.
|
||||
auto shuffle = in_shuffle_mode();
|
||||
|
||||
_stats.for_dc(dc).calls++;
|
||||
_current_stats->calls++;
|
||||
lblogger.debug("Examining DC {} rack {} (shuffle={}, balancing={}, tablets_per_shard_goal={}, force_capacity_based_balancing={})",
|
||||
dc, rack, shuffle, _tm->tablets().balancing_enabled(), _tablets_per_shard_goal, _force_capacity_based_balancing);
|
||||
|
||||
@@ -3529,7 +3602,7 @@ public:
|
||||
|
||||
if (nodes.empty()) {
|
||||
lblogger.debug("No nodes to balance.");
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
@@ -3552,15 +3625,23 @@ public:
|
||||
|
||||
// If we don't have nodes to drain, remove nodes which don't have complete tablet sizes
|
||||
if (nodes_to_drain.empty()) {
|
||||
std::optional<host_id> incomplete_host;
|
||||
size_t incomplete_count = 0;
|
||||
|
||||
for (auto nodes_i = nodes.begin(); nodes_i != nodes.end();) {
|
||||
host_id host = nodes_i->first;
|
||||
if (!_load_sketch->has_complete_data(host)) {
|
||||
lblogger.info("Node {} does not have complete tablet stats, ignoring", nodes_i->first);
|
||||
incomplete_host.emplace(host);
|
||||
incomplete_count++;
|
||||
nodes_i = nodes.erase(nodes_i);
|
||||
} else {
|
||||
++nodes_i;
|
||||
}
|
||||
}
|
||||
|
||||
if (incomplete_host) {
|
||||
lblogger.info("Ignoring {} node(s) with incomplete tablet stats, e.g. {}", incomplete_count, *incomplete_host);
|
||||
}
|
||||
}
|
||||
|
||||
plan.set_has_nodes_to_drain(!nodes_to_drain.empty());
|
||||
@@ -3594,11 +3675,11 @@ public:
|
||||
});
|
||||
if (!has_dest_nodes) {
|
||||
for (auto host : nodes_to_drain) {
|
||||
plan.add(drain_failure(host, format("No candidate nodes in DC {} to drain {}."
|
||||
" Consider adding new nodes or reducing replication factor.", dc, host)));
|
||||
plan.add(drain_failure(host, format("No candidate nodes in {} to drain {}."
|
||||
" Consider adding new nodes or reducing replication factor.", _location, host)));
|
||||
}
|
||||
lblogger.debug("No candidate nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
@@ -3704,6 +3785,8 @@ public:
|
||||
if (!migrating(t1) && !migrating(t2)) {
|
||||
auto candidate = colocated_tablets{global_tablet_id{table, t1.tid}, global_tablet_id{table, t2->tid}};
|
||||
add_candidate(shard_load_info, migration_tablet_set{std::move(candidate), tablet_sizes_sum});
|
||||
} else {
|
||||
_migrating_candidates++;
|
||||
}
|
||||
} else {
|
||||
if (tids.size() != tablet_sizes.size()) {
|
||||
@@ -3712,6 +3795,8 @@ public:
|
||||
for (size_t i = 0; i < tids.size(); i++) {
|
||||
if (!migrating(get_table_desc(tids[i]))) { // migrating tablets are not candidates
|
||||
add_candidate(shard_load_info, migration_tablet_set{global_tablet_id{table, tids[i]}, tablet_sizes[i]});
|
||||
} else {
|
||||
_migrating_candidates++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3749,26 +3834,14 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&& [host, load] : nodes) {
|
||||
size_t read = 0;
|
||||
size_t write = 0;
|
||||
for (auto& shard_load : load.shards) {
|
||||
read += shard_load.streaming_read_load;
|
||||
write += shard_load.streaming_write_load;
|
||||
}
|
||||
auto level = (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Node {}: dc={} rack={} load={} tablets={} shards={} tablets/shard={} state={} cap={}"
|
||||
" stream_read={} stream_write={}",
|
||||
host, dc, load.rack(), load.avg_load, load.tablet_count, load.shard_count,
|
||||
load.tablets_per_shard(), load.state(), load.dusage->capacity, read, write);
|
||||
}
|
||||
print_node_stats(nodes, only_active::yes);
|
||||
|
||||
if (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || !is_balanced(min_load, max_load)))) {
|
||||
host_id target = *min_load_node;
|
||||
lblogger.info("target node: {}, avg_load: {}, max: {}", target, min_load, max_load);
|
||||
plan.merge(co_await make_internode_plan(dc, nodes, nodes_to_drain, target));
|
||||
plan.merge(co_await make_internode_plan(nodes, nodes_to_drain, target));
|
||||
} else {
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled()) {
|
||||
@@ -3776,9 +3849,9 @@ public:
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in {}", dc_merge_plan.tablet_migration_count(), _location);
|
||||
plan.merge(std::move(dc_merge_plan));
|
||||
}
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ class load_balancer_stats_manager {
|
||||
using host_id = locator::host_id;
|
||||
|
||||
sstring group_name;
|
||||
std::unordered_map<dc_name, std::unique_ptr<load_balancer_dc_stats>> _dc_stats;
|
||||
std::unordered_map<dc_name, lw_shared_ptr<load_balancer_dc_stats>> _dc_stats;
|
||||
std::unordered_map<host_id, std::unique_ptr<load_balancer_node_stats>> _node_stats;
|
||||
load_balancer_cluster_stats _cluster_stats;
|
||||
seastar::metrics::label dc_label{"target_dc"};
|
||||
@@ -113,7 +113,7 @@ class load_balancer_stats_manager {
|
||||
public:
|
||||
load_balancer_stats_manager(sstring group_name);
|
||||
|
||||
load_balancer_dc_stats& for_dc(const dc_name& dc);
|
||||
const lw_shared_ptr<load_balancer_dc_stats>& for_dc(const dc_name& dc);
|
||||
load_balancer_node_stats& for_node(const dc_name& dc, host_id node);
|
||||
load_balancer_cluster_stats& for_cluster();
|
||||
|
||||
@@ -196,7 +196,7 @@ public:
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size() && _drain_failures.empty(); }
|
||||
bool empty() const { return !size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size() + _drain_failures.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
|
||||
@@ -331,12 +331,17 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto [id, req] = *next_req;
|
||||
|
||||
auto* server_rs = topo.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Node {} has a pending {} request but is not found in topology", id, req));
|
||||
}
|
||||
|
||||
if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
|
||||
// If the highest prio request is removenode or decommission we need to start cleanup if one is needed
|
||||
return start_vnodes_cleanup(std::move(guard), req, id);
|
||||
}
|
||||
|
||||
return node_to_work_on(std::move(guard), &topo, id, &topo.find(id)->second, req, get_request_param(id));
|
||||
return node_to_work_on(std::move(guard), &topo, id, &server_rs->second, req, get_request_param(id));
|
||||
};
|
||||
|
||||
node_to_work_on get_node_to_work_on(group0_guard guard) const {
|
||||
@@ -373,7 +378,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
auto& topo = _topo_sm._topology;
|
||||
|
||||
auto it = topo.find(id);
|
||||
SCYLLA_ASSERT(it);
|
||||
if (!it) {
|
||||
on_internal_error(rtlogger, format("retake_node: node {} not found in topology", id));
|
||||
}
|
||||
|
||||
std::optional<topology_request> req;
|
||||
auto rit = topo.requests.find(id);
|
||||
@@ -2492,7 +2499,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
switch (node.rs->state) {
|
||||
case node_state::bootstrapping: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Bootstrapping node {} owns tokens", node.id));
|
||||
}
|
||||
auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
|
||||
auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;
|
||||
|
||||
@@ -2548,11 +2557,23 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
|
||||
auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
|
||||
SCYLLA_ASSERT(it != _topo_sm._topology.normal_nodes.end());
|
||||
SCYLLA_ASSERT(it->second.ring && it->second.state == node_state::normal);
|
||||
if (it == _topo_sm._topology.normal_nodes.end()) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} not found in normal nodes", replaced_id, node.id));
|
||||
}
|
||||
if (!it->second.ring) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is missing tokens", replaced_id, node.id));
|
||||
}
|
||||
if (it->second.state != node_state::normal) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is not in normal state", replaced_id, node.id));
|
||||
}
|
||||
|
||||
topology_mutation_builder builder(node.guard.write_timestamp());
|
||||
|
||||
@@ -2951,7 +2972,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Ring state on node {} is write_both_read_new while the node is in state {}",
|
||||
node.id, node.rs->state));
|
||||
}
|
||||
@@ -3268,7 +3289,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
rtbuilder.set("start_time", db_clock::now());
|
||||
switch (node.request.value()) {
|
||||
case topology_request::join: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Joining node {} owns tokens", node.id));
|
||||
}
|
||||
// Write chosen tokens through raft.
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3280,7 +3303,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::leave: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3311,7 +3336,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::remove: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Node {} being removed doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3339,7 +3366,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::replace: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3396,12 +3425,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
case node_state::removing:
|
||||
case node_state::replacing:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Found node {} in state {} but there is no ongoing topology transition",
|
||||
node.id, node.rs->state));
|
||||
case node_state::left:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Topology coordinator is called for node {} in state 'left'", node.id));
|
||||
break;
|
||||
}
|
||||
@@ -3463,7 +3492,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto id = node.id;
|
||||
|
||||
SCYLLA_ASSERT(!_topo_sm._topology.transition_nodes.empty());
|
||||
if (_topo_sm._topology.transition_nodes.empty()) {
|
||||
on_internal_error(rtlogger, format("transition nodes are empty while accepting node {}", node.id));
|
||||
}
|
||||
|
||||
release_node(std::move(node));
|
||||
|
||||
@@ -3873,6 +3904,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
|
||||
for (auto& [table_id, table_stats] : dc_stats.tables) {
|
||||
co_await coroutine::maybe_yield();
|
||||
|
||||
if (!_db.column_family_exists(table_id)) {
|
||||
continue;
|
||||
}
|
||||
auto& t = _db.find_column_family(table_id);
|
||||
auto& rs = t.get_effective_replication_map()->get_replication_strategy();
|
||||
if (!rs.uses_tablets()) {
|
||||
@@ -3896,6 +3930,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
|
||||
}
|
||||
|
||||
for (auto& [table_id, table_load_stats] : stats.tables) {
|
||||
if (!total_replicas.contains(table_id)) {
|
||||
continue;
|
||||
}
|
||||
auto table_total_replicas = total_replicas.at(table_id);
|
||||
if (table_total_replicas == 0) {
|
||||
continue;
|
||||
|
||||
@@ -20,6 +20,8 @@ namespace db {
|
||||
|
||||
namespace service {
|
||||
|
||||
extern logging::logger rtlogger;
|
||||
|
||||
topology_mutation_builder::topology_mutation_builder(api::timestamp_type ts) :
|
||||
_s(db::system_keyspace::topology()),
|
||||
_m(_s, partition_key::from_singular(*_s, db::system_keyspace::TOPOLOGY)),
|
||||
@@ -35,7 +37,9 @@ topology_node_mutation_builder::topology_node_mutation_builder(topology_mutation
|
||||
template<typename Builder>
|
||||
Builder& topology_mutation_builder_base<Builder>::apply_atomic(const char* cell, const data_value& value) {
|
||||
const column_definition* cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
self().row().apply(*cdef, atomic_cell::make_live(*cdef->type, self().timestamp(), cdef->type->decompose(value), self().ttl()));
|
||||
return self();
|
||||
}
|
||||
@@ -45,7 +49,9 @@ template<std::ranges::range C>
|
||||
requires std::convertible_to<std::ranges::range_value_t<C>, data_value>
|
||||
Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, collection_apply_mode apply_mode, const C& c) {
|
||||
const column_definition* cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
auto vtype = static_pointer_cast<const set_type_impl>(cdef->type)->get_elements_type();
|
||||
|
||||
std::set<bytes, serialized_compare> cset(vtype->as_less_comparator());
|
||||
@@ -70,7 +76,9 @@ Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, co
|
||||
template<typename Builder>
|
||||
Builder& topology_mutation_builder_base<Builder>::del(const char* cell) {
|
||||
auto cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
if (!cdef->type->is_multi_cell()) {
|
||||
self().row().apply(*cdef, atomic_cell::make_dead(self().timestamp(), gc_clock::now()));
|
||||
} else {
|
||||
|
||||
1
test.py
1
test.py
@@ -61,7 +61,6 @@ PYTEST_RUNNER_DIRECTORIES = [
|
||||
TEST_DIR / 'raft',
|
||||
TEST_DIR / 'unit',
|
||||
TEST_DIR / 'vector_search',
|
||||
TEST_DIR / 'vector_search_validator',
|
||||
TEST_DIR / 'alternator',
|
||||
TEST_DIR / 'broadcast_tables',
|
||||
TEST_DIR / 'cql',
|
||||
|
||||
@@ -103,7 +103,6 @@ if(BUILD_TESTING)
|
||||
add_subdirectory(raft)
|
||||
add_subdirectory(resource/wasm)
|
||||
add_subdirectory(vector_search)
|
||||
add_subdirectory(vector_search_validator)
|
||||
|
||||
if(CMAKE_CONFIGURATION_TYPES)
|
||||
foreach(config ${CMAKE_CONFIGURATION_TYPES})
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "utils/s3/aws_error.hh"
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/http/exception.hh>
|
||||
|
||||
enum class message_style : uint8_t { singular = 1, plural = 2 };
|
||||
|
||||
@@ -122,7 +123,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
|
||||
std::throw_with_nested(std::logic_error("Higher level logic_error"));
|
||||
}
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_maybe_nested_exception(std::current_exception());
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::NETWORK_CONNECTION, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("Software caused connection abort", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
|
||||
@@ -136,7 +137,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
|
||||
std::throw_with_nested(std::runtime_error("Higher level runtime_error"));
|
||||
}
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_maybe_nested_exception(std::current_exception());
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("Higher level runtime_error", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
|
||||
@@ -146,7 +147,7 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
|
||||
try {
|
||||
throw std::runtime_error("Something bad happened");
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_maybe_nested_exception(std::current_exception());
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("Something bad happened", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
|
||||
@@ -156,9 +157,39 @@ BOOST_AUTO_TEST_CASE(TestNestedException) {
|
||||
try {
|
||||
throw "foo";
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_maybe_nested_exception(std::current_exception());
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::UNKNOWN, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("No error message was provided, exception content: char const*", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::no);
|
||||
}
|
||||
|
||||
// Test system_error
|
||||
try {
|
||||
throw std::system_error(std::error_code(ECONNABORTED, std::system_category()));
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::NETWORK_CONNECTION, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("Software caused connection abort", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
|
||||
}
|
||||
|
||||
// Test aws_exception
|
||||
try {
|
||||
throw aws::aws_exception(aws::aws_error::get_errors().at("HTTP_TOO_MANY_REQUESTS"));
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::HTTP_TOO_MANY_REQUESTS, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL("", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
|
||||
}
|
||||
|
||||
// Test httpd::unexpected_status_error
|
||||
try {
|
||||
throw seastar::httpd::unexpected_status_error(seastar::http::reply::status_type::network_connect_timeout);
|
||||
} catch (...) {
|
||||
auto error = aws::aws_error::from_exception_ptr(std::current_exception());
|
||||
BOOST_REQUIRE_EQUAL(aws::aws_error_type::HTTP_NETWORK_CONNECT_TIMEOUT, error.get_error_type());
|
||||
BOOST_REQUIRE_EQUAL(" HTTP code: 599 Network Connect Timeout", error.get_error_message());
|
||||
BOOST_REQUIRE_EQUAL(error.is_retryable(), aws::retryable::yes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,8 +169,127 @@ void check_range_tombstone_end(const fragment& f, std::optional<bound_weight> bo
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t prepare_batches(cql_test_env& env, std::string_view batchlog_table_name, uint64_t batch_count, bool replay_fails,
|
||||
db::batchlog_manager::post_replay_cleanup cleanup) {
|
||||
const bool is_v1 = batchlog_table_name == db::system_keyspace::BATCHLOG;
|
||||
|
||||
uint64_t failed_batches = 0;
|
||||
|
||||
auto& bm = env.batchlog_manager().local();
|
||||
|
||||
env.execute_cql("CREATE TABLE tbl (pk bigint PRIMARY KEY, v text)").get();
|
||||
|
||||
for (uint64_t i = 0; i != batch_count; ++i) {
|
||||
std::vector<sstring> queries;
|
||||
std::vector<std::string_view> query_views;
|
||||
for (uint64_t j = 0; j != i+2; ++j) {
|
||||
queries.emplace_back(format("INSERT INTO tbl (pk, v) VALUES ({}, 'value');", j));
|
||||
query_views.emplace_back(queries.back());
|
||||
}
|
||||
const bool fail = i % 2;
|
||||
bool injected_exception_thrown = false;
|
||||
|
||||
std::optional<scoped_error_injection> error_injection;
|
||||
if (fail) {
|
||||
++failed_batches;
|
||||
error_injection.emplace("storage_proxy_fail_send_batch");
|
||||
}
|
||||
try {
|
||||
env.execute_batch(
|
||||
query_views,
|
||||
cql3::statements::batch_statement::type::LOGGED,
|
||||
std::make_unique<cql3::query_options>(db::consistency_level::ONE, std::vector<cql3::raw_value>())).get();
|
||||
} catch (std::runtime_error& ex) {
|
||||
if (fail) {
|
||||
BOOST_REQUIRE_EQUAL(std::string(ex.what()), "Error injection: failing to send batch");
|
||||
injected_exception_thrown = true;
|
||||
} else {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(injected_exception_thrown, fail);
|
||||
}
|
||||
|
||||
// v1 (system.batchlog) is partition-oriented, while v2 (system.batchlog_v2) is row oriented. We need to switch the partition-region filter accordingly.
|
||||
const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = {} ALLOW FILTERING", db::system_keyspace::NAME, batchlog_table_name, is_v1 ? 0 : 2);
|
||||
|
||||
assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, batchlog_table_name)).get())
|
||||
.is_rows()
|
||||
.with_size(failed_batches);
|
||||
|
||||
assert_that(env.execute_cql(fragments_query).get())
|
||||
.is_rows(tests::dump_to_logs::yes)
|
||||
.with_size(batch_count)
|
||||
.assert_for_columns_of_each_row([&] (columns_assertions& columns) {
|
||||
columns.with_typed_column<sstring>("mutation_source", "memtable:0");
|
||||
});
|
||||
|
||||
std::optional<scoped_error_injection> error_injection;
|
||||
if (replay_fails) {
|
||||
error_injection.emplace("storage_proxy_fail_replay_batch");
|
||||
}
|
||||
|
||||
bm.do_batch_log_replay(cleanup).get();
|
||||
|
||||
assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, batchlog_table_name)).get())
|
||||
.is_rows(tests::dump_to_logs::yes)
|
||||
.with_size(replay_fails ? failed_batches : 0);
|
||||
|
||||
return failed_batches;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
future<> run_batchlog_v1_cleanup_with_failed_batches_test(bool replay_fails) {
|
||||
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
return make_ready_future<>();
|
||||
#endif
|
||||
|
||||
cql_test_config cfg;
|
||||
cfg.db_config->batchlog_replay_cleanup_after_replays.set_value("9999999", utils::config_file::config_source::Internal);
|
||||
cfg.batchlog_replay_timeout = 0s;
|
||||
cfg.batchlog_delay = 9999h;
|
||||
cfg.disabled_features.insert("BATCHLOG_V2");
|
||||
|
||||
return do_with_cql_env_thread([=] (cql_test_env& env) -> void {
|
||||
const uint64_t batch_count = 8;
|
||||
const uint64_t failed_batches = prepare_batches(env, db::system_keyspace::BATCHLOG, batch_count, replay_fails, db::batchlog_manager::post_replay_cleanup::no);
|
||||
|
||||
const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = 0 ALLOW FILTERING", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
|
||||
|
||||
const auto fragment_results = cql3::untyped_result_set(env.execute_cql(fragments_query).get());
|
||||
|
||||
const auto batchlog_v1_schema = env.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
|
||||
|
||||
size_t live{0};
|
||||
size_t dead{0};
|
||||
for (const auto& row : fragment_results) {
|
||||
const auto metadata = row.get_as<sstring>("metadata");
|
||||
auto metadata_json = rjson::parse(metadata);
|
||||
if (metadata_json.HasMember("tombstone") && metadata_json["tombstone"].IsObject() && metadata_json["tombstone"].HasMember("deletion_time")) {
|
||||
++dead;
|
||||
} else {
|
||||
++live;
|
||||
}
|
||||
}
|
||||
|
||||
if (replay_fails) {
|
||||
BOOST_REQUIRE_EQUAL(failed_batches, live);
|
||||
} else {
|
||||
BOOST_REQUIRE_EQUAL(0, live);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(batch_count, dead + live);
|
||||
}, cfg);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_batchlog_v1_replay_fails) {
|
||||
return run_batchlog_v1_cleanup_with_failed_batches_test(true);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_batchlog_v1_replay) {
|
||||
return run_batchlog_v1_cleanup_with_failed_batches_test(false);
|
||||
}
|
||||
|
||||
future<> run_batchlog_cleanup_with_failed_batches_test(bool replay_fails, db::batchlog_manager::post_replay_cleanup cleanup) {
|
||||
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
return make_ready_future<>();
|
||||
@@ -182,68 +301,11 @@ future<> run_batchlog_cleanup_with_failed_batches_test(bool replay_fails, db::ba
|
||||
cfg.batchlog_delay = 9999h;
|
||||
|
||||
return do_with_cql_env_thread([=] (cql_test_env& env) -> void {
|
||||
auto& bm = env.batchlog_manager().local();
|
||||
|
||||
env.execute_cql("CREATE TABLE tbl (pk bigint PRIMARY KEY, v text)").get();
|
||||
|
||||
const uint64_t batch_count = 8;
|
||||
uint64_t failed_batches = 0;
|
||||
|
||||
for (uint64_t i = 0; i != batch_count; ++i) {
|
||||
std::vector<sstring> queries;
|
||||
std::vector<std::string_view> query_views;
|
||||
for (uint64_t j = 0; j != i+2; ++j) {
|
||||
queries.emplace_back(format("INSERT INTO tbl (pk, v) VALUES ({}, 'value');", j));
|
||||
query_views.emplace_back(queries.back());
|
||||
}
|
||||
const bool fail = i % 2;
|
||||
bool injected_exception_thrown = false;
|
||||
|
||||
std::optional<scoped_error_injection> error_injection;
|
||||
if (fail) {
|
||||
++failed_batches;
|
||||
error_injection.emplace("storage_proxy_fail_send_batch");
|
||||
}
|
||||
try {
|
||||
env.execute_batch(
|
||||
query_views,
|
||||
cql3::statements::batch_statement::type::LOGGED,
|
||||
std::make_unique<cql3::query_options>(db::consistency_level::ONE, std::vector<cql3::raw_value>())).get();
|
||||
} catch (std::runtime_error& ex) {
|
||||
if (fail) {
|
||||
BOOST_REQUIRE_EQUAL(std::string(ex.what()), "Error injection: failing to send batch");
|
||||
injected_exception_thrown = true;
|
||||
} else {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(injected_exception_thrown, fail);
|
||||
}
|
||||
const uint64_t failed_batches = prepare_batches(env, db::system_keyspace::BATCHLOG_V2, batch_count, replay_fails, cleanup);
|
||||
|
||||
const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = 2 ALLOW FILTERING", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
|
||||
|
||||
assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2)).get())
|
||||
.is_rows()
|
||||
.with_size(failed_batches);
|
||||
|
||||
assert_that(env.execute_cql(fragments_query).get())
|
||||
.is_rows()
|
||||
.with_size(batch_count)
|
||||
.assert_for_columns_of_each_row([&] (columns_assertions& columns) {
|
||||
columns.with_typed_column<sstring>("mutation_source", "memtable:0");
|
||||
});
|
||||
|
||||
std::optional<scoped_error_injection> error_injection;
|
||||
if (replay_fails) {
|
||||
error_injection.emplace("storage_proxy_fail_replay_batch");
|
||||
}
|
||||
|
||||
bm.do_batch_log_replay(cleanup).get();
|
||||
|
||||
assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2)).get())
|
||||
.is_rows()
|
||||
.with_size(replay_fails ? failed_batches : 0);
|
||||
|
||||
const auto fragment_results = cql3::untyped_result_set(env.execute_cql(fragments_query).get());
|
||||
|
||||
const auto batchlog_v2_schema = env.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
|
||||
@@ -693,7 +755,7 @@ SEASTAR_TEST_CASE(test_batchlog_replay_write_time) {
|
||||
|
||||
auto get_write_attempts = [&] () -> uint64_t {
|
||||
return env.batchlog_manager().map_reduce0([] (const db::batchlog_manager& bm) {
|
||||
return bm.stats().write_attempts;
|
||||
return bm.get_stats().write_attempts;
|
||||
}, uint64_t(0), std::plus<uint64_t>{}).get();
|
||||
};
|
||||
|
||||
|
||||
@@ -391,21 +391,31 @@ SEASTAR_TEST_CASE(select_from_vector_search_system_table) {
|
||||
return do_with_cql_env_thread(
|
||||
[](auto&& env) {
|
||||
create_user_if_not_exists(env, bob);
|
||||
with_user(env, bob, [&env] {
|
||||
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.group0_history").get(), exceptions::unauthorized_exception,
|
||||
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
|
||||
});
|
||||
with_user(env, bob, [&env] {
|
||||
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.versions").get(), exceptions::unauthorized_exception,
|
||||
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
|
||||
});
|
||||
|
||||
// All tables in vector_search_system_resources from client_state.cc
|
||||
const std::vector<sstring> vector_search_system_tables = {
|
||||
"system.group0_history",
|
||||
"system.versions",
|
||||
"system.cdc_streams",
|
||||
"system.cdc_timestamps",
|
||||
};
|
||||
|
||||
// Without VECTOR_SEARCH_INDEXING permission, bob cannot select from these tables
|
||||
for (const auto& table : vector_search_system_tables) {
|
||||
with_user(env, bob, [&env, &table] {
|
||||
BOOST_REQUIRE_EXCEPTION(env.execute_cql(format("SELECT * FROM {}", table)).get(), exceptions::unauthorized_exception,
|
||||
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
|
||||
});
|
||||
}
|
||||
|
||||
cquery_nofail(env, "GRANT VECTOR_SEARCH_INDEXING ON ALL KEYSPACES TO bob");
|
||||
with_user(env, bob, [&env] {
|
||||
cquery_nofail(env, "SELECT * FROM system.group0_history");
|
||||
});
|
||||
with_user(env, bob, [&env] {
|
||||
cquery_nofail(env, "SELECT * FROM system.versions");
|
||||
});
|
||||
|
||||
// With VECTOR_SEARCH_INDEXING permission, bob can select from these tables
|
||||
for (const auto& table : vector_search_system_tables) {
|
||||
with_user(env, bob, [&env, &table] {
|
||||
cquery_nofail(env, format("SELECT * FROM {}", table));
|
||||
});
|
||||
}
|
||||
},
|
||||
db_config_with_auth());
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "types/list.hh"
|
||||
#include "types/set.hh"
|
||||
#include "schema/schema_builder.hh"
|
||||
#include "cql3/functions/vector_similarity_fcts.hh"
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(cql_functions_test)
|
||||
|
||||
@@ -422,4 +423,96 @@ SEASTAR_TEST_CASE(test_aggregate_functions_vector_type) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_extract_float_vector) {
|
||||
// Compare standard deserialization path vs optimized extraction path
|
||||
auto serialize = [](size_t dim, const std::vector<float>& values) {
|
||||
auto vector_type = vector_type_impl::get_instance(float_type, dim);
|
||||
std::vector<data_value> data_vals;
|
||||
data_vals.reserve(values.size());
|
||||
for (float f : values) {
|
||||
data_vals.push_back(data_value(f));
|
||||
}
|
||||
return vector_type->decompose(make_list_value(vector_type, data_vals));
|
||||
};
|
||||
|
||||
auto deserialize_standard = [](size_t dim, const bytes_opt& serialized) {
|
||||
auto vector_type = vector_type_impl::get_instance(float_type, dim);
|
||||
data_value v = vector_type->deserialize(*serialized);
|
||||
const auto& elements = value_cast<std::vector<data_value>>(v);
|
||||
std::vector<float> result;
|
||||
result.reserve(elements.size());
|
||||
for (const auto& elem : elements) {
|
||||
result.push_back(value_cast<float>(elem));
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
auto compare_vectors = [](const std::vector<float>& a, const std::vector<float>& b) {
|
||||
BOOST_REQUIRE_EQUAL(a.size(), b.size());
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
if (std::isnan(a[i]) && std::isnan(b[i])) {
|
||||
continue; // Both NaN, consider equal
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(a[i], b[i]);
|
||||
}
|
||||
};
|
||||
|
||||
// Prepare test cases
|
||||
std::vector<std::vector<float>> test_vectors = {
|
||||
// Small vectors with explicit values
|
||||
{1.0f, 2.5f},
|
||||
{-1.5f, 0.0f, 3.14159f},
|
||||
// Special floating-point values
|
||||
{
|
||||
std::numeric_limits<float>::infinity(),
|
||||
-std::numeric_limits<float>::infinity(),
|
||||
0.0f,
|
||||
-0.0f,
|
||||
std::numeric_limits<float>::min(),
|
||||
std::numeric_limits<float>::max()
|
||||
},
|
||||
// NaN values (require special comparison)
|
||||
{
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
1.0f,
|
||||
std::numeric_limits<float>::signaling_NaN()
|
||||
}
|
||||
};
|
||||
|
||||
// Add common embedding dimensions with pattern-generated data
|
||||
for (size_t dim : {128, 384, 768, 1024, 1536}) {
|
||||
std::vector<float> vec(dim);
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
vec[i] = static_cast<float>(i % 100) * 0.01f;
|
||||
}
|
||||
test_vectors.push_back(std::move(vec));
|
||||
}
|
||||
|
||||
// Run tests for all test vectors
|
||||
for (const auto& vec : test_vectors) {
|
||||
size_t dim = vec.size();
|
||||
auto serialized = serialize(dim, vec);
|
||||
auto standard = deserialize_standard(dim, serialized);
|
||||
compare_vectors(standard, cql3::functions::detail::extract_float_vector(serialized, dim));
|
||||
}
|
||||
|
||||
// Null parameter should throw
|
||||
BOOST_REQUIRE_EXCEPTION(
|
||||
cql3::functions::detail::extract_float_vector(std::nullopt, 3),
|
||||
exceptions::invalid_request_exception,
|
||||
seastar::testing::exception_predicate::message_contains("Cannot extract float vector from null parameter")
|
||||
);
|
||||
|
||||
// Size mismatch should throw
|
||||
for (auto [actual_dim, expected_dim] : {std::pair{2, 3}, {4, 3}}) {
|
||||
std::vector<float> vec(actual_dim, 1.0f);
|
||||
auto serialized = serialize(actual_dim, vec);
|
||||
BOOST_REQUIRE_EXCEPTION(
|
||||
cql3::functions::detail::extract_float_vector(serialized, expected_dim),
|
||||
exceptions::invalid_request_exception,
|
||||
seastar::testing::exception_predicate::message_contains("Invalid vector size")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
@@ -113,15 +113,23 @@ static future<> compare_object_data(const local_gcs_wrapper& env, std::string_vi
|
||||
BOOST_REQUIRE_EQUAL(read, total);
|
||||
}
|
||||
|
||||
using namespace std::string_literals;
|
||||
static constexpr auto prefix = "bork/ninja/"s;
|
||||
|
||||
// #28398 include a prefix in all names.
|
||||
static std::string make_name() {
|
||||
return fmt::format("{}{}", prefix, utils::UUID_gen::get_time_UUID());
|
||||
}
|
||||
|
||||
static future<> test_read_write_helper(const local_gcs_wrapper& env, size_t dest_size, std::optional<size_t> specific_buffer_size = std::nullopt) {
|
||||
auto& c = env.client();
|
||||
auto uuid = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
std::vector<temporary_buffer<char>> written;
|
||||
|
||||
// ensure we remove the object
|
||||
env.objects_to_delete.emplace_back(uuid);
|
||||
co_await create_object_of_size(c, env.bucket, uuid, dest_size, &written, specific_buffer_size);
|
||||
co_await compare_object_data(env, uuid, std::move(written));
|
||||
env.objects_to_delete.emplace_back(name);
|
||||
co_await create_object_of_size(c, env.bucket, name, dest_size, &written, specific_buffer_size);
|
||||
co_await compare_object_data(env, name, std::move(written));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(gcs_tests, *seastar::testing::async_fixture<gcs_fixture>())
|
||||
@@ -147,21 +155,28 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
|
||||
auto& c = env.client();
|
||||
std::unordered_map<std::string, uint64_t> names;
|
||||
for (size_t i = 0; i < 10; ++i) {
|
||||
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
|
||||
env.objects_to_delete.emplace_back(name);
|
||||
co_await create_object_of_size(c, env.bucket, name, size);
|
||||
names.emplace(name, size);
|
||||
}
|
||||
|
||||
auto infos = co_await c.list_objects(env.bucket);
|
||||
utils::gcp::storage::bucket_paging paging;
|
||||
size_t n_found = 0;
|
||||
|
||||
for (auto& info : infos) {
|
||||
auto i = names.find(info.name);
|
||||
if (i != names.end()) {
|
||||
BOOST_REQUIRE_EQUAL(info.size, i->second);
|
||||
++n_found;
|
||||
for (;;) {
|
||||
auto infos = co_await c.list_objects(env.bucket, "", paging);
|
||||
|
||||
for (auto& info : infos) {
|
||||
auto i = names.find(info.name);
|
||||
if (i != names.end()) {
|
||||
BOOST_REQUIRE_EQUAL(info.size, i->second);
|
||||
++n_found;
|
||||
}
|
||||
}
|
||||
if (infos.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(n_found, names.size());
|
||||
@@ -170,7 +185,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
|
||||
SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
|
||||
auto& env = *this;
|
||||
auto& c = env.client();
|
||||
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
env.objects_to_delete.emplace_back(name);
|
||||
co_await create_object_of_size(c, env.bucket, name, 128);
|
||||
{
|
||||
@@ -190,7 +205,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *ch
|
||||
SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_skip_read, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
|
||||
auto& env = *this;
|
||||
auto& c = env.client();
|
||||
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
std::vector<temporary_buffer<char>> bufs;
|
||||
constexpr size_t file_size = 12*1024*1024 + 384*7 + 31;
|
||||
|
||||
@@ -243,7 +258,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor
|
||||
|
||||
size_t total = 0;
|
||||
for (size_t i = 0; i < 32; ++i) {
|
||||
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
|
||||
env.objects_to_delete.emplace_back(name);
|
||||
co_await create_object_of_size(c, env.bucket, name, size, &bufs);
|
||||
@@ -251,7 +266,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor
|
||||
total += size;
|
||||
}
|
||||
|
||||
auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
|
||||
auto name = make_name();
|
||||
env.objects_to_delete.emplace_back(name);
|
||||
|
||||
auto info = co_await c.merge_objects(env.bucket, name, names);
|
||||
|
||||
@@ -767,7 +767,6 @@ void test_chunked_download_data_source(const client_maker_function& client_maker
|
||||
#endif
|
||||
|
||||
cln->delete_object(object_name).get();
|
||||
cln->close().get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_chunked_download_data_source_with_delays_minio) {
|
||||
@@ -981,3 +980,88 @@ BOOST_AUTO_TEST_CASE(s3_fqn_manipulation) {
|
||||
BOOST_REQUIRE_EQUAL(bucket_name, "bucket");
|
||||
BOOST_REQUIRE_EQUAL(object_name, "prefix1/prefix2/foo.bar");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(part_size_calculation_test) {
|
||||
{
|
||||
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 5_MiB), std::runtime_error, [](const std::runtime_error& e) {
|
||||
return std::string(e.what()).starts_with("too many parts: 100352 > 10000");
|
||||
});
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(490_GiB, 100_MiB);
|
||||
BOOST_REQUIRE_EQUAL(size, 100_MiB);
|
||||
BOOST_REQUIRE(parts == 5018);
|
||||
}
|
||||
{
|
||||
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(490_GiB, 4_MiB), std::runtime_error, [](const std::runtime_error& e) {
|
||||
return std::string(e.what()).starts_with("part_size too small: 4194304 is smaller than minimum part size: 5242880");
|
||||
});
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(50_MiB, 0);
|
||||
BOOST_REQUIRE_EQUAL(size, 50_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 1);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(49_MiB, 0);
|
||||
BOOST_REQUIRE_EQUAL(size, 50_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 1);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(490_GiB, 0);
|
||||
BOOST_REQUIRE_EQUAL(size, 51_MiB);
|
||||
BOOST_REQUIRE(parts == 9839);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(50_MiB * 10000, 0);
|
||||
BOOST_REQUIRE_EQUAL(size, 50_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 10000);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(50_MiB * 10000 + 1, 0);
|
||||
BOOST_REQUIRE(size > 50_MiB);
|
||||
BOOST_REQUIRE(parts <= 10000);
|
||||
}
|
||||
{
|
||||
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(50_TiB, 0), std::runtime_error, [](const std::runtime_error& e) {
|
||||
return std::string(e.what()).starts_with("object size too large: 54975581388800 is larger than maximum S3 object size: 53687091200000");
|
||||
});
|
||||
}
|
||||
{
|
||||
BOOST_REQUIRE_EXCEPTION(s3::calc_part_size(1_TiB, 5_GiB + 1), std::runtime_error, [](const std::runtime_error& e) {
|
||||
return std::string(e.what()).starts_with("part_size too large: 5368709121 is larger than maximum part size: 5368709120");
|
||||
});
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(5_TiB, 0);
|
||||
BOOST_REQUIRE_EQUAL(parts, 9987);
|
||||
BOOST_REQUIRE_EQUAL(size, 525_MiB);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(5_MiB * 10000, 5_MiB);
|
||||
BOOST_REQUIRE_EQUAL(size, 5_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 10000);
|
||||
}
|
||||
{
|
||||
size_t total = 5_MiB * 10001; // 10001 parts at 5 MiB
|
||||
BOOST_REQUIRE_EXCEPTION(
|
||||
s3::calc_part_size(total, 5_MiB), std::runtime_error, [](auto& e) { return std::string(e.what()).starts_with("too many parts: 10001 > 10000"); });
|
||||
}
|
||||
{
|
||||
size_t total = 500_GiB + 123; // odd size to force non-MiB alignment
|
||||
auto [parts, size] = s3::calc_part_size(total, 0);
|
||||
|
||||
BOOST_REQUIRE(size % 1_MiB == 0); // aligned
|
||||
BOOST_REQUIRE(parts <= 10000);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(6_MiB, 0);
|
||||
BOOST_REQUIRE_EQUAL(size, 50_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 1);
|
||||
}
|
||||
{
|
||||
auto [parts, size] = s3::calc_part_size(100_MiB, 200_MiB);
|
||||
BOOST_REQUIRE_EQUAL(parts, 1);
|
||||
BOOST_REQUIRE_EQUAL(size, 200_MiB);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1650,6 +1650,21 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
}
|
||||
}
|
||||
|
||||
static future<> apply_repair_transitions(token_metadata& tm, const migration_plan& plan) {
|
||||
for (const auto& repair : plan.repair_plan().repairs()) {
|
||||
co_await tm.tablets().mutate_tablet_map_async(repair.table, [&] (tablet_map& tmap) {
|
||||
auto tablet_info = tmap.get_tablet_info(repair.tablet);
|
||||
tmap.set_tablet_transition_info(repair.tablet, tablet_transition_info{
|
||||
tablet_transition_stage::repair,
|
||||
tablet_transition_kind::repair,
|
||||
tablet_info.replicas,
|
||||
std::nullopt,
|
||||
});
|
||||
return make_ready_future();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were fully executed.
|
||||
static
|
||||
future<> apply_plan(token_metadata& tm, const migration_plan& plan, service::topology& topology, shared_load_stats* load_stats) {
|
||||
@@ -1674,6 +1689,7 @@ future<> apply_plan(token_metadata& tm, const migration_plan& plan, service::top
|
||||
if (auto request_id = plan.rack_list_colocation_plan().request_to_resume(); request_id) {
|
||||
topology.paused_rf_change_requests.erase(request_id);
|
||||
}
|
||||
co_await apply_repair_transitions(tm, plan);
|
||||
}
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were started but not yet executed.
|
||||
@@ -5995,4 +6011,168 @@ SEASTAR_THREAD_TEST_CASE(test_tablets_describe_ring) {
|
||||
}, cfg).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_auto_repair_rf1) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(true);
|
||||
cfg_in.db_config->auto_repair_threshold_default_in_seconds(1);
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 1}}, 1);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
tablet_id tablet{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tablet = tid;
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
}
|
||||
});
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
bool once = false;
|
||||
rebalance_tablets(e, nullptr, {}, [&once] (const migration_plan& plan) { return std::exchange(once, true); });
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).get_tablet_transition_info(tablet) == nullptr);
|
||||
}, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
void run_tablet_manual_repair_rf1(cql_test_env& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 1}}, 1);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
tablet_id tablet{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tablet = tid;
|
||||
tablet_info ti{
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
}
|
||||
};
|
||||
ti.repair_task_info = ti.repair_task_info.make_user_repair_request();
|
||||
tmap.set_tablet(tid, std::move(ti));
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
bool once = false;
|
||||
rebalance_tablets(e, nullptr, {}, [&once] (const migration_plan& plan) { return std::exchange(once, true); });
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).get_tablet_transition_info(tablet)->transition == tablet_transition_kind::repair);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_manual_repair_rf1_auto_repair_off) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(false);
|
||||
do_with_cql_env_thread(run_tablet_manual_repair_rf1, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_manual_repair_rf1_auto_repair_on) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(true);
|
||||
do_with_cql_env_thread(run_tablet_manual_repair_rf1, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
// Test for tablet_map::get_secondary_replica() and specifically how it
|
||||
// relates to get_primary_replica().
|
||||
// We never officially documented given a list of replicas, which replica
|
||||
// is to be considered the "primary" - it's not simply the first replica in
|
||||
// the list but the first in some reshuffling of the list, reshuffling whose
|
||||
// details changed in commits like 817fdad and d88036d. So this patch doesn't
|
||||
// enshrine what get_primary_replica() or get_secondary_replica() should
|
||||
// return. It just verifies that get_secondary_replica() returns a *different*
|
||||
// replica than get_primary_replica() if there are 2 or more replicas, or
|
||||
// throws an error when there's just one replica.
|
||||
// Reproduces SCYLLADB-777.
|
||||
SEASTAR_THREAD_TEST_CASE(test_get_secondary_replica) {
|
||||
auto h1 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
auto h2 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
auto h3 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
|
||||
locator::topology::config cfg = {
|
||||
.this_endpoint = inet_address("127.0.0.1"),
|
||||
.this_host_id = h1,
|
||||
.local_dc_rack = endpoint_dc_rack::default_location,
|
||||
};
|
||||
auto topo = locator::topology(cfg);
|
||||
topo.add_or_update_endpoint(h1, endpoint_dc_rack::default_location, node::state::normal);
|
||||
topo.add_or_update_endpoint(h2, endpoint_dc_rack::default_location, node::state::normal);
|
||||
topo.add_or_update_endpoint(h3, endpoint_dc_rack::default_location, node::state::normal);
|
||||
|
||||
// With 1 replica, get_secondary_replica should throw.
|
||||
{
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
}
|
||||
});
|
||||
BOOST_REQUIRE_THROW(tmap.get_secondary_replica(tid, topo), std::runtime_error);
|
||||
}
|
||||
|
||||
// With 2 replicas, get_secondary_replica should return a different replica
|
||||
// than get_primary_replica for every tablet.
|
||||
{
|
||||
tablet_map tmap(4);
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
tablet_replica {h2, 0},
|
||||
}
|
||||
});
|
||||
}
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
auto primary = tmap.get_primary_replica(tid, topo);
|
||||
auto secondary = tmap.get_secondary_replica(tid, topo);
|
||||
BOOST_REQUIRE(primary != secondary);
|
||||
}
|
||||
}
|
||||
|
||||
// With 3 replicas, same check.
|
||||
{
|
||||
tablet_map tmap(4);
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
tablet_replica {h2, 0},
|
||||
tablet_replica {h3, 0},
|
||||
}
|
||||
});
|
||||
}
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
auto primary = tmap.get_primary_replica(tid, topo);
|
||||
auto secondary = tmap.get_secondary_replica(tid, topo);
|
||||
BOOST_REQUIRE(primary != secondary);
|
||||
}
|
||||
}
|
||||
|
||||
topo.clear_gently().get();
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
95
test/cluster/auth_cluster/test_startup_response.py
Normal file
95
test/cluster/auth_cluster/test_startup_response.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import pytest
|
||||
import logging
|
||||
import sys
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import time
|
||||
from unittest import mock
|
||||
from cassandra.cluster import Cluster, DefaultConnection, NoHostAvailable
|
||||
from cassandra import connection
|
||||
from cassandra.auth import PlainTextAuthProvider
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_startup_no_auth_response(manager: ManagerClient, build_mode):
|
||||
"""
|
||||
Test behavior when client hangs on startup auth response.
|
||||
This is stressing uninitialized_connections_semaphore_cpu_concurrency
|
||||
switching between CPU and Network states (1 or 0 semaphore units taken
|
||||
per connection).
|
||||
Test is probabilistic in the sense that it triggers bug reliably
|
||||
only with sufficient `num_connections` but empirically this number
|
||||
is tested to be very low.
|
||||
"""
|
||||
server = await manager.server_add(config=auth_config)
|
||||
|
||||
# Define a custom connection class that hangs on startup response
|
||||
class NoOpConnection(DefaultConnection):
|
||||
def _handle_startup_response(self, startup_response):
|
||||
pass
|
||||
|
||||
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
|
||||
|
||||
connections_observed = False
|
||||
num_connections = 100
|
||||
timeout = 360
|
||||
|
||||
def attempt_bad_connection():
|
||||
c = Cluster([server.ip_addr], port=9042, auth_provider=auth_provider, connect_timeout=timeout, connection_class=NoOpConnection)
|
||||
try:
|
||||
logging.info("Attempting bad connection")
|
||||
c.connect()
|
||||
pytest.fail("Should not connect")
|
||||
except Exception:
|
||||
# We expect failure or timeout
|
||||
pass
|
||||
finally:
|
||||
c.shutdown()
|
||||
|
||||
def attempt_good_connection():
|
||||
nonlocal connections_observed
|
||||
c = Cluster([server.ip_addr], port=9042, auth_provider=auth_provider, connect_timeout=timeout/3)
|
||||
try:
|
||||
logging.info("Attempting good connection")
|
||||
session = c.connect()
|
||||
logging.info("Performing SELECT(*) FROM system.clients")
|
||||
res = session.execute("SELECT COUNT(*) FROM system.clients WHERE connection_stage = 'AUTHENTICATING' ALLOW FILTERING;")
|
||||
count = res[0][0]
|
||||
logging.info(f"Observed {count} AUTHENTICATING connections...")
|
||||
if count >= num_connections/2:
|
||||
connections_observed = True
|
||||
finally:
|
||||
c.shutdown()
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
logging.info("Attempting concurrent connections with custom hanging connection class...")
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=num_connections + 1)
|
||||
|
||||
async def verify_loop():
|
||||
logging.info("Verifying server availability concurrently...")
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
logging.info(f"Good connection attempt at delta {time.time() - start_time:.2f}s")
|
||||
try:
|
||||
await loop.run_in_executor(executor, attempt_good_connection)
|
||||
except Exception as e:
|
||||
logging.info(f"Good connection attempt failed: {e}")
|
||||
if connections_observed:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
logging.info("Verification loop completed")
|
||||
|
||||
good_future = asyncio.create_task(verify_loop())
|
||||
bad_futures = [loop.run_in_executor(executor, attempt_bad_connection) for _ in range(num_connections)]
|
||||
|
||||
await good_future
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
assert connections_observed
|
||||
@@ -396,7 +396,8 @@ async def test_mv_first_replica_in_dc(manager: ManagerClient, delayed_replica: s
|
||||
@pytest.mark.parametrize("migration_type", ["tablets_internode", "tablets_intranode", "vnodes"])
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_mv_write_during_migration(manager: ManagerClient, migration_type: str):
|
||||
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug']
|
||||
# RF=1 and fast boot options with streaming don't play well together, so force RBNO for bootstrap
|
||||
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug', "--allowed-repair-based-node-ops", "replace,removenode,rebuild,bootstrap,decommission"]
|
||||
|
||||
servers = await manager.servers_add(3, cmdline=cmdline)
|
||||
cql = manager.get_cql()
|
||||
|
||||
@@ -3,14 +3,18 @@
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
from time import time
|
||||
|
||||
from test.cluster.tasks.task_manager_types import TaskID, TaskStats, TaskStatus
|
||||
from test.cluster.tasks.task_manager_types import TaskID, TaskStats, TaskStatus, State
|
||||
from test.pylib.internal_types import IPAddress
|
||||
from test.pylib.rest_client import ScyllaRESTAPIClient
|
||||
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from test.pylib.util import wait_for
|
||||
|
||||
|
||||
class TaskManagerClient():
|
||||
"""Async Task Manager client"""
|
||||
|
||||
@@ -36,6 +40,27 @@ class TaskManagerClient():
|
||||
assert(type(stats_list) == list)
|
||||
return [TaskStats(**stats_dict) for stats_dict in stats_list]
|
||||
|
||||
async def wait_task_appears(self, node_ip: IPAddress, module_name: str,
|
||||
task_type: Optional[str] = None,
|
||||
entity: Optional[str] = None,
|
||||
deadline: Optional[float] = None) -> TaskStats:
|
||||
"""
|
||||
Waits for a task to appear in "running" state based on the specified task filter.
|
||||
A task matches the filter if all of its fields mach the specified attributes.
|
||||
Throws an exception if no such task appears before the deadline.
|
||||
|
||||
:return: stats of the first task matching the filter.
|
||||
"""
|
||||
async def get_tasks():
|
||||
tasks = await self.list_tasks(node_ip, module_name)
|
||||
for stats in tasks:
|
||||
if stats.state == State.running and \
|
||||
(task_type is None or stats.type == task_type) and \
|
||||
(entity is None or stats.entity == entity):
|
||||
return stats
|
||||
return None
|
||||
return await wait_for(get_tasks, deadline or (time() + 60), period=0.1, backoff_factor=1.2, max_period=1)
|
||||
|
||||
async def get_task_status(self, node_ip: IPAddress, task_id: TaskID) -> TaskStatus:
|
||||
"""Get status of one task."""
|
||||
status = await self.api.client.get_json(f"/task_manager/task_status/{task_id}", host=node_ip)
|
||||
|
||||
@@ -184,6 +184,72 @@ async def test_alternator_ttl_scheduling_group(manager: ManagerClient):
|
||||
|
||||
table.delete()
|
||||
|
||||
@pytest.mark.parametrize("with_down_node", [False, True], ids=["all_nodes_up", "one_node_down"])
|
||||
async def test_alternator_ttl_multinode_expiration(manager: ManagerClient, with_down_node):
|
||||
"""When the cluster has multiple nodes, different nodes are responsible
|
||||
for checking expiration in different token ranges - each node is
|
||||
responsible for its "primary ranges". Let's check that this expiration
|
||||
really does happen - for the entire token range - by writing many
|
||||
partitions that will span the entire token range, and seeing that they
|
||||
all expire. We don't check that nodes don't do more work than they
|
||||
should - an inefficient implementation where every node scans the
|
||||
entire data set will also pass this test.
|
||||
When the test is run a second time with with_down_node=True, we verify
|
||||
that TTL expiration works correctly even when one of the nodes is
|
||||
brought down. This node's TTL scanner is responsible for scanning part
|
||||
of the token range, so when this node is down, part of the data might
|
||||
not get expired. At that point - other node(s) should take over
|
||||
expiring data in that range - and this test verifies that this indeed
|
||||
happens. Reproduces issue #9787 and SCYLLADB-777.
|
||||
"""
|
||||
servers = await manager.servers_add(3, config=alternator_config, auto_rack_dc='dc1')
|
||||
alternator = get_alternator(servers[0].ip_addr)
|
||||
|
||||
if with_down_node:
|
||||
# Bring down one of nodes. Everything we do below, like creating a
|
||||
# table, reading and writing, should continue to work with one node
|
||||
# down.
|
||||
await manager.server_stop_gracefully(servers[2].server_id)
|
||||
|
||||
table = alternator.create_table(TableName=unique_table_name(),
|
||||
BillingMode='PAY_PER_REQUEST',
|
||||
KeySchema=[
|
||||
{'AttributeName': 'p', 'KeyType': 'HASH' },
|
||||
],
|
||||
AttributeDefinitions=[
|
||||
{'AttributeName': 'p', 'AttributeType': 'N' },
|
||||
])
|
||||
# Set the "expiration" column to mark item's expiration time
|
||||
table.meta.client.update_time_to_live(TableName=table.name, TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': True})
|
||||
|
||||
# Insert 50 rows, in different partitions, so the murmur3 hash maps them
|
||||
# all over the token space so different nodes would be responsible for
|
||||
# expiring them. All items are marked to expire 10 seconds in the past,
|
||||
# so should all expire as soon as possible, during this test.
|
||||
expiration = int(time.time()) - 10
|
||||
with table.batch_writer() as batch:
|
||||
for p in range(50):
|
||||
batch.put_item({'p': p, 'expiration': expiration})
|
||||
# Expect that after a short delay, all items in the table will have
|
||||
# expired - so a scan should return no responses. This should happen
|
||||
# even though one of the nodes is down and not doing its usual
|
||||
# expiration-scanning work.
|
||||
timeout = time.time() + 60
|
||||
items = -1
|
||||
while items != 0 and time.time() < timeout:
|
||||
response = table.scan(ConsistentRead=True)
|
||||
items = len(response['Items'])
|
||||
# In theory (though probably not in practice in this test), a scan()
|
||||
# can return zero items but have more pages - so we need to be more
|
||||
# diligent and scan all pages to check it's completely empty.
|
||||
while items == 0 and 'LastEvaluatedKey' in response:
|
||||
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], ConsistentRead=True)
|
||||
items += len(response['Items'])
|
||||
if items == 0:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
assert items == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_localnodes_broadcast_rpc_address(manager: ManagerClient):
|
||||
"""Test that if the "broadcast_rpc_address" of a node is set, the
|
||||
|
||||
@@ -26,6 +26,7 @@ import pytest
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import inject_error
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.internal_types import ServerUpState
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -198,8 +199,14 @@ ALTERNATOR_PROXY_SERVER_CONFIG = {
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
async def alternator_proxy_server(manager: ManagerClient):
|
||||
"""Fixture that creates a server with Alternator proxy protocol ports enabled."""
|
||||
server = await manager.server_add(config=ALTERNATOR_PROXY_SERVER_CONFIG)
|
||||
"""Fixture that creates a server with Alternator proxy protocol ports enabled.
|
||||
|
||||
Waits for SERVING state to ensure Alternator ports are ready.
|
||||
"""
|
||||
server = await manager.server_add(
|
||||
config=ALTERNATOR_PROXY_SERVER_CONFIG,
|
||||
expected_server_up_state=ServerUpState.SERVING
|
||||
)
|
||||
yield (server, manager)
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ async def wait_for_expected_client_routes_size(cql, expected_routes_size):
|
||||
if len(client_routes) == expected_size:
|
||||
return client_routes
|
||||
return None
|
||||
await wait_for(lambda: expected_client_routes_size(cql, expected_routes_size), time.time() + 10)
|
||||
await wait_for(lambda: expected_client_routes_size(cql, expected_routes_size), time.time() + 60)
|
||||
|
||||
def generate_connection_id(i):
|
||||
# Make the string longer than 30 characters to make sure that in C++ the string has a heap allocation
|
||||
@@ -108,7 +108,7 @@ async def test_client_routes_upgrade(request, manager: ManagerClient):
|
||||
servers = await manager.servers_add(num_servers, config={'error_injections_at_startup': config})
|
||||
cql, hosts = await manager.get_ready_cql(servers)
|
||||
# Empty `system.client_routes` is there even if the feature is disabled.
|
||||
wait_for_expected_client_routes_size(cql, 0)
|
||||
await wait_for_expected_client_routes_size(cql, 0)
|
||||
|
||||
with pytest.raises(HTTPError) as exc:
|
||||
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(0)])
|
||||
@@ -133,7 +133,7 @@ async def test_client_routes_upgrade(request, manager: ManagerClient):
|
||||
raise exc
|
||||
return None
|
||||
|
||||
wait_for(client_routes_ready, time.time() + 10)
|
||||
await wait_for(client_routes_ready, time.time() + 60)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -194,7 +194,7 @@ async def wait_for_expected_event_num(expected_num, received_events):
|
||||
if len(received_events) == num:
|
||||
return num
|
||||
return None
|
||||
await wait_for(lambda: expected_event_num(expected_num), time.time() + 10)
|
||||
await wait_for(lambda: expected_event_num(expected_num), time.time() + 60)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_events(request, manager: ManagerClient, monkeypatch):
|
||||
|
||||
76
test/cluster/test_compaction_backpressure.py
Normal file
76
test/cluster/test_compaction_backpressure.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (C) 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.tablets import get_tablet_replica
|
||||
from test.pylib.rest_client import inject_error_one_shot, inject_error
|
||||
from test.cluster.util import new_test_keyspace, new_test_table
|
||||
from test.cqlpy import nodetool
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_intranode_migration_not_blocked_by_backpressure(manager: ManagerClient):
|
||||
"""
|
||||
Reproducer for a bug where intra-node tablet migration gets stuck
|
||||
in maybe_wait_for_sstable_count_reduction() because the compaction
|
||||
manager's regular_compaction_task_executor::do_run() returns early
|
||||
(descriptor.sstables is empty) without signaling compaction_done.
|
||||
|
||||
The flush triggered by clone_locally_tablet_storage() calls
|
||||
maybe_wait_for_sstable_count_reduction() which waits on
|
||||
compaction_done. If compaction returns early without signaling,
|
||||
the wait hangs forever.
|
||||
|
||||
The test uses two error injections:
|
||||
1. maybe_wait_for_sstable_count_reduction_wait_on_compaction_done
|
||||
(one-shot): Forces the flush path to wait on compaction_done,
|
||||
simulating the backpressure condition.
|
||||
2. compaction_regular_compaction_task_executor_do_run_empty_sstables
|
||||
(non-one-shot): Forces do_run() to always return early with
|
||||
empty sstables, simulating the case where the compaction
|
||||
strategy finds nothing to compact.
|
||||
|
||||
Without the fix: do_run() returns early without signaling
|
||||
compaction_done, the wait hangs forever, blocking the migration.
|
||||
|
||||
With the fix: do_run() signals compaction_done on early return,
|
||||
waking up the waiter and allowing the migration to proceed.
|
||||
"""
|
||||
cmdline = ['--smp', '1']
|
||||
servers = [await manager.server_add(cmdline=cmdline)]
|
||||
|
||||
cql = manager.get_cql()
|
||||
extra_ks_param = "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}"
|
||||
async with new_test_keyspace(manager, extra_ks_param) as ks:
|
||||
extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'max_threshold': 64} and compression = {}"
|
||||
async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
|
||||
table = cf.split('.')[-1]
|
||||
|
||||
await manager.api.disable_autocompaction(servers[0].ip_addr, ks)
|
||||
|
||||
# Write data and flush to create an sstable.
|
||||
logger.info("Writing initial data and flushing")
|
||||
for i in range(100):
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(10*i, 10*(i+1))])
|
||||
await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)
|
||||
|
||||
# Write more data WITHOUT flushing
|
||||
logger.info("Writing dirty data to memtable (no flush)")
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {cf} (pk, t) VALUES ({i}, '{'x' * 1020}')") for i in range(400, 410)])
|
||||
|
||||
logger.info("Enable autocompaction and trigger flush to simulate backpressure")
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "set_sstable_count_reduction_threshold", one_shot=False, parameters={'value': 32})
|
||||
await manager.api.enable_autocompaction(servers[0].ip_addr, ks)
|
||||
# With the bug, the flush hangs forever in maybe_wait_for_sstable_count_reduction().
|
||||
# With the fix, it completes promptly.
|
||||
await manager.api.keyspace_flush(servers[0].ip_addr, ks, table)
|
||||
@@ -30,7 +30,7 @@ async def test_different_group0_ids(manager: ManagerClient):
|
||||
"""
|
||||
|
||||
# Consistent topology changes are disabled to use repair based node operations.
|
||||
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
|
||||
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled', 'allowed_repair_based_node_ops': 'bootstrap,decommission,replace,removenode,rebuild'}
|
||||
scylla_a = await manager.server_add(config = cfg)
|
||||
scylla_b = await manager.server_add(start=False, config = cfg)
|
||||
await manager.server_start(scylla_b.server_id, seeds=[scylla_b.ip_addr])
|
||||
|
||||
@@ -30,6 +30,7 @@ async def test_crashed_node_substitution(manager: ManagerClient):
|
||||
|
||||
log = await manager.server_open_log(failed_server.server_id)
|
||||
await log.wait_for("finished do_send_ack2_msg")
|
||||
failed_id = await manager.get_host_id(failed_server.server_id)
|
||||
await manager.api.message_injection(failed_server.ip_addr, 'crash_before_group0_join')
|
||||
|
||||
await task
|
||||
@@ -50,7 +51,6 @@ async def test_crashed_node_substitution(manager: ManagerClient):
|
||||
[await manager.api.message_injection(s.ip_addr, 'fast_orphan_removal_fiber') for s in servers]
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
failed_id = await manager.get_host_id(failed_server.server_id)
|
||||
await log.wait_for(f"Finished to force remove node {failed_id}")
|
||||
|
||||
post_wait_live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
|
||||
|
||||
@@ -7,15 +7,14 @@ import asyncio
|
||||
import pytest
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
import re
|
||||
|
||||
from cassandra.cluster import ConnectionException, NoHostAvailable # type: ignore
|
||||
from cassandra.cluster import NoHostAvailable # type: ignore
|
||||
from cassandra.query import SimpleStatement, ConsistencyLevel
|
||||
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
from test.pylib.internal_types import IPAddress
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import inject_error
|
||||
from test.pylib.rest_client import ScyllaMetricsClient, TCPRESTClient, inject_error
|
||||
from test.pylib.tablets import get_tablet_replicas
|
||||
from test.pylib.scylla_cluster import ReplaceConfig
|
||||
from test.pylib.util import wait_for
|
||||
@@ -26,26 +25,21 @@ from test.cluster.util import get_topology_coordinator, find_server_by_host_id,
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_hint_manager_metric(server: ServerInfo, metric_name: str) -> int:
|
||||
result = 0
|
||||
metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text
|
||||
pattern = re.compile(f"^scylla_hints_manager_{metric_name}")
|
||||
for metric in metrics.split('\n'):
|
||||
if pattern.match(metric) is not None:
|
||||
result += int(float(metric.split()[1]))
|
||||
return result
|
||||
async def get_hint_metrics(client: ScyllaMetricsClient, server_ip: IPAddress, metric_name: str):
|
||||
metrics = await client.query(server_ip)
|
||||
return metrics.get(f"scylla_hints_manager_{metric_name}")
|
||||
|
||||
# Creates a sync point for ALL hosts.
|
||||
def create_sync_point(node: ServerInfo) -> str:
|
||||
return requests.post(f"http://{node.ip_addr}:10000/hinted_handoff/sync_point/").json()
|
||||
async def create_sync_point(client: TCPRESTClient, server_ip: IPAddress) -> str:
|
||||
response = await client.post_json("/hinted_handoff/sync_point", host=server_ip, port=10_000)
|
||||
return response
|
||||
|
||||
def await_sync_point(node: ServerInfo, sync_point: str, timeout: int) -> bool:
|
||||
async def await_sync_point(client: TCPRESTClient, server_ip: IPAddress, sync_point: str, timeout: int) -> bool:
|
||||
params = {
|
||||
"id": sync_point,
|
||||
"timeout": str(timeout)
|
||||
}
|
||||
|
||||
response = requests.get(f"http://{node.ip_addr}:10000/hinted_handoff/sync_point", params=params).json()
|
||||
response = await client.get_json("/hinted_handoff/sync_point", host=server_ip, port=10_000, params=params)
|
||||
match response:
|
||||
case "IN_PROGRESS":
|
||||
return False
|
||||
@@ -67,10 +61,7 @@ async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient)
|
||||
|
||||
await manager.server_stop_gracefully(servers[1].server_id)
|
||||
|
||||
def get_hints_written_count(server):
|
||||
return get_hint_manager_metric(server, "written")
|
||||
|
||||
hints_before = get_hints_written_count(servers[0])
|
||||
hints_before = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
|
||||
|
||||
# Some of the inserts will be targeted to the dead node.
|
||||
# The coordinator doesn't have live targets to send the write to, but it should write a hint.
|
||||
@@ -78,7 +69,7 @@ async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient)
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY))
|
||||
|
||||
# Verify hints are written
|
||||
hints_after = get_hints_written_count(servers[0])
|
||||
hints_after = await get_hint_metrics(manager.metrics, servers[0].ip_addr, "written")
|
||||
assert hints_after > hints_before
|
||||
|
||||
# For dropping the keyspace
|
||||
@@ -144,24 +135,29 @@ async def test_sync_point(manager: ManagerClient):
|
||||
# Mutations need to be applied to hinted handoff's commitlog before we create the sync point.
|
||||
# Otherwise, the sync point will correspond to no hints at all.
|
||||
|
||||
# We need to wrap the function in an async function to make `wait_for` be able to use it below.
|
||||
async def check_no_hints_in_progress_node1() -> bool:
|
||||
return get_hint_manager_metric(node1, "size_of_hints_in_progress") == 0
|
||||
async def check_written_hints(min_count: int) -> bool:
|
||||
errors = await get_hint_metrics(manager.metrics, node1.ip_addr, "errors")
|
||||
assert errors == 0, "Writing hints to disk failed"
|
||||
|
||||
hints = await get_hint_metrics(manager.metrics, node1.ip_addr, "written")
|
||||
if hints >= min_count:
|
||||
return True
|
||||
return None
|
||||
|
||||
deadline = time.time() + 30
|
||||
await wait_for(check_no_hints_in_progress_node1, deadline)
|
||||
await wait_for(lambda: check_written_hints(2 * mutation_count), deadline)
|
||||
|
||||
sync_point1 = create_sync_point(node1)
|
||||
sync_point1 = await create_sync_point(manager.api.client, node1.ip_addr)
|
||||
|
||||
await manager.server_start(node2.server_id)
|
||||
await manager.server_sees_other_server(node1.ip_addr, node2.ip_addr)
|
||||
|
||||
assert not await_sync_point(node1, sync_point1, 30)
|
||||
assert not (await await_sync_point(manager.api.client, node1.ip_addr, sync_point1, 3))
|
||||
|
||||
await manager.server_start(node3.server_id)
|
||||
await manager.server_sees_other_server(node1.ip_addr, node3.ip_addr)
|
||||
|
||||
assert await_sync_point(node1, sync_point1, 30)
|
||||
assert await await_sync_point(manager.api.client, node1.ip_addr, sync_point1, 30)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -207,7 +203,8 @@ async def test_hints_consistency_during_decommission(manager: ManagerClient):
|
||||
await manager.servers_see_each_other([server1, server2, server3])
|
||||
|
||||
# Record the current position of hints so that we can wait for them later
|
||||
sync_points = [create_sync_point(srv) for srv in (server1, server2)]
|
||||
sync_points = await asyncio.gather(*[create_sync_point(manager.api.client, srv.ip_addr) for srv in (server1, server2)])
|
||||
sync_points = list(sync_points)
|
||||
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
coord = await get_topology_coordinator(manager)
|
||||
@@ -233,7 +230,8 @@ async def test_hints_consistency_during_decommission(manager: ManagerClient):
|
||||
await manager.api.disable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay")
|
||||
|
||||
logger.info("Wait until hints are replayed from nodes 1 and 2")
|
||||
await asyncio.gather(*(asyncio.to_thread(await_sync_point, srv, pt, timeout=30) for srv, pt in zip((server1, server2), sync_points)))
|
||||
await asyncio.gather(*(await_sync_point(manager.api.client, srv.ip_addr, pt, timeout=30)
|
||||
for srv, pt in zip((server1, server2), sync_points)))
|
||||
|
||||
# Unpause streaming and let decommission finish
|
||||
logger.info("Unpause streaming")
|
||||
@@ -271,11 +269,11 @@ async def test_hints_consistency_during_replace(manager: ManagerClient):
|
||||
# Write 100 rows with CL=ANY. Some of the rows will only be stored as hints because of RF=1
|
||||
for i in range(100):
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
|
||||
sync_point = create_sync_point(servers[0])
|
||||
sync_point = await create_sync_point(manager.api.client, servers[0].ip_addr)
|
||||
|
||||
await manager.server_add(replace_cfg=ReplaceConfig(replaced_id = servers[2].server_id, reuse_ip_addr = False, use_host_id = True))
|
||||
|
||||
assert await_sync_point(servers[0], sync_point, 30)
|
||||
assert await await_sync_point(manager.api.client, servers[0].ip_addr, sync_point, 30)
|
||||
# Verify that all rows were recovered by the hint replay
|
||||
for i in range(100):
|
||||
assert list(await cql.run_async(f"SELECT v FROM {table} WHERE pk = {i}")) == [(i + 1,)]
|
||||
@@ -300,16 +298,12 @@ async def test_draining_hints(manager: ManagerClient):
|
||||
for i in range(1000):
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
|
||||
|
||||
sync_point = create_sync_point(s1)
|
||||
sync_point = await create_sync_point(manager.api.client, s1.ip_addr)
|
||||
await manager.server_start(s2.server_id)
|
||||
|
||||
|
||||
async def wait():
|
||||
assert await_sync_point(s1, sync_point, 60)
|
||||
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
_ = tg.create_task(manager.decommission_node(s1.server_id, timeout=60))
|
||||
_ = tg.create_task(wait())
|
||||
_ = tg.create_task(await_sync_point(manager.api.client, s1.ip_addr, sync_point, 60))
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
@@ -335,7 +329,7 @@ async def test_canceling_hint_draining(manager: ManagerClient):
|
||||
for i in range(1000):
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY))
|
||||
|
||||
sync_point = create_sync_point(s1)
|
||||
sync_point = await create_sync_point(manager.api.client, s1.ip_addr)
|
||||
|
||||
await manager.api.enable_injection(s1.ip_addr, "hinted_handoff_pause_hint_replay", False, {})
|
||||
await manager.remove_node(s1.server_id, s2.server_id)
|
||||
@@ -353,7 +347,7 @@ async def test_canceling_hint_draining(manager: ManagerClient):
|
||||
await s1_log.wait_for(f"Draining starts for {host_id2}", from_mark=s1_mark)
|
||||
|
||||
# Make sure draining finishes successfully.
|
||||
assert await_sync_point(s1, sync_point, 60)
|
||||
assert await await_sync_point(manager.api.client, s1.ip_addr, sync_point, 60)
|
||||
await s1_log.wait_for(f"Removed hint directory for {host_id2}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -392,7 +386,7 @@ async def test_hint_to_pending(manager: ManagerClient):
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "hinted_handoff_pause_hint_replay", False)
|
||||
await manager.server_start(servers[1].server_id)
|
||||
sync_point = create_sync_point(servers[0])
|
||||
sync_point = await create_sync_point(manager.api.client, servers[0].ip_addr)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "pause_after_streaming_tablet", False)
|
||||
tablet_migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "t", host_ids[1], 0, host_ids[0], 0, 0))
|
||||
@@ -404,7 +398,7 @@ async def test_hint_to_pending(manager: ManagerClient):
|
||||
await wait_for(migration_reached_streaming, time.time() + 60)
|
||||
|
||||
await manager.api.disable_injection(servers[0].ip_addr, "hinted_handoff_pause_hint_replay")
|
||||
assert await_sync_point(servers[0], sync_point, 30)
|
||||
assert await await_sync_point(manager.api.client, servers[0].ip_addr, sync_point, 30)
|
||||
|
||||
await manager.api.message_injection(servers[0].ip_addr, "pause_after_streaming_tablet")
|
||||
done, pending = await asyncio.wait([tablet_migration])
|
||||
|
||||
@@ -861,3 +861,80 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
|
||||
else:
|
||||
logger.info("Starting vnode repair")
|
||||
await manager.api.repair(servers[1].ip_addr, ks, "test")
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs tablet merge
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
|
||||
cmdline = ['--logger-log-level', 'repair=debug']
|
||||
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and wait until the merge fiber starts
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "merge_completion_fiber", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
|
||||
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
|
||||
|
||||
# Trigger repair and wait for the inc repair prepare preparation to start
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
|
||||
# Wait for preparation to start.
|
||||
await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
|
||||
# Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
|
||||
# With the serialization, preparation will wait for merge fiber to finish.
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Continue to execute the merge fiber so that the compaction group is removed
|
||||
await inject_error_on(manager, "replica_merge_completion_wait", servers)
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
|
||||
|
||||
await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
|
||||
|
||||
# Continue the repair to trigger use-after-free
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
|
||||
|
||||
await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs table drop
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_incremental_repair_table_drop_compaction_group_gone(manager: ManagerClient):
|
||||
cmdline = ['--logger-log-level', 'repair=debug']
|
||||
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and wait until the merge fiber starts
|
||||
s1_mark = await coord_log.mark()
|
||||
|
||||
# Trigger repair and wait for the inc repair prepare preparation to start
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
|
||||
# Wait for preparation to finish.
|
||||
await coord_log.wait_for('Re-enabled compaction for range', from_mark=s1_mark)
|
||||
|
||||
s1_mark = await coord_log.mark()
|
||||
drop_future = cql.run_async(f"DROP TABLE {ks}.test;")
|
||||
await coord_log.wait_for(f'Stopping.*ongoing compactions for table {ks}.test', from_mark=s1_mark)
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
# Continue the repair to trigger use-after-free
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
|
||||
|
||||
await drop_future
|
||||
|
||||
@@ -95,6 +95,8 @@ async def test_full_shutdown_during_replace(manager: ManagerClient, reuse_ip: bo
|
||||
await leader_log.wait_for(
|
||||
'topology_coordinator/write_both_read_old/before_global_token_metadata_barrier: waiting for message')
|
||||
|
||||
replacing_host_id = await manager.get_host_id(replacing_server.server_id)
|
||||
|
||||
logger.info(f'Stopping {live_servers + [replacing_server]}')
|
||||
await gather_safely(*(manager.server_stop(srv.server_id) for srv in live_servers + [replacing_server]))
|
||||
replacing_task.cancel()
|
||||
@@ -135,6 +137,13 @@ async def test_full_shutdown_during_replace(manager: ManagerClient, reuse_ip: bo
|
||||
await manager.api.message_injection(
|
||||
srv.ip_addr, 'topology_coordinator/write_both_read_old/before_global_token_metadata_barrier')
|
||||
|
||||
logs = [await manager.server_open_log(srv.server_id) for srv in live_servers]
|
||||
logger.info(f'Waiting for {replacing_server} to be removed from gossip after replace rollback')
|
||||
await gather_safely(*[
|
||||
log.wait_for(f'gossip - Finished to force remove node {replacing_host_id}')
|
||||
for log in logs
|
||||
])
|
||||
|
||||
logger.info(f'Retrying replace of {dead_server}')
|
||||
new_server = await manager.server_add(replace_cfg, property_file=dead_server.property_file())
|
||||
live_servers.append(new_server)
|
||||
|
||||
@@ -7,16 +7,19 @@
|
||||
from cassandra.protocol import ConfigurationException
|
||||
from cassandra.connection import UnixSocketEndPoint
|
||||
from cassandra.policies import WhiteListRoundRobinPolicy
|
||||
from cassandra.query import SimpleStatement, ConsistencyLevel
|
||||
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.tablets import get_all_tablet_replicas
|
||||
from test.cluster.conftest import cluster_con
|
||||
from test.pylib.util import wait_for_cql_and_get_hosts
|
||||
from test.cluster.util import new_test_keyspace
|
||||
from test.pylib.util import gather_safely, wait_for_cql_and_get_hosts
|
||||
from test.cluster.util import create_new_test_keyspace
|
||||
|
||||
import pytest
|
||||
import logging
|
||||
import socket
|
||||
import time
|
||||
from typing import TypeAlias
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -25,80 +28,166 @@ async def test_maintenance_mode(manager: ManagerClient):
|
||||
"""
|
||||
The test checks that in maintenance mode server A is not available for other nodes and for clients.
|
||||
It is possible to connect by the maintenance socket to server A and perform local CQL operations.
|
||||
"""
|
||||
|
||||
server_a, server_b = await manager.server_add(), await manager.server_add()
|
||||
The test is run with multiple keyspaces with different configurations (replication strategy, RF, tablets enabled).
|
||||
It initially used only SimpleStrategy and RF=1, which hid https://github.com/scylladb/scylladb/issues/27988. To keep
|
||||
the test fast, the tasks for different keyspaces are performed concurrently, and server A is started in maintenance
|
||||
mode only once.
|
||||
"""
|
||||
max_rf = 3
|
||||
servers = await manager.servers_add(max_rf, auto_rack_dc='dc1')
|
||||
server_a = servers[0]
|
||||
host_id_a = await manager.get_host_id(server_a.server_id)
|
||||
socket_endpoint = UnixSocketEndPoint(await manager.server_get_maintenance_socket_path(server_a.server_id))
|
||||
|
||||
cluster = cluster_con([server_b.ip_addr])
|
||||
# For the move_tablet API.
|
||||
await manager.disable_tablet_balancing()
|
||||
|
||||
# An exclusive connection to server A is needed for requests with LocalStrategy.
|
||||
cluster = cluster_con([server_a.ip_addr], load_balancing_policy=WhiteListRoundRobinPolicy([server_a.ip_addr]))
|
||||
cql = cluster.connect()
|
||||
|
||||
async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks:
|
||||
table = f"{ks}.t"
|
||||
# (replication strategy, Optional[replication factor], tablets enabled)
|
||||
KeyspaceOptions: TypeAlias = tuple[str, int | None, bool]
|
||||
keyspace_options: list[KeyspaceOptions] = []
|
||||
keyspace_options.append(('EverywhereStrategy', None, False))
|
||||
keyspace_options.append(('LocalStrategy', None, False))
|
||||
for rf in range(1, max_rf + 1):
|
||||
keyspace_options.append(('SimpleStrategy', rf, False))
|
||||
for tablets_enabled in [True, False]:
|
||||
keyspace_options.append(('NetworkTopologyStrategy', rf, tablets_enabled))
|
||||
|
||||
key_on_server_a_per_table: dict[str, int] = dict()
|
||||
|
||||
async def prepare_table(options: KeyspaceOptions):
|
||||
replication_strategy, rf, tablets_enabled = options
|
||||
rf_string = "" if rf is None else f", 'replication_factor': {rf}"
|
||||
ks = await create_new_test_keyspace(cql,
|
||||
f"""WITH REPLICATION = {{'class': '{replication_strategy}'{rf_string}}}
|
||||
AND tablets = {{'enabled': {str(tablets_enabled).lower()}, 'initial': 1}}""")
|
||||
rf_tag = "" if rf is None else f"rf{rf}"
|
||||
tablets_tag = "tablets" if tablets_enabled else "vnodes"
|
||||
table_suffix = f"{replication_strategy.lower()}_{rf_tag}_{tablets_tag}"
|
||||
table = f"{ks}.{table_suffix}"
|
||||
await cql.run_async(f"CREATE TABLE {table} (k int PRIMARY KEY, v int)")
|
||||
logger.info(f"Created table {table}")
|
||||
|
||||
async def insert_one(cl: ConsistencyLevel):
|
||||
key = 1
|
||||
insert_stmt = SimpleStatement(f"INSERT INTO {table} (k, v) VALUES ({key}, {key})",
|
||||
consistency_level=cl)
|
||||
await cql.run_async(insert_stmt)
|
||||
key_on_server_a_per_table[table] = key
|
||||
|
||||
if replication_strategy == 'LocalStrategy':
|
||||
await insert_one(ConsistencyLevel.ONE)
|
||||
return
|
||||
|
||||
if tablets_enabled:
|
||||
await insert_one(ConsistencyLevel.ALL)
|
||||
|
||||
logger.info(f"Ensuring that a tablet replica is on {server_a} for table {table}")
|
||||
[tablet] = await get_all_tablet_replicas(manager, server_a, ks, table_suffix)
|
||||
if host_id_a not in [r[0] for r in tablet.replicas]:
|
||||
assert rf < max_rf
|
||||
any_replica = tablet.replicas[0]
|
||||
logger.info(f"Moving tablet from {any_replica} to {server_a} for table {table}")
|
||||
await manager.api.move_tablet(server_a.ip_addr, ks, table_suffix,
|
||||
any_replica[0], any_replica[1],
|
||||
host_id_a, 0,
|
||||
tablet.last_token)
|
||||
return
|
||||
|
||||
# This path is executed only for vnodes-based keyspaces.
|
||||
|
||||
# Token ranges of the server A
|
||||
# [(start_token, end_token)]
|
||||
ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token, endpoint
|
||||
ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token
|
||||
FROM system.token_ring WHERE keyspace_name = '{ks}'
|
||||
AND endpoint = '{server_a.ip_addr}' ALLOW FILTERING""")]
|
||||
|
||||
# Insert data to the cluster and find a key that is stored on server A.
|
||||
for i in range(256):
|
||||
await cql.run_async(f"INSERT INTO {table} (k, v) VALUES ({i}, {i})")
|
||||
# Insert data to the cluster until a key is stored on server A.
|
||||
new_key = 0
|
||||
while table not in key_on_server_a_per_table:
|
||||
if new_key == 1000:
|
||||
# The probability of reaching this code is (2/3)^1000 for RF=1 and lower for greater RFs. This is much
|
||||
# less than, for example, the probability of a UUID collision, so worrying about this would be silly.
|
||||
# It could still happen due to a bug, and then we want to know about it, so we fail the test.
|
||||
pytest.fail(f"Could not find a key on server {server_a} after inserting 1000 keys")
|
||||
new_key += 1
|
||||
|
||||
# [(key, token of this key)]
|
||||
keys_with_tokens = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"SELECT k, token(k) FROM {table}")]
|
||||
key_on_server_a = None
|
||||
insert_stmt = SimpleStatement(f"INSERT INTO {table} (k, v) VALUES ({new_key}, {new_key})",
|
||||
consistency_level=ConsistencyLevel.ALL)
|
||||
await cql.run_async(insert_stmt)
|
||||
|
||||
for key, token in keys_with_tokens:
|
||||
res = await cql.run_async(f"SELECT token(k) FROM {table} WHERE k = {new_key}")
|
||||
assert len(res) == 1
|
||||
token = res[0][0]
|
||||
for start, end in ranges:
|
||||
if (start < end and start < token <= end) or (start >= end and (token <= end or start < token)):
|
||||
key_on_server_a = key
|
||||
logger.info(f"Found key {new_key} with token {token} on server {server_a} for table {table}")
|
||||
key_on_server_a_per_table[table] = new_key
|
||||
|
||||
if key_on_server_a is None:
|
||||
# There is only a chance ~(1/2)^256 that all keys are stored on the server B
|
||||
# In this case we skip the test
|
||||
pytest.skip("All keys are stored on the server B")
|
||||
logger.info("Preparing tables")
|
||||
await gather_safely(*(prepare_table(options) for options in keyspace_options))
|
||||
|
||||
# Start server A in maintenance mode
|
||||
await manager.server_stop_gracefully(server_a.server_id)
|
||||
await manager.server_update_config(server_a.server_id, "maintenance_mode", "true")
|
||||
await manager.server_start(server_a.server_id)
|
||||
# Start server A in maintenance mode
|
||||
await manager.server_stop_gracefully(server_a.server_id)
|
||||
await manager.server_update_config(server_a.server_id, "maintenance_mode", True)
|
||||
await manager.server_start(server_a.server_id)
|
||||
|
||||
log = await manager.server_open_log(server_a.server_id)
|
||||
await log.wait_for(r"initialization completed \(maintenance mode\)")
|
||||
log = await manager.server_open_log(server_a.server_id)
|
||||
await log.wait_for(r"initialization completed \(maintenance mode\)")
|
||||
|
||||
# Check that the regular CQL port is not available
|
||||
assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0
|
||||
# Check that the regular CQL port is not available
|
||||
assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0
|
||||
|
||||
maintenance_cluster = cluster_con([socket_endpoint],
|
||||
load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint]))
|
||||
maintenance_cql = maintenance_cluster.connect()
|
||||
maintenance_cluster = cluster_con([socket_endpoint],
|
||||
load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint]))
|
||||
maintenance_cql = maintenance_cluster.connect()
|
||||
|
||||
async def update_table_in_maintenance_mode(table: str, key: int):
|
||||
# Check that local data is available in maintenance mode
|
||||
res = await maintenance_cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}")
|
||||
assert res[0][0] == key_on_server_a
|
||||
select_stm = SimpleStatement(f"SELECT v FROM {table} WHERE k = {key}", consistency_level=ConsistencyLevel.ONE)
|
||||
res = await maintenance_cql.run_async(select_stm)
|
||||
assert len(res) == 1 and res[0][0] == key, f"Expected {key} for table {table}"
|
||||
|
||||
# Check that group0 operations are disabled
|
||||
with pytest.raises(ConfigurationException):
|
||||
await maintenance_cql.run_async(f"CREATE TABLE {ks}.t2 (k int PRIMARY KEY, v int)")
|
||||
update_stm = SimpleStatement(f"UPDATE {table} SET v = {key + 1} WHERE k = {key}",
|
||||
consistency_level=ConsistencyLevel.ONE)
|
||||
await maintenance_cql.run_async(update_stm)
|
||||
|
||||
await maintenance_cql.run_async(f"UPDATE {table} SET v = {key_on_server_a + 1} WHERE k = {key_on_server_a}")
|
||||
logger.info("Updating tables in maintenance mode")
|
||||
await gather_safely(*(update_table_in_maintenance_mode(table, key)
|
||||
for table, key in key_on_server_a_per_table.items()))
|
||||
|
||||
# Ensure that server B recognizes server A as being shutdown, not as being alive.
|
||||
res = await cql.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'")
|
||||
assert res[0][0] == "shutdown"
|
||||
# Check that group0 operations are disabled
|
||||
with pytest.raises(ConfigurationException, match="cannot start group0 operation in the maintenance mode"):
|
||||
await create_new_test_keyspace(
|
||||
maintenance_cql, "WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
|
||||
|
||||
await manager.server_stop_gracefully(server_a.server_id)
|
||||
# Ensure that another server recognizes server A as being shutdown, not as being alive.
|
||||
cql_b, [host_b] = await manager.get_ready_cql([servers[1]])
|
||||
res = await cql_b.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'",
|
||||
host=host_b)
|
||||
assert len(res) == 1
|
||||
assert res[0][0] == "shutdown"
|
||||
|
||||
# Restart in normal mode to see if the changes made in maintenance mode are persisted
|
||||
await manager.server_update_config(server_a.server_id, "maintenance_mode", False)
|
||||
await manager.server_start(server_a.server_id, wait_others=1)
|
||||
await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60)
|
||||
await manager.servers_see_each_other([server_a, server_b])
|
||||
await manager.server_stop_gracefully(server_a.server_id)
|
||||
|
||||
res = await cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}")
|
||||
assert res[0][0] == key_on_server_a + 1
|
||||
# Restart in normal mode
|
||||
await manager.server_update_config(server_a.server_id, "maintenance_mode", False)
|
||||
await manager.server_start(server_a.server_id, wait_others=1)
|
||||
await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60)
|
||||
await manager.servers_see_each_other(servers)
|
||||
|
||||
async def check_table_in_normal_mode(table: str, key: int):
|
||||
# Check if the changes made in maintenance mode are persisted
|
||||
select_stm = SimpleStatement(f"SELECT v FROM {table} WHERE k = {key}", consistency_level=ConsistencyLevel.ALL)
|
||||
res = await cql.run_async(select_stm)
|
||||
assert len(res) == 1 and res[0][0] == key + 1, f"Expected {key + 1} for table {table}"
|
||||
|
||||
logger.info("Checking tables in normal mode")
|
||||
await gather_safely(*(check_table_in_normal_mode(table, key) for table, key in key_on_server_a_per_table.items()))
|
||||
|
||||
cluster.shutdown()
|
||||
maintenance_cluster.shutdown()
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.cluster.util import new_test_keyspace
|
||||
from collections import defaultdict
|
||||
import pytest
|
||||
@@ -16,7 +17,6 @@ logger = logging.getLogger(__name__)
|
||||
GB = 1024 * 1024 * 1024
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_balance_empty_tablets(manager: ManagerClient):
|
||||
|
||||
# This test checks that size-based load balancing migrates empty tablets of a newly created
|
||||
@@ -25,7 +25,7 @@ async def test_balance_empty_tablets(manager: ManagerClient):
|
||||
|
||||
logger.info('Bootstrapping cluster')
|
||||
|
||||
cfg = { 'error_injections_at_startup': ['short_tablet_stats_refresh_interval'] }
|
||||
cfg = { 'tablet_load_stats_refresh_interval_in_seconds': 1 }
|
||||
|
||||
cfg_small = cfg | { 'data_file_capacity': 50 * GB }
|
||||
cfg_large = cfg | { 'data_file_capacity': 100 * GB }
|
||||
@@ -56,6 +56,9 @@ async def test_balance_empty_tablets(manager: ManagerClient):
|
||||
|
||||
await manager.api.quiesce_topology(servers[0].ip_addr)
|
||||
|
||||
# Ensure all nodes see the same data in system.tablets
|
||||
await asyncio.gather(*[read_barrier(manager.api, s.ip_addr) for s in servers])
|
||||
|
||||
replicas_per_node = defaultdict(int)
|
||||
tablets_per_shard = {}
|
||||
for row in await cql.run_async('SELECT * FROM system.tablets'):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user