Compare commits
183 Commits
copilot/co
...
next-2026.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
233da83dd9 | ||
|
|
9b81939a93 | ||
|
|
804842e95c | ||
|
|
4f77cb621f | ||
|
|
eb6c333e1b | ||
|
|
8d21636a81 | ||
|
|
7f236baf61 | ||
|
|
4da8641d83 | ||
|
|
3ab789e1ca | ||
|
|
25a17282bd | ||
|
|
7afcc56128 | ||
|
|
32443ed6f7 | ||
|
|
3e9b984020 | ||
|
|
2d199fb609 | ||
|
|
35cd7f9239 | ||
|
|
32ce43d4b1 | ||
|
|
fef7750eb6 | ||
|
|
213442227d | ||
|
|
1398a55d16 | ||
|
|
a0a2a67634 | ||
|
|
d4e454b5bc | ||
|
|
825a36c97a | ||
|
|
45413e99a5 | ||
|
|
c93a935564 | ||
|
|
69f78ce74a | ||
|
|
3513ce6069 | ||
|
|
0ca7253315 | ||
|
|
c7ac3b5394 | ||
|
|
d6ed05efc1 | ||
|
|
39fcc83e75 | ||
|
|
6250f1e967 | ||
|
|
b307c9301d | ||
|
|
f26af8cd30 | ||
|
|
2bd10bff5e | ||
|
|
1105d83893 | ||
|
|
9b9d5cee8a | ||
|
|
a8fd9936a3 | ||
|
|
9190d42863 | ||
|
|
09b0e1ba8b | ||
|
|
8a7f5f1428 | ||
|
|
185288f16e | ||
|
|
7dfcb53197 | ||
|
|
d552244812 | ||
|
|
62b344cb55 | ||
|
|
173bfd627c | ||
|
|
bf369326d6 | ||
|
|
864774fb00 | ||
|
|
49ed97cec8 | ||
|
|
81685b0d06 | ||
|
|
06013b2377 | ||
|
|
4cc5c2605f | ||
|
|
021851c5c5 | ||
|
|
c4aa14c1a7 | ||
|
|
0fdb0961a2 | ||
|
|
2100ae2d0a | ||
|
|
51fc498314 | ||
|
|
f4b938df09 | ||
|
|
0dfefc3f12 | ||
|
|
883e3e014a | ||
|
|
4ccb795beb | ||
|
|
9e02b0f45f | ||
|
|
eb9b8dbf62 | ||
|
|
995df5dec6 | ||
|
|
beb781b829 | ||
|
|
502b7f296d | ||
|
|
b251ee02a4 | ||
|
|
f26d08dde2 | ||
|
|
9cd1038c7a | ||
|
|
fdae3e4f3a | ||
|
|
d47e4898ea | ||
|
|
7bc87de838 | ||
|
|
2141b9b824 | ||
|
|
aa50edbf17 | ||
|
|
7f836aa3ec | ||
|
|
bd26803c1a | ||
|
|
2feed49285 | ||
|
|
3007cb6f37 | ||
|
|
1e2d1c7e85 | ||
|
|
55ad575c8f | ||
|
|
8982140cd9 | ||
|
|
e90449f770 | ||
|
|
98fd5c5e45 | ||
|
|
cca6a1c3dd | ||
|
|
9edd0ae3fb | ||
|
|
dc3133b031 | ||
|
|
86554e6192 | ||
|
|
637618560b | ||
|
|
8c3c5777da | ||
|
|
bb9a5261ec | ||
|
|
d5d81cc066 | ||
|
|
6a438543c2 | ||
|
|
99a67484bf | ||
|
|
cabf2845d9 | ||
|
|
ff4a0fc87e | ||
|
|
0a89dbb4d4 | ||
|
|
19cbaa1be2 | ||
|
|
9cf0f0998d | ||
|
|
f56e1760d7 | ||
|
|
db4e3a664d | ||
|
|
c292892d5f | ||
|
|
d87467f77b | ||
|
|
6e92ee1bb2 | ||
|
|
4ecc402b79 | ||
|
|
b27adefc16 | ||
|
|
cad92d5100 | ||
|
|
44cc5ae30b | ||
|
|
05a5bd542a | ||
|
|
8a626bb458 | ||
|
|
3a56a0cf99 | ||
|
|
0cdac69aab | ||
|
|
f04a3acf33 | ||
|
|
ad716f9341 | ||
|
|
2edd87f2e1 | ||
|
|
ba10e74523 | ||
|
|
5abc2fea9f | ||
|
|
2ab81f768b | ||
|
|
cfebb52db0 | ||
|
|
37ef37e8ab | ||
|
|
fdad814aa3 | ||
|
|
0257f7cc89 | ||
|
|
07bfd920e7 | ||
|
|
698ba5bd0b | ||
|
|
d8c7303d14 | ||
|
|
9365adb2fb | ||
|
|
6a55396e90 | ||
|
|
f4b79c1b1d | ||
|
|
f633f57163 | ||
|
|
09ed4178a6 | ||
|
|
2bf7a0f65e | ||
|
|
5b15c52f1e | ||
|
|
26e17202f6 | ||
|
|
b62e1b405b | ||
|
|
f3d2a16e66 | ||
|
|
eee99ebb3d | ||
|
|
c248744c5a | ||
|
|
4ba3c08d45 | ||
|
|
c8c21cc29c | ||
|
|
e95689c96b | ||
|
|
6094f4b7b2 | ||
|
|
ad64dc7c01 | ||
|
|
bafd185087 | ||
|
|
07d1f8f48a | ||
|
|
523d529d27 | ||
|
|
c8dbd43ed5 | ||
|
|
0cf9f41649 | ||
|
|
dc89e2ea37 | ||
|
|
797f56cb45 | ||
|
|
be1d418bc0 | ||
|
|
46923f7358 | ||
|
|
4032e95715 | ||
|
|
eab10c00b1 | ||
|
|
091c3b4e22 | ||
|
|
19eadafdef | ||
|
|
358fc15893 | ||
|
|
32124d209e | ||
|
|
c7f4bda459 | ||
|
|
568af3cd8d | ||
|
|
bd694dd1a1 | ||
|
|
9672e0171f | ||
|
|
8cec41acf2 | ||
|
|
d207de0d76 | ||
|
|
edde4e878e | ||
|
|
be1c674f1a | ||
|
|
a7cff37024 | ||
|
|
9431bc5628 | ||
|
|
14db8375ac | ||
|
|
614020b5d5 | ||
|
|
e091afb400 | ||
|
|
edc46fe6a1 | ||
|
|
f8b9b767c2 | ||
|
|
23d038b385 | ||
|
|
3e2d1384bf | ||
|
|
bd7481e30c | ||
|
|
16d7b65754 | ||
|
|
e30c01eae6 | ||
|
|
d0f3725887 | ||
|
|
c12168b7ef | ||
|
|
76c0162060 | ||
|
|
c9620d9573 | ||
|
|
91cf77d016 | ||
|
|
2c2f0693ab | ||
|
|
2c73d0e6b5 | ||
|
|
f94296e0ae |
@@ -18,7 +18,7 @@ jobs:
|
||||
|
||||
// Regular expression pattern to check for "Fixes" prefix
|
||||
// Adjusted to dynamically insert the repository full name
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
|
||||
const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
|
||||
const regex = new RegExp(pattern);
|
||||
|
||||
if (!regex.test(body)) {
|
||||
|
||||
53
.github/workflows/call_backport_with_jira.yaml
vendored
Normal file
53
.github/workflows/call_backport_with_jira.yaml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
name: Backport with Jira Integration
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- next-*.*
|
||||
- branch-*.*
|
||||
pull_request_target:
|
||||
types: [labeled, closed]
|
||||
branches:
|
||||
- master
|
||||
- next
|
||||
- next-*.*
|
||||
- branch-*.*
|
||||
|
||||
jobs:
|
||||
backport-on-push:
|
||||
if: github.event_name == 'push'
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'push'
|
||||
base_branch: ${{ github.ref }}
|
||||
commits: ${{ github.event.before }}..${{ github.sha }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
backport-on-label:
|
||||
if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'labeled'
|
||||
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
|
||||
pull_request_number: ${{ github.event.pull_request.number }}
|
||||
head_commit: ${{ github.event.pull_request.base.sha }}
|
||||
label_name: ${{ github.event.label.name }}
|
||||
pr_state: ${{ github.event.pull_request.state }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
|
||||
backport-chain:
|
||||
if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
|
||||
uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
|
||||
with:
|
||||
event_type: 'chain'
|
||||
base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
|
||||
pull_request_number: ${{ github.event.pull_request.number }}
|
||||
pr_body: ${{ github.event.pull_request.body }}
|
||||
secrets:
|
||||
gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
|
||||
jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
|
||||
49
.github/workflows/trigger-scylla-ci.yaml
vendored
49
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -9,16 +9,57 @@ on:
|
||||
|
||||
jobs:
|
||||
trigger-jenkins:
|
||||
if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
|
||||
if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Verify Org Membership
|
||||
id: verify_author
|
||||
env:
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
|
||||
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
|
||||
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
|
||||
AUTHOR="$PR_AUTHOR"
|
||||
ASSOCIATION="$PR_ASSOCIATION"
|
||||
else
|
||||
AUTHOR="$COMMENT_AUTHOR"
|
||||
ASSOCIATION="$COMMENT_ASSOCIATION"
|
||||
fi
|
||||
ORG="scylladb"
|
||||
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
|
||||
echo "member=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
|
||||
echo "member=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
env:
|
||||
COMMENT_BODY: ${{ github.event.comment.body }}
|
||||
shell: bash
|
||||
run: |
|
||||
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "trigger=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Trigger Scylla-CI-Route Jenkins Job
|
||||
if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
|
||||
env:
|
||||
JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
|
||||
JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
|
||||
JENKINS_URL: "https://jenkins.scylladb.com"
|
||||
PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
|
||||
PR_REPO_NAME: "${{ github.event.repository.full_name }}"
|
||||
run: |
|
||||
PR_NUMBER=${{ github.event.issue.number }}
|
||||
PR_REPO_NAME=${{ github.event.repository.full_name }}
|
||||
curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
|
||||
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
|
||||
--user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0-dev
|
||||
VERSION=2026.1.1
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -767,7 +767,7 @@ static future<bool> scan_table(
|
||||
// by tasking another node to take over scanning of the dead node's primary
|
||||
// ranges. What we do here is that this node will also check expiration
|
||||
// on its *secondary* ranges - but only those whose primary owner is down.
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
|
||||
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
|
||||
if (!gossiper.is_alive(tablet_primary_replica.host)) {
|
||||
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
|
||||
|
||||
@@ -515,6 +515,15 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
|
||||
auto sstables = parsed.GetArray() |
|
||||
std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
|
||||
std::ranges::to<std::vector>();
|
||||
apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
|
||||
keyspace,
|
||||
table,
|
||||
endpoint,
|
||||
bucket,
|
||||
prefix,
|
||||
sstables.size(),
|
||||
scope,
|
||||
primary_replica_only);
|
||||
auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
|
||||
co_return json::json_return_type(fmt::to_string(task_id));
|
||||
});
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "mutation/mutation_fragment_stream_validator.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/pretty_printers.hh"
|
||||
#include "readers/multi_range.hh"
|
||||
#include "readers/compacting.hh"
|
||||
@@ -611,23 +612,23 @@ private:
|
||||
}
|
||||
|
||||
// Called in a seastar thread
|
||||
dht::partition_range_vector
|
||||
utils::chunked_vector<dht::partition_range>
|
||||
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
// If owned ranges is disengaged, it means no cleanup work was done and
|
||||
// so nothing needs to be invalidated.
|
||||
if (!_owned_ranges) {
|
||||
return dht::partition_range_vector{};
|
||||
return {};
|
||||
}
|
||||
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
|
||||
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
|
||||
|
||||
auto non_owned_ranges = sstables
|
||||
| std::views::transform([] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return dht::partition_range::make({sst->get_first_decorated_key(), true},
|
||||
{sst->get_last_decorated_key(), true});
|
||||
}) | std::ranges::to<dht::partition_range_vector>();
|
||||
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
|
||||
|
||||
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
|
||||
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
|
||||
}
|
||||
protected:
|
||||
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
|
||||
@@ -718,8 +719,8 @@ protected:
|
||||
|
||||
compaction_completion_desc
|
||||
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
|
||||
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
|
||||
auto ranges = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
|
||||
}
|
||||
|
||||
// Tombstone expiration is enabled based on the presence of sstable set.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "compaction_fwd.hh"
|
||||
#include "mutation_writer/token_group_based_splitting_writer.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace compaction {
|
||||
|
||||
@@ -38,7 +39,7 @@ struct compaction_completion_desc {
|
||||
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
|
||||
std::vector<sstables::shared_sstable> new_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// creates a new SSTable for a given shard
|
||||
|
||||
@@ -778,6 +778,7 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
|
||||
cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
|
||||
}
|
||||
compaction::compaction_state& cs = get_compaction_state(&t);
|
||||
auto gh = cs.gate.hold();
|
||||
auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
|
||||
if (!reason.empty()) {
|
||||
cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
|
||||
@@ -791,6 +792,7 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
|
||||
cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
|
||||
}
|
||||
compaction::compaction_state& cs = get_compaction_state(&t);
|
||||
auto gh = cs.gate.hold();
|
||||
auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
|
||||
if (!reason.empty()) {
|
||||
cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
|
||||
@@ -1519,7 +1521,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
|
||||
| std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
|
||||
| std::ranges::to<std::unordered_set>());
|
||||
};
|
||||
const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
|
||||
const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
|
||||
.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
|
||||
|
||||
auto count = co_await num_runs_for_compaction();
|
||||
if (count <= threshold) {
|
||||
cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
|
||||
@@ -1534,9 +1538,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
|
||||
auto& cstate = get_compaction_state(&t);
|
||||
try {
|
||||
while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
|
||||
co_await cstate.compaction_done.wait([this, &t] {
|
||||
return !can_perform_regular_compaction(t);
|
||||
});
|
||||
co_await cstate.compaction_done.wait();
|
||||
}
|
||||
} catch (const broken_condition_variable&) {
|
||||
co_return;
|
||||
@@ -2387,6 +2389,8 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
|
||||
if (!c_state.gate.is_closed()) {
|
||||
auto close_gate = c_state.gate.close();
|
||||
co_await stop_ongoing_compactions(reason, &t);
|
||||
// Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
|
||||
co_await c_state.incremental_repair_lock.write_lock();
|
||||
co_await std::move(close_gate);
|
||||
}
|
||||
|
||||
|
||||
47
configure.py
47
configure.py
@@ -730,28 +730,6 @@ vector_search_tests = set([
|
||||
'test/vector_search/rescoring_test'
|
||||
])
|
||||
|
||||
vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
|
||||
vector_search_validator_deps = set([
|
||||
'test/vector_search_validator/build-validator',
|
||||
'test/vector_search_validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator/src/main.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
|
||||
'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
|
||||
])
|
||||
|
||||
vector_store_bin = 'vector-search-validator/bin/vector-store'
|
||||
vector_store_deps = set([
|
||||
'test/vector_search_validator/build-env',
|
||||
'test/vector_search_validator/build-vector-store',
|
||||
])
|
||||
|
||||
vector_search_validator_bins = set([
|
||||
vector_search_validator_bin,
|
||||
vector_store_bin,
|
||||
])
|
||||
|
||||
wasms = set([
|
||||
'wasm/return_input.wat',
|
||||
'wasm/test_complex_null_values.wat',
|
||||
@@ -785,7 +763,7 @@ other = set([
|
||||
'iotune',
|
||||
])
|
||||
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
|
||||
all_artifacts = apps | cpp_apps | tests | other | wasms
|
||||
|
||||
arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
|
||||
@@ -2582,11 +2560,10 @@ def write_build_file(f,
|
||||
description = RUST_LIB $out
|
||||
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
|
||||
f.write(
|
||||
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
|
||||
'build {mode}-build: phony {artifacts} {wasms}\n'.format(
|
||||
mode=mode,
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
|
||||
artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
|
||||
wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
|
||||
vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
if profile_recipe := modes[mode].get('profile_recipe'):
|
||||
@@ -2616,7 +2593,7 @@ def write_build_file(f,
|
||||
continue
|
||||
profile_dep = modes[mode].get('profile_target', "")
|
||||
|
||||
if binary in other or binary in wasms or binary in vector_search_validator_bins:
|
||||
if binary in other or binary in wasms:
|
||||
continue
|
||||
srcs = deps[binary]
|
||||
# 'scylla'
|
||||
@@ -2727,11 +2704,10 @@ def write_build_file(f,
|
||||
)
|
||||
|
||||
f.write(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
|
||||
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
|
||||
mode=mode,
|
||||
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
|
||||
wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
|
||||
vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
|
||||
)
|
||||
)
|
||||
f.write(
|
||||
@@ -2899,19 +2875,6 @@ def write_build_file(f,
|
||||
'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
rule build-vector-search-validator
|
||||
command = test/vector_search_validator/build-validator $builddir
|
||||
rule build-vector-store
|
||||
command = test/vector_search_validator/build-vector-store $builddir
|
||||
'''))
|
||||
f.write(
|
||||
'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
|
||||
)
|
||||
f.write(
|
||||
'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
|
||||
)
|
||||
|
||||
f.write(textwrap.dedent(f'''\
|
||||
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
|
||||
build dist-unified: phony dist-unified-tar
|
||||
|
||||
@@ -10,9 +10,41 @@
|
||||
#include "types/types.hh"
|
||||
#include "types/vector.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include <span>
|
||||
#include <bit>
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
|
||||
namespace detail {
|
||||
|
||||
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
|
||||
if (!param) {
|
||||
throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
|
||||
}
|
||||
|
||||
const size_t expected_size = dimension * sizeof(float);
|
||||
if (param->size() != expected_size) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
|
||||
expected_size, dimension, param->size()));
|
||||
}
|
||||
|
||||
std::vector<float> result;
|
||||
result.reserve(dimension);
|
||||
|
||||
bytes_view view(*param);
|
||||
for (size_t i = 0; i < dimension; ++i) {
|
||||
// read_simple handles network byte order (big-endian) conversion
|
||||
uint32_t raw = read_simple<uint32_t>(view);
|
||||
result.push_back(std::bit_cast<float>(raw));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
namespace {
|
||||
|
||||
// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
|
||||
@@ -22,14 +54,14 @@ namespace {
|
||||
|
||||
// You should only use this function if you need to preserve the original vectors and cannot normalize
|
||||
// them in advance.
|
||||
float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double dot_product = 0.0;
|
||||
double squared_norm_a = 0.0;
|
||||
double squared_norm_b = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
|
||||
dot_product += a * b;
|
||||
squared_norm_a += a * a;
|
||||
@@ -37,7 +69,7 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
|
||||
}
|
||||
|
||||
if (squared_norm_a == 0 || squared_norm_b == 0) {
|
||||
throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
// The cosine similarity is in the range [-1, 1].
|
||||
@@ -46,12 +78,12 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
|
||||
return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
|
||||
}
|
||||
|
||||
float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
|
||||
double diff = a - b;
|
||||
sum += diff * diff;
|
||||
@@ -65,12 +97,12 @@ float compute_euclidean_similarity(const std::vector<data_value>& v1, const std:
|
||||
|
||||
// Assumes that both vectors are L2-normalized.
|
||||
// This similarity is intended as an optimized way to perform cosine similarity calculation.
|
||||
float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
|
||||
float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
|
||||
double dot_product = 0.0;
|
||||
|
||||
for (size_t i = 0; i < v1.size(); ++i) {
|
||||
double a = value_cast<float>(v1[i]);
|
||||
double b = value_cast<float>(v2[i]);
|
||||
double a = v1[i];
|
||||
double b = v2[i];
|
||||
dot_product += a * b;
|
||||
}
|
||||
|
||||
@@ -136,13 +168,15 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto& type = arg_types()[0];
|
||||
data_value v1 = type->deserialize(*parameters[0]);
|
||||
data_value v2 = type->deserialize(*parameters[1]);
|
||||
const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
|
||||
const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
|
||||
// Extract dimension from the vector type
|
||||
const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
|
||||
size_t dimension = type.get_dimension();
|
||||
|
||||
float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
|
||||
// Optimized path: extract floats directly from bytes, bypassing data_value overhead
|
||||
std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
|
||||
std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
|
||||
|
||||
float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
|
||||
return float_type->decompose(result);
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "native_scalar_function.hh"
|
||||
#include "cql3/assignment_testable.hh"
|
||||
#include "cql3/functions/function_name.hh"
|
||||
#include <span>
|
||||
|
||||
namespace cql3 {
|
||||
namespace functions {
|
||||
@@ -19,7 +20,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
|
||||
static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
|
||||
static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
|
||||
|
||||
using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
|
||||
using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
|
||||
extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
|
||||
|
||||
std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
|
||||
@@ -33,5 +34,14 @@ public:
|
||||
virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Extract float vector directly from serialized bytes, bypassing data_value overhead.
|
||||
// This is an internal API exposed for testing purposes.
|
||||
// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
|
||||
std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
} // namespace functions
|
||||
} // namespace cql3
|
||||
|
||||
@@ -105,6 +105,7 @@ public:
|
||||
static const std::chrono::minutes entry_expiry;
|
||||
|
||||
using key_type = prepared_cache_key_type;
|
||||
using pinned_value_type = cache_value_ptr;
|
||||
using value_type = checked_weak_ptr;
|
||||
using statement_is_too_big = typename cache_type::entry_is_too_big;
|
||||
|
||||
@@ -116,9 +117,14 @@ public:
|
||||
: _cache(size, entry_expiry, logger)
|
||||
{}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
|
||||
}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_type> get(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
|
||||
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
|
||||
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
const auto& warnings = prep_ptr->warnings;
|
||||
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
for (const auto& w : warnings) {
|
||||
msg->add_warning(w);
|
||||
}
|
||||
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
|
||||
co_return std::move(msg);
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
|
||||
20
cql3/query_result_printer.hh
Normal file
20
cql3/query_result_printer.hh
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
class result;
|
||||
|
||||
void print_query_results_text(std::ostream& os, const result& result);
|
||||
void print_query_results_json(std::ostream& os, const result& result);
|
||||
|
||||
} // namespace cql3
|
||||
@@ -9,8 +9,10 @@
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include "types/json_utils.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/hashers.hh"
|
||||
#include "utils/rjson.hh"
|
||||
#include "cql3/result_set.hh"
|
||||
|
||||
namespace cql3 {
|
||||
@@ -195,4 +197,85 @@ make_empty_metadata() {
|
||||
return empty_metadata_cache;
|
||||
}
|
||||
|
||||
void print_query_results_text(std::ostream& os, const cql3::result& result) {
|
||||
const auto& metadata = result.get_metadata();
|
||||
const auto& column_metadata = metadata.get_names();
|
||||
|
||||
struct column_values {
|
||||
size_t max_size{0};
|
||||
sstring header_format;
|
||||
sstring row_format;
|
||||
std::vector<sstring> values;
|
||||
|
||||
void add(sstring value) {
|
||||
max_size = std::max(max_size, value.size());
|
||||
values.push_back(std::move(value));
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<column_values> columns;
|
||||
columns.resize(column_metadata.size());
|
||||
|
||||
for (size_t i = 0; i < column_metadata.size(); ++i) {
|
||||
columns[i].add(column_metadata[i]->name->text());
|
||||
}
|
||||
|
||||
for (const auto& row : result.result_set().rows()) {
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
if (row[i]) {
|
||||
columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
|
||||
} else {
|
||||
columns[i].add("");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<sstring> separators(columns.size(), sstring());
|
||||
for (size_t i = 0; i < columns.size(); ++i) {
|
||||
auto& col_values = columns[i];
|
||||
col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
|
||||
col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
|
||||
for (size_t c = 0; c < col_values.max_size; ++c) {
|
||||
separators[i] += "-";
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
|
||||
std::vector<sstring> row;
|
||||
row.reserve(columns.size());
|
||||
for (size_t i = 0; i < columns.size(); ++i) {
|
||||
const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
|
||||
row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
|
||||
}
|
||||
fmt::print(os, "{}\n", fmt::join(row, "|"));
|
||||
if (!r) {
|
||||
fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_query_results_json(std::ostream& os, const cql3::result& result) {
|
||||
const auto& metadata = result.get_metadata();
|
||||
const auto& column_metadata = metadata.get_names();
|
||||
|
||||
rjson::streaming_writer writer(os);
|
||||
|
||||
writer.StartArray();
|
||||
for (const auto& row : result.result_set().rows()) {
|
||||
writer.StartObject();
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
writer.Key(column_metadata[i]->name->text());
|
||||
if (!row[i] || row[i]->empty()) {
|
||||
writer.Null();
|
||||
continue;
|
||||
}
|
||||
const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
|
||||
const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
|
||||
writer.RawValue(value, type);
|
||||
}
|
||||
writer.EndObject();
|
||||
}
|
||||
writer.EndArray();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/paxos/paxos_state.hh"
|
||||
#include "types/types.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/cql_statement.hh"
|
||||
@@ -329,6 +330,19 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
} else if (service::paxos::paxos_store::try_get_base_table(name)) {
|
||||
// Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
|
||||
// The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
|
||||
fragmented_ostringstream os{};
|
||||
|
||||
fmt::format_to(os.to_iter(),
|
||||
"/* Do NOT execute this statement! It's only for informational purposes.\n"
|
||||
" A paxos state table is created automatically when enabling LWT on a base table.\n"
|
||||
"\n{}\n"
|
||||
"*/",
|
||||
*table_desc.create_statement);
|
||||
|
||||
table_desc.create_statement = std::move(os).to_managed_string();
|
||||
}
|
||||
result.push_back(std::move(table_desc));
|
||||
@@ -364,7 +378,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
|
||||
future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
|
||||
auto& replica_db = db.real_database();
|
||||
auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
|
||||
return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
|
||||
}) | std::ranges::to<std::vector<schema_ptr>>();
|
||||
std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));
|
||||
|
||||
|
||||
@@ -55,8 +55,21 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
|
||||
return hash & ((1ULL << batchlog_shard_bits) - 1);
|
||||
}
|
||||
|
||||
bool is_batchlog_v1(const schema& schema) {
|
||||
return schema.cf_name() == system_keyspace::BATCHLOG;
|
||||
}
|
||||
|
||||
std::pair<partition_key, clustering_key>
|
||||
get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
|
||||
if (is_batchlog_v1(schema)) {
|
||||
if (!id) {
|
||||
on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
|
||||
}
|
||||
auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
|
||||
auto ckey = clustering_key::make_empty();
|
||||
return std::pair(std::move(pkey), std::move(ckey));
|
||||
}
|
||||
|
||||
auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
|
||||
|
||||
std::vector<bytes> ckey_components;
|
||||
@@ -85,6 +98,14 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
|
||||
auto cdef_data = schema->get_column_definition(to_bytes("data"));
|
||||
m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
|
||||
|
||||
if (is_batchlog_v1(*schema)) {
|
||||
auto cdef_version = schema->get_column_definition(to_bytes("version"));
|
||||
m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
|
||||
|
||||
auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
|
||||
m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
@@ -122,9 +143,10 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
|
||||
const std::chrono::seconds db::batchlog_manager::replay_interval;
|
||||
const uint32_t db::batchlog_manager::page_size;
|
||||
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
|
||||
: _qp(qp)
|
||||
, _sys_ks(sys_ks)
|
||||
, _fs(fs)
|
||||
, _replay_timeout(config.replay_timeout)
|
||||
, _replay_rate(config.replay_rate)
|
||||
, _delay(config.delay)
|
||||
@@ -300,23 +322,156 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
|
||||
});
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
|
||||
co_await maybe_migrate_v1_to_v2();
|
||||
namespace {
|
||||
|
||||
typedef db_clock::rep clock_type;
|
||||
using clock_type = db_clock::rep;
|
||||
|
||||
struct replay_stats {
|
||||
std::optional<db_clock::time_point> min_too_fresh;
|
||||
bool need_cleanup = false;
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static future<db::all_batches_replayed> process_batch(
|
||||
cql3::query_processor& qp,
|
||||
db::batchlog_manager::stats& stats,
|
||||
db::batchlog_manager::post_replay_cleanup cleanup,
|
||||
utils::rate_limiter& limiter,
|
||||
schema_ptr schema,
|
||||
std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
|
||||
const db_clock::time_point now,
|
||||
db_clock::duration replay_timeout,
|
||||
std::chrono::seconds write_timeout,
|
||||
const cql3::untyped_result_set::row& row) {
|
||||
const bool is_v1 = db::is_batchlog_v1(*schema);
|
||||
const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
|
||||
const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
|
||||
auto timeout = replay_timeout;
|
||||
|
||||
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
|
||||
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
|
||||
auto data = row.get_blob_unfragmented("data");
|
||||
|
||||
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
bool send_failed = false;
|
||||
|
||||
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
|
||||
|
||||
try {
|
||||
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
|
||||
auto in = ser::as_input_stream(data);
|
||||
while (in.size()) {
|
||||
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
|
||||
const auto tbl = qp.db().try_find_table(fm.column_family_id());
|
||||
if (!tbl) {
|
||||
continue;
|
||||
}
|
||||
if (written_at <= tbl->get_truncation_time()) {
|
||||
continue;
|
||||
}
|
||||
schema_ptr s = tbl->schema();
|
||||
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
fms.emplace_back(std::move(fm), std::move(s));
|
||||
}
|
||||
|
||||
if (now < written_at + timeout) {
|
||||
blogger.debug("Skipping replay of {}, too fresh", id);
|
||||
|
||||
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
|
||||
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
|
||||
auto size = data.size();
|
||||
|
||||
for (const auto& [fm, s] : fms) {
|
||||
mutations.emplace_back(fm.to_mutation(s));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
if (!mutations.empty()) {
|
||||
const auto ttl = [written_at]() -> clock_type {
|
||||
/*
|
||||
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
|
||||
* This ensures that deletes aren't "undone" by an old batch replay.
|
||||
*/
|
||||
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
|
||||
warn(unimplemented::cause::HINT);
|
||||
#if 0
|
||||
for (auto& m : *mutations) {
|
||||
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
|
||||
}
|
||||
#endif
|
||||
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
|
||||
}();
|
||||
|
||||
if (ttl > 0) {
|
||||
// Origin does the send manually, however I can't see a super great reason to do so.
|
||||
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
|
||||
// in both cases.
|
||||
// FIXME: verify that the above is reasonably true.
|
||||
co_await limiter.reserve(size);
|
||||
stats.write_attempts += mutations.size();
|
||||
auto timeout = db::timeout_clock::now() + write_timeout;
|
||||
if (cleanup) {
|
||||
co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
|
||||
} else {
|
||||
co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (const data_dictionary::no_such_column_family&) {
|
||||
// As above -- we should drop the batch if the table doesn't exist anymore.
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
// we have to resort to keeping this batch to next lap.
|
||||
if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
|
||||
co_return db::all_batches_replayed::no;
|
||||
}
|
||||
send_failed = true;
|
||||
}
|
||||
|
||||
auto& sp = qp.proxy();
|
||||
|
||||
if (send_failed) {
|
||||
blogger.debug("Moving batch {} to stage failed_replay", id);
|
||||
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
}
|
||||
|
||||
// delete batch
|
||||
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
|
||||
co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
shard_written_at.need_cleanup = true;
|
||||
|
||||
co_return db::all_batches_replayed(!send_failed);
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
|
||||
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
|
||||
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
|
||||
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
|
||||
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
|
||||
auto limiter = make_lw_shared<utils::rate_limiter>(throttle);
|
||||
utils::rate_limiter limiter(throttle);
|
||||
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
struct replay_stats {
|
||||
std::optional<db_clock::time_point> min_too_fresh;
|
||||
bool need_cleanup = false;
|
||||
};
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
@@ -324,125 +479,49 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
// same across a while prefix of written_at (across all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
|
||||
const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
|
||||
const auto batch_shard = row.get_as<int32_t>("shard");
|
||||
auto written_at = row.get_as<db_clock::time_point>("written_at");
|
||||
auto id = row.get_as<utils::UUID>("id");
|
||||
// enough time for the actual write + batchlog entry mutation delivery (two separate requests).
|
||||
auto timeout = _replay_timeout;
|
||||
auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
|
||||
all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
|
||||
blogger.debug("Skipping batch replay due to skip_batch_replay injection");
|
||||
all_replayed = all_batches_replayed::no;
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
|
||||
blogger.debug("Started replayAllFailedBatches");
|
||||
co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
|
||||
|
||||
auto data = row.get_blob_unfragmented("data");
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
|
||||
|
||||
blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
|
||||
co_await _qp.query_internal(
|
||||
format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
|
||||
db::consistency_level::ONE,
|
||||
{},
|
||||
page_size,
|
||||
batch);
|
||||
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
bool send_failed = false;
|
||||
blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
|
||||
});
|
||||
|
||||
auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
|
||||
co_return all_replayed;
|
||||
}
|
||||
|
||||
try {
|
||||
utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
|
||||
auto in = ser::as_input_stream(data);
|
||||
while (in.size()) {
|
||||
auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
|
||||
const auto tbl = _qp.db().try_find_table(fm.column_family_id());
|
||||
if (!tbl) {
|
||||
continue;
|
||||
}
|
||||
if (written_at <= tbl->get_truncation_time()) {
|
||||
continue;
|
||||
}
|
||||
schema_ptr s = tbl->schema();
|
||||
if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
|
||||
timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
|
||||
}
|
||||
fms.emplace_back(std::move(fm), std::move(s));
|
||||
}
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
|
||||
co_await maybe_migrate_v1_to_v2();
|
||||
|
||||
if (now < written_at + timeout) {
|
||||
blogger.debug("Skipping replay of {}, too fresh", id);
|
||||
db::all_batches_replayed all_replayed = all_batches_replayed::yes;
|
||||
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
|
||||
// max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
|
||||
auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
|
||||
utils::rate_limiter limiter(throttle);
|
||||
|
||||
shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
|
||||
auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
|
||||
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
|
||||
|
||||
auto size = data.size();
|
||||
|
||||
for (const auto& [fm, s] : fms) {
|
||||
mutations.emplace_back(fm.to_mutation(s));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
|
||||
if (!mutations.empty()) {
|
||||
const auto ttl = [written_at]() -> clock_type {
|
||||
/*
|
||||
* Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
|
||||
* This ensures that deletes aren't "undone" by an old batch replay.
|
||||
*/
|
||||
auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
|
||||
warn(unimplemented::cause::HINT);
|
||||
#if 0
|
||||
for (auto& m : *mutations) {
|
||||
unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
|
||||
}
|
||||
#endif
|
||||
return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
|
||||
}();
|
||||
|
||||
if (ttl > 0) {
|
||||
// Origin does the send manually, however I can't see a super great reason to do so.
|
||||
// Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
|
||||
// in both cases.
|
||||
// FIXME: verify that the above is reasonably true.
|
||||
co_await limiter->reserve(size);
|
||||
_stats.write_attempts += mutations.size();
|
||||
auto timeout = db::timeout_clock::now() + write_timeout;
|
||||
if (cleanup) {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
|
||||
} else {
|
||||
co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (data_dictionary::no_such_keyspace& ex) {
|
||||
// should probably ignore and drop the batch
|
||||
} catch (const data_dictionary::no_such_column_family&) {
|
||||
// As above -- we should drop the batch if the table doesn't exist anymore.
|
||||
} catch (...) {
|
||||
blogger.warn("Replay failed (will retry): {}", std::current_exception());
|
||||
all_replayed = all_batches_replayed::no;
|
||||
// timeout, overload etc.
|
||||
// Do _not_ remove the batch, assuning we got a node write error.
|
||||
// Since we don't have hints (which origin is satisfied with),
|
||||
// we have to resort to keeping this batch to next lap.
|
||||
if (!cleanup || stage == batchlog_stage::failed_replay) {
|
||||
co_return stop_iteration::no;
|
||||
}
|
||||
send_failed = true;
|
||||
}
|
||||
|
||||
auto& sp = _qp.proxy();
|
||||
|
||||
if (send_failed) {
|
||||
blogger.debug("Moving batch {} to stage failed_replay", id);
|
||||
auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
|
||||
co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
}
|
||||
|
||||
// delete batch
|
||||
auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
|
||||
co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
|
||||
shard_written_at.need_cleanup = true;
|
||||
// Use a stable `now` across all batches, so skip/replay decisions are the
|
||||
// same across a while prefix of written_at (across all ids).
|
||||
const auto now = db_clock::now();
|
||||
|
||||
auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
|
||||
all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
|
||||
@@ -501,3 +580,10 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
|
||||
|
||||
co_return all_replayed;
|
||||
}
|
||||
|
||||
future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
|
||||
if (_fs.batchlog_v2) {
|
||||
return replay_all_failed_batches_v2(cleanup);
|
||||
}
|
||||
return replay_all_failed_batches_v1(cleanup);
|
||||
}
|
||||
|
||||
@@ -27,6 +27,12 @@ class query_processor;
|
||||
|
||||
} // namespace cql3
|
||||
|
||||
namespace gms {
|
||||
|
||||
class feature_service;
|
||||
|
||||
} // namespace gms
|
||||
|
||||
namespace db {
|
||||
|
||||
class system_keyspace;
|
||||
@@ -49,6 +55,11 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
|
||||
public:
|
||||
using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
|
||||
|
||||
struct stats {
|
||||
uint64_t write_attempts = 0;
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
|
||||
static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
|
||||
@@ -56,14 +67,13 @@ private:
|
||||
|
||||
using clock_type = lowres_clock;
|
||||
|
||||
struct stats {
|
||||
uint64_t write_attempts = 0;
|
||||
} _stats;
|
||||
stats _stats;
|
||||
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
|
||||
cql3::query_processor& _qp;
|
||||
db::system_keyspace& _sys_ks;
|
||||
gms::feature_service& _fs;
|
||||
db_clock::duration _replay_timeout;
|
||||
uint64_t _replay_rate;
|
||||
std::chrono::milliseconds _delay;
|
||||
@@ -84,12 +94,14 @@ private:
|
||||
|
||||
future<> maybe_migrate_v1_to_v2();
|
||||
|
||||
future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
|
||||
future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
|
||||
future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
|
||||
public:
|
||||
// Takes a QP, not a distributes. Because this object is supposed
|
||||
// to be per shard and does no dispatching beyond delegating the the
|
||||
// shard qp (which is what you feed here).
|
||||
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);
|
||||
batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
|
||||
|
||||
// abort the replay loop and return its future.
|
||||
future<> drain();
|
||||
@@ -102,7 +114,7 @@ public:
|
||||
return _last_replay;
|
||||
}
|
||||
|
||||
const stats& stats() const {
|
||||
const stats& get_stats() const {
|
||||
return _stats;
|
||||
}
|
||||
private:
|
||||
|
||||
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
|
||||
}
|
||||
continue;
|
||||
} catch (shutdown_marker&) {
|
||||
_reserve_segments.abort(std::current_exception());
|
||||
break;
|
||||
} catch (...) {
|
||||
clogger.warn("Exception in segment reservation: {}", std::current_exception());
|
||||
}
|
||||
co_await sleep(100ms);
|
||||
}
|
||||
_reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
|
||||
}
|
||||
|
||||
future<std::vector<db::commitlog::descriptor>>
|
||||
|
||||
@@ -1291,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
|
||||
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
|
||||
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/labels.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace cache {
|
||||
|
||||
@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
|
||||
return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
|
||||
return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
|
||||
future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
|
||||
return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
|
||||
return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
|
||||
auto on_failure = defer([this] () noexcept {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "utils/histogram.hh"
|
||||
#include "mutation/partition_version.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/cache_tracker.hh"
|
||||
#include "readers/empty.hh"
|
||||
#include "readers/mutation_source.hh"
|
||||
@@ -457,7 +458,7 @@ public:
|
||||
// mutation source made prior to the call to invalidate().
|
||||
future<> invalidate(external_updater, const dht::decorated_key&);
|
||||
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
|
||||
// Evicts entries from cache.
|
||||
//
|
||||
|
||||
@@ -1139,14 +1139,17 @@ future<> schema_applier::finalize_tables_and_views() {
|
||||
// was already dropped (see https://github.com/scylladb/scylla/issues/5614)
|
||||
for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
|
||||
auto s = dropped_view.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
|
||||
auto s = dropped_table.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
|
||||
auto s = dropped_cdc.get();
|
||||
co_await _ss.local().on_cleanup_for_drop_table(s->id());
|
||||
co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
|
||||
}
|
||||
|
||||
|
||||
@@ -105,7 +105,7 @@ namespace {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.ks_name() == schema_tables::NAME) {
|
||||
// all schema tables are group0 tables
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -87,31 +87,15 @@ namespace {
|
||||
static const std::unordered_set<sstring> tables = {
|
||||
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
|
||||
system_keyspace::BROADCAST_KV_STORE,
|
||||
system_keyspace::CDC_GENERATIONS_V3,
|
||||
system_keyspace::RAFT,
|
||||
system_keyspace::RAFT_SNAPSHOTS,
|
||||
system_keyspace::RAFT_SNAPSHOT_CONFIG,
|
||||
system_keyspace::GROUP0_HISTORY,
|
||||
system_keyspace::DISCOVERY,
|
||||
system_keyspace::TABLETS,
|
||||
system_keyspace::TOPOLOGY,
|
||||
system_keyspace::TOPOLOGY_REQUESTS,
|
||||
system_keyspace::LOCAL,
|
||||
system_keyspace::PEERS,
|
||||
system_keyspace::SCYLLA_LOCAL,
|
||||
system_keyspace::COMMITLOG_CLEANUPS,
|
||||
system_keyspace::SERVICE_LEVELS_V2,
|
||||
system_keyspace::VIEW_BUILD_STATUS_V2,
|
||||
system_keyspace::CDC_STREAMS_STATE,
|
||||
system_keyspace::CDC_STREAMS_HISTORY,
|
||||
system_keyspace::ROLES,
|
||||
system_keyspace::ROLE_MEMBERS,
|
||||
system_keyspace::ROLE_ATTRIBUTES,
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.enable_schema_commitlog();
|
||||
@@ -143,7 +127,7 @@ namespace {
|
||||
system_keyspace::REPAIR_TASKS,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -1714,7 +1698,9 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
|
||||
std::unordered_set<dht::token> tset;
|
||||
for (auto& t: tokens) {
|
||||
auto str = value_cast<sstring>(t);
|
||||
SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
|
||||
if (str != dht::token::from_sstring(str).to_sstring()) {
|
||||
on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
|
||||
}
|
||||
tset.insert(dht::token::from_sstring(str));
|
||||
}
|
||||
return tset;
|
||||
@@ -3191,7 +3177,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
};
|
||||
}
|
||||
} else if (must_have_tokens(nstate)) {
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
|
||||
}
|
||||
}
|
||||
@@ -3273,7 +3259,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
// Currently, at most one node at a time can be in transitioning state.
|
||||
if (!map->empty()) {
|
||||
const auto& [other_id, other_rs] = *map->begin();
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
|
||||
other_id, other_rs.state, host_id, nstate));
|
||||
}
|
||||
@@ -3331,8 +3317,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
|
||||
NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
|
||||
gen_id.id);
|
||||
SCYLLA_ASSERT(gen_rows);
|
||||
if (gen_rows->empty()) {
|
||||
if (!gen_rows || gen_rows->empty()) {
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
|
||||
}
|
||||
|
||||
@@ -215,6 +215,8 @@ public:
|
||||
static constexpr auto BUILT_VIEWS = "built_views";
|
||||
static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
|
||||
static constexpr auto CDC_LOCAL = "cdc_local";
|
||||
static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
|
||||
static constexpr auto CDC_STREAMS = "cdc_streams";
|
||||
|
||||
// auth
|
||||
static constexpr auto ROLES = "roles";
|
||||
|
||||
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
const row& existing_row = existing.cells();
|
||||
const row& updated_row = update.cells();
|
||||
|
||||
const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
|
||||
const auto view_it = _view->columns_by_name().find(cdef.name());
|
||||
const bool column_is_selected = view_it != _view->columns_by_name().end();
|
||||
|
||||
@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
// as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
|
||||
// Because of that, we don't generate view updates when the value in an unselected column is created
|
||||
// or changes.
|
||||
if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
|
||||
if (!column_is_selected) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//TODO(sarna): Optimize collections case - currently they do not go under optimization
|
||||
if (!cdef.is_atomic()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the value was created or deleted, unless we have a non-expiring marker
|
||||
// We cannot skip if the value was created or deleted
|
||||
const auto* existing_cell = existing_row.find_cell(cdef.id);
|
||||
const auto* updated_cell = updated_row.find_cell(cdef.id);
|
||||
if (existing_cell == nullptr || updated_cell == nullptr) {
|
||||
return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
|
||||
return existing_cell == updated_cell;
|
||||
}
|
||||
|
||||
if (!cdef.is_atomic()) {
|
||||
return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
|
||||
}
|
||||
|
||||
atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
|
||||
atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);
|
||||
|
||||
// We cannot skip when a selected column is changed
|
||||
if (column_is_selected) {
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
|
||||
// With non-expiring row marker, liveness checks below are not relevant
|
||||
if (base_has_nonexpiring_marker) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the change updates TTL
|
||||
const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
|
||||
const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
|
||||
if (existing_has_ttl || updated_has_ttl) {
|
||||
return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
|
||||
}
|
||||
|
||||
return true;
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
locator::endpoint_dc_rack my_location,
|
||||
const locator::network_topology_strategy* network_topology,
|
||||
const bool network_topology,
|
||||
replica::cf_stats& cf_stats) {
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id me,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& my_location = topology.get_location(me);
|
||||
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
|
||||
|
||||
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
|
||||
if (auto* np = topology.find_node(ep)) {
|
||||
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
// view pairing as the leaving base replica.
|
||||
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
|
||||
auto leaving_base = it->get().host_id();
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
|
||||
view_token, use_tablets, cf_stats);
|
||||
}
|
||||
}
|
||||
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
|
||||
wait_for_all_updates wait_for_all)
|
||||
{
|
||||
auto& ks = _db.find_keyspace(base->ks_name());
|
||||
auto& replication = ks.get_replication_strategy();
|
||||
const bool uses_tablets = ks.uses_tablets();
|
||||
const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
|
||||
// The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
|
||||
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
|
||||
auto get_erm = [&] (table_id id) {
|
||||
auto it = erms.find(id);
|
||||
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
|
||||
co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
|
||||
auto view_token = dht::get_token(*mut.s, mut.fm.key());
|
||||
auto view_ermp = erms.at(mut.s->id());
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
|
||||
ks.uses_tablets(), cf_stats);
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
|
||||
uses_tablets, cf_stats);
|
||||
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
|
||||
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
|
||||
if (no_pairing_endpoint) {
|
||||
|
||||
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id node,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
|
||||
@@ -1345,8 +1345,8 @@ public:
|
||||
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
|
||||
@@ -1428,8 +1428,8 @@ public:
|
||||
}
|
||||
private:
|
||||
static schema_ptr build_schema() {
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
|
||||
return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
|
||||
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
|
||||
return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
|
||||
.with_column("keyspace_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("table_name", utf8_type, column_kind::partition_key)
|
||||
.with_column("timestamp", timestamp_type, column_kind::clustering_key)
|
||||
|
||||
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
|
||||
return prs;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
|
||||
utils::chunked_vector<dht::partition_range> prs;
|
||||
prs.reserve(ranges.size());
|
||||
for (auto& range : ranges) {
|
||||
prs.push_back(dht::to_partition_range(range));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return prs;
|
||||
}
|
||||
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
|
||||
std::map<unsigned, dht::partition_range_vector> ret;
|
||||
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
|
||||
return ret;
|
||||
}
|
||||
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
|
||||
auto cmp = dht::ring_position_comparator(schema);
|
||||
// optimize set of potentially overlapping ranges by deoverlapping them.
|
||||
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
|
||||
dht::partition_range_vector res;
|
||||
auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
|
||||
utils::chunked_vector<dht::partition_range> res;
|
||||
res.reserve(ranges.size() * 2);
|
||||
|
||||
auto range = ranges.begin();
|
||||
|
||||
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {
|
||||
|
||||
dht::partition_range to_partition_range(dht::token_range);
|
||||
dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);
|
||||
|
||||
// Each shard gets a sorted, disjoint vector of ranges
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
|
||||
// Returns a sorted and deoverlapped list of ranges that are
|
||||
// the result of subtracting all ranges from ranges_to_subtract.
|
||||
// ranges_to_subtract must be sorted and deoverlapped.
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
|
||||
|
||||
// Returns a token_range vector split based on the given number of most-significant bits
|
||||
dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -52,6 +77,10 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -223,6 +252,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
6
dist/docker/redhat/build_docker.sh
vendored
6
dist/docker/redhat/build_docker.sh
vendored
@@ -97,7 +97,9 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/
|
||||
|
||||
run microdnf clean all
|
||||
run microdnf --setopt=tsflags=nodocs -y update
|
||||
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
|
||||
run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip cpio
|
||||
# Extract only systemctl binary from systemd package to avoid installing the whole systemd in the container.
|
||||
run bash -rc "microdnf download systemd && rpm2cpio systemd-*.rpm | cpio -idmv ./usr/bin/systemctl && rm -rf systemd-*.rpm"
|
||||
run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
|
||||
run pip3 install --no-cache-dir --prefix /usr supervisor
|
||||
run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
|
||||
@@ -106,6 +108,8 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
|
||||
run mkdir -p /var/log/scylla
|
||||
run chown -R scylla:scylla /var/lib/scylla
|
||||
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/sysconfig/scylla-server
|
||||
# Cleanup packages not needed in the final image and clean package manager cache to reduce image size.
|
||||
run bash -rc "microdnf remove -y cpio && microdnf clean all"
|
||||
|
||||
run mkdir -p /opt/scylladb/supervisor
|
||||
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
|
||||
/stable/kb/perftune-modes-sync.html: /stable/kb/index.html
|
||||
|
||||
# Remove the troubleshooting page relevant for Open Source only
|
||||
|
||||
/stable/troubleshooting/missing-dotmount-files.html: /troubleshooting/index.html
|
||||
|
||||
# Move the diver information to another project
|
||||
|
||||
/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html
|
||||
|
||||
@@ -142,10 +142,6 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
|
||||
Alternator implements such requests by reading the entire top-level
|
||||
attribute a, modifying only a.b[3].c, and then writing back a.
|
||||
|
||||
Currently, Alternator doesn't use Tablets. That's because Alternator relies
|
||||
on LWT (lightweight transactions), and LWT is not supported in keyspaces
|
||||
with Tablets enabled.
|
||||
|
||||
```{eval-rst}
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@@ -187,6 +187,23 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
|
||||
the keyspace schema with ``tablets = { 'enabled': false }`` or
|
||||
``tablets = { 'enabled': true }``.
|
||||
|
||||
.. _keyspace-rf-rack-valid-to-enforce-rack-list:
|
||||
|
||||
Enforcing Rack-List Replication for Tablet Keyspaces
|
||||
------------------------------------------------------------------
|
||||
|
||||
The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`.
|
||||
|
||||
Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
|
||||
to ``rf_rack_valid_keyspaces`` option.
|
||||
|
||||
To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
|
||||
rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
|
||||
After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
|
||||
sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
|
||||
|
||||
.. _tablets-limitations:
|
||||
|
||||
Limitations and Unsupported Features
|
||||
|
||||
@@ -200,8 +200,6 @@ for two cases. One is setting replication factor to 0, in which case the number
|
||||
The other is when the numeric replication factor is equal to the current number of replicas
|
||||
for a given datacanter, in which case the current rack list is preserved.
|
||||
|
||||
Altering from a numeric replication factor to a rack list is not supported yet.
|
||||
|
||||
Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
|
||||
auto-expansion will only *add* new datacenters for safety, it will not alter
|
||||
existing datacenters or remove any even if they are no longer in the cluster.
|
||||
@@ -424,6 +422,21 @@ Altering from a rack list to a numeric replication factor is not supported.
|
||||
|
||||
Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).
|
||||
|
||||
.. _conversion-to-rack-list-rf:
|
||||
|
||||
Conversion to rack-list replication factor
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. _drop-keyspace-statement:
|
||||
|
||||
DROP KEYSPACE
|
||||
|
||||
@@ -282,6 +282,7 @@ For example::
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
See :ref:`WHERE <where-clause>`.
|
||||
|
||||
For example::
|
||||
|
||||
@@ -289,10 +290,6 @@ For example::
|
||||
WHERE user_id = 'user123'
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
|
||||
|
||||
Other filtering scenarios are currently not supported.
|
||||
|
||||
.. note::
|
||||
|
||||
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
|
||||
|
||||
@@ -140,17 +140,83 @@ Vector Index :label-note:`ScyllaDB Cloud`
|
||||
`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.
|
||||
|
||||
ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
|
||||
similarity search on vector data.
|
||||
similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
|
||||
index for indexing vectors per partition.
|
||||
|
||||
The vector index is the only custom type index supported in ScyllaDB. It is created using
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
|
||||
the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
|
||||
add additional columns to the index for filtering the search results. The partition column
|
||||
specified in the global vector index definition must be the vector column, and any subsequent
|
||||
columns are treated as filtering columns. The local vector index requires that the partition key
|
||||
of the base table is also the partition key of the index and the vector column is the first one
|
||||
from the following columns.
|
||||
|
||||
Example of a simple index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global vector index. Additional filtering can be performed on the primary key
|
||||
columns of the base table.
|
||||
|
||||
Example of a global vector index with additional filtering:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed to enable similarity search using
|
||||
a global index. Additional columns are added for filtering the search results.
|
||||
The filtering is possible on ``category``, ``info`` and all primary key columns
|
||||
of the base table.
|
||||
|
||||
Example of a local vector index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
|
||||
USING 'vector_index'
|
||||
WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
|
||||
|
||||
The vector column (``embedding``) is indexed for similarity search (a local
|
||||
index) and additional columns are added for filtering the search results. The
|
||||
filtering is possible on ``category``, ``info`` and all primary key columns of
|
||||
the base table. The columns ``id`` and ``created_at`` must be the partition key
|
||||
of the base table.
|
||||
|
||||
Vector indexes support additional filtering columns of native data types
|
||||
(excluding counter and duration). The indexed column itself must be a vector
|
||||
column, while the extra columns can be used to filter search results.
|
||||
|
||||
The supported types are:
|
||||
|
||||
* ``ascii``
|
||||
* ``bigint``
|
||||
* ``blob``
|
||||
* ``boolean``
|
||||
* ``date``
|
||||
* ``decimal``
|
||||
* ``double``
|
||||
* ``float``
|
||||
* ``inet``
|
||||
* ``int``
|
||||
* ``smallint``
|
||||
* ``text``
|
||||
* ``varchar``
|
||||
* ``time``
|
||||
* ``timestamp``
|
||||
* ``timeuuid``
|
||||
* ``tinyint``
|
||||
* ``uuid``
|
||||
* ``varint``
|
||||
|
||||
|
||||
The following options are supported for vector indexes. All of them are optional.
|
||||
|
||||
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
|
||||
|
||||
23
docs/features/automatic-repair.rst
Normal file
23
docs/features/automatic-repair.rst
Normal file
@@ -0,0 +1,23 @@
|
||||
.. _automatic-repair:
|
||||
|
||||
Automatic Repair
|
||||
================
|
||||
|
||||
Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
|
||||
|
||||
Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
|
||||
Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.
|
||||
|
||||
To enable automatic repair, add this to the configuration (``scylla.yaml``):
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
auto_repair_enabled_default: true
|
||||
auto_repair_threshold_default_in_seconds: 86400
|
||||
|
||||
This will enable automatic repair for all tables with a repair period of 1 day. This configuration has to be set on each node, to an identical value.
|
||||
More featureful configuration methods will be implemented in the future.
|
||||
|
||||
To disable, set ``auto_repair_enabled_default: false``.
|
||||
|
||||
Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
|
||||
@@ -3,7 +3,7 @@
|
||||
Incremental Repair
|
||||
==================
|
||||
|
||||
ScyllaDB's standard repair process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
|
||||
|
||||
The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.
|
||||
|
||||
@@ -37,7 +37,12 @@ The available modes are:
|
||||
* ``disabled``: Completely disables the incremental repair logic for the current operation. The repair behaves like a classic, non-incremental repair, and it does not read or update any incremental repair status markers.
|
||||
|
||||
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental. It can also be specified with the REST API, e.g., curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
The incremental_mode parameter can be specified using nodetool cluster repair, e.g., nodetool cluster repair --incremental-mode incremental.
|
||||
It can also be specified with the REST API, e.g.:
|
||||
|
||||
.. code::
|
||||
|
||||
curl -X POST "http://127.0.0.1:10000/storage_service/tablets/repair?ks=ks1&table=tb1&tokens=all&incremental_mode=incremental"
|
||||
|
||||
Benefits of Incremental Repair
|
||||
------------------------------
|
||||
@@ -46,6 +51,8 @@ Benefits of Incremental Repair
|
||||
* **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
|
||||
* **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.
|
||||
|
||||
Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
Workload Prioritization </features/workload-prioritization>
|
||||
Backup and Restore </features/backup-and-restore>
|
||||
Incremental Repair </features/incremental-repair/>
|
||||
Automatic Repair </features/automatic-repair/>
|
||||
Vector Search </features/vector-search/>
|
||||
|
||||
.. panel-box::
|
||||
@@ -44,5 +45,7 @@ This document highlights ScyllaDB's key data modeling features.
|
||||
* :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
|
||||
efficient and lightweight approach to maintaining data consistency by
|
||||
repairing only the data that has changed since the last repair.
|
||||
* :doc:`Automatic Repair </features/automatic-repair/>` schedules and runs repairs
|
||||
directly in ScyllaDB, without external schedulers.
|
||||
* :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
|
||||
similarity-based queries on vector embeddings.
|
||||
|
||||
@@ -10,7 +10,6 @@ Install ScyllaDB |CURRENT_VERSION|
|
||||
/getting-started/install-scylla/launch-on-azure
|
||||
/getting-started/installation-common/scylla-web-installer
|
||||
/getting-started/install-scylla/install-on-linux
|
||||
/getting-started/installation-common/install-jmx
|
||||
/getting-started/install-scylla/run-in-docker
|
||||
/getting-started/installation-common/unified-installer
|
||||
/getting-started/installation-common/air-gapped-install
|
||||
@@ -36,7 +35,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
|
||||
* :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
|
||||
* :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
* :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
|
||||
* :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
|
||||
* :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
.. |RHEL_EPEL_8| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
|
||||
.. |RHEL_EPEL_9| replace:: https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
|
||||
|
||||
======================================
|
||||
Install ScyllaDB Linux Packages
|
||||
======================================
|
||||
========================================================
|
||||
Install ScyllaDB |CURRENT_VERSION| Linux Packages
|
||||
========================================================
|
||||
|
||||
We recommend installing ScyllaDB using :doc:`ScyllaDB Web Installer for Linux </getting-started/installation-common/scylla-web-installer/>`,
|
||||
a platform-agnostic installation script, to install ScyllaDB on any supported Linux platform.
|
||||
@@ -46,13 +46,13 @@ Install ScyllaDB
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys a43e06657bac99e3
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor a43e06657bac99e3 | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys c503c686b007f39e
|
||||
sudo gpg --homedir /tmp --no-default-keyring --keyring /tmp/temp.gpg --export --armor c503c686b007f39e | gpg --dearmor > /etc/apt/keyrings/scylladb.gpg
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -94,16 +94,6 @@ Install ScyllaDB
|
||||
|
||||
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1
|
||||
|
||||
|
||||
#. (Ubuntu only) Set Java 11.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y openjdk-11-jre-headless
|
||||
sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Install the EPEL repository.
|
||||
@@ -135,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -143,27 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
|
||||
(Optional) Install scylla-jmx
|
||||
-------------------------------
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
|
||||
|
||||
sudo yum install scylla-2025.3.1
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on AWS
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on AWS
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on AWS. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on Azure
|
||||
==========================
|
||||
===============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on Azure
|
||||
===============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on Azure. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
==========================
|
||||
Launch ScyllaDB on GCP
|
||||
==========================
|
||||
=============================================
|
||||
Launch ScyllaDB |CURRENT_VERSION| on GCP
|
||||
=============================================
|
||||
|
||||
This article will guide you through self-managed ScyllaDB deployment on GCP. For a fully-managed deployment of ScyllaDB
|
||||
as-a-service, see `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/>`_.
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
|
||||
======================================
|
||||
Install scylla-jmx Package
|
||||
======================================
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, you can still install it from scylla-jmx GitHub page.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
#. Download .deb package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".deb".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt install -y ./scylla-jmx_<version>_all.deb
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Download .rpm package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".rpm".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
|
||||
|
||||
|
||||
.. group-tab:: Install without root privileges
|
||||
|
||||
#. Download .tar.gz package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".tar.gz".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code:: console
|
||||
|
||||
tar xpf scylla-jmx-<version>.noarch.tar.gz
|
||||
cd scylla-jmx
|
||||
./install.sh --nonroot
|
||||
|
||||
Next Steps
|
||||
-----------
|
||||
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
|
||||
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
|
||||
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
|
||||
* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
@@ -36,11 +36,8 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -50,20 +47,4 @@ you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -14,44 +14,35 @@ Prerequisites
|
||||
Ensure your platform is supported by the ScyllaDB version you want to install.
|
||||
See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
|
||||
|
||||
Note that if you're on CentOS 7, only root offline installation is supported.
|
||||
|
||||
Download and Install
|
||||
-----------------------
|
||||
|
||||
#. Download the latest tar.gz file for ScyllaDB version (x86 or ARM) from ``https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-<version>/``.
|
||||
|
||||
Example for version 6.1: https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-6.1/
|
||||
**Example** for version 2025.1:
|
||||
|
||||
- Go to https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-2025.1/
|
||||
- Download the ``scylla-unified`` file for the patch version you want to
|
||||
install. For example, to install 2025.1.9 (x86), download
|
||||
``scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz``.
|
||||
|
||||
#. Uncompress the downloaded package.
|
||||
|
||||
The following example shows the package for ScyllaDB 6.1.1 (x86):
|
||||
**Example** for version 2025.1.9 (x86) (downloaded in the previous step):
|
||||
|
||||
.. code:: console
|
||||
.. code::
|
||||
|
||||
tar xvfz scylla-unified-6.1.1-0.20240814.8d90b817660a.x86_64.tar.gz
|
||||
tar xvfz scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz
|
||||
|
||||
#. Install OpenJDK 8 or 11.
|
||||
|
||||
The following example shows Java installation on a CentOS-like system:
|
||||
|
||||
.. code:: console
|
||||
|
||||
sudo yum install -y java-11-openjdk-headless
|
||||
|
||||
For root offline installation on Debian-like systems, two additional packages, ``xfsprogs``
|
||||
and ``mdadm``, should be installed to be used in RAID setup.
|
||||
#. (Root offline installation only) For root offline installation on Debian-like
|
||||
systems, two additional packages, ``xfsprogs`` and ``mdadm``, should be
|
||||
installed to be used in RAID setup.
|
||||
|
||||
#. Install ScyllaDB as a user with non-root privileges:
|
||||
|
||||
.. code:: console
|
||||
|
||||
./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
|
||||
|
||||
#. (Optional) Install scylla-jmx
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
./install.sh --nonroot
|
||||
|
||||
Configure and Run ScyllaDB
|
||||
----------------------------
|
||||
@@ -81,19 +72,14 @@ Run nodetool:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/nodetool status
|
||||
~/scylladb/bin/nodetool nodetool status
|
||||
|
||||
Run cqlsh:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/cqlsh
|
||||
~/scylladb/bin/cqlsh
|
||||
|
||||
Run cassandra-stress:
|
||||
|
||||
.. code:: console
|
||||
|
||||
~/scylladb/share/cassandra/bin/cassandra-stress write
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -124,7 +110,7 @@ Nonroot install
|
||||
|
||||
./install.sh --upgrade --nonroot
|
||||
|
||||
.. note:: The installation script does not upgrade scylla-jmx and scylla-tools. You will have to upgrade them separately.
|
||||
.. note:: The installation script does not upgrade scylla-tools. You will have to upgrade them separately.
|
||||
|
||||
Uninstall
|
||||
===========
|
||||
@@ -154,4 +140,4 @@ Next Steps
|
||||
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
|
||||
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
|
||||
* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
@@ -25,4 +25,8 @@ For Example:
|
||||
|
||||
nodetool rebuild <source-dc-name>
|
||||
|
||||
``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
|
||||
|
||||
See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
|
||||
|
||||
.. include:: nodetool-index.rst
|
||||
|
||||
@@ -155,7 +155,6 @@ Add New DC
|
||||
UN 54.235.9.159 109.75 KB 256 ? 39798227-9f6f-4868-8193-08570856c09a RACK1
|
||||
UN 54.146.228.25 128.33 KB 256 ? 7a4957a1-9590-4434-9746-9c8a6f796a0c RACK1
|
||||
|
||||
.. TODO possibly provide additional information WRT how ALTER works with tablets
|
||||
|
||||
#. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:
|
||||
|
||||
@@ -171,26 +170,68 @@ Add New DC
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
|
||||
ALTER Command
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class’: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace2;
|
||||
|
||||
CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
|
||||
CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then alter the keyspace one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -198,7 +239,7 @@ Add New DC
|
||||
|
||||
The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.
|
||||
|
||||
#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
|
||||
#. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.
|
||||
|
||||
|
||||
@@ -0,0 +1,492 @@
|
||||
=================================================
|
||||
Cluster Platform Migration Using Node Cycling
|
||||
=================================================
|
||||
|
||||
This procedure describes how to migrate a ScyllaDB cluster to new instance types
|
||||
using the add-and-replace approach, which is commonly used for:
|
||||
|
||||
* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
|
||||
* Upgrading to newer instance types with better performance
|
||||
* Changing instance families within the same cloud provider
|
||||
|
||||
The add-and-replace approach maintains data replication throughout the migration
|
||||
and ensures zero downtime for client applications.
|
||||
|
||||
.. note::
|
||||
|
||||
This procedure does **not** change the ScyllaDB software version. All nodes
|
||||
(both existing and new) must run the same ScyllaDB version. For software
|
||||
version upgrades, see :doc:`Upgrade </upgrade/index>`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The add-and-replace migration follows these steps:
|
||||
|
||||
#. Add new nodes (on target instance type) to the existing cluster
|
||||
#. Wait for data to stream to the new nodes
|
||||
#. Decommission old nodes (on source instance type)
|
||||
|
||||
This approach keeps the cluster operational throughout the migration while
|
||||
maintaining the configured replication factor.
|
||||
|
||||
Key characteristics
|
||||
===================
|
||||
|
||||
* **Zero downtime**: Client applications continue to operate during migration
|
||||
* **Data safety**: Replication factor is maintained throughout the process
|
||||
* **Flexible**: Works with both vnodes and tablets-enabled clusters
|
||||
* **Multi-DC support**: Can migrate nodes across multiple datacenters
|
||||
|
||||
.. warning::
|
||||
|
||||
Ensure your cluster has sufficient capacity during the migration. At the peak
|
||||
of the process, your cluster will temporarily have double the number of nodes.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Check cluster health
|
||||
====================
|
||||
|
||||
Before starting the migration, verify that your cluster is healthy:
|
||||
|
||||
#. Check that all nodes are in Up Normal (UN) status:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status. Do not proceed if any nodes are down.
|
||||
|
||||
#. Ensure no streaming or repair operations are in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
nodetool compactionstats
|
||||
|
||||
Plan the migration
|
||||
==================
|
||||
|
||||
Before provisioning new instances, plan the following:
|
||||
|
||||
**Instance type mapping**: Identify the source and target instance types.
|
||||
If your cluster uses vnodes (not tablets), consider that mismatched shard
|
||||
counts between source and target instance types can cause slower repairs.
|
||||
With tablets enabled, shard count mismatch is fully supported.
|
||||
|
||||
**Rack assignment planning**: Each new node must be assigned to the same rack
|
||||
as the node it will replace. This maintains rack-aware topology for:
|
||||
|
||||
* Rack-aware replication (NetworkTopologyStrategy)
|
||||
* Proper data distribution across failure domains
|
||||
* Minimizing data movement during decommission
|
||||
|
||||
Example mapping for a 3-node cluster:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
Source nodes (to be decommissioned): Target nodes (to be added):
|
||||
192.168.1.10 - RACK0 → 192.168.2.10 - RACK0
|
||||
192.168.1.11 - RACK1 → 192.168.2.11 - RACK1
|
||||
192.168.1.12 - RACK2 → 192.168.2.12 - RACK2
|
||||
|
||||
Create a backup
|
||||
===============
|
||||
|
||||
Back up the data before starting the migration. One of the following
|
||||
methods can be used:
|
||||
|
||||
* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
|
||||
cluster-wide backup. See the
|
||||
`ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
|
||||
for details.
|
||||
|
||||
* **Snapshots**: On each node in the cluster, create a snapshot:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool snapshot -t pre_migration_backup
|
||||
nodetool listsnapshots
|
||||
|
||||
.. note::
|
||||
|
||||
Snapshots are local to each node and do not protect against node or disk
|
||||
failure. For full disaster recovery, use ScyllaDB Manager backup.
|
||||
|
||||
|
||||
Procedure
|
||||
---------
|
||||
|
||||
Adding new nodes
|
||||
================
|
||||
|
||||
#. Provision new instances with the target instance type. Ensure:
|
||||
|
||||
* The same ScyllaDB version as existing nodes
|
||||
* Same network configuration and security groups
|
||||
* Appropriate storage configuration
|
||||
|
||||
#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
|
||||
cluster:
|
||||
|
||||
* **cluster_name**: Must match the existing cluster name
|
||||
* **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
|
||||
* **endpoint_snitch**: Must match the existing cluster configuration
|
||||
* **listen_address**: IP address of the new node
|
||||
* **rpc_address**: IP address of the new node
|
||||
|
||||
All other cluster-wide settings (tablets configuration, encryption settings,
|
||||
experimental features, etc.) must match the existing nodes.
|
||||
|
||||
.. caution::
|
||||
|
||||
Make sure that the ScyllaDB version on the new node is identical to the
|
||||
version on the other nodes in the cluster. Running nodes with different
|
||||
versions is not supported.
|
||||
|
||||
#. If using ``GossipingPropertyFileSnitch``, configure
|
||||
``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
|
||||
and rack assignment for this node:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
dc = <datacenter-name>
|
||||
rack = <rack-name>
|
||||
prefer_local = true
|
||||
|
||||
.. warning::
|
||||
|
||||
Each node must have the correct rack assignment. Using the same rack for
|
||||
all new nodes breaks rack-aware replication topology.
|
||||
|
||||
#. Start ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl start scylla-server
|
||||
|
||||
For Docker deployments:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker exec -it <container-name> supervisorctl start scylla
|
||||
|
||||
#. Monitor the bootstrap process from an existing node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The new node will appear with ``UJ`` (Up, Joining) status while streaming
|
||||
data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
|
||||
|
||||
**Example output during bootstrap:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
After the node reaches ``UN`` status, verify no streaming is in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
Wait until output shows "Not sending any streams" and no active receiving streams.
|
||||
|
||||
#. Repeat steps 1-6 for each new node to be added.
|
||||
|
||||
.. note::
|
||||
|
||||
You can add multiple nodes in parallel if they are in different datacenters.
|
||||
Within a single datacenter, add nodes one at a time for best results.
|
||||
|
||||
|
||||
Updating seed node configuration
|
||||
================================
|
||||
|
||||
If any of your original nodes are configured as seed nodes, you must update
|
||||
the seed configuration before decommissioning them.
|
||||
|
||||
#. Check the current seed configuration on any node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
|
||||
|
||||
#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
|
||||
on **all new nodes** to use the new node IPs as seeds:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
seed_provider:
|
||||
- class_name: org.apache.cassandra.locator.SimpleSeedProvider
|
||||
parameters:
|
||||
- seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
|
||||
|
||||
.. note::
|
||||
|
||||
Updating seed configuration on the **old nodes** (that will be
|
||||
decommissioned) is optional. Seeds are only used during node startup
|
||||
to discover the cluster. If you don't plan to restart the old nodes
|
||||
before decommissioning them, their seed configuration doesn't matter.
|
||||
However, updating all nodes is recommended for safety in case an old
|
||||
node unexpectedly restarts during the migration.
|
||||
|
||||
#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
|
||||
configuration:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl restart scylla-server
|
||||
|
||||
Wait for the node to fully start before restarting the next node.
|
||||
|
||||
#. After restarting the new nodes, verify the cluster is healthy:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
nodetool describecluster
|
||||
|
||||
.. warning::
|
||||
|
||||
Complete this seed list update on **all new nodes** before decommissioning
|
||||
any old nodes. This ensures the new nodes can reform the cluster after
|
||||
the old nodes are removed.
|
||||
|
||||
|
||||
Decommissioning old nodes
|
||||
=========================
|
||||
|
||||
After all new nodes are added and healthy, decommission the old nodes one
|
||||
at a time.
|
||||
|
||||
#. Verify all nodes are healthy before starting decommission:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status.
|
||||
|
||||
#. On the node to be decommissioned, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool decommission
|
||||
|
||||
This command blocks until the decommission is complete. The node will
|
||||
stream its data to the remaining nodes.
|
||||
|
||||
#. Monitor the decommission progress from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
|
||||
→ removed from the cluster.
|
||||
|
||||
You can also monitor streaming progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
#. After decommission completes, verify the node is no longer in the cluster:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioned node should no longer appear in the output.
|
||||
|
||||
#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
|
||||
no longer belongs to them after the topology change:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool cleanup
|
||||
|
||||
.. note::
|
||||
|
||||
``nodetool cleanup`` can be resource-intensive. Run it on one node at a
|
||||
time during low-traffic periods.
|
||||
|
||||
#. Wait for the cluster to stabilize before decommissioning the next node.
|
||||
Ensure no streaming operations are in progress.
|
||||
|
||||
#. Repeat steps 1-7 for each old node to be decommissioned.
|
||||
|
||||
|
||||
Post-migration verification
|
||||
---------------------------
|
||||
|
||||
After all old nodes are decommissioned, verify the migration was successful.
|
||||
|
||||
Verify cluster topology
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
Confirm:
|
||||
|
||||
* All nodes show ``UN`` (Up, Normal) status
|
||||
* Only the new instance type nodes are present
|
||||
* Nodes are balanced across racks
|
||||
|
||||
Verify schema agreement
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool describecluster
|
||||
|
||||
All nodes should report the same schema version.
|
||||
|
||||
Verify data connectivity
|
||||
========================
|
||||
|
||||
Connect to the cluster and run a test query:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
|
||||
|
||||
.. note::
|
||||
|
||||
If ScyllaDB is configured with ``listen_interface``, you must use the
|
||||
node's interface IP address (not localhost) for cqlsh connections.
|
||||
|
||||
Verify ScyllaDB version
|
||||
=======================
|
||||
|
||||
Confirm all nodes are running the same ScyllaDB version:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
scylla --version
|
||||
|
||||
Verify data integrity (optional)
|
||||
================================
|
||||
|
||||
Run data validation on each keyspace to verify sstable integrity:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool scrub --mode=VALIDATE <keyspace_name>
|
||||
|
||||
Rollback
|
||||
--------
|
||||
|
||||
If issues occur during the migration, you can roll back by reversing the
|
||||
procedure.
|
||||
|
||||
During add phase
|
||||
================
|
||||
|
||||
If a new node fails to bootstrap:
|
||||
|
||||
#. Stop ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl stop scylla-server
|
||||
|
||||
#. From an existing node, remove the failed node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id-of-failed-node>
|
||||
|
||||
During decommission phase
|
||||
=========================
|
||||
|
||||
If a decommission operation gets stuck:
|
||||
|
||||
#. If the node is still reachable, try stopping and restarting ScyllaDB
|
||||
#. If the node is unresponsive, from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id>
|
||||
|
||||
See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
for more details.
|
||||
|
||||
Full rollback
|
||||
=============
|
||||
|
||||
To roll back after the migration is complete (all nodes on new instance type),
|
||||
apply the same add-and-replace procedure in reverse:
|
||||
|
||||
#. Add new nodes on the original instance type
|
||||
#. Wait for data streaming to complete
|
||||
#. Decommission the nodes on the new instance type
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
Node stuck in Joining (UJ) state
|
||||
================================
|
||||
|
||||
If a new node remains in ``UJ`` state for an extended period:
|
||||
|
||||
* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
|
||||
* Verify network connectivity between nodes
|
||||
* Ensure sufficient disk space on all nodes
|
||||
* Check for any ongoing operations that may be blocking
|
||||
|
||||
Decommission taking too long
|
||||
============================
|
||||
|
||||
Decommission duration depends on data size. If it appears stuck:
|
||||
|
||||
* Check streaming progress: ``nodetool netstats``
|
||||
* Look for errors in ScyllaDB logs
|
||||
* Verify network bandwidth between nodes
|
||||
|
||||
Schema disagreement
|
||||
===================
|
||||
|
||||
If nodes report different schema versions:
|
||||
|
||||
* Wait a few minutes for schema to propagate
|
||||
* If disagreement persists, restart the nodes one by one
|
||||
* Run ``nodetool describecluster`` to verify agreement
|
||||
|
||||
|
||||
Additional resources
|
||||
--------------------
|
||||
|
||||
* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
|
||||
* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
|
||||
* :doc:`Upgrade </upgrade/index>`
|
||||
@@ -40,12 +40,14 @@ Prerequisites
|
||||
Procedure
|
||||
---------
|
||||
|
||||
#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
|
||||
For example:
|
||||
|
||||
If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC
|
||||
|
||||
#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
|
||||
|
||||
#. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.
|
||||
|
||||
For example:
|
||||
@@ -73,6 +75,33 @@ Procedure
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
|
||||
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba2
|
||||
cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba3
|
||||
cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ Cluster Management Procedures
|
||||
Safely Restart Your Cluster <safe-start>
|
||||
repair-based-node-operation
|
||||
Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
|
||||
Cluster Platform Migration <cluster-platform-migration>
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -85,6 +86,8 @@ Cluster Management Procedures
|
||||
|
||||
* :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
|
||||
|
||||
* :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
|
||||
|
||||
.. panel-box::
|
||||
:title: Topology Changes
|
||||
:id: "getting-started"
|
||||
|
||||
@@ -57,12 +57,11 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
.. note::
|
||||
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -58,4 +58,12 @@ See also
|
||||
|
||||
* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
|
||||
|
||||
Incremental Repair
|
||||
------------------
|
||||
|
||||
Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
|
||||
|
||||
Automatic Repair
|
||||
----------------
|
||||
|
||||
Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
|
||||
|
||||
@@ -8,7 +8,6 @@ Troubleshooting ScyllaDB
|
||||
|
||||
support/index
|
||||
startup/index
|
||||
upgrade/index
|
||||
cluster/index
|
||||
modeling/index
|
||||
storage/index
|
||||
@@ -29,7 +28,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
* :doc:`Errors and ScyllaDB Customer Support <support/index>`
|
||||
* :doc:`ScyllaDB Startup <startup/index>`
|
||||
* :doc:`ScyllaDB Cluster and Node <cluster/index>`
|
||||
* :doc:`ScyllaDB Upgrade <upgrade/index>`
|
||||
* :doc:`Data Modeling <modeling/index>`
|
||||
* :doc:`Data Storage and SSTables <storage/index>`
|
||||
* :doc:`CQL errors <CQL/index>`
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade
|
||||
======================================================================================
|
||||
|
||||
Problem
|
||||
^^^^^^^
|
||||
When you reboot the machine after a ScyllaDB upgrade, you cannot access data directories under ``/var/lib/scylla``, and
|
||||
coredump saves to ``rootfs``.
|
||||
|
||||
|
||||
The problem may occur when you upgrade ScylaDB Open Source 4.6 or later to a version of ScyllaDB Enterprise if
|
||||
the ``/etc/systemd/system/var-lib-scylla.mount`` and ``/etc/systemd/system/var-lib-systemd-coredump.mount`` are
|
||||
deleted by RPM.
|
||||
|
||||
To avoid losing the files, the upgrade procedure includes a step to backup the .mount files. The following
|
||||
example shows the command to backup the files before the upgrade from version 5.0:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
for conf in $( rpm -qc $(rpm -qa | grep scylla) | grep -v contains ) /etc/systemd/system/{var-lib-scylla,var-lib-systemd-coredump}.mount; do sudo cp -v $conf $conf.backup-5.0; done
|
||||
|
||||
If you don't backup the .mount files before the upgrade, the files may be lost.
|
||||
|
||||
|
||||
Solution
|
||||
^^^^^^^^
|
||||
|
||||
If you didn't backup the .mount files before the upgrade and the files were deleted during the upgrade,
|
||||
you need to restore them manually.
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-systemd-coredump.mount``, run the following:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
[Unit]
|
||||
Description=Save coredump to scylla data directory
|
||||
Conflicts=umount.target
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/var/lib/scylla/coredump
|
||||
Where=/var/lib/systemd/coredump
|
||||
Type=none
|
||||
Options=bind
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
To restore ``/etc/systemd/system/var-lib-scylla.mount``, run the following (specifying your data disk):
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ UUID=`blkid -s UUID -o value <specify your data disk, eg: /dev/md0>`
|
||||
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-scylla.mount
|
||||
[Unit]
|
||||
Description=ScyllaDB data directory
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
[Mount]
|
||||
What=/dev/disk/by-uuid/$UUID
|
||||
Where=/var/lib/scylla
|
||||
Type=xfs
|
||||
Options=noatime
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOS
|
||||
|
||||
After restoring .mount files, you need to enable them:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ sudo systemctl daemon-reload
|
||||
$ sudo systemctl enable --now var-lib-scylla.mount
|
||||
$ sudo systemctl enable --now var-lib-systemd-coredump.mount
|
||||
|
||||
|
||||
.. include:: /troubleshooting/_common/ts-return.rst
|
||||
@@ -1,16 +0,0 @@
|
||||
Upgrade
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
:maxdepth: 2
|
||||
|
||||
Inaccessible configuration files after ScyllaDB upgrade </troubleshooting/missing-dotmount-files>
|
||||
|
||||
.. panel-box::
|
||||
:title: Upgrade Issues
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade </troubleshooting//missing-dotmount-files>`
|
||||
|
||||
@@ -11,9 +11,13 @@ ScyllaDB. This means that:
|
||||
|
||||
* You should follow the upgrade policy:
|
||||
|
||||
* Starting with version **2025.4**, upgrades can skip minor versions as long
|
||||
as they remain within the same major version (for example, upgrading directly
|
||||
from 2025.1 → 2025.4 is supported).
|
||||
* Starting with version **2025.4**, upgrades can **skip minor versions** if:
|
||||
|
||||
* They remain within the same major version (for example, upgrading
|
||||
directly from *2025.1 → 2025.4* is supported).
|
||||
* You upgrade to the next major version (for example, upgrading
|
||||
directly from *2025.3 → 2026.1* is supported).
|
||||
|
||||
* For versions **prior to 2025.4**, upgrades must be performed consecutively—
|
||||
each successive X.Y version must be installed in order, **without skipping
|
||||
any major or minor version** (for example, upgrading directly from 2025.1 → 2025.3
|
||||
|
||||
@@ -4,8 +4,7 @@ Upgrade ScyllaDB
|
||||
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4/index>
|
||||
ScyllaDB 2025.4 Patch Upgrades <upgrade-guide-from-2025.4.x-to-2025.4.y>
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
|
||||
@@ -1,266 +0,0 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2025.4.x
|
||||
.. |NEW_VERSION| replace:: 2025.4.y
|
||||
|
||||
==========================================================================
|
||||
Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
|
||||
==========================================================================
|
||||
|
||||
This document describes a step-by-step procedure for upgrading from
|
||||
|SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION| (where "y" is
|
||||
the latest available version), and rolling back to version |SRC_VERSION|
|
||||
if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
|
||||
CentOS, Debian, and Ubuntu.
|
||||
See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
for information about supported versions.
|
||||
|
||||
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
Upgrade Procedure
|
||||
=================
|
||||
|
||||
.. note::
|
||||
Apply the following procedure **serially** on each node. Do not move to the next
|
||||
node before validating that the node is up and running the new version.
|
||||
|
||||
A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
|
||||
shutdown. For each of the nodes in the cluster, you will:
|
||||
|
||||
#. Drain the node and back up the data.
|
||||
#. Backup configuration file.
|
||||
#. Stop ScyllaDB.
|
||||
#. Download and install new ScyllaDB packages.
|
||||
#. Start ScyllaDB.
|
||||
#. Validate that the upgrade was successful.
|
||||
|
||||
**Before** upgrading, check which version you are running now using
|
||||
``scylla --version``. Note the current version in case you want to roll back
|
||||
the upgrade.
|
||||
|
||||
**During** the rolling upgrade it is highly recommended:
|
||||
|
||||
* Not to use new |NEW_VERSION| features.
|
||||
* Not to run administration functions, like repairs, refresh, rebuild or add
|
||||
or remove nodes. See
|
||||
`sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
|
||||
ScyllaDB Manager's scheduled or running repairs.
|
||||
* Not to apply schema changes.
|
||||
|
||||
Upgrade Steps
|
||||
=============
|
||||
|
||||
Back up the data
|
||||
------------------------------
|
||||
|
||||
Back up all the data to an external device. We recommend using
|
||||
`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
|
||||
to create backups.
|
||||
|
||||
Alternatively, you can use the ``nodetool snapshot`` command.
|
||||
For **each** node in the cluster, run the following:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
nodetool snapshot
|
||||
|
||||
Take note of the directory name that nodetool gives you, and copy all
|
||||
the directories with this name under ``/var/lib/scylla`` to a backup device.
|
||||
|
||||
When the upgrade is completed on all nodes, remove the snapshot with the
|
||||
``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
|
||||
space.
|
||||
|
||||
Back up the configuration file
|
||||
------------------------------
|
||||
|
||||
Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
|
||||
in case you need to roll back the upgrade.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
|
||||
|
||||
Gracefully stop the node
|
||||
------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server stop
|
||||
|
||||
Download and install the new release
|
||||
------------------------------------
|
||||
|
||||
You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
|
||||
a patch release.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To install a patch version on Debian or Ubuntu, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To install a patch version on RHEL or CentOS, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo yum clean all
|
||||
sudo yum update scylla\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you're using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you're using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to apply an extended upgrade procedure:
|
||||
|
||||
#. Install the new ScyllaDB version with the additional
|
||||
``scylla-machine-image`` package:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
sudo apt-get dist-upgrade scylla-machine-image
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service start scylla-server
|
||||
|
||||
Validate
|
||||
--------
|
||||
#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
|
||||
including the one you just upgraded, are in UN status.
|
||||
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
|
||||
to check the ScyllaDB version.
|
||||
#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
|
||||
#. Check again after 2 minutes to validate that no new issues are introduced.
|
||||
|
||||
Once you are sure the node upgrade is successful, move to the next node in
|
||||
the cluster.
|
||||
|
||||
Rollback Procedure
|
||||
==================
|
||||
|
||||
The following procedure describes a rollback from ScyllaDB release
|
||||
|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
|
||||
|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes.
|
||||
|
||||
* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
|
||||
* Execute the following commands one node at a time, moving to the next node only
|
||||
after the rollback procedure is completed successfully.
|
||||
|
||||
ScyllaDB rollback is a rolling procedure that does **not** require a full
|
||||
cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
|
||||
|
||||
#. Drain the node and stop ScyllaDB.
|
||||
#. Downgrade to the previous release.
|
||||
#. Restore the configuration file.
|
||||
#. Restart ScyllaDB.
|
||||
#. Validate the rollback success.
|
||||
|
||||
Rollback Steps
|
||||
==============
|
||||
|
||||
Gracefully shutdown ScyllaDB
|
||||
-----------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
sudo service stop scylla-server
|
||||
|
||||
Downgrade to the previous release
|
||||
----------------------------------
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you’re using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you’re using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to additionally downgrade
|
||||
the ``scylla-machine-image`` package.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
|
||||
Restore the configuration file
|
||||
------------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo rm -rf /etc/scylla/scylla.yaml
|
||||
sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server start
|
||||
|
||||
Validate
|
||||
--------
|
||||
Check upgrade instruction above for validation. Once you are sure the node
|
||||
rollback is successful, move to the next node in the cluster.
|
||||
@@ -1,13 +0,0 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2025.4
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2025.4>
|
||||
Metrics Update <metric-update-2025.x-to-2025.4>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2025.4 <metric-update-2025.x-to-2025.4>`
|
||||
@@ -1,68 +0,0 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
.. |PRECEDING_VERSION| replace:: 2025.3
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_database_total_view_updates_due_to_replica_count_mismatch
|
||||
- The total number of view updates for which there were more view replicas
|
||||
than base replicas and we had to generate an extra view update because
|
||||
the additional view replica wouldn't get paired with any base replica.
|
||||
It should only increase during the Replication Factor (RF) change. It
|
||||
should stop increasing shortly after finishing the RF change.
|
||||
* - scylla_database_total_writes_rejected_due_to_out_of_space_prevention
|
||||
- Counts write operations that were rejected due to disabled user tables
|
||||
writes.
|
||||
* - scylla_index_query_latencies
|
||||
- Index query latencies.
|
||||
* - scylla_reactor_aio_retries
|
||||
- The total number of IOCB-s re-submitted via thread-pool.
|
||||
* - scylla_reactor_io_threaded_fallbacks
|
||||
- The total number of io-threaded-fallbacks operations.
|
||||
* - scylla_repair_inc_sst_read_bytes
|
||||
- The total number of bytes read from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_inc_sst_skipped_bytes
|
||||
- The total number of bytes skipped from SStables for incremental repair
|
||||
on this shard.
|
||||
* - scylla_repair_tablet_time_ms
|
||||
- The time spent on tablet repair on this shard (in milliseconds).
|
||||
* - scylla_s3_downloads_blocked_on_memory
|
||||
- Counts the number of times the S3 client downloads were delayed due to
|
||||
insufficient memory availability.
|
||||
* - scylla_s3_memory_usage
|
||||
- The total number of bytes consumed by the S3 client.
|
||||
* - scylla_s3_total_read_prefetch_bytes
|
||||
- The total number of bytes requested from object.
|
||||
* - scylla_storage_proxy_replica_fenced_out_requests
|
||||
- The number of requests that resulted in a stale_topology_exception.
|
||||
* - scylla_vector_store_dns_refreshes
|
||||
- The number of DNS refreshes.
|
||||
|
||||
New and Updated Metrics in Previous 2025.x Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
==========================================================
|
||||
Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
|
||||
==========================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
|
||||
Metrics Update <metric-update-2025.x-to-2026.1>
|
||||
|
||||
* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
|
||||
* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
|
||||
@@ -0,0 +1,82 @@
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
.. |PRECEDING_VERSION| replace:: 2025.4
|
||||
|
||||
================================================================
|
||||
Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
|
||||
================================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
|
||||
|
||||
|
||||
New Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric
|
||||
- Description
|
||||
* - scylla_alternator_operation_size_kb
|
||||
- Histogram of item sizes involved in a request.
|
||||
* - scylla_column_family_total_disk_space_before_compression
|
||||
- Hypothetical total disk space used if data files weren't compressed
|
||||
* - scylla_group_name_auto_repair_enabled_nr
|
||||
- Number of tablets with auto repair enabled.
|
||||
* - scylla_group_name_auto_repair_needs_repair_nr
|
||||
- Number of tablets with auto repair enabled that currently need repair.
|
||||
* - scylla_lsa_compact_time_ms
|
||||
- Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
|
||||
* - scylla_lsa_evict_time_ms
|
||||
- Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
|
||||
* - scylla_lsa_reclaim_time_ms
|
||||
- Total time spent in reclaiming LSA memory back to std allocator.
|
||||
* - scylla_object_storage_memory_usage
|
||||
- Total number of bytes consumed by the object storage client.
|
||||
* - scylla_tablet_ops_failed
|
||||
- Number of failed tablet auto repair attempts.
|
||||
* - scylla_tablet_ops_succeeded
|
||||
- Number of successful tablet auto repair attempts.
|
||||
|
||||
Renamed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 150
|
||||
:header-rows: 1
|
||||
|
||||
* - Metric Name in |PRECEDING_VERSION|
|
||||
- Metric Name in |NEW_VERSION|
|
||||
* - scylla_s3_memory_usage
|
||||
- scylla_object_storage_memory_usage
|
||||
|
||||
Removed Metrics in |NEW_VERSION|
|
||||
--------------------------------------
|
||||
|
||||
The following metrics are removed in ScyllaDB |NEW_VERSION|.
|
||||
|
||||
* scylla_redis_current_connections
|
||||
* scylla_redis_op_latency
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_operation
|
||||
* scylla_redis_requests_latency
|
||||
* scylla_redis_requests_served
|
||||
* scylla_redis_requests_serving
|
||||
|
||||
New and Updated Metrics in Previous Releases
|
||||
-------------------------------------------------------
|
||||
|
||||
* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
|
||||
* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
|
||||
* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2025.x
|
||||
.. |NEW_VERSION| replace:: 2025.4
|
||||
.. |NEW_VERSION| replace:: 2026.1
|
||||
|
||||
.. |ROLLBACK| replace:: rollback
|
||||
.. _ROLLBACK: ./#rollback-procedure
|
||||
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2025.4
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2025.4
|
||||
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
|
||||
.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
|
||||
|
||||
=======================================================================================
|
||||
Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
|
||||
@@ -17,10 +17,12 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
|
||||
to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian,
|
||||
and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>`
|
||||
for information about supported versions.
|
||||
and Ubuntu.
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions. It also applies when using
|
||||
the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
See :doc:`About Upgrade </upgrade/about-upgrade/>` for the ScyllaDB upgrade policy.
|
||||
|
||||
Before You Upgrade ScyllaDB
|
||||
==============================
|
||||
@@ -149,8 +151,9 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Update the ScyllaDB deb repo to |NEW_VERSION|.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.4.list
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -167,8 +170,9 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Update the ScyllaDB rpm repo to |NEW_VERSION|.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.4.repo
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install the new ScyllaDB version:
|
||||
|
||||
@@ -198,11 +202,6 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
|
||||
If you need JMX server, see
|
||||
:doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
and get new version.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
@@ -284,6 +284,7 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
|
||||
}
|
||||
[[fallthrough]];
|
||||
case httpclient::reply_status::request_timeout:
|
||||
case httpclient::reply_status::too_many_requests:
|
||||
if (retry < max_retries) {
|
||||
// service unavailable etc -> backoff + retry
|
||||
do_backoff = true;
|
||||
|
||||
@@ -182,6 +182,7 @@ public:
|
||||
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
|
||||
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
|
||||
gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
|
||||
gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv };
|
||||
public:
|
||||
|
||||
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
|
||||
@@ -147,17 +147,88 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
|
||||
}
|
||||
|
||||
void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
|
||||
}
|
||||
auto target = targets[0];
|
||||
auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
|
||||
if (!c_def) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
|
||||
}
|
||||
auto type = c_def->type;
|
||||
if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
|
||||
|
||||
struct validate_visitor {
|
||||
const class schema& schema;
|
||||
bool& is_vector;
|
||||
|
||||
/// Vector indexes support filtering on native types that can be used as primary key columns.
|
||||
/// There is no counter (it cannot be used with vector columns)
|
||||
/// and no duration (it cannot be used as a primary key or in secondary indexes).
|
||||
static bool is_supported_filtering_column(abstract_type const & kind_type) {
|
||||
switch (kind_type.get_kind()) {
|
||||
case abstract_type::kind::ascii:
|
||||
case abstract_type::kind::boolean:
|
||||
case abstract_type::kind::byte:
|
||||
case abstract_type::kind::bytes:
|
||||
case abstract_type::kind::date:
|
||||
case abstract_type::kind::decimal:
|
||||
case abstract_type::kind::double_kind:
|
||||
case abstract_type::kind::float_kind:
|
||||
case abstract_type::kind::inet:
|
||||
case abstract_type::kind::int32:
|
||||
case abstract_type::kind::long_kind:
|
||||
case abstract_type::kind::short_kind:
|
||||
case abstract_type::kind::simple_date:
|
||||
case abstract_type::kind::time:
|
||||
case abstract_type::kind::timestamp:
|
||||
case abstract_type::kind::timeuuid:
|
||||
case abstract_type::kind::utf8:
|
||||
case abstract_type::kind::uuid:
|
||||
case abstract_type::kind::varint:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void validate(cql3::column_identifier const& column, bool is_vector) const {
|
||||
auto const& c_name = column.to_string();
|
||||
auto const* c_def = schema.get_column_definition(column.name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto type = c_def->type;
|
||||
|
||||
if (is_vector) {
|
||||
auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
|
||||
if (vector_type == nullptr) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
|
||||
auto elements_type = vector_type->get_elements_type();
|
||||
if (elements_type->get_kind() != abstract_type::kind::float_kind) {
|
||||
throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!is_supported_filtering_column(*type)) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
|
||||
for (const auto& column : columns) {
|
||||
// CQL restricts the secondary local index to have multiple columns with partition key only.
|
||||
// Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
|
||||
// so we can assume here that these are non-vectors filtering columns.
|
||||
validate(*column, false);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
|
||||
validate(*column, is_vector);
|
||||
// The first column is the vector column, the rest mustn't be vectors.
|
||||
is_vector = false;
|
||||
}
|
||||
};
|
||||
|
||||
bool is_vector = true;
|
||||
for (const auto& target : targets) {
|
||||
std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -347,8 +347,8 @@ install -d -m755 "$retc"/scylla.d
|
||||
scylla_yaml_dir=$(mktemp -d)
|
||||
scylla_yaml=$scylla_yaml_dir/scylla.yaml
|
||||
grep -v api_ui_dir conf/scylla.yaml | grep -v api_doc_dir > $scylla_yaml
|
||||
echo "api_ui_dir: /opt/scylladb/swagger-ui/dist/" >> $scylla_yaml
|
||||
echo "api_doc_dir: /opt/scylladb/api/api-doc/" >> $scylla_yaml
|
||||
echo "api_ui_dir: $prefix/swagger-ui/dist/" >> $scylla_yaml
|
||||
echo "api_doc_dir: $prefix/api/api-doc/" >> $scylla_yaml
|
||||
installconfig 644 $scylla_yaml "$retc"/scylla
|
||||
rm -rf $scylla_yaml_dir
|
||||
|
||||
|
||||
@@ -612,12 +612,16 @@ tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topo
|
||||
return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
|
||||
}
|
||||
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
|
||||
if (get_tablet_info(id).replicas.size() < 2) {
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id, const locator::topology& topo) const {
|
||||
const auto& orig_replicas = get_tablet_info(id).replicas;
|
||||
if (orig_replicas.size() < 2) {
|
||||
throw std::runtime_error(format("No secondary replica for tablet id {}", id));
|
||||
}
|
||||
const auto& replicas = get_tablet_info(id).replicas;
|
||||
return replicas.at((size_t(id)+1) % replicas.size());
|
||||
tablet_replica_set replicas = orig_replicas;
|
||||
std::ranges::sort(replicas, tablet_replica_comparator(topo));
|
||||
// This formula must match the one in get_primary_replica(),
|
||||
// just with + 1.
|
||||
return replicas.at((size_t(id) + size_t(id) / replicas.size() + 1) % replicas.size());
|
||||
}
|
||||
|
||||
std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
|
||||
|
||||
@@ -647,9 +647,10 @@ public:
|
||||
/// Returns the primary replica for the tablet
|
||||
tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
/// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
|
||||
/// Returns the secondary replica for the tablet: the replica that immediately follows the primary
|
||||
/// replica in the topology-sorted replica list.
|
||||
/// \throws std::runtime_error if the tablet has less than 2 replicas.
|
||||
tablet_replica get_secondary_replica(tablet_id id) const;
|
||||
tablet_replica get_secondary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
// Returns the replica that matches hosts and dcs filters for tablet_task_info.
|
||||
std::optional<tablet_replica> maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const;
|
||||
|
||||
2
main.cc
2
main.cc
@@ -2417,7 +2417,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
bm_cfg.delay = std::chrono::milliseconds(cfg->ring_delay_ms());
|
||||
bm_cfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();
|
||||
|
||||
bm.start(std::ref(qp), std::ref(sys_ks), bm_cfg).get();
|
||||
bm.start(std::ref(qp), std::ref(sys_ks), std::ref(feature_service), bm_cfg).get();
|
||||
auto stop_batchlog_manager = defer_verbose_shutdown("batchlog manager", [&bm] {
|
||||
bm.stop().get();
|
||||
});
|
||||
|
||||
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(
|
||||
|
||||
writev(v.serialize());
|
||||
}
|
||||
return collection_mutation(type, ret);
|
||||
return collection_mutation(type, std::move(ret));
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
|
||||
size 6492280
|
||||
oid sha256:088a9d7e165d33436eb3029ab092582cbae61f0e17486c226d8947ff44658c78
|
||||
size 6535832
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
|
||||
size 6492176
|
||||
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
|
||||
size 6528308
|
||||
|
||||
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
on_internal_error_noexcept(rcslog,
|
||||
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
|
||||
_resources, _initial_resources));
|
||||
_resources.count = std::max(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
|
||||
_resources.count = std::min(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::min(_resources.memory, _initial_resources.memory);
|
||||
}
|
||||
maybe_wake_execution_loop();
|
||||
}
|
||||
|
||||
@@ -1211,6 +1211,7 @@ private:
|
||||
}
|
||||
|
||||
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
|
||||
rlogger.debug("Disabling compaction for range={} for incremental repair", _range);
|
||||
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
|
||||
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
|
||||
_rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
|
||||
@@ -1240,6 +1241,8 @@ private:
|
||||
// compaction.
|
||||
reenablers_and_holders.cres.clear();
|
||||
rlogger.info("Re-enabled compaction for range={} for incremental repair", _range);
|
||||
|
||||
co_await utils::get_local_injector().inject("wait_after_prepare_sstables_for_incremental_repair", utils::wait_for_message(5min));
|
||||
}
|
||||
|
||||
// Read rows from sstable until the size of rows exceeds _max_row_buf_size - current_size
|
||||
@@ -2633,7 +2636,7 @@ future<repair_flush_hints_batchlog_response> repair_service::repair_flush_hints_
|
||||
all_replayed = co_await _bm.local().do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no);
|
||||
utils::get_local_injector().set_parameter("repair_flush_hints_batchlog_handler", "issue_flush", fmt::to_string(flush_time));
|
||||
}
|
||||
rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={}", req.repair_uuid, from, issue_flush);
|
||||
rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={} all_replayed={}", req.repair_uuid, from, issue_flush, all_replayed);
|
||||
}
|
||||
);
|
||||
if (!all_replayed) {
|
||||
@@ -3953,3 +3956,19 @@ future<std::optional<repair_task_progress>> repair_service::get_tablet_repair_ta
|
||||
task_uuid, tid, requested, finished, progress.progress(), finished_nomerge);
|
||||
co_return progress;
|
||||
}
|
||||
|
||||
void repair_service::on_cleanup_for_drop_table(const table_id& id) {
|
||||
// Prevent repair lock from being leaked in repair_service when table is dropped midway.
|
||||
// The RPC verb that removes the lock on success path will not be called by coordinator after table was dropped.
|
||||
// We also cannot move the lock from repair_service to repair_meta, since the lock must outlive the latter.
|
||||
// Since tablet metadata has been erased at this point, we can simply erase all instances for the dropped table.
|
||||
rlogger.debug("Cleaning up state for dropped table {}", id);
|
||||
for (auto it = _repair_compaction_locks.begin(); it != _repair_compaction_locks.end();) {
|
||||
auto& [global_tid, _] = *it;
|
||||
if (global_tid.table == id) {
|
||||
it = _repair_compaction_locks.erase(it);
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,6 +318,8 @@ public:
|
||||
|
||||
future<uint32_t> get_next_repair_meta_id();
|
||||
|
||||
void on_cleanup_for_drop_table(const table_id& id);
|
||||
|
||||
friend class repair::user_requested_repair_task_impl;
|
||||
friend class repair::data_sync_repair_task_impl;
|
||||
friend class repair::tablet_repair_task_impl;
|
||||
|
||||
@@ -432,7 +432,9 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
@@ -442,12 +444,13 @@ public:
|
||||
virtual storage_group& storage_group_for_token(dht::token) const = 0;
|
||||
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
|
||||
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
|
||||
virtual locator::combined_load_stats table_load_stats() const = 0;
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
|
||||
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
|
||||
virtual future<> wait_for_background_tablet_resize_work() = 0;
|
||||
|
||||
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
|
||||
};
|
||||
|
||||
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
|
||||
if (!range.is_singular()) {
|
||||
continue;
|
||||
}
|
||||
auto token = dht::token::to_int64(ranges.front().start()->value().token());
|
||||
auto token = dht::token::to_int64(range.start()->value().token());
|
||||
if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
|
||||
// Don't return immediately - account all ranges first
|
||||
ret = can_proceed::no;
|
||||
|
||||
@@ -1129,9 +1129,7 @@ public:
|
||||
return _stats;
|
||||
}
|
||||
|
||||
// The tablet filter is used to not double account migrating tablets, so it's important that
|
||||
// only one of pending or leaving replica is accounted based on current migration stage.
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
|
||||
locator::combined_load_stats table_load_stats() const;
|
||||
|
||||
const db::view::stats& get_view_stats() const {
|
||||
return _view_stats;
|
||||
@@ -1368,8 +1366,6 @@ public:
|
||||
future<compaction_reenablers_and_lock_holders> get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
|
||||
const service::frozen_topology_guard& guard, dht::token_range range);
|
||||
future<uint64_t> estimated_partitions_in_range(dht::token_range tr) const;
|
||||
private:
|
||||
future<std::vector<compaction::compaction_group_view*>> get_compaction_group_views_for_repair(dht::token_range range);
|
||||
};
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_tablet_sstable_set(schema_ptr, const storage_group_manager& sgm, const locator::tablet_map&);
|
||||
|
||||
299
replica/table.cc
299
replica/table.cc
@@ -711,7 +711,9 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -734,7 +736,7 @@ public:
|
||||
return *_single_sg;
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
|
||||
locator::combined_load_stats table_load_stats() const override {
|
||||
return locator::combined_load_stats{
|
||||
.table_ls = locator::table_load_stats{
|
||||
.size_in_bytes = _single_sg->live_disk_space_used(),
|
||||
@@ -750,12 +752,18 @@ public:
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
|
||||
}
|
||||
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
|
||||
future<> wait_for_background_tablet_resize_work() override { return make_ready_future<>(); }
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
|
||||
return get_compaction_group().make_sstable_set();
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -768,8 +776,15 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
locator::resize_decision::seq_number_t _split_ready_seq_number = std::numeric_limits<locator::resize_decision::seq_number_t>::min();
|
||||
future<> _merge_completion_fiber;
|
||||
condition_variable _merge_completion_event;
|
||||
// Ensures that processes such as incremental repair will wait for pending work from
|
||||
// merge fiber before proceeding. This guarantees stability on the compaction groups.
|
||||
// NOTE: it's important that we don't await on the barrier with any compaction group
|
||||
// gate held, since merge fiber will stop groups that in turn await on gate,
|
||||
// potentially causing an ABBA deadlock.
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -793,7 +808,8 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -856,6 +872,7 @@ public:
|
||||
, _my_host_id(erm.get_token_metadata().get_my_id())
|
||||
, _tablet_map(&erm.get_token_metadata().tablets().get_tablet_map(schema()->id()))
|
||||
, _merge_completion_fiber(merge_completion_fiber())
|
||||
, _merge_fiber_barrier(format("[table {}.{}] merge_fiber_barrier", _t.schema()->ks_name(), _t.schema()->cf_name()))
|
||||
{
|
||||
storage_group_map ret;
|
||||
|
||||
@@ -886,7 +903,9 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -900,7 +919,7 @@ public:
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
@@ -908,6 +927,10 @@ public:
|
||||
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
|
||||
return tablet_map().get_token_range_after_split(token);
|
||||
}
|
||||
future<> wait_for_background_tablet_resize_work() override {
|
||||
co_await _merge_fiber_barrier.advance_and_await();
|
||||
co_return;
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
|
||||
// FIXME: avoid recreation of compound_set for groups which had no change. usually, only one group will be changed at a time.
|
||||
@@ -2120,33 +2143,31 @@ compaction_group::update_repaired_at_for_merge() {
|
||||
});
|
||||
}
|
||||
|
||||
future<std::vector<compaction::compaction_group_view*>> table::get_compaction_group_views_for_repair(dht::token_range range) {
|
||||
std::vector<compaction::compaction_group_view*> ret;
|
||||
auto sgs = storage_groups_for_token_range(range);
|
||||
for (auto& sg : sgs) {
|
||||
co_await coroutine::maybe_yield();
|
||||
sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
|
||||
ret.push_back(&cg->view_for_unrepaired_data());
|
||||
});
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
|
||||
const service::frozen_topology_guard& guard, dht::token_range range) {
|
||||
auto ret = compaction_reenablers_and_lock_holders();
|
||||
auto views = co_await get_compaction_group_views_for_repair(range);
|
||||
for (auto view : views) {
|
||||
auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(*view);
|
||||
// Waits for background tablet resize work like merge that might destroy compaction groups,
|
||||
// providing stability. Essentially, serializes tablet merge completion handling with
|
||||
// the start of incremental repair, from the replica side.
|
||||
co_await _sg_manager->wait_for_background_tablet_resize_work();
|
||||
|
||||
for (auto sg : storage_groups_for_token_range(range)) {
|
||||
// FIXME: indentation
|
||||
auto cgs = sg->compaction_groups_immediate();
|
||||
for (auto& cg : cgs) {
|
||||
auto gate_holder = cg->async_gate().hold();
|
||||
auto& view = cg->view_for_unrepaired_data();
|
||||
auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(view);
|
||||
tlogger.info("Disabled compaction for range={} session_id={} for incremental repair", range, guard);
|
||||
ret.cres.push_back(std::make_unique<compaction::compaction_reenabler>(std::move(cre)));
|
||||
|
||||
// This lock prevents the unrepaired compaction started by major compaction to run in parallel with repair.
|
||||
// The unrepaired compaction started by minor compaction does not need to take the lock since it ignores
|
||||
// sstables being repaired, so it can run in parallel with repair.
|
||||
auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(*view, "row_level_repair");
|
||||
auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(view, "row_level_repair");
|
||||
tlogger.info("Got unrepaired compaction and repair lock for range={} session_id={} for incremental repair", range, guard);
|
||||
ret.lock_holders.push_back(std::move(lock_holder));
|
||||
}
|
||||
}
|
||||
co_return ret;
|
||||
}
|
||||
@@ -2922,17 +2943,108 @@ void table::on_flush_timer() {
|
||||
});
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
// The following functions return true if we should return the tablet size of a tablet in
|
||||
// migration depending on its transition stage and whether it is a leaving or pending replica
|
||||
bool has_size_on_leaving (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_size_on_pending (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair:
|
||||
return false;
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
|
||||
locator::table_load_stats table_stats;
|
||||
table_stats.split_ready_seq_number = _split_ready_seq_number;
|
||||
|
||||
locator::tablet_load_stats tablet_stats;
|
||||
|
||||
for_each_storage_group([&] (size_t id, storage_group& sg) {
|
||||
locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
|
||||
if (tablet_filter(*_tablet_map, gid)) {
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
auto tid = locator::tablet_id(id);
|
||||
locator::global_tablet_id gid { _t.schema()->id(), tid };
|
||||
locator::tablet_replica me { _my_host_id, this_shard_id() };
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
|
||||
auto transition = _tablet_map->get_tablet_transition_info(tid);
|
||||
auto& info = _tablet_map->get_tablet_info(tid);
|
||||
bool is_pending = transition && transition->pending_replica == me;
|
||||
bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
|
||||
// Otherwise, pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto table_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
// When a tablet is in migration, we want to send its size during any migration stage when
|
||||
// we still know the tablet's size. This way the balancer will have better information about
|
||||
// tablet sizes, and we reduce the chance that the node will be ignored during balancing
|
||||
// due to missing tablet size. On the leaving replica we include tablets until the use_new
|
||||
// stage (inclusive), and on the pending we include tablets after the streaming stage.
|
||||
// There is an overlap in tablet sizes (we report sizes on both the leaving and pending
|
||||
// replicas for some stages), but that should not be a problem.
|
||||
auto tablet_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (is_leaving) {
|
||||
return has_size_on_leaving(transition->stage);
|
||||
} else if (is_pending) {
|
||||
return has_size_on_pending(transition->stage);
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (table_size_filter()) {
|
||||
table_stats.size_in_bytes += tablet_size;
|
||||
}
|
||||
|
||||
if (tablet_size_filter()) {
|
||||
const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
|
||||
// Make sure the token range is in the form (a, b]
|
||||
SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
|
||||
@@ -2945,8 +3057,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
|
||||
};
|
||||
}
|
||||
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
return _sg_manager->table_load_stats(std::move(tablet_filter));
|
||||
locator::combined_load_stats table::table_load_stats() const {
|
||||
return _sg_manager->table_load_stats();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
@@ -3018,7 +3130,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
|
||||
while (!_t.async_gate().is_closed()) {
|
||||
try {
|
||||
co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(60s));
|
||||
co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(5min));
|
||||
auto ks_name = schema()->ks_name();
|
||||
auto cf_name = schema()->cf_name();
|
||||
// Enable compaction after merge is done.
|
||||
@@ -3052,12 +3164,15 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
utils::get_local_injector().inject("replica_merge_completion_wait", [] () {
|
||||
tlogger.info("Merge completion fiber finished, about to sleep");
|
||||
});
|
||||
_pending_merge_fiber_work.reset();
|
||||
co_await _merge_completion_event.wait();
|
||||
tlogger.debug("Merge completion fiber woke up for {}.{}", schema()->ks_name(), schema()->cf_name());
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3081,7 +3196,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3110,10 +3225,15 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
new_storage_groups[new_tid] = std::move(new_sg);
|
||||
}
|
||||
_storage_groups = std::move(new_storage_groups);
|
||||
_pending_merge_fiber_work = _merge_fiber_barrier.start();
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3126,7 +3246,10 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
} else if (new_tablet_count < old_tablet_count) {
|
||||
tlogger.info0("Detected tablet merge for table {}.{}, decreasing from {} to {} tablets",
|
||||
schema()->ks_name(), schema()->cf_name(), old_tablet_count, new_tablet_count);
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3212,7 +3335,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -3674,7 +3797,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
|
||||
|
||||
std::vector<snapshot_sstable_set> sstable_sets(smp::count);
|
||||
std::vector<int64_t> tablet_counts(smp::count);
|
||||
|
||||
co_await writer->init();
|
||||
co_await smp::invoke_on_all([&] -> future<> {
|
||||
@@ -3682,7 +3804,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
auto [tables, permit] = co_await t.snapshot_sstables();
|
||||
auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
|
||||
sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
|
||||
tablet_counts[this_shard_id()] = t.calculate_tablet_count();
|
||||
});
|
||||
co_await writer->sync();
|
||||
|
||||
@@ -3696,12 +3817,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
|
||||
std::optional<int64_t> min_tablet_count;
|
||||
std::optional<int64_t> tablet_count;
|
||||
if (t.uses_tablets()) {
|
||||
SCYLLA_ASSERT(!tablet_counts.empty());
|
||||
min_tablet_count = *std::ranges::min_element(tablet_counts);
|
||||
auto erm = t.get_effective_replication_map();
|
||||
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
||||
tablet_count = tm.tablet_count();
|
||||
}
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
@@ -3759,6 +3881,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -3766,6 +3889,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -3785,53 +3911,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
|
||||
@@ -263,8 +263,9 @@ public:
|
||||
void enable_schema_commitlog() {
|
||||
_static_props.enable_schema_commitlog();
|
||||
}
|
||||
void set_is_group0_table(bool enabled = true) {
|
||||
_static_props.is_group0_table = enabled;
|
||||
void set_is_group0_table() {
|
||||
_static_props.is_group0_table = true;
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
class default_names {
|
||||
|
||||
@@ -227,6 +227,8 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
|
||||
static const std::unordered_set<auth::resource> vector_search_system_resources = {
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
|
||||
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
|
||||
};
|
||||
|
||||
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
|
||||
|
||||
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
auto ps_ptr = qp.get_prepared(cache_key);
|
||||
if (!ps_ptr) {
|
||||
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = std::move(msg_ptr->get_prepared());
|
||||
ps_ptr = msg_ptr->get_prepared();
|
||||
if (!ps_ptr) {
|
||||
on_internal_error(paxos_state::logger, "prepared statement is null");
|
||||
}
|
||||
|
||||
@@ -72,7 +72,7 @@ void group0_state_id_handler::refresh() {
|
||||
const auto min_state_id = std::ranges::min(group0_members_state_ids, [](auto a, auto b) {
|
||||
if (!a || !b) {
|
||||
// This should never happen, but if it does, it's a bug.
|
||||
on_fatal_internal_error(slogger, "unexpected empty state_id");
|
||||
on_internal_error(slogger, "unexpected empty state_id");
|
||||
}
|
||||
return utils::timeuuid_tri_compare(a, b) < 0;
|
||||
});
|
||||
|
||||
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
|
||||
if (!schema->static_props().is_group0_table) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
|
||||
}
|
||||
|
||||
if (!schema->static_props().use_schema_commitlog) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -149,19 +149,31 @@ public:
|
||||
const auto& node = nodes_info.at(voter_id);
|
||||
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_alive_nodes_remaining > 0);
|
||||
if (_alive_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no alive nodes remaining, but node {} is alive", voter_id));
|
||||
}
|
||||
--_alive_nodes_remaining;
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: rack doesn't own a live leader, but leader {} is alive", voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
if (node.is_voter) {
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no live voters remaining, but voter {} is alive", voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
} else {
|
||||
SCYLLA_ASSERT(_existing_dead_voters_remaining > 0);
|
||||
if (_existing_dead_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no dead voters remaining, but voter {} is dead", voter_id));
|
||||
}
|
||||
--_existing_dead_voters_remaining;
|
||||
}
|
||||
}
|
||||
@@ -279,16 +291,25 @@ public:
|
||||
|
||||
if (node.is_alive) {
|
||||
if (node.is_voter) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no live voters remaining, but voter {} is alive", *voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
}
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: DC doesn't own a live leader, but leader {} is alive", *voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(_nodes_remaining > 0);
|
||||
if (_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no nodes remaining, but voter {} belongs to this DC", *voter_id));
|
||||
}
|
||||
|
||||
--_nodes_remaining;
|
||||
++_assigned_voters_count;
|
||||
|
||||
@@ -123,12 +123,7 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
|
||||
// Check the effective replication map consistency:
|
||||
// we have an inconsistent effective replication map in case we the number of
|
||||
// read replicas is higher than the replication factor.
|
||||
void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
|
||||
// Skip for non-debug builds.
|
||||
if constexpr (!tools::build_info::is_debug_build()) {
|
||||
return;
|
||||
}
|
||||
|
||||
[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
|
||||
const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
|
||||
if (!error.empty()) {
|
||||
on_internal_error(slogger, error);
|
||||
@@ -4291,7 +4286,7 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
|
||||
public:
|
||||
context(storage_proxy & p, utils::chunked_vector<mutation>&& mutations, lw_shared_ptr<cdc::operation_result_tracker>&& cdc_tracker, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, coordinator_mutate_options options)
|
||||
: _p(p)
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2))
|
||||
, _schema(_p.local_db().find_schema(db::system_keyspace::NAME, _p.features().batchlog_v2 ? db::system_keyspace::BATCHLOG_V2 : db::system_keyspace::BATCHLOG))
|
||||
, _ermp(_p.local_db().find_column_family(_schema->id()).get_effective_replication_map())
|
||||
, _mutations(std::move(mutations))
|
||||
, _cdc_tracker(std::move(cdc_tracker))
|
||||
@@ -6972,7 +6967,12 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
|
||||
return host_id_vector_replica_set{my_host_id(erm)};
|
||||
}
|
||||
auto endpoints = erm.get_replicas_for_reading(token);
|
||||
validate_read_replicas(erm, endpoints);
|
||||
// Skip for non-debug builds and maintenance mode.
|
||||
if constexpr (tools::build_info::is_debug_build()) {
|
||||
if (!_db.local().get_config().maintenance_mode()) {
|
||||
validate_read_replicas(erm, endpoints);
|
||||
}
|
||||
}
|
||||
auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
|
||||
endpoints.erase(it, endpoints.end());
|
||||
sort_endpoints_by_proximity(erm, endpoints);
|
||||
|
||||
@@ -532,9 +532,16 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
|
||||
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
|
||||
}
|
||||
|
||||
static std::unordered_set<locator::host_id> get_released_nodes(const service::topology& topology, const locator::token_metadata& tm) {
|
||||
return boost::join(topology.left_nodes, topology.ignored_nodes)
|
||||
| std::views::transform([] (const auto& raft_id) { return locator::host_id(raft_id.uuid()); })
|
||||
| std::views::filter([&] (const auto& h) { return !tm.get_topology().has_node(h); })
|
||||
| std::ranges::to<std::unordered_set<locator::host_id>>();
|
||||
}
|
||||
|
||||
// Synchronizes the local node state (token_metadata, system.peers/system.local tables,
|
||||
// gossiper) to align it with the other raft topology nodes.
|
||||
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal) {
|
||||
future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released) {
|
||||
nodes_to_notify_after_sync nodes_to_notify;
|
||||
|
||||
rtlogger.trace("Start sync_raft_topology_nodes");
|
||||
@@ -625,7 +632,9 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await update_topology_change_info(tmptr, ::format("{} {}/{}", rs.state, id, ip));
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(_topology_state_machine._topology.req_param.contains(id));
|
||||
if (!_topology_state_machine._topology.req_param.contains(id)) {
|
||||
on_internal_error(rtlogger, format("No request parameters for replacing node {}", id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(_topology_state_machine._topology.req_param[id]).replaced_id;
|
||||
auto existing_ip = _address_map.find(locator::host_id{replaced_id.uuid()});
|
||||
const auto replaced_host_id = locator::host_id(replaced_id.uuid());
|
||||
@@ -642,7 +651,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
on_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -688,13 +697,10 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
}
|
||||
}
|
||||
|
||||
auto nodes_to_release = t.left_nodes;
|
||||
nodes_to_release.insert(t.ignored_nodes.begin(), t.ignored_nodes.end());
|
||||
for (const auto& id: nodes_to_release) {
|
||||
auto host_id = locator::host_id(id.uuid());
|
||||
if (!tmptr->get_topology().find_node(host_id)) {
|
||||
nodes_to_notify.released.push_back(host_id);
|
||||
}
|
||||
if (prev_released) {
|
||||
auto nodes_to_release = get_released_nodes(t, *tmptr);
|
||||
std::erase_if(nodes_to_release, [&] (const auto& host_id) { return prev_released->contains(host_id); });
|
||||
std::copy(nodes_to_release.begin(), nodes_to_release.end(), std::back_inserter(nodes_to_notify.released));
|
||||
}
|
||||
|
||||
co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
|
||||
@@ -732,6 +738,10 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
|
||||
|
||||
rtlogger.debug("reload raft topology state");
|
||||
std::unordered_set<raft::server_id> prev_normal = _topology_state_machine._topology.normal_nodes | std::views::keys | std::ranges::to<std::unordered_set>();
|
||||
std::optional<std::unordered_set<locator::host_id>> prev_released;
|
||||
if (!_topology_state_machine._topology.is_empty()) {
|
||||
prev_released = get_released_nodes(_topology_state_machine._topology, get_token_metadata());
|
||||
}
|
||||
|
||||
std::unordered_set<locator::host_id> tablet_hosts = co_await replica::read_required_hosts(_qp);
|
||||
|
||||
@@ -832,7 +842,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
|
||||
}, topology.tstate);
|
||||
tmptr->set_read_new(read_new);
|
||||
|
||||
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal));
|
||||
auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal), std::move(prev_released));
|
||||
|
||||
std::optional<locator::tablet_metadata> tablets;
|
||||
if (hint.tablets_hint) {
|
||||
@@ -6146,6 +6156,57 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -6173,12 +6234,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -6217,43 +6272,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
co_await local_topology_barrier();
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
@@ -6276,7 +6295,11 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
break;
|
||||
case raft_topology_cmd::command::stream_ranges: {
|
||||
co_await with_scheduling_group(_db.local().get_streaming_scheduling_group(), coroutine::lambda([&] () -> future<> {
|
||||
const auto rs = _topology_state_machine._topology.find(id)->second;
|
||||
const auto* server_rs = _topology_state_machine._topology.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Got {} request for node {} not found in topology", cmd.cmd, id));
|
||||
}
|
||||
const auto rs = server_rs->second;
|
||||
auto tstate = _topology_state_machine._topology.tstate;
|
||||
auto session = _topology_state_machine._topology.session;
|
||||
if (!rs.ring || rs.ring->tokens.empty()) {
|
||||
@@ -7328,11 +7351,15 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
|
||||
const locator::host_id this_host = _db.local().get_token_metadata().get_my_id();
|
||||
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
// Align to 64 bytes to avoid cache line ping-pong when updating size in map_reduce0() below
|
||||
struct alignas(64) aligned_tablet_size {
|
||||
uint64_t size = 0;
|
||||
};
|
||||
std::vector<aligned_tablet_size> tablet_sizes_per_shard(smp::count);
|
||||
|
||||
// Each node combines a per-table load map from all of its shards and returns it to the coordinator.
|
||||
// So if there are 1k nodes, there will be 1k RPCs in total.
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &sum_tablet_sizes] (replica::database& db) -> future<locator::load_stats> {
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &tablet_sizes_per_shard] (replica::database& db) -> future<locator::load_stats> {
|
||||
locator::load_stats load_stats{};
|
||||
auto& tables_metadata = db.get_tables_metadata();
|
||||
|
||||
@@ -7341,36 +7368,10 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (!table) {
|
||||
continue;
|
||||
}
|
||||
auto erm = table->get_effective_replication_map();
|
||||
auto& token_metadata = erm->get_token_metadata();
|
||||
auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached cleanup stage, then leaving replicas are accounted.
|
||||
// If transition is past cleanup stage, then pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
|
||||
auto transition = tmap.get_tablet_transition_info(id.tablet);
|
||||
auto& info = tmap.get_tablet_info(id.tablet);
|
||||
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_pending = transition->pending_replica == me;
|
||||
bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats() };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
sum_tablet_sizes += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -7389,6 +7390,10 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (config_capacity != 0) {
|
||||
tls.effective_capacity = config_capacity;
|
||||
} else {
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
for (const auto& ts : tablet_sizes_per_shard) {
|
||||
sum_tablet_sizes += ts.size;
|
||||
}
|
||||
tls.effective_capacity = si.available + sum_tablet_sizes;
|
||||
}
|
||||
|
||||
@@ -8431,6 +8436,7 @@ future<> storage_service::start_maintenance_mode() {
|
||||
set_mode(mode::MAINTENANCE);
|
||||
|
||||
return mutate_token_metadata([this] (mutable_token_metadata_ptr token_metadata) -> future<> {
|
||||
token_metadata->update_topology(my_host_id(), _snitch.local()->get_location(), locator::node::state::normal, smp::count);
|
||||
return token_metadata->update_normal_tokens({ dht::token{} }, my_host_id());
|
||||
}, acquire_merge_lock::yes);
|
||||
}
|
||||
@@ -8603,4 +8609,13 @@ future<> storage_service::query_cdc_streams(table_id table, noncopyable_function
|
||||
return _cdc_gens.local().query_cdc_streams(table, std::move(f));
|
||||
}
|
||||
|
||||
future<> storage_service::on_cleanup_for_drop_table(const table_id& id) {
|
||||
co_await container().invoke_on_all([id] (storage_service& ss) {
|
||||
if (ss._repair.local_is_initialized()) {
|
||||
ss._repair.local().on_cleanup_for_drop_table(id);
|
||||
}
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
} // namespace service
|
||||
|
||||
@@ -617,6 +617,8 @@ public:
|
||||
virtual void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
|
||||
virtual void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
|
||||
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
|
||||
|
||||
future<> on_cleanup_for_drop_table(const table_id& id);
|
||||
private:
|
||||
std::optional<db::system_keyspace::peer_info> get_peer_info_for_update(locator::host_id endpoint);
|
||||
// return an engaged value iff app_state_map has changes to the peer info
|
||||
@@ -942,6 +944,9 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
@@ -1115,7 +1120,7 @@ private:
|
||||
// gossiper) to align it with the other raft topology nodes.
|
||||
// Optional target_node can be provided to restrict the synchronization to the specified node.
|
||||
// Returns a structure that describes which notifications to trigger after token metadata is updated.
|
||||
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal);
|
||||
future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released);
|
||||
// Triggers notifications (on_joined, on_left) based on the recent changes to token metadata, as described by the passed in structure.
|
||||
// This function should be called on the result of `sync_raft_topology_nodes`, after the global token metadata is updated.
|
||||
future<> notify_nodes_after_sync(nodes_to_notify_after_sync&& nodes_to_notify);
|
||||
|
||||
@@ -90,14 +90,14 @@ load_balancer_stats_manager::load_balancer_stats_manager(sstring group_name):
|
||||
setup_metrics(_cluster_stats);
|
||||
}
|
||||
|
||||
load_balancer_dc_stats& load_balancer_stats_manager::for_dc(const dc_name& dc) {
|
||||
const lw_shared_ptr<load_balancer_dc_stats>& load_balancer_stats_manager::for_dc(const dc_name& dc) {
|
||||
auto it = _dc_stats.find(dc);
|
||||
if (it == _dc_stats.end()) {
|
||||
auto stats = std::make_unique<load_balancer_dc_stats>();
|
||||
auto stats = make_lw_shared<load_balancer_dc_stats>();
|
||||
setup_metrics(dc, *stats);
|
||||
it = _dc_stats.emplace(dc, std::move(stats)).first;
|
||||
}
|
||||
return *it->second;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
load_balancer_node_stats& load_balancer_stats_manager::for_node(const dc_name& dc, host_id node) {
|
||||
@@ -149,22 +149,22 @@ db::tablet_options combine_tablet_options(R&& opts) {
|
||||
|
||||
static std::unordered_set<locator::tablet_id> split_string_to_tablet_id(std::string_view s, char delimiter) {
|
||||
auto tokens_view = s | std::views::split(delimiter)
|
||||
| std::views::transform([](auto&& range) {
|
||||
return std::string_view(&*range.begin(), std::ranges::distance(range));
|
||||
})
|
||||
| std::views::transform([](std::string_view sv) {
|
||||
return locator::tablet_id(std::stoul(std::string(sv)));
|
||||
});
|
||||
| std::views::transform([](auto&& range) {
|
||||
return std::string_view(&*range.begin(), std::ranges::distance(range));
|
||||
})
|
||||
| std::views::transform([](std::string_view sv) {
|
||||
return locator::tablet_id(std::stoul(std::string(sv)));
|
||||
});
|
||||
return std::unordered_set<locator::tablet_id>{tokens_view.begin(), tokens_view.end()};
|
||||
}
|
||||
|
||||
struct repair_plan {
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_info tinfo;
|
||||
dht::token_range range;
|
||||
dht::token last_token;
|
||||
db_clock::duration repair_time_diff;
|
||||
bool is_user_reuqest;
|
||||
locator::global_tablet_id gid;
|
||||
locator::tablet_info tinfo;
|
||||
dht::token_range range;
|
||||
dht::token last_token;
|
||||
db_clock::duration repair_time_diff;
|
||||
bool is_user_reuqest;
|
||||
};
|
||||
|
||||
// Used to compare different migration choices in regard to impact on load imbalance.
|
||||
@@ -291,6 +291,12 @@ struct rack_list_colocation_state {
|
||||
}
|
||||
};
|
||||
|
||||
/// Formattable wrapper for migration_plan, whose formatter prints a short summary of the plan.
|
||||
struct plan_summary {
|
||||
migration_plan& plan;
|
||||
explicit plan_summary(migration_plan& plan) : plan(plan) {}
|
||||
};
|
||||
|
||||
future<rack_list_colocation_state> find_required_rack_list_colocations(
|
||||
replica::database& db,
|
||||
token_metadata_ptr tmptr,
|
||||
@@ -452,7 +458,36 @@ struct fmt::formatter<service::repair_plan> : fmt::formatter<std::string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const service::repair_plan& p, FormatContext& ctx) const {
|
||||
auto diff_seconds = std::chrono::duration<float>(p.repair_time_diff).count();
|
||||
fmt::format_to(ctx.out(), "{{tablet={} last_token={} is_user_req={} diff_seconds={}}}", p.gid, p.last_token, p.is_user_reuqest, diff_seconds);
|
||||
fmt::format_to(ctx.out(), "{{tablet={} last_token={} is_user_req={} diff_seconds={}}}", p.gid, p.last_token, p.is_user_reuqest, diff_seconds);
|
||||
return ctx.out();
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct fmt::formatter<service::plan_summary> : fmt::formatter<std::string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const service::plan_summary& p, FormatContext& ctx) const {
|
||||
auto& plan = p.plan;
|
||||
std::string_view delim = "";
|
||||
auto get_delim = [&] { return std::exchange(delim, ", "); };
|
||||
if (plan.migrations().size()) {
|
||||
fmt::format_to(ctx.out(), "{}migrations: {}", get_delim(), plan.migrations().size());
|
||||
}
|
||||
if (plan.repair_plan().repairs().size()) {
|
||||
fmt::format_to(ctx.out(), "{}repairs: {}", get_delim(), plan.repair_plan().repairs().size());
|
||||
}
|
||||
if (plan.resize_plan().resize.size()) {
|
||||
fmt::format_to(ctx.out(), "{}resize: {}", get_delim(), plan.resize_plan().resize.size());
|
||||
}
|
||||
if (plan.resize_plan().finalize_resize.size()) {
|
||||
fmt::format_to(ctx.out(), "{}resize-ready: {}", get_delim(), plan.resize_plan().finalize_resize.size());
|
||||
}
|
||||
if (plan.rack_list_colocation_plan().size()) {
|
||||
fmt::format_to(ctx.out(), "{}rack-list colocation ready: {}", get_delim(), plan.rack_list_colocation_plan().request_to_resume());
|
||||
}
|
||||
if (delim.empty()) {
|
||||
fmt::format_to(ctx.out(), "empty");
|
||||
}
|
||||
return ctx.out();
|
||||
}
|
||||
};
|
||||
@@ -868,9 +903,12 @@ class load_balancer {
|
||||
absl::flat_hash_map<table_id, uint64_t> _disk_used_per_table;
|
||||
dc_name _dc;
|
||||
std::optional<sstring> _rack; // Set when plan making is limited to a single rack.
|
||||
sstring _location; // Name of the current scope of plan making. DC or DC+rack.
|
||||
lw_shared_ptr<load_balancer_dc_stats> _current_stats; // Stats for current scope of plan making.
|
||||
size_t _total_capacity_shards; // Total number of non-drained shards in the balanced node set.
|
||||
size_t _total_capacity_nodes; // Total number of non-drained nodes in the balanced node set.
|
||||
uint64_t _total_capacity_storage; // Total storage of non-drained nodes in the balanced node set.
|
||||
size_t _migrating_candidates; // Number of candidate replicas skipped because tablet is migrating.
|
||||
locator::load_stats_ptr _table_load_stats;
|
||||
load_balancer_stats_manager& _stats;
|
||||
std::unordered_set<host_id> _skiplist;
|
||||
@@ -995,22 +1033,21 @@ public:
|
||||
migration_plan plan;
|
||||
|
||||
auto rack_list_colocation = ongoing_rack_list_colocation();
|
||||
if (!utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations in rack {} in DC {}", rack_plan.size(), rack, dc);
|
||||
plan.merge(std::move(rack_plan));
|
||||
}
|
||||
} else {
|
||||
auto dc_plan = co_await make_plan(dc);
|
||||
auto level = dc_plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations in DC {}", dc_plan.size(), dc);
|
||||
plan.merge(std::move(dc_plan));
|
||||
|
||||
// Prepare plans for each DC separately and combine them to be executed in parallel.
|
||||
for (auto&& dc : topo.get_datacenters()) {
|
||||
if (_db.get_config().rf_rack_valid_keyspaces() || _db.get_config().enforce_rack_list() || rack_list_colocation) {
|
||||
for (auto rack : topo.get_datacenter_racks().at(dc) | std::views::keys) {
|
||||
auto rack_plan = co_await make_plan(dc, rack);
|
||||
auto level = rack_plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Plan for {}/{}: {}", dc, rack, plan_summary(rack_plan));
|
||||
plan.merge(std::move(rack_plan));
|
||||
}
|
||||
} else {
|
||||
auto dc_plan = co_await make_plan(dc);
|
||||
auto level = dc_plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Plan for {}: {}", dc, plan_summary(dc_plan));
|
||||
plan.merge(std::move(dc_plan));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1027,9 +1064,8 @@ public:
|
||||
plan.set_repair_plan(co_await make_repair_plan(plan));
|
||||
}
|
||||
|
||||
auto level = plan.size() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migration plans, out of which there were {} tablet migration(s) and {} resize decision(s) and {} tablet repair(s) and {} rack-list colocation(s)",
|
||||
plan.size(), plan.tablet_migration_count(), plan.resize_decision_count(), plan.tablet_repair_count(), plan.tablet_rack_list_colocation_count());
|
||||
auto level = plan.empty() ? seastar::log_level::debug : seastar::log_level::info;
|
||||
lblogger.log(level, "Prepared plan: {}", plan_summary(plan));
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
@@ -1071,6 +1107,11 @@ public:
|
||||
if (!is_auto_repair_enabled(config)) {
|
||||
co_return false;
|
||||
}
|
||||
auto size = info.replicas.size();
|
||||
if (size <= 1) {
|
||||
lblogger.debug("Skipped auto repair for tablet={} replicas={}", gid, size);
|
||||
co_return false;
|
||||
}
|
||||
auto threshold = _db.get_config().auto_repair_threshold_default_in_seconds();
|
||||
auto repair_time_threshold = std::chrono::seconds(threshold);
|
||||
auto& last_repair_time = info.repair_time;
|
||||
@@ -1408,7 +1449,7 @@ public:
|
||||
co_return all_colocated;
|
||||
}
|
||||
|
||||
future<migration_plan> make_merge_colocation_plan(const dc_name& dc, node_load_map& nodes) {
|
||||
future<migration_plan> make_merge_colocation_plan(node_load_map& nodes) {
|
||||
migration_plan plan;
|
||||
table_resize_plan resize_plan;
|
||||
|
||||
@@ -1565,7 +1606,7 @@ public:
|
||||
if (cross_rack_migration(src, dst)) {
|
||||
// FIXME: This is illegal if table has views, as it breaks base-view pairing.
|
||||
// Can happen when RF!=#racks.
|
||||
_stats.for_dc(_dc).cross_rack_collocations++;
|
||||
_current_stats->cross_rack_collocations++;
|
||||
lblogger.debug("Cross-rack co-location migration for {}@{} (rack: {}) to co-habit {}@{} (rack: {})",
|
||||
t2_id, src, rack_of(src), t1_id, dst, rack_of(dst));
|
||||
utils::get_local_injector().inject("forbid_cross_rack_migration_attempt", [&] {
|
||||
@@ -2215,7 +2256,7 @@ public:
|
||||
|
||||
// Evaluates impact on load balance of migrating a tablet set of a given table to dst.
|
||||
migration_badness evaluate_dst_badness(node_load_map& nodes, table_id table, tablet_replica dst, uint64_t tablet_set_disk_size) {
|
||||
_stats.for_dc(_dc).candidates_evaluated++;
|
||||
_current_stats->candidates_evaluated++;
|
||||
|
||||
auto& node_info = nodes[dst.host];
|
||||
|
||||
@@ -2254,7 +2295,7 @@ public:
|
||||
|
||||
// Evaluates impact on load balance of migrating a tablet set of a given table from src.
|
||||
migration_badness evaluate_src_badness(node_load_map& nodes, table_id table, tablet_replica src, uint64_t tablet_set_disk_size) {
|
||||
_stats.for_dc(_dc).candidates_evaluated++;
|
||||
_current_stats->candidates_evaluated++;
|
||||
|
||||
auto& node_info = nodes[src.host];
|
||||
|
||||
@@ -2603,15 +2644,15 @@ public:
|
||||
auto mig_streaming_info = get_migration_streaming_infos(_tm->get_topology(), tmap, mig);
|
||||
|
||||
if (!can_accept_load(nodes, mig_streaming_info)) {
|
||||
_stats.for_dc(node_load.dc()).migrations_skipped++;
|
||||
_current_stats->migrations_skipped++;
|
||||
lblogger.debug("Unable to balance {}: load limit reached", host);
|
||||
break;
|
||||
}
|
||||
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {} size: {}", mig, tablets.tablet_set_disk_size);
|
||||
_stats.for_dc(node_load.dc()).migrations_produced++;
|
||||
_stats.for_dc(node_load.dc()).intranode_migrations_produced++;
|
||||
_current_stats->migrations_produced++;
|
||||
_current_stats->intranode_migrations_produced++;
|
||||
mark_as_scheduled(mig);
|
||||
plan.add(std::move(mig));
|
||||
|
||||
@@ -2718,21 +2759,21 @@ public:
|
||||
auto targets = get_viable_targets();
|
||||
if (rs->is_rack_based(_dc)) {
|
||||
lblogger.debug("candidate tablet {} skipped because RF is rack-based and it's in a different rack", tablet);
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_rack++;
|
||||
_current_stats->tablets_skipped_rack++;
|
||||
return skip_info{std::move(targets)};
|
||||
}
|
||||
if (!targets.contains(dst_info.id)) {
|
||||
auto new_rack_load = rack_load[dst_info.rack()] + 1;
|
||||
lblogger.debug("candidate tablet {} skipped because it would increase load on rack {} to {}, max={}",
|
||||
tablet, dst_info.rack(), new_rack_load, max_rack_load);
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_rack++;
|
||||
_current_stats->tablets_skipped_rack++;
|
||||
return skip_info{std::move(targets)};
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&& r : tmap.get_tablet_info(tablet.tablet).replicas) {
|
||||
if (r.host == dst_info.id) {
|
||||
_stats.for_dc(src_info.dc()).tablets_skipped_node++;
|
||||
_current_stats->tablets_skipped_node++;
|
||||
lblogger.debug("candidate tablet {} skipped because it has a replica on target node", tablet);
|
||||
if (need_viable_targets) {
|
||||
return skip_info{get_viable_targets()};
|
||||
@@ -2939,7 +2980,7 @@ public:
|
||||
};
|
||||
|
||||
if (min_candidate.badness.is_bad() && _use_table_aware_balancing) {
|
||||
_stats.for_dc(_dc).bad_first_candidates++;
|
||||
_current_stats->bad_first_candidates++;
|
||||
|
||||
// Consider better alternatives.
|
||||
if (drain_skipped) {
|
||||
@@ -3060,7 +3101,7 @@ public:
|
||||
lblogger.debug("Table {} shard overcommit: {}", table, overcommit);
|
||||
}
|
||||
|
||||
future<migration_plan> make_internode_plan(const dc_name& dc, node_load_map& nodes,
|
||||
future<migration_plan> make_internode_plan(node_load_map& nodes,
|
||||
const std::unordered_set<host_id>& nodes_to_drain,
|
||||
host_id target) {
|
||||
migration_plan plan;
|
||||
@@ -3120,7 +3161,7 @@ public:
|
||||
|
||||
if (nodes_by_load.empty()) {
|
||||
lblogger.debug("No more candidate nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3191,7 +3232,7 @@ public:
|
||||
|
||||
if (nodes_by_load_dst.empty()) {
|
||||
lblogger.debug("No more target nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3221,7 +3262,7 @@ public:
|
||||
const load_type max_load = std::max(max_off_candidate_load, src_node_info.avg_load);
|
||||
if (is_balanced(target_info.avg_load, max_load)) {
|
||||
lblogger.debug("Balance achieved.");
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -3255,7 +3296,7 @@ public:
|
||||
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
|
||||
if (can_check_convergence && !check_convergence(src_node_info, target_info, source_tablets)) {
|
||||
lblogger.debug("No more candidates. Load would be inverted.");
|
||||
_stats.for_dc(dc).stop_load_inversion++;
|
||||
_current_stats->stop_load_inversion++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3289,11 +3330,11 @@ public:
|
||||
}
|
||||
}
|
||||
if (candidate.badness.is_bad()) {
|
||||
_stats.for_dc(_dc).bad_migrations++;
|
||||
_current_stats->bad_migrations++;
|
||||
}
|
||||
|
||||
if (drain_skipped) {
|
||||
_stats.for_dc(_dc).migrations_from_skiplist++;
|
||||
_current_stats->migrations_from_skiplist++;
|
||||
}
|
||||
|
||||
if (src_node_info.req && *src_node_info.req == topology_request::leave && src_node_info.excluded) {
|
||||
@@ -3313,7 +3354,7 @@ public:
|
||||
if (can_accept_load(nodes, mig_streaming_info)) {
|
||||
apply_load(nodes, mig_streaming_info);
|
||||
lblogger.debug("Adding migration: {} size: {}", mig, source_tablets.tablet_set_disk_size);
|
||||
_stats.for_dc(dc).migrations_produced++;
|
||||
_current_stats->migrations_produced++;
|
||||
mark_as_scheduled(mig);
|
||||
plan.add(std::move(mig));
|
||||
} else {
|
||||
@@ -3324,10 +3365,10 @@ public:
|
||||
// Just because the next migration is blocked doesn't mean we could not proceed with migrations
|
||||
// for other shards which are produced by the planner subsequently.
|
||||
skipped_migrations++;
|
||||
_stats.for_dc(dc).migrations_skipped++;
|
||||
_current_stats->migrations_skipped++;
|
||||
if (skipped_migrations >= max_skipped_migrations) {
|
||||
lblogger.debug("Too many migrations skipped, aborting balancing");
|
||||
_stats.for_dc(dc).stop_skip_limit++;
|
||||
_current_stats->stop_skip_limit++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -3346,7 +3387,7 @@ public:
|
||||
}
|
||||
|
||||
if (plan.size() == batch_size) {
|
||||
_stats.for_dc(dc).stop_batch_size++;
|
||||
_current_stats->stop_batch_size++;
|
||||
}
|
||||
|
||||
if (plan.empty()) {
|
||||
@@ -3363,7 +3404,13 @@ public:
|
||||
// If there are 7 tablets and RF=3, each node must have 1 tablet replica.
|
||||
// So node3 will have average load of 1, and node1 and node2 will have
|
||||
// average shard load of 7.
|
||||
lblogger.info("Not possible to achieve balance.");
|
||||
|
||||
// Show when this is the final plan with no active migrations left to execute,
|
||||
// otherwise it may just be a temporary situation due to lack of candidates.
|
||||
if (_migrating_candidates == 0) {
|
||||
lblogger.info("Not possible to achieve balance in {}", _location);
|
||||
print_node_stats(nodes, only_active::no);
|
||||
}
|
||||
}
|
||||
|
||||
co_return std::move(plan);
|
||||
@@ -3420,11 +3467,37 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using only_active = bool_class<struct only_active_tag>;
|
||||
|
||||
void print_node_stats(node_load_map& nodes, only_active only_active_) {
|
||||
for (auto&& [host, load] : nodes) {
|
||||
size_t read = 0;
|
||||
size_t write = 0;
|
||||
for (auto& shard_load : load.shards) {
|
||||
read += shard_load.streaming_read_load;
|
||||
write += shard_load.streaming_write_load;
|
||||
}
|
||||
auto level = !only_active_ || (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Node {}: {}/{} load={:.6f} tablets={} shards={} tablets/shard={:.3f} state={} cap={}"
|
||||
" rd={} wr={}",
|
||||
host, load.dc(), load.rack(), load.avg_load, load.tablet_count, load.shard_count,
|
||||
load.tablets_per_shard(), load.state(), load.dusage->capacity, read, write);
|
||||
}
|
||||
}
|
||||
|
||||
future<migration_plan> make_plan(dc_name dc, std::optional<sstring> rack = std::nullopt) {
|
||||
migration_plan plan;
|
||||
|
||||
if (utils::get_local_injector().enter("tablet_migration_bypass")) {
|
||||
co_return std::move(plan);
|
||||
}
|
||||
|
||||
_dc = dc;
|
||||
_rack = rack;
|
||||
_location = fmt::format("{}{}", dc, rack ? fmt::format("/{}", *rack) : "");
|
||||
_current_stats = _stats.for_dc(dc);
|
||||
auto _ = seastar::defer([&] { _current_stats = nullptr; });
|
||||
_migrating_candidates = 0;
|
||||
|
||||
auto node_filter = [&] (const locator::node& node) {
|
||||
return node.dc_rack().dc == dc && (!rack || node.dc_rack().rack == *rack);
|
||||
@@ -3433,7 +3506,7 @@ public:
|
||||
// Causes load balancer to move some tablet even though load is balanced.
|
||||
auto shuffle = in_shuffle_mode();
|
||||
|
||||
_stats.for_dc(dc).calls++;
|
||||
_current_stats->calls++;
|
||||
lblogger.debug("Examining DC {} rack {} (shuffle={}, balancing={}, tablets_per_shard_goal={}, force_capacity_based_balancing={})",
|
||||
dc, rack, shuffle, _tm->tablets().balancing_enabled(), _tablets_per_shard_goal, _force_capacity_based_balancing);
|
||||
|
||||
@@ -3529,7 +3602,7 @@ public:
|
||||
|
||||
if (nodes.empty()) {
|
||||
lblogger.debug("No nodes to balance.");
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
@@ -3552,15 +3625,23 @@ public:
|
||||
|
||||
// If we don't have nodes to drain, remove nodes which don't have complete tablet sizes
|
||||
if (nodes_to_drain.empty()) {
|
||||
std::optional<host_id> incomplete_host;
|
||||
size_t incomplete_count = 0;
|
||||
|
||||
for (auto nodes_i = nodes.begin(); nodes_i != nodes.end();) {
|
||||
host_id host = nodes_i->first;
|
||||
if (!_load_sketch->has_complete_data(host)) {
|
||||
lblogger.info("Node {} does not have complete tablet stats, ignoring", nodes_i->first);
|
||||
incomplete_host.emplace(host);
|
||||
incomplete_count++;
|
||||
nodes_i = nodes.erase(nodes_i);
|
||||
} else {
|
||||
++nodes_i;
|
||||
}
|
||||
}
|
||||
|
||||
if (incomplete_host) {
|
||||
lblogger.info("Ignoring {} node(s) with incomplete tablet stats, e.g. {}", incomplete_count, *incomplete_host);
|
||||
}
|
||||
}
|
||||
|
||||
plan.set_has_nodes_to_drain(!nodes_to_drain.empty());
|
||||
@@ -3594,11 +3675,11 @@ public:
|
||||
});
|
||||
if (!has_dest_nodes) {
|
||||
for (auto host : nodes_to_drain) {
|
||||
plan.add(drain_failure(host, format("No candidate nodes in DC {} to drain {}."
|
||||
" Consider adding new nodes or reducing replication factor.", dc, host)));
|
||||
plan.add(drain_failure(host, format("No candidate nodes in {} to drain {}."
|
||||
" Consider adding new nodes or reducing replication factor.", _location, host)));
|
||||
}
|
||||
lblogger.debug("No candidate nodes");
|
||||
_stats.for_dc(dc).stop_no_candidates++;
|
||||
_current_stats->stop_no_candidates++;
|
||||
co_return plan;
|
||||
}
|
||||
|
||||
@@ -3704,6 +3785,8 @@ public:
|
||||
if (!migrating(t1) && !migrating(t2)) {
|
||||
auto candidate = colocated_tablets{global_tablet_id{table, t1.tid}, global_tablet_id{table, t2->tid}};
|
||||
add_candidate(shard_load_info, migration_tablet_set{std::move(candidate), tablet_sizes_sum});
|
||||
} else {
|
||||
_migrating_candidates++;
|
||||
}
|
||||
} else {
|
||||
if (tids.size() != tablet_sizes.size()) {
|
||||
@@ -3712,6 +3795,8 @@ public:
|
||||
for (size_t i = 0; i < tids.size(); i++) {
|
||||
if (!migrating(get_table_desc(tids[i]))) { // migrating tablets are not candidates
|
||||
add_candidate(shard_load_info, migration_tablet_set{global_tablet_id{table, tids[i]}, tablet_sizes[i]});
|
||||
} else {
|
||||
_migrating_candidates++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3749,26 +3834,14 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&& [host, load] : nodes) {
|
||||
size_t read = 0;
|
||||
size_t write = 0;
|
||||
for (auto& shard_load : load.shards) {
|
||||
read += shard_load.streaming_read_load;
|
||||
write += shard_load.streaming_write_load;
|
||||
}
|
||||
auto level = (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Node {}: dc={} rack={} load={} tablets={} shards={} tablets/shard={} state={} cap={}"
|
||||
" stream_read={} stream_write={}",
|
||||
host, dc, load.rack(), load.avg_load, load.tablet_count, load.shard_count,
|
||||
load.tablets_per_shard(), load.state(), load.dusage->capacity, read, write);
|
||||
}
|
||||
print_node_stats(nodes, only_active::yes);
|
||||
|
||||
if (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || !is_balanced(min_load, max_load)))) {
|
||||
host_id target = *min_load_node;
|
||||
lblogger.info("target node: {}, avg_load: {}, max: {}", target, min_load, max_load);
|
||||
plan.merge(co_await make_internode_plan(dc, nodes, nodes_to_drain, target));
|
||||
plan.merge(co_await make_internode_plan(nodes, nodes_to_drain, target));
|
||||
} else {
|
||||
_stats.for_dc(dc).stop_balance++;
|
||||
_current_stats->stop_balance++;
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled()) {
|
||||
@@ -3776,9 +3849,9 @@ public:
|
||||
}
|
||||
|
||||
if (_tm->tablets().balancing_enabled() && plan.empty() && !ongoing_rack_list_colocation()) {
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(dc, nodes);
|
||||
auto dc_merge_plan = co_await make_merge_colocation_plan(nodes);
|
||||
auto level = dc_merge_plan.tablet_migration_count() > 0 ? seastar::log_level::info : seastar::log_level::debug;
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in DC {}", dc_merge_plan.tablet_migration_count(), dc);
|
||||
lblogger.log(level, "Prepared {} migrations for co-locating sibling tablets in {}", dc_merge_plan.tablet_migration_count(), _location);
|
||||
plan.merge(std::move(dc_merge_plan));
|
||||
}
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ class load_balancer_stats_manager {
|
||||
using host_id = locator::host_id;
|
||||
|
||||
sstring group_name;
|
||||
std::unordered_map<dc_name, std::unique_ptr<load_balancer_dc_stats>> _dc_stats;
|
||||
std::unordered_map<dc_name, lw_shared_ptr<load_balancer_dc_stats>> _dc_stats;
|
||||
std::unordered_map<host_id, std::unique_ptr<load_balancer_node_stats>> _node_stats;
|
||||
load_balancer_cluster_stats _cluster_stats;
|
||||
seastar::metrics::label dc_label{"target_dc"};
|
||||
@@ -113,7 +113,7 @@ class load_balancer_stats_manager {
|
||||
public:
|
||||
load_balancer_stats_manager(sstring group_name);
|
||||
|
||||
load_balancer_dc_stats& for_dc(const dc_name& dc);
|
||||
const lw_shared_ptr<load_balancer_dc_stats>& for_dc(const dc_name& dc);
|
||||
load_balancer_node_stats& for_node(const dc_name& dc, host_id node);
|
||||
load_balancer_cluster_stats& for_cluster();
|
||||
|
||||
@@ -196,7 +196,7 @@ public:
|
||||
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
||||
|
||||
const migrations_vector& migrations() const { return _migrations; }
|
||||
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size() && _drain_failures.empty(); }
|
||||
bool empty() const { return !size(); }
|
||||
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size() + _drain_failures.size(); }
|
||||
size_t tablet_migration_count() const { return _migrations.size(); }
|
||||
size_t resize_decision_count() const { return _resize_plan.size(); }
|
||||
|
||||
@@ -21,7 +21,6 @@ namespace service {
|
||||
|
||||
struct status_helper {
|
||||
tasks::task_status status;
|
||||
utils::chunked_vector<locator::tablet_id> tablets;
|
||||
std::optional<locator::tablet_replica> pending_replica;
|
||||
};
|
||||
|
||||
@@ -148,18 +147,40 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
}
|
||||
|
||||
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
|
||||
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
});
|
||||
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
|
||||
while (true) {
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
return true;
|
||||
}
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!is_repair_task(task_type)) {
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
auto tmptr = _ss.get_token_metadata_ptr();
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
break;
|
||||
}
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_still_running = false;
|
||||
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
|
||||
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
|
||||
return make_ready_future();
|
||||
});
|
||||
if (!repair_still_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
|
||||
if (is_migration_task(task_type)) {
|
||||
@@ -169,9 +190,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -257,6 +278,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_task_finished = false;
|
||||
bool repair_task_pending = false;
|
||||
bool no_tablets_processed = true;
|
||||
if (is_repair_task(task_type)) {
|
||||
auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
|
||||
if (progress) {
|
||||
@@ -273,37 +295,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& task_info = info.repair_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tid);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tablet_id);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
} else { // Resize task.
|
||||
auto& task_info = tmap.resize_task_info();
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
if (!res.tablets.empty()) {
|
||||
if (!no_tablets_processed) {
|
||||
res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
|
||||
if (repair_task_pending) {
|
||||
// When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
|
||||
res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
if (repair_task_finished) {
|
||||
|
||||
@@ -331,12 +331,17 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto [id, req] = *next_req;
|
||||
|
||||
auto* server_rs = topo.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Node {} has a pending {} request but is not found in topology", id, req));
|
||||
}
|
||||
|
||||
if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
|
||||
// If the highest prio request is removenode or decommission we need to start cleanup if one is needed
|
||||
return start_vnodes_cleanup(std::move(guard), req, id);
|
||||
}
|
||||
|
||||
return node_to_work_on(std::move(guard), &topo, id, &topo.find(id)->second, req, get_request_param(id));
|
||||
return node_to_work_on(std::move(guard), &topo, id, &server_rs->second, req, get_request_param(id));
|
||||
};
|
||||
|
||||
node_to_work_on get_node_to_work_on(group0_guard guard) const {
|
||||
@@ -373,7 +378,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
auto& topo = _topo_sm._topology;
|
||||
|
||||
auto it = topo.find(id);
|
||||
SCYLLA_ASSERT(it);
|
||||
if (!it) {
|
||||
on_internal_error(rtlogger, format("retake_node: node {} not found in topology", id));
|
||||
}
|
||||
|
||||
std::optional<topology_request> req;
|
||||
auto rit = topo.requests.find(id);
|
||||
@@ -2186,6 +2193,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
_tablet_allocator.set_load_stats(reconciled_stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the background storage group merge to finish before releasing the state machine.
|
||||
// Background merge holds the old erm, so a successful barrier joins with it.
|
||||
// This guarantees that the background merge doesn't run concurrently with the next merge.
|
||||
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
|
||||
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
|
||||
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
|
||||
// by the background merge fiber.
|
||||
tm = nullptr;
|
||||
if (!guard) {
|
||||
guard = co_await start_operation();
|
||||
}
|
||||
co_await global_tablet_token_metadata_barrier(std::move(guard));
|
||||
}
|
||||
|
||||
future<> handle_truncate_table(group0_guard guard) {
|
||||
@@ -2492,7 +2512,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
switch (node.rs->state) {
|
||||
case node_state::bootstrapping: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Bootstrapping node {} owns tokens", node.id));
|
||||
}
|
||||
auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
|
||||
auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;
|
||||
|
||||
@@ -2548,11 +2570,23 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
|
||||
auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
|
||||
SCYLLA_ASSERT(it != _topo_sm._topology.normal_nodes.end());
|
||||
SCYLLA_ASSERT(it->second.ring && it->second.state == node_state::normal);
|
||||
if (it == _topo_sm._topology.normal_nodes.end()) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} not found in normal nodes", replaced_id, node.id));
|
||||
}
|
||||
if (!it->second.ring) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is missing tokens", replaced_id, node.id));
|
||||
}
|
||||
if (it->second.state != node_state::normal) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is not in normal state", replaced_id, node.id));
|
||||
}
|
||||
|
||||
topology_mutation_builder builder(node.guard.write_timestamp());
|
||||
|
||||
@@ -2951,7 +2985,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Ring state on node {} is write_both_read_new while the node is in state {}",
|
||||
node.id, node.rs->state));
|
||||
}
|
||||
@@ -3268,7 +3302,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
rtbuilder.set("start_time", db_clock::now());
|
||||
switch (node.request.value()) {
|
||||
case topology_request::join: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Joining node {} owns tokens", node.id));
|
||||
}
|
||||
// Write chosen tokens through raft.
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3280,7 +3316,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::leave: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3311,7 +3349,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::remove: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Node {} being removed doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3339,7 +3379,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::replace: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3396,12 +3438,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
case node_state::removing:
|
||||
case node_state::replacing:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Found node {} in state {} but there is no ongoing topology transition",
|
||||
node.id, node.rs->state));
|
||||
case node_state::left:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Topology coordinator is called for node {} in state 'left'", node.id));
|
||||
break;
|
||||
}
|
||||
@@ -3463,7 +3505,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto id = node.id;
|
||||
|
||||
SCYLLA_ASSERT(!_topo_sm._topology.transition_nodes.empty());
|
||||
if (_topo_sm._topology.transition_nodes.empty()) {
|
||||
on_internal_error(rtlogger, format("transition nodes are empty while accepting node {}", node.id));
|
||||
}
|
||||
|
||||
release_node(std::move(node));
|
||||
|
||||
@@ -3873,6 +3917,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
|
||||
for (auto& [table_id, table_stats] : dc_stats.tables) {
|
||||
co_await coroutine::maybe_yield();
|
||||
|
||||
if (!_db.column_family_exists(table_id)) {
|
||||
continue;
|
||||
}
|
||||
auto& t = _db.find_column_family(table_id);
|
||||
auto& rs = t.get_effective_replication_map()->get_replication_strategy();
|
||||
if (!rs.uses_tablets()) {
|
||||
@@ -3896,6 +3943,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
|
||||
}
|
||||
|
||||
for (auto& [table_id, table_load_stats] : stats.tables) {
|
||||
if (!total_replicas.contains(table_id)) {
|
||||
continue;
|
||||
}
|
||||
auto table_total_replicas = total_replicas.at(table_id);
|
||||
if (table_total_replicas == 0) {
|
||||
continue;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user