mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-23 01:50:35 +00:00
Compare commits
107 Commits
fix_sl_v2_
...
next-2026.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
270ae28c00 | ||
|
|
819969a66a | ||
|
|
c5a473bf19 | ||
|
|
144fdc6c9f | ||
|
|
53984ce293 | ||
|
|
1689736223 | ||
|
|
3836757486 | ||
|
|
53df1d6c35 | ||
|
|
efb9911e3a | ||
|
|
ad5ba4c643 | ||
|
|
c4681d0975 | ||
|
|
24d8843803 | ||
|
|
5c5fbfaabb | ||
|
|
ec46a8a7d3 | ||
|
|
dae73f4781 | ||
|
|
bcd320a82a | ||
|
|
1cb9d0b245 | ||
|
|
ac7efa2085 | ||
|
|
f61343ca15 | ||
|
|
3386716217 | ||
|
|
cb9c65af43 | ||
|
|
351ed72f5f | ||
|
|
7a080130cf | ||
|
|
164364ed3e | ||
|
|
dd9df62617 | ||
|
|
df56f6bdc2 | ||
|
|
b60985548f | ||
|
|
feba21f868 | ||
|
|
92f8f2c2db | ||
|
|
e992d76489 | ||
|
|
196db8931e | ||
|
|
e436db01e3 | ||
|
|
9020288c79 | ||
|
|
7467dcd30f | ||
|
|
f9be6f4a83 | ||
|
|
d46ff9b405 | ||
|
|
5ca0bc2019 | ||
|
|
e5d82bf857 | ||
|
|
fac9795325 | ||
|
|
3a1d7d2b09 | ||
|
|
fb81acb7aa | ||
|
|
56bf4c8f0e | ||
|
|
bf1f5ee796 | ||
|
|
da53b8798f | ||
|
|
3d167dd36e | ||
|
|
c42799fb01 | ||
|
|
c93c037d39 | ||
|
|
3107d9083e | ||
|
|
04e5fa6c3e | ||
|
|
70b9ae04ff | ||
|
|
eaae2bf0af | ||
|
|
abfa4d0272 | ||
|
|
8bdc97924e | ||
|
|
253fa9519f | ||
|
|
666d0440f1 | ||
|
|
70b7652e64 | ||
|
|
27604deebb | ||
|
|
cd7baebc8b | ||
|
|
c5f57815a5 | ||
|
|
5eabf35824 | ||
|
|
95e422db48 | ||
|
|
b033bbc560 | ||
|
|
faf8ad69f0 | ||
|
|
dc7829a9b5 | ||
|
|
f2111c011f | ||
|
|
d2b12329ab | ||
|
|
b638170a4e | ||
|
|
d5c7f29734 | ||
|
|
a5dd529475 | ||
|
|
b176591488 | ||
|
|
233da83dd9 | ||
|
|
9b81939a93 | ||
|
|
804842e95c | ||
|
|
4f77cb621f | ||
|
|
eb6c333e1b | ||
|
|
8d21636a81 | ||
|
|
7f236baf61 | ||
|
|
4da8641d83 | ||
|
|
3ab789e1ca | ||
|
|
25a17282bd | ||
|
|
7afcc56128 | ||
|
|
32443ed6f7 | ||
|
|
3e9b984020 | ||
|
|
2d199fb609 | ||
|
|
35cd7f9239 | ||
|
|
32ce43d4b1 | ||
|
|
fef7750eb6 | ||
|
|
213442227d | ||
|
|
1398a55d16 | ||
|
|
a0a2a67634 | ||
|
|
d4e454b5bc | ||
|
|
825a36c97a | ||
|
|
45413e99a5 | ||
|
|
c93a935564 | ||
|
|
69f78ce74a | ||
|
|
3513ce6069 | ||
|
|
0ca7253315 | ||
|
|
c7ac3b5394 | ||
|
|
d6ed05efc1 | ||
|
|
39fcc83e75 | ||
|
|
6250f1e967 | ||
|
|
b307c9301d | ||
|
|
f26af8cd30 | ||
|
|
2bd10bff5e | ||
|
|
1105d83893 | ||
|
|
9b9d5cee8a | ||
|
|
a8fd9936a3 |
22
.github/workflows/trigger-scylla-ci.yaml
vendored
22
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -15,13 +15,19 @@ jobs:
|
||||
- name: Verify Org Membership
|
||||
id: verify_author
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
|
||||
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
|
||||
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
|
||||
AUTHOR="${{ github.event.pull_request.user.login }}"
|
||||
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
|
||||
AUTHOR="$PR_AUTHOR"
|
||||
ASSOCIATION="$PR_ASSOCIATION"
|
||||
else
|
||||
AUTHOR="${{ github.event.comment.user.login }}"
|
||||
AUTHOR="$COMMENT_AUTHOR"
|
||||
ASSOCIATION="$COMMENT_ASSOCIATION"
|
||||
fi
|
||||
ORG="scylladb"
|
||||
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
|
||||
@@ -34,13 +40,11 @@ jobs:
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
env:
|
||||
COMMENT_BODY: ${{ github.event.comment.body }}
|
||||
shell: bash
|
||||
run: |
|
||||
BODY=$(cat << 'EOF'
|
||||
${{ github.event.comment.body }}
|
||||
EOF
|
||||
)
|
||||
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
|
||||
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0
|
||||
VERSION=2026.1.2
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
if (should_add_wcu) {
|
||||
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
|
||||
}
|
||||
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_write_item_latency.mark(duration);
|
||||
for (const auto& w : per_table_wcu) {
|
||||
w.first->api_operations.batch_write_item_latency.mark(duration);
|
||||
}
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
if (!some_succeeded && eptr) {
|
||||
co_await coroutine::return_exception_ptr(std::move(eptr));
|
||||
}
|
||||
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_get_item_latency.mark(duration);
|
||||
for (const table_requests& rs : requests) {
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
||||
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
|
||||
}
|
||||
if (is_big(response)) {
|
||||
co_return make_streamed(std::move(response));
|
||||
} else {
|
||||
|
||||
@@ -32,6 +32,8 @@ namespace {
|
||||
|
||||
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
|
||||
|
||||
constexpr std::string_view user_placeholder = "{USER}";
|
||||
|
||||
struct url_desc_deleter {
|
||||
void operator()(LDAPURLDesc *p) {
|
||||
ldap_free_urldesc(p);
|
||||
@@ -40,9 +42,141 @@ struct url_desc_deleter {
|
||||
|
||||
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
|
||||
|
||||
url_desc_ptr parse_url(std::string_view url) {
|
||||
/// Escapes LDAP filter assertion value per RFC 4515 Section 3.
|
||||
/// The characters *, (, ), \, and NUL must be backslash-hex-escaped
|
||||
/// to prevent filter injection when interpolating untrusted input.
|
||||
sstring escape_filter_value(std::string_view value) {
|
||||
size_t escapable_chars = 0;
|
||||
for (unsigned char ch : value) {
|
||||
switch (ch) {
|
||||
case '*':
|
||||
case '(':
|
||||
case ')':
|
||||
case '\\':
|
||||
case '\0':
|
||||
++escapable_chars;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (escapable_chars == 0) {
|
||||
return sstring(value);
|
||||
}
|
||||
|
||||
sstring escaped(value.size() + escapable_chars * 2, 0);
|
||||
size_t pos = 0;
|
||||
for (unsigned char ch : value) {
|
||||
switch (ch) {
|
||||
case '*':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = 'a';
|
||||
break;
|
||||
case '(':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = '8';
|
||||
break;
|
||||
case ')':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = '9';
|
||||
break;
|
||||
case '\\':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '5';
|
||||
escaped[pos++] = 'c';
|
||||
break;
|
||||
case '\0':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '0';
|
||||
escaped[pos++] = '0';
|
||||
break;
|
||||
default:
|
||||
escaped[pos++] = static_cast<char>(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return escaped;
|
||||
}
|
||||
|
||||
/// Percent-encodes characters that are not RFC 3986 "unreserved"
|
||||
/// (ALPHA / DIGIT / '-' / '.' / '_' / '~').
|
||||
///
|
||||
/// Uses explicit ASCII range checks instead of std::isalnum() because
|
||||
/// the latter is locale-dependent and could pass non-ASCII characters
|
||||
/// through unencoded under certain locale settings.
|
||||
///
|
||||
/// This is applied AFTER RFC 4515 filter escaping when the value is
|
||||
/// substituted into an LDAP URL. It serves two purposes:
|
||||
/// 1. Prevents URL-level metacharacters ('?', '#') from breaking
|
||||
/// the URL structure parsed by ldap_url_parse.
|
||||
/// 2. Prevents percent-decoding (which ldap_url_parse performs on
|
||||
/// each component) from undoing the filter escaping, e.g. a
|
||||
/// literal "%2a" in the username would otherwise decode to '*'.
|
||||
sstring percent_encode_for_url(std::string_view value) {
|
||||
static constexpr char hex[] = "0123456789ABCDEF";
|
||||
|
||||
size_t chars_to_encode = 0;
|
||||
for (unsigned char ch : value) {
|
||||
if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
|
||||
|| ch == '-' || ch == '.' || ch == '_' || ch == '~')) {
|
||||
++chars_to_encode;
|
||||
}
|
||||
}
|
||||
|
||||
if (chars_to_encode == 0) {
|
||||
return sstring(value);
|
||||
}
|
||||
|
||||
sstring encoded(value.size() + chars_to_encode * 2, 0);
|
||||
size_t pos = 0;
|
||||
for (unsigned char ch : value) {
|
||||
if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
|
||||
|| ch == '-' || ch == '.' || ch == '_' || ch == '~') {
|
||||
encoded[pos++] = static_cast<char>(ch);
|
||||
} else {
|
||||
encoded[pos++] = '%';
|
||||
encoded[pos++] = hex[ch >> 4];
|
||||
encoded[pos++] = hex[ch & 0x0F];
|
||||
}
|
||||
}
|
||||
|
||||
return encoded;
|
||||
}
|
||||
|
||||
/// Checks whether \p sentinel appears in any parsed URL component
|
||||
/// other than the filter (host, DN, attributes, extensions).
|
||||
bool sentinel_outside_filter(const LDAPURLDesc& desc, std::string_view sentinel) {
|
||||
auto contains = [&](const char* field) {
|
||||
return field && std::string_view(field).find(sentinel) != std::string_view::npos;
|
||||
};
|
||||
if (contains(desc.lud_host) || contains(desc.lud_dn)) {
|
||||
return true;
|
||||
}
|
||||
if (desc.lud_attrs) {
|
||||
for (int i = 0; desc.lud_attrs[i]; ++i) {
|
||||
if (contains(desc.lud_attrs[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (desc.lud_exts) {
|
||||
for (int i = 0; desc.lud_exts[i]; ++i) {
|
||||
if (contains(desc.lud_exts[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
url_desc_ptr parse_url(const sstring& url) {
|
||||
LDAPURLDesc *desc = nullptr;
|
||||
if (ldap_url_parse(url.data(), &desc)) {
|
||||
if (ldap_url_parse(url.c_str(), &desc)) {
|
||||
mylog.error("error in ldap_url_parse({})", url);
|
||||
}
|
||||
return url_desc_ptr(desc);
|
||||
@@ -115,6 +249,7 @@ const resource_set& ldap_role_manager::protected_resources() const {
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::start() {
|
||||
validate_query_template();
|
||||
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
|
||||
return make_exception_future(
|
||||
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
@@ -199,7 +334,7 @@ future<> ldap_role_manager::revoke(std::string_view, std::string_view, ::service
|
||||
}
|
||||
|
||||
future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
|
||||
const auto url = get_url(grantee_name.data());
|
||||
const auto url = get_url(grantee_name);
|
||||
auto desc = parse_url(url);
|
||||
if (!desc) {
|
||||
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
|
||||
@@ -331,7 +466,46 @@ future<> ldap_role_manager::remove_attribute(std::string_view role_name, std::st
|
||||
}
|
||||
|
||||
sstring ldap_role_manager::get_url(std::string_view user) const {
|
||||
return boost::replace_all_copy(_query_template, "{USER}", user);
|
||||
// Two-layer encoding protects against injection:
|
||||
// 1. RFC 4515 filter escaping neutralizes filter metacharacters (*, (, ), \, NUL)
|
||||
// 2. URL percent-encoding prevents URL structure injection (?, #) and blocks
|
||||
// ldap_url_parse's percent-decoding from undoing the filter escaping (%2a -> *)
|
||||
return boost::replace_all_copy(_query_template, user_placeholder,
|
||||
percent_encode_for_url(escape_filter_value(user)));
|
||||
}
|
||||
|
||||
void ldap_role_manager::validate_query_template() const {
|
||||
if (_query_template.find(user_placeholder) == sstring::npos) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Substitute {USER} with a sentinel and let ldap_url_parse tell us
|
||||
// which URL component it landed in. The sentinel is purely
|
||||
// alphanumeric so it cannot affect URL parsing.
|
||||
static constexpr std::string_view sentinel = "XLDAPSENTINELX";
|
||||
sstring test_url = boost::replace_all_copy(_query_template, user_placeholder, sentinel);
|
||||
auto desc = parse_url(test_url);
|
||||
if (!desc) {
|
||||
throw url_error(format("LDAP URL template is not a valid URL when {{USER}} is substituted: {}", _query_template));
|
||||
}
|
||||
|
||||
// The sentinel must appear in the filter ...
|
||||
if (!desc->lud_filter
|
||||
|| std::string_view(desc->lud_filter).find(sentinel) == std::string_view::npos) {
|
||||
throw url_error(format(
|
||||
"LDAP URL template places {{USER}} outside the filter component. "
|
||||
"RFC 4515 filter escaping only protects the filter; other components "
|
||||
"(e.g. the base DN) require different escaping and are not supported. "
|
||||
"Template: {}", _query_template));
|
||||
}
|
||||
// ... and nowhere else (host, DN, attributes, extensions).
|
||||
if (sentinel_outside_filter(*desc, sentinel)) {
|
||||
throw url_error(format(
|
||||
"LDAP URL template places {{USER}} outside the filter component. "
|
||||
"RFC 4515 filter escaping only protects the filter; other components "
|
||||
"(e.g. the host) require different escaping and are not supported. "
|
||||
"Template: {}", _query_template));
|
||||
}
|
||||
}
|
||||
|
||||
future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants() {
|
||||
|
||||
@@ -107,6 +107,9 @@ class ldap_role_manager : public role_manager {
|
||||
/// Macro-expands _query_template, returning the result.
|
||||
sstring get_url(std::string_view user) const;
|
||||
|
||||
/// Validates that {USER}, if present, is used only in the LDAP filter component.
|
||||
void validate_query_template() const;
|
||||
|
||||
/// Used to auto-create roles returned by ldap.
|
||||
future<> create_role(std::string_view role_name);
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "mutation/mutation_fragment_stream_validator.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/pretty_printers.hh"
|
||||
#include "readers/multi_range.hh"
|
||||
#include "readers/compacting.hh"
|
||||
@@ -611,23 +612,23 @@ private:
|
||||
}
|
||||
|
||||
// Called in a seastar thread
|
||||
dht::partition_range_vector
|
||||
utils::chunked_vector<dht::partition_range>
|
||||
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
// If owned ranges is disengaged, it means no cleanup work was done and
|
||||
// so nothing needs to be invalidated.
|
||||
if (!_owned_ranges) {
|
||||
return dht::partition_range_vector{};
|
||||
return {};
|
||||
}
|
||||
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
|
||||
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
|
||||
|
||||
auto non_owned_ranges = sstables
|
||||
| std::views::transform([] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return dht::partition_range::make({sst->get_first_decorated_key(), true},
|
||||
{sst->get_last_decorated_key(), true});
|
||||
}) | std::ranges::to<dht::partition_range_vector>();
|
||||
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
|
||||
|
||||
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
|
||||
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
|
||||
}
|
||||
protected:
|
||||
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
|
||||
@@ -718,8 +719,8 @@ protected:
|
||||
|
||||
compaction_completion_desc
|
||||
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
|
||||
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
|
||||
auto ranges = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
|
||||
}
|
||||
|
||||
// Tombstone expiration is enabled based on the presence of sstable set.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "compaction_fwd.hh"
|
||||
#include "mutation_writer/token_group_based_splitting_writer.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace compaction {
|
||||
|
||||
@@ -38,7 +39,7 @@ struct compaction_completion_desc {
|
||||
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
|
||||
std::vector<sstables::shared_sstable> new_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// creates a new SSTable for a given shard
|
||||
|
||||
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
|
||||
if (dsm && (this_shard_id() == 0)) {
|
||||
_out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
|
||||
if (threshold_reached) {
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = true;
|
||||
return cm.drain();
|
||||
});
|
||||
}
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = false;
|
||||
cm.enable();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2291,6 +2297,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
|
||||
}
|
||||
|
||||
std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
|
||||
std::exception_ptr ex;
|
||||
if (_in_critical_disk_utilization_mode) {
|
||||
ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
|
||||
} else {
|
||||
ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
|
||||
}
|
||||
return ex;
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
@@ -2300,8 +2316,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
co_return coroutine::exception(make_disabled_exception(t));
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
|
||||
@@ -114,6 +114,8 @@ private:
|
||||
uint32_t _disabled_state_count = 0;
|
||||
|
||||
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
|
||||
// precondition: is_disabled() is true.
|
||||
std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);
|
||||
|
||||
std::optional<future<>> _stop_future;
|
||||
|
||||
@@ -173,6 +175,7 @@ private:
|
||||
tombstone_gc_state _tombstone_gc_state;
|
||||
|
||||
utils::disk_space_monitor::subscription _out_of_space_subscription;
|
||||
bool _in_critical_disk_utilization_mode = false;
|
||||
private:
|
||||
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
|
||||
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
|
||||
|
||||
@@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/tracing_test.cc',
|
||||
'test/boost/user_function_test.cc',
|
||||
'test/boost/user_types_test.cc',
|
||||
'test/boost/vector_index_test.cc',
|
||||
'test/boost/view_build_test.cc',
|
||||
'test/boost/view_complex_test.cc',
|
||||
'test/boost/view_schema_ckey_test.cc',
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include "utils/assert.hh"
|
||||
#include "cql3/column_specification.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
@@ -31,4 +32,12 @@ bool column_specification::all_in_same_table(const std::vector<lw_shared_ptr<col
|
||||
});
|
||||
}
|
||||
|
||||
lw_shared_ptr<column_specification> make_column_spec(std::string_view ks_name, std::string_view cf_name, sstring name, data_type type) {
|
||||
return make_lw_shared<column_specification>(
|
||||
ks_name,
|
||||
cf_name,
|
||||
::make_shared<column_identifier>(std::move(name), true),
|
||||
std::move(type));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -42,4 +42,6 @@ public:
|
||||
static bool all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names);
|
||||
};
|
||||
|
||||
lw_shared_ptr<column_specification> make_column_spec(std::string_view ks_name, std::string_view cf_name, sstring name, data_type type);
|
||||
|
||||
}
|
||||
|
||||
@@ -105,6 +105,7 @@ public:
|
||||
static const std::chrono::minutes entry_expiry;
|
||||
|
||||
using key_type = prepared_cache_key_type;
|
||||
using pinned_value_type = cache_value_ptr;
|
||||
using value_type = checked_weak_ptr;
|
||||
using statement_is_too_big = typename cache_type::entry_is_too_big;
|
||||
|
||||
@@ -116,9 +117,14 @@ public:
|
||||
: _cache(size, entry_expiry, logger)
|
||||
{}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
|
||||
}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_type> get(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
|
||||
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
|
||||
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
const auto& warnings = prep_ptr->warnings;
|
||||
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
for (const auto& w : warnings) {
|
||||
msg->add_warning(w);
|
||||
}
|
||||
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
|
||||
co_return std::move(msg);
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
@@ -1029,6 +1029,11 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (failed) {
|
||||
std::rethrow_exception(access_future.get_exception());
|
||||
}
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
@@ -1036,11 +1041,6 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (access_future.failed()) {
|
||||
std::rethrow_exception(access_future.get_exception());
|
||||
}
|
||||
batch->validate();
|
||||
batch->validate(*this, query_state.get_client_state());
|
||||
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
|
||||
|
||||
@@ -201,6 +201,10 @@ public:
|
||||
return _clustering_columns_restrictions;
|
||||
}
|
||||
|
||||
const expr::expression& get_nonprimary_key_restrictions() const {
|
||||
return _nonprimary_key_restrictions;
|
||||
}
|
||||
|
||||
// Get a set of columns restricted by the IS NOT NULL restriction.
|
||||
// IS NOT NULL is a special case that is handled separately from other restrictions.
|
||||
const std::unordered_set<const column_definition*> get_not_null_columns() const;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include "create_index_statement.hh"
|
||||
#include "db/config.hh"
|
||||
@@ -37,6 +38,7 @@
|
||||
#include "types/concrete_types.hh"
|
||||
#include "db/tags/extension.hh"
|
||||
#include "tombstone_gc_extension.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
@@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_vector_capable_class(const sstring& class_name) {
|
||||
return boost::iequals(class_name, "vector_index");
|
||||
}
|
||||
|
||||
static bool is_vector_index(const index_options_map& options) {
|
||||
auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
|
||||
return class_it != options.end() && is_vector_capable_class(class_it->second);
|
||||
}
|
||||
|
||||
view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
|
||||
const data_dictionary::database& db) const
|
||||
{
|
||||
@@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
|
||||
_idx_properties->validate();
|
||||
|
||||
// FIXME: This is ugly and can be improved.
|
||||
const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
|
||||
const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
|
||||
const bool uses_view_properties = _view_properties.properties()->count() > 0
|
||||
|| _view_properties.use_compact_storage()
|
||||
|| _view_properties.defined_ordering().size() > 0;
|
||||
@@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
|
||||
const index_options_map& options)
|
||||
{
|
||||
index_options_map new_options = options;
|
||||
auto target_option = secondary_index::target_parser::serialize_targets(targets);
|
||||
auto target_option = is_vector_index(options)
|
||||
? secondary_index::vector_index::serialize_targets(targets)
|
||||
: secondary_index::target_parser::serialize_targets(targets);
|
||||
new_options.emplace(index_target::target_option_name, target_option);
|
||||
|
||||
const auto& first_target = targets.front()->value;
|
||||
|
||||
@@ -30,13 +30,14 @@ list_effective_service_level_statement::prepare(data_dictionary::database db, cq
|
||||
return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_effective_service_level_statement>(*this));
|
||||
}
|
||||
|
||||
static auto make_column(sstring name, const shared_ptr<const abstract_type> type) {
|
||||
return make_lw_shared<column_specification>(
|
||||
"QOS",
|
||||
"effective_service_level",
|
||||
::make_shared<column_identifier>(std::move(name), true),
|
||||
type);
|
||||
};
|
||||
shared_ptr<const cql3::metadata> list_effective_service_level_statement::get_result_metadata() const {
|
||||
return ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<column_specification>>{
|
||||
make_column_spec("QOS", "effective_service_level", "service_level_option", utf8_type),
|
||||
make_column_spec("QOS", "effective_service_level", "effective_service_level", utf8_type),
|
||||
make_column_spec("QOS", "effective_service_level", "value", utf8_type)
|
||||
});
|
||||
}
|
||||
|
||||
static bytes_opt decompose_timeout (const qos::service_level_options::timeout_type& duration) {
|
||||
return std::visit(overloaded_functor{
|
||||
@@ -69,11 +70,6 @@ static bytes_opt decompose_shares(const qos::service_level_options::shares_type&
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
list_effective_service_level_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional<service::group0_guard>) const {
|
||||
static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
|
||||
make_column("service_level_option", utf8_type),
|
||||
make_column("effective_service_level", utf8_type),
|
||||
make_column("value", utf8_type)
|
||||
});
|
||||
auto& role_manager = state.get_client_state().get_auth_service()->underlying_role_manager();
|
||||
|
||||
if (!co_await role_manager.exists(_role_name)) {
|
||||
@@ -87,7 +83,7 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu
|
||||
throw exceptions::invalid_request_exception(format("Role {} doesn't have assigned any service level", _role_name));
|
||||
}
|
||||
|
||||
auto rs = std::make_unique<result_set>(metadata);
|
||||
auto rs = std::make_unique<result_set>(::make_shared<cql3::metadata>(*get_result_metadata()));
|
||||
rs->add_row({
|
||||
utf8_type->decompose("workload_type"),
|
||||
utf8_type->decompose(slo->effective_names->workload),
|
||||
@@ -110,4 +106,4 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,9 +21,11 @@ public:
|
||||
|
||||
virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard>) const override;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,9 +15,18 @@
|
||||
#include "auth/authorizer.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "cql3/result_set.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
|
||||
shared_ptr<const cql3::metadata> cql3::statements::list_permissions_statement::get_result_metadata() const {
|
||||
return ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<cql3::column_specification>>{
|
||||
make_column_spec(db::system_keyspace::NAME, "permissions", "role", utf8_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "permissions", "username", utf8_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "permissions", "resource", utf8_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "permissions", "permission", utf8_type)});
|
||||
}
|
||||
|
||||
cql3::statements::list_permissions_statement::list_permissions_statement(
|
||||
auth::permission_set permissions,
|
||||
std::optional<auth::resource> resource,
|
||||
@@ -80,18 +89,6 @@ cql3::statements::list_permissions_statement::execute(
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
std::optional<service::group0_guard> guard) const {
|
||||
auto make_column = [auth_ks = auth::get_auth_ks_name(qp)](sstring name) {
|
||||
return make_lw_shared<column_specification>(
|
||||
auth_ks,
|
||||
"permissions",
|
||||
::make_shared<column_identifier>(std::move(name), true),
|
||||
utf8_type);
|
||||
};
|
||||
|
||||
std::vector<lw_shared_ptr<column_specification>> metadata({
|
||||
make_column("role"), make_column("username"), make_column("resource"), make_column("permission")
|
||||
});
|
||||
|
||||
const auto make_resource_filter = [this]()
|
||||
-> std::optional<std::pair<auth::resource, auth::recursive_permissions>> {
|
||||
if (!_resource) {
|
||||
@@ -104,6 +101,7 @@ cql3::statements::list_permissions_statement::execute(
|
||||
};
|
||||
|
||||
const auto& as = *state.get_client_state().get_auth_service();
|
||||
auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
|
||||
|
||||
return do_with(make_resource_filter(), [this, &as, metadata = std::move(metadata)](const auto& resource_filter) mutable {
|
||||
return auth::list_filtered_permissions(
|
||||
|
||||
@@ -34,6 +34,8 @@ public:
|
||||
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
|
||||
void validate(query_processor&, const service::client_state&) const override;
|
||||
|
||||
future<> check_access(query_processor& qp, const service::client_state&) const override;
|
||||
|
||||
@@ -35,6 +35,8 @@ public:
|
||||
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
|
||||
virtual future<> check_access(query_processor& qp, const service::client_state&) const override;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#include "seastarx.hh"
|
||||
#include "cql3/statements/list_service_level_attachments_statement.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "service/client_state.hh"
|
||||
#include "service/query_state.hh"
|
||||
@@ -17,6 +16,15 @@ namespace cql3 {
|
||||
|
||||
namespace statements {
|
||||
|
||||
shared_ptr<const cql3::metadata> list_service_level_attachments_statement::get_result_metadata() const {
|
||||
static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
|
||||
make_column_spec("QOS", "service_levels_attachments", "role", utf8_type),
|
||||
make_column_spec("QOS", "service_levels_attachments", "service_level", utf8_type)
|
||||
});
|
||||
|
||||
return ::make_shared<cql3::metadata>(metadata);
|
||||
}
|
||||
|
||||
list_service_level_attachments_statement::list_service_level_attachments_statement(sstring role_name) :
|
||||
_role_name(role_name), _describe_all(false) {
|
||||
}
|
||||
@@ -40,19 +48,7 @@ list_service_level_attachments_statement::execute(query_processor& qp,
|
||||
service::query_state &state,
|
||||
const query_options &,
|
||||
std::optional<service::group0_guard> guard) const {
|
||||
|
||||
static auto make_column = [] (sstring name, const shared_ptr<const abstract_type> type) {
|
||||
return make_lw_shared<column_specification>(
|
||||
"QOS",
|
||||
"service_levels_attachments",
|
||||
::make_shared<column_identifier>(std::move(name), true),
|
||||
type);
|
||||
};
|
||||
|
||||
static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
|
||||
make_column("role", utf8_type), make_column("service_level", utf8_type)
|
||||
});
|
||||
|
||||
auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
|
||||
|
||||
return make_ready_future().then([this, &state] () {
|
||||
if (_describe_all) {
|
||||
@@ -67,7 +63,7 @@ list_service_level_attachments_statement::execute(query_processor& qp,
|
||||
});
|
||||
|
||||
}
|
||||
}).then([] (std::unordered_map<sstring, sstring> roles_to_att_val) {
|
||||
}).then([metadata = std::move(metadata)] (std::unordered_map<sstring, sstring> roles_to_att_val) {
|
||||
|
||||
auto rs = std::make_unique<result_set>(metadata);
|
||||
for (auto&& role_to_sl : roles_to_att_val) {
|
||||
|
||||
@@ -22,6 +22,7 @@ public:
|
||||
list_service_level_attachments_statement(sstring role_name);
|
||||
list_service_level_attachments_statement();
|
||||
std::unique_ptr<cql3::statements::prepared_statement> prepare(data_dictionary::database db, cql_stats &stats) override;
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
virtual future<> check_access(query_processor& qp, const service::client_state&) const override;
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard> guard) const override;
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
#include "seastarx.hh"
|
||||
#include "cql3/statements/list_service_level_statement.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "service/qos/service_level_controller.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "utils/overloaded_functor.hh"
|
||||
@@ -19,6 +18,20 @@ namespace cql3 {
|
||||
|
||||
namespace statements {
|
||||
|
||||
shared_ptr<const cql3::metadata> list_service_level_statement::get_result_metadata() const {
|
||||
std::vector<lw_shared_ptr<column_specification>> metadata{
|
||||
make_column_spec("QOS", "service_levels", "service_level", utf8_type),
|
||||
make_column_spec("QOS", "service_levels", "timeout", duration_type),
|
||||
make_column_spec("QOS", "service_levels", "workload_type", utf8_type),
|
||||
make_column_spec("QOS", "service_levels", "shares", int32_type),
|
||||
};
|
||||
if (_describe_all) {
|
||||
metadata.push_back(make_column_spec("QOS", "service_levels", "percentage of all service level shares", utf8_type));
|
||||
}
|
||||
|
||||
return ::make_shared<cql3::metadata>(std::move(metadata));
|
||||
}
|
||||
|
||||
list_service_level_statement::list_service_level_statement(sstring service_level, bool describe_all) :
|
||||
_service_level(service_level), _describe_all(describe_all) {
|
||||
}
|
||||
@@ -38,23 +51,7 @@ list_service_level_statement::execute(query_processor& qp,
|
||||
service::query_state &state,
|
||||
const query_options &,
|
||||
std::optional<service::group0_guard> guard) const {
|
||||
|
||||
static auto make_column = [] (sstring name, const shared_ptr<const abstract_type> type) {
|
||||
return make_lw_shared<column_specification>(
|
||||
"QOS",
|
||||
"service_levels",
|
||||
::make_shared<column_identifier>(std::move(name), true),
|
||||
type);
|
||||
};
|
||||
|
||||
std::vector<lw_shared_ptr<column_specification>> metadata({make_column("service_level", utf8_type),
|
||||
make_column("timeout", duration_type),
|
||||
make_column("workload_type", utf8_type),
|
||||
make_column("shares", int32_type),
|
||||
});
|
||||
if (_describe_all) {
|
||||
metadata.push_back(make_column("percentage of all service level shares", utf8_type));
|
||||
}
|
||||
auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
|
||||
|
||||
return make_ready_future().then([this, &state] () {
|
||||
if (_describe_all) {
|
||||
|
||||
@@ -21,6 +21,7 @@ class list_service_level_statement final : public service_level_statement {
|
||||
public:
|
||||
list_service_level_statement(sstring service_level, bool describe_all);
|
||||
std::unique_ptr<cql3::statements::prepared_statement> prepare(data_dictionary::database db, cql_stats &stats) override;
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
virtual future<> check_access(query_processor& qp, const service::client_state&) const override;
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard> guard) const override;
|
||||
|
||||
@@ -12,10 +12,17 @@
|
||||
#include "list_users_statement.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
|
||||
shared_ptr<const cql3::metadata> cql3::statements::list_users_statement::get_result_metadata() const {
|
||||
return ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<cql3::column_specification>>{
|
||||
cql3::make_column_spec(db::system_keyspace::NAME, "users", "name", utf8_type),
|
||||
cql3::make_column_spec(db::system_keyspace::NAME, "users", "super", boolean_type)});
|
||||
}
|
||||
|
||||
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::list_users_statement::prepare(
|
||||
data_dictionary::database db, cql_stats& stats) {
|
||||
return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_users_statement>(*this));
|
||||
@@ -28,20 +35,7 @@ future<> cql3::statements::list_users_statement::check_access(query_processor& q
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
cql3::statements::list_users_statement::execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const {
|
||||
static const sstring virtual_table_name("users");
|
||||
|
||||
const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
|
||||
return make_lw_shared<column_specification>(
|
||||
auth_ks,
|
||||
virtual_table_name,
|
||||
::make_shared<column_identifier>(name, true),
|
||||
ty);
|
||||
};
|
||||
|
||||
auto metadata = ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<column_specification>>{
|
||||
make_column_spec("name", utf8_type),
|
||||
make_column_spec("super", boolean_type)});
|
||||
auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
|
||||
|
||||
auto make_results = [metadata = std::move(metadata)](const auth::service& as, std::unordered_set<sstring>&& roles) mutable {
|
||||
using cql_transport::messages::result_message;
|
||||
|
||||
@@ -23,6 +23,8 @@ public:
|
||||
|
||||
std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
|
||||
|
||||
future<> check_access(query_processor& qp, const service::client_state&) const override;
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor&
|
||||
, service::query_state&
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "cql3/statements/list_roles_statement.hh"
|
||||
#include "cql3/statements/revoke_role_statement.hh"
|
||||
#include "cql3/statements/request_validations.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
@@ -347,6 +348,17 @@ std::unique_ptr<prepared_statement> list_roles_statement::prepare(
|
||||
return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_roles_statement>(*this));
|
||||
}
|
||||
|
||||
shared_ptr<const cql3::metadata> list_roles_statement::get_result_metadata() const {
|
||||
static const thread_local auto custom_options_type = map_type_impl::get_instance(utf8_type, utf8_type, true);
|
||||
|
||||
return ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<column_specification>>{
|
||||
make_column_spec(db::system_keyspace::NAME, "roles", "role", utf8_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "roles", "super", boolean_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "roles", "login", boolean_type),
|
||||
make_column_spec(db::system_keyspace::NAME, "roles", "options", custom_options_type)});
|
||||
}
|
||||
|
||||
future<> list_roles_statement::check_access(query_processor& qp, const service::client_state& state) const {
|
||||
state.ensure_not_anonymous();
|
||||
|
||||
@@ -376,24 +388,8 @@ future<> list_roles_statement::check_access(query_processor& qp, const service::
|
||||
|
||||
future<result_message_ptr>
|
||||
list_roles_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional<service::group0_guard> guard) const {
|
||||
static const sstring virtual_table_name("roles");
|
||||
|
||||
const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
|
||||
return make_lw_shared<column_specification>(
|
||||
auth_ks,
|
||||
virtual_table_name,
|
||||
::make_shared<column_identifier>(name, true),
|
||||
ty);
|
||||
};
|
||||
|
||||
static const thread_local auto custom_options_type = map_type_impl::get_instance(utf8_type, utf8_type, true);
|
||||
|
||||
auto metadata = ::make_shared<cql3::metadata>(
|
||||
std::vector<lw_shared_ptr<column_specification>>{
|
||||
make_column_spec("role", utf8_type),
|
||||
make_column_spec("super", boolean_type),
|
||||
make_column_spec("login", boolean_type),
|
||||
make_column_spec("options", custom_options_type)});
|
||||
auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
|
||||
|
||||
auto make_results = [metadata = std::move(metadata)](
|
||||
auth::role_manager& rm,
|
||||
|
||||
@@ -2006,9 +2006,7 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(
|
||||
|
||||
auto indexes = sim.list_indexes();
|
||||
auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
|
||||
return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
|
||||
ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
|
||||
(ind.target_column() == prepared_ann_ordering.first->name_as_text());
|
||||
return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
|
||||
});
|
||||
|
||||
if (it == indexes.end()) {
|
||||
|
||||
@@ -461,7 +461,17 @@ public:
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_background_continuity = true; // Default continuity
|
||||
if (_reversed) [[unlikely]] {
|
||||
if (!rows.empty()) {
|
||||
it = std::prev(rows.end());
|
||||
cont = is_continuous::yes;
|
||||
rt = {};
|
||||
} else {
|
||||
_background_continuity = true;
|
||||
}
|
||||
} else {
|
||||
_background_continuity = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!it) {
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/labels.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace cache {
|
||||
|
||||
@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
|
||||
return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
|
||||
return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
|
||||
future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
|
||||
return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
|
||||
return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
|
||||
auto on_failure = defer([this] () noexcept {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "utils/histogram.hh"
|
||||
#include "mutation/partition_version.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/cache_tracker.hh"
|
||||
#include "readers/empty.hh"
|
||||
#include "readers/mutation_source.hh"
|
||||
@@ -457,7 +458,7 @@ public:
|
||||
// mutation source made prior to the call to invalidate().
|
||||
future<> invalidate(external_updater, const dht::decorated_key&);
|
||||
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
|
||||
// Evicts entries from cache.
|
||||
//
|
||||
|
||||
@@ -105,7 +105,7 @@ namespace {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.ks_name() == schema_tables::NAME) {
|
||||
// all schema tables are group0 tables
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -87,31 +87,15 @@ namespace {
|
||||
static const std::unordered_set<sstring> tables = {
|
||||
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
|
||||
system_keyspace::BROADCAST_KV_STORE,
|
||||
system_keyspace::CDC_GENERATIONS_V3,
|
||||
system_keyspace::RAFT,
|
||||
system_keyspace::RAFT_SNAPSHOTS,
|
||||
system_keyspace::RAFT_SNAPSHOT_CONFIG,
|
||||
system_keyspace::GROUP0_HISTORY,
|
||||
system_keyspace::DISCOVERY,
|
||||
system_keyspace::TABLETS,
|
||||
system_keyspace::TOPOLOGY,
|
||||
system_keyspace::TOPOLOGY_REQUESTS,
|
||||
system_keyspace::LOCAL,
|
||||
system_keyspace::PEERS,
|
||||
system_keyspace::SCYLLA_LOCAL,
|
||||
system_keyspace::COMMITLOG_CLEANUPS,
|
||||
system_keyspace::SERVICE_LEVELS_V2,
|
||||
system_keyspace::VIEW_BUILD_STATUS_V2,
|
||||
system_keyspace::CDC_STREAMS_STATE,
|
||||
system_keyspace::CDC_STREAMS_HISTORY,
|
||||
system_keyspace::ROLES,
|
||||
system_keyspace::ROLE_MEMBERS,
|
||||
system_keyspace::ROLE_ATTRIBUTES,
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.enable_schema_commitlog();
|
||||
@@ -143,7 +127,7 @@ namespace {
|
||||
system_keyspace::REPAIR_TASKS,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
const row& existing_row = existing.cells();
|
||||
const row& updated_row = update.cells();
|
||||
|
||||
const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
|
||||
const auto view_it = _view->columns_by_name().find(cdef.name());
|
||||
const bool column_is_selected = view_it != _view->columns_by_name().end();
|
||||
|
||||
@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
// as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
|
||||
// Because of that, we don't generate view updates when the value in an unselected column is created
|
||||
// or changes.
|
||||
if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
|
||||
if (!column_is_selected) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//TODO(sarna): Optimize collections case - currently they do not go under optimization
|
||||
if (!cdef.is_atomic()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the value was created or deleted, unless we have a non-expiring marker
|
||||
// We cannot skip if the value was created or deleted
|
||||
const auto* existing_cell = existing_row.find_cell(cdef.id);
|
||||
const auto* updated_cell = updated_row.find_cell(cdef.id);
|
||||
if (existing_cell == nullptr || updated_cell == nullptr) {
|
||||
return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
|
||||
return existing_cell == updated_cell;
|
||||
}
|
||||
|
||||
if (!cdef.is_atomic()) {
|
||||
return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
|
||||
}
|
||||
|
||||
atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
|
||||
atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);
|
||||
|
||||
// We cannot skip when a selected column is changed
|
||||
if (column_is_selected) {
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
|
||||
// With non-expiring row marker, liveness checks below are not relevant
|
||||
if (base_has_nonexpiring_marker) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the change updates TTL
|
||||
const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
|
||||
const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
|
||||
if (existing_has_ttl || updated_has_ttl) {
|
||||
return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
|
||||
}
|
||||
|
||||
return true;
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
locator::endpoint_dc_rack my_location,
|
||||
const locator::network_topology_strategy* network_topology,
|
||||
const bool network_topology,
|
||||
replica::cf_stats& cf_stats) {
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id me,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& my_location = topology.get_location(me);
|
||||
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
|
||||
|
||||
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
|
||||
if (auto* np = topology.find_node(ep)) {
|
||||
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
// view pairing as the leaving base replica.
|
||||
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
|
||||
auto leaving_base = it->get().host_id();
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
|
||||
view_token, use_tablets, cf_stats);
|
||||
}
|
||||
}
|
||||
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
|
||||
wait_for_all_updates wait_for_all)
|
||||
{
|
||||
auto& ks = _db.find_keyspace(base->ks_name());
|
||||
auto& replication = ks.get_replication_strategy();
|
||||
const bool uses_tablets = ks.uses_tablets();
|
||||
const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
|
||||
// The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
|
||||
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
|
||||
auto get_erm = [&] (table_id id) {
|
||||
auto it = erms.find(id);
|
||||
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
|
||||
co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
|
||||
auto view_token = dht::get_token(*mut.s, mut.fm.key());
|
||||
auto view_ermp = erms.at(mut.s->id());
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
|
||||
ks.uses_tablets(), cf_stats);
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
|
||||
uses_tablets, cf_stats);
|
||||
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
|
||||
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
|
||||
if (no_pairing_endpoint) {
|
||||
|
||||
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id node,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
|
||||
@@ -200,9 +200,7 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
try {
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
co_await create_staging_sstable_tasks();
|
||||
lock.return_all();
|
||||
_as.check();
|
||||
co_await _sstables_to_register_event.when();
|
||||
} catch (semaphore_aborted&) {
|
||||
@@ -227,13 +225,45 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
}
|
||||
}
|
||||
|
||||
future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
// Collect `_staging_sstables_mutex` locks from multiple shards,
|
||||
// so other shards won't interact with their `_staging_sstables` map
|
||||
// until the caller releases them.
|
||||
std::vector<foreign_ptr<semaphore_units<>>> locks;
|
||||
locks.resize(smp::count);
|
||||
// Locks are acquired from multiple shards in parallel.
|
||||
// This is the only place where multiple-shard locks are acquired at once
|
||||
// and the method is called only once at a time (from `create_staging_sstable_tasks()`
|
||||
// on shard 0), so no deadlock may occur.
|
||||
co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
|
||||
auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
|
||||
auto& vbw = sharded_vbw.local();
|
||||
auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
|
||||
co_return make_foreign(std::move(lock));
|
||||
});
|
||||
locks[shard_id] = std::move(lock_ptr);
|
||||
});
|
||||
co_return std::move(locks);
|
||||
}
|
||||
|
||||
future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
// Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
|
||||
auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
|
||||
if (_sstables_to_register.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto shards = _sstables_to_register
|
||||
| std::views::values
|
||||
| std::views::join
|
||||
| std::views::transform([] (const auto& sst_info) { return sst_info.shard; })
|
||||
| std::ranges::to<std::flat_set<shard_id>>();
|
||||
shards.erase(0); // We're already holding shard0 lock
|
||||
auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto guard = co_await _group0.client().start_operation(_as);
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
for (auto& [table_id, sst_infos]: _sstables_to_register) {
|
||||
@@ -672,24 +702,34 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
}
|
||||
|
||||
future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
|
||||
if (_staging_sstables[table_id].empty()) {
|
||||
auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
|
||||
std::vector<sstables::shared_sstable> sstables_to_process;
|
||||
|
||||
try {
|
||||
// Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
|
||||
// concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
|
||||
// while we read them.
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
|
||||
auto tid = tablet_map.get_tablet_id(last_token);
|
||||
auto tablet_range = tablet_map.get_token_range(tid);
|
||||
|
||||
// Select sstables belonging to the tablet (identified by `last_token`)
|
||||
for (auto& sst: _staging_sstables[table_id]) {
|
||||
auto sst_last_token = sst->get_last_decorated_key().token();
|
||||
if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
|
||||
sstables_to_process.push_back(sst);
|
||||
}
|
||||
}
|
||||
lock.return_all();
|
||||
} catch (semaphore_aborted&) {
|
||||
vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
|
||||
auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
|
||||
auto tid = tablet_map.get_tablet_id(last_token);
|
||||
auto tablet_range = tablet_map.get_token_range(tid);
|
||||
|
||||
// Select sstables belonging to the tablet (identified by `last_token`)
|
||||
std::vector<sstables::shared_sstable> sstables_to_process;
|
||||
for (auto& sst: _staging_sstables[table_id]) {
|
||||
auto sst_last_token = sst->get_last_decorated_key().token();
|
||||
if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
|
||||
sstables_to_process.push_back(sst);
|
||||
}
|
||||
if (sstables_to_process.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);
|
||||
|
||||
try {
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <flat_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
@@ -169,10 +170,15 @@ private:
|
||||
future<> do_process_staging(table_id base_id, dht::token last_token);
|
||||
|
||||
future<> run_staging_sstables_registrator();
|
||||
// Caller must hold units from `_staging_sstables_mutex`
|
||||
// Acquires `_staging_sstables_mutex` on all shards internally,
|
||||
// so callers must not hold `_staging_sstables_mutex` when invoking it.
|
||||
future<> create_staging_sstable_tasks();
|
||||
future<> discover_existing_staging_sstables();
|
||||
std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
|
||||
// Acquire `_staging_sstables_mutex` on multiple shards in parallel.
|
||||
// Must be called only from shard 0.
|
||||
// Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
|
||||
future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);
|
||||
|
||||
void init_messaging_service();
|
||||
future<> uninit_messaging_service();
|
||||
|
||||
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
|
||||
return prs;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
|
||||
utils::chunked_vector<dht::partition_range> prs;
|
||||
prs.reserve(ranges.size());
|
||||
for (auto& range : ranges) {
|
||||
prs.push_back(dht::to_partition_range(range));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return prs;
|
||||
}
|
||||
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
|
||||
std::map<unsigned, dht::partition_range_vector> ret;
|
||||
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
|
||||
return ret;
|
||||
}
|
||||
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
|
||||
auto cmp = dht::ring_position_comparator(schema);
|
||||
// optimize set of potentially overlapping ranges by deoverlapping them.
|
||||
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
|
||||
dht::partition_range_vector res;
|
||||
auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
|
||||
utils::chunked_vector<dht::partition_range> res;
|
||||
res.reserve(ranges.size() * 2);
|
||||
|
||||
auto range = ranges.begin();
|
||||
|
||||
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {
|
||||
|
||||
dht::partition_range to_partition_range(dht::token_range);
|
||||
dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);
|
||||
|
||||
// Each shard gets a sorted, disjoint vector of ranges
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
|
||||
// Returns a sorted and deoverlapped list of ranges that are
|
||||
// the result of subtracting all ranges from ranges_to_subtract.
|
||||
// ranges_to_subtract must be sorted and deoverlapped.
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
|
||||
|
||||
// Returns a token_range vector split based on the given number of most-significant bits
|
||||
dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -52,6 +77,10 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -223,6 +252,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
28
dist/common/scripts/scylla_swap_setup
vendored
28
dist/common/scripts/scylla_swap_setup
vendored
@@ -9,6 +9,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shlex
|
||||
import argparse
|
||||
import psutil
|
||||
from pathlib import Path
|
||||
@@ -103,16 +104,41 @@ if __name__ == '__main__':
|
||||
run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
|
||||
swapfile.chmod(0o600)
|
||||
run('mkswap -f {}'.format(swapfile), shell=True, check=True)
|
||||
|
||||
mount_point = find_mount_point(swap_directory)
|
||||
mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
|
||||
|
||||
# Add DefaultDependencies=no to the swap unit to avoid getting the default
|
||||
# Before=swap.target dependency. We apply this to all clouds, but the
|
||||
# requirement came from Azure:
|
||||
#
|
||||
# On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
|
||||
# However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
|
||||
# the network (After=network-online.target). By extension, this means that
|
||||
# the swap unit depends on the network. If we didn't use DefaultDependencies=no,
|
||||
# then the swap unit would be part of the swap.target which other services
|
||||
# assume to be a local boot target, so we would end up with dependency cycles
|
||||
# such as:
|
||||
#
|
||||
# swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
|
||||
#
|
||||
# By removing the automatic Before=swap.target, the swap unit is no longer
|
||||
# part of swap.target, avoiding such cycles. The swap will still be
|
||||
# activated via WantedBy=multi-user.target.
|
||||
unit_data = '''
|
||||
[Unit]
|
||||
Description=swapfile
|
||||
DefaultDependencies=no
|
||||
After={}
|
||||
Conflicts=umount.target
|
||||
Before=umount.target
|
||||
|
||||
[Swap]
|
||||
What={}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
'''[1:-1].format(swapfile)
|
||||
'''[1:-1].format(mount_unit, swapfile)
|
||||
with swapunit.open('w') as f:
|
||||
f.write(unit_data)
|
||||
systemd_unit.reload()
|
||||
|
||||
@@ -31,7 +31,7 @@ was used. Alternator currently supports two compression algorithms, `gzip`
|
||||
and `deflate`, both standardized in ([RFC 9110](https://www.rfc-editor.org/rfc/rfc9110.html)).
|
||||
Other standard compression types which are listed in
|
||||
[IANA's HTTP Content Coding Registry](https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding),
|
||||
including `zstd` ([RFC 8878][https://www.rfc-editor.org/rfc/rfc8878.html]),
|
||||
including `zstd` ([RFC 8878](https://www.rfc-editor.org/rfc/rfc8878.html)),
|
||||
are not yet supported by Alternator.
|
||||
|
||||
Note that HTTP's compression only compresses the request's _body_ - not the
|
||||
|
||||
@@ -437,6 +437,36 @@ To migrate a keyspace from a numeric replication factor to a rack-list replicati
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. _fix-rf-change-tablet-rebuilds:
|
||||
|
||||
Fixing invalid replica state with RF change
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If a tablet rebuild fails during an RF change, the state of replicas will be invalid, even though the RF change is marked as successful. The missing replicas will be eventually added in the background. However, until then, the following RF changes will fail.
|
||||
|
||||
To fix the state of replicas in the foreground, retry the previous ALTER KEYSPACE statement, i.e. update the replication factor to the same value it currently has.
|
||||
|
||||
For example, if the following statement fails due to invalid replica state:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
|
||||
|
||||
Check the current replication factor with DESCRIBE KEYSPACE:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE Excelsior;
|
||||
CREATE KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
|
||||
|
||||
Ensure that reaching the valid replicas state is possible (e.g. there is enough non-excluded racks) and alter keyspace with the current replication factor:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
|
||||
|
||||
This should fix the state of replicas and allow future RF changes to succeed.
|
||||
|
||||
.. _drop-keyspace-statement:
|
||||
|
||||
DROP KEYSPACE
|
||||
|
||||
@@ -281,8 +281,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
See :ref:`WHERE <where-clause>`.
|
||||
|
||||
For example::
|
||||
|
||||
@@ -290,10 +290,6 @@ For example::
|
||||
WHERE user_id = 'user123'
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
|
||||
|
||||
Other filtering scenarios are currently not supported.
|
||||
|
||||
.. note::
|
||||
|
||||
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
|
||||
|
||||
@@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index
|
||||
- index on map, set or list values: VALUES(v)
|
||||
- index on map entries: ENTRIES(v)
|
||||
|
||||
Their serialization is just string representation, so:
|
||||
"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets.
|
||||
Their serialization uses lowercase type names as prefixes, except for `full` which is serialized
|
||||
as just the column name (without any prefix):
|
||||
`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection
|
||||
index on column `v` is stored simply as `"v"` (same as a regular index).
|
||||
|
||||
If the column name contains characters that could be confused with the above formats
|
||||
(e.g., a name containing parentheses or braces), it is escaped using the CQL
|
||||
quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the
|
||||
name in double quotes and doubles any embedded double-quote characters. For example,
|
||||
a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is
|
||||
stored as `"keys(m)"`.
|
||||
|
||||
## Local index
|
||||
|
||||
|
||||
10
docs/dev/vector_index.md
Normal file
10
docs/dev/vector_index.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Vector index in Scylla
|
||||
|
||||
Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
|
||||
|
||||
- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
|
||||
- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
|
||||
- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
|
||||
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
|
||||
|
||||
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.
|
||||
@@ -52,7 +52,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -125,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -133,19 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
sudo yum install scylla-2025.3.1
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -36,11 +36,8 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -50,20 +47,4 @@ you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -0,0 +1,492 @@
|
||||
=================================================
|
||||
Cluster Platform Migration Using Node Cycling
|
||||
=================================================
|
||||
|
||||
This procedure describes how to migrate a ScyllaDB cluster to new instance types
|
||||
using the add-and-replace approach, which is commonly used for:
|
||||
|
||||
* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
|
||||
* Upgrading to newer instance types with better performance
|
||||
* Changing instance families within the same cloud provider
|
||||
|
||||
The add-and-replace approach maintains data replication throughout the migration
|
||||
and ensures zero downtime for client applications.
|
||||
|
||||
.. note::
|
||||
|
||||
This procedure does **not** change the ScyllaDB software version. All nodes
|
||||
(both existing and new) must run the same ScyllaDB version. For software
|
||||
version upgrades, see :doc:`Upgrade </upgrade/index>`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The add-and-replace migration follows these steps:
|
||||
|
||||
#. Add new nodes (on target instance type) to the existing cluster
|
||||
#. Wait for data to stream to the new nodes
|
||||
#. Decommission old nodes (on source instance type)
|
||||
|
||||
This approach keeps the cluster operational throughout the migration while
|
||||
maintaining the configured replication factor.
|
||||
|
||||
Key characteristics
|
||||
===================
|
||||
|
||||
* **Zero downtime**: Client applications continue to operate during migration
|
||||
* **Data safety**: Replication factor is maintained throughout the process
|
||||
* **Flexible**: Works with both vnodes and tablets-enabled clusters
|
||||
* **Multi-DC support**: Can migrate nodes across multiple datacenters
|
||||
|
||||
.. warning::
|
||||
|
||||
Ensure your cluster has sufficient capacity during the migration. At the peak
|
||||
of the process, your cluster will temporarily have double the number of nodes.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Check cluster health
|
||||
====================
|
||||
|
||||
Before starting the migration, verify that your cluster is healthy:
|
||||
|
||||
#. Check that all nodes are in Up Normal (UN) status:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status. Do not proceed if any nodes are down.
|
||||
|
||||
#. Ensure no streaming or repair operations are in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
nodetool compactionstats
|
||||
|
||||
Plan the migration
|
||||
==================
|
||||
|
||||
Before provisioning new instances, plan the following:
|
||||
|
||||
**Instance type mapping**: Identify the source and target instance types.
|
||||
If your cluster uses vnodes (not tablets), consider that mismatched shard
|
||||
counts between source and target instance types can cause slower repairs.
|
||||
With tablets enabled, shard count mismatch is fully supported.
|
||||
|
||||
**Rack assignment planning**: Each new node must be assigned to the same rack
|
||||
as the node it will replace. This maintains rack-aware topology for:
|
||||
|
||||
* Rack-aware replication (NetworkTopologyStrategy)
|
||||
* Proper data distribution across failure domains
|
||||
* Minimizing data movement during decommission
|
||||
|
||||
Example mapping for a 3-node cluster:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
Source nodes (to be decommissioned): Target nodes (to be added):
|
||||
192.168.1.10 - RACK0 → 192.168.2.10 - RACK0
|
||||
192.168.1.11 - RACK1 → 192.168.2.11 - RACK1
|
||||
192.168.1.12 - RACK2 → 192.168.2.12 - RACK2
|
||||
|
||||
Create a backup
|
||||
===============
|
||||
|
||||
Back up the data before starting the migration. One of the following
|
||||
methods can be used:
|
||||
|
||||
* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
|
||||
cluster-wide backup. See the
|
||||
`ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
|
||||
for details.
|
||||
|
||||
* **Snapshots**: On each node in the cluster, create a snapshot:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool snapshot -t pre_migration_backup
|
||||
nodetool listsnapshots
|
||||
|
||||
.. note::
|
||||
|
||||
Snapshots are local to each node and do not protect against node or disk
|
||||
failure. For full disaster recovery, use ScyllaDB Manager backup.
|
||||
|
||||
|
||||
Procedure
|
||||
---------
|
||||
|
||||
Adding new nodes
|
||||
================
|
||||
|
||||
#. Provision new instances with the target instance type. Ensure:
|
||||
|
||||
* The same ScyllaDB version as existing nodes
|
||||
* Same network configuration and security groups
|
||||
* Appropriate storage configuration
|
||||
|
||||
#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
|
||||
cluster:
|
||||
|
||||
* **cluster_name**: Must match the existing cluster name
|
||||
* **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
|
||||
* **endpoint_snitch**: Must match the existing cluster configuration
|
||||
* **listen_address**: IP address of the new node
|
||||
* **rpc_address**: IP address of the new node
|
||||
|
||||
All other cluster-wide settings (tablets configuration, encryption settings,
|
||||
experimental features, etc.) must match the existing nodes.
|
||||
|
||||
.. caution::
|
||||
|
||||
Make sure that the ScyllaDB version on the new node is identical to the
|
||||
version on the other nodes in the cluster. Running nodes with different
|
||||
versions is not supported.
|
||||
|
||||
#. If using ``GossipingPropertyFileSnitch``, configure
|
||||
``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
|
||||
and rack assignment for this node:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
dc = <datacenter-name>
|
||||
rack = <rack-name>
|
||||
prefer_local = true
|
||||
|
||||
.. warning::
|
||||
|
||||
Each node must have the correct rack assignment. Using the same rack for
|
||||
all new nodes breaks rack-aware replication topology.
|
||||
|
||||
#. Start ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl start scylla-server
|
||||
|
||||
For Docker deployments:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker exec -it <container-name> supervisorctl start scylla
|
||||
|
||||
#. Monitor the bootstrap process from an existing node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The new node will appear with ``UJ`` (Up, Joining) status while streaming
|
||||
data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
|
||||
|
||||
**Example output during bootstrap:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
After the node reaches ``UN`` status, verify no streaming is in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
Wait until output shows "Not sending any streams" and no active receiving streams.
|
||||
|
||||
#. Repeat steps 1-6 for each new node to be added.
|
||||
|
||||
.. note::
|
||||
|
||||
You can add multiple nodes in parallel if they are in different datacenters.
|
||||
Within a single datacenter, add nodes one at a time for best results.
|
||||
|
||||
|
||||
Updating seed node configuration
|
||||
================================
|
||||
|
||||
If any of your original nodes are configured as seed nodes, you must update
|
||||
the seed configuration before decommissioning them.
|
||||
|
||||
#. Check the current seed configuration on any node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
|
||||
|
||||
#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
|
||||
on **all new nodes** to use the new node IPs as seeds:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
seed_provider:
|
||||
- class_name: org.apache.cassandra.locator.SimpleSeedProvider
|
||||
parameters:
|
||||
- seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
|
||||
|
||||
.. note::
|
||||
|
||||
Updating seed configuration on the **old nodes** (that will be
|
||||
decommissioned) is optional. Seeds are only used during node startup
|
||||
to discover the cluster. If you don't plan to restart the old nodes
|
||||
before decommissioning them, their seed configuration doesn't matter.
|
||||
However, updating all nodes is recommended for safety in case an old
|
||||
node unexpectedly restarts during the migration.
|
||||
|
||||
#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
|
||||
configuration:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl restart scylla-server
|
||||
|
||||
Wait for the node to fully start before restarting the next node.
|
||||
|
||||
#. After restarting the new nodes, verify the cluster is healthy:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
nodetool describecluster
|
||||
|
||||
.. warning::
|
||||
|
||||
Complete this seed list update on **all new nodes** before decommissioning
|
||||
any old nodes. This ensures the new nodes can reform the cluster after
|
||||
the old nodes are removed.
|
||||
|
||||
|
||||
Decommissioning old nodes
|
||||
=========================
|
||||
|
||||
After all new nodes are added and healthy, decommission the old nodes one
|
||||
at a time.
|
||||
|
||||
#. Verify all nodes are healthy before starting decommission:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status.
|
||||
|
||||
#. On the node to be decommissioned, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool decommission
|
||||
|
||||
This command blocks until the decommission is complete. The node will
|
||||
stream its data to the remaining nodes.
|
||||
|
||||
#. Monitor the decommission progress from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
|
||||
→ removed from the cluster.
|
||||
|
||||
You can also monitor streaming progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
#. After decommission completes, verify the node is no longer in the cluster:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioned node should no longer appear in the output.
|
||||
|
||||
#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
|
||||
no longer belongs to them after the topology change:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool cleanup
|
||||
|
||||
.. note::
|
||||
|
||||
``nodetool cleanup`` can be resource-intensive. Run it on one node at a
|
||||
time during low-traffic periods.
|
||||
|
||||
#. Wait for the cluster to stabilize before decommissioning the next node.
|
||||
Ensure no streaming operations are in progress.
|
||||
|
||||
#. Repeat steps 1-7 for each old node to be decommissioned.
|
||||
|
||||
|
||||
Post-migration verification
|
||||
---------------------------
|
||||
|
||||
After all old nodes are decommissioned, verify the migration was successful.
|
||||
|
||||
Verify cluster topology
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
Confirm:
|
||||
|
||||
* All nodes show ``UN`` (Up, Normal) status
|
||||
* Only the new instance type nodes are present
|
||||
* Nodes are balanced across racks
|
||||
|
||||
Verify schema agreement
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool describecluster
|
||||
|
||||
All nodes should report the same schema version.
|
||||
|
||||
Verify data connectivity
|
||||
========================
|
||||
|
||||
Connect to the cluster and run a test query:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
|
||||
|
||||
.. note::
|
||||
|
||||
If ScyllaDB is configured with ``listen_interface``, you must use the
|
||||
node's interface IP address (not localhost) for cqlsh connections.
|
||||
|
||||
Verify ScyllaDB version
|
||||
=======================
|
||||
|
||||
Confirm all nodes are running the same ScyllaDB version:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
scylla --version
|
||||
|
||||
Verify data integrity (optional)
|
||||
================================
|
||||
|
||||
Run data validation on each keyspace to verify sstable integrity:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool scrub --mode=VALIDATE <keyspace_name>
|
||||
|
||||
Rollback
|
||||
--------
|
||||
|
||||
If issues occur during the migration, you can roll back by reversing the
|
||||
procedure.
|
||||
|
||||
During add phase
|
||||
================
|
||||
|
||||
If a new node fails to bootstrap:
|
||||
|
||||
#. Stop ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl stop scylla-server
|
||||
|
||||
#. From an existing node, remove the failed node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id-of-failed-node>
|
||||
|
||||
During decommission phase
|
||||
=========================
|
||||
|
||||
If a decommission operation gets stuck:
|
||||
|
||||
#. If the node is still reachable, try stopping and restarting ScyllaDB
|
||||
#. If the node is unresponsive, from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id>
|
||||
|
||||
See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
for more details.
|
||||
|
||||
Full rollback
|
||||
=============
|
||||
|
||||
To roll back after the migration is complete (all nodes on new instance type),
|
||||
apply the same add-and-replace procedure in reverse:
|
||||
|
||||
#. Add new nodes on the original instance type
|
||||
#. Wait for data streaming to complete
|
||||
#. Decommission the nodes on the new instance type
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
Node stuck in Joining (UJ) state
|
||||
================================
|
||||
|
||||
If a new node remains in ``UJ`` state for an extended period:
|
||||
|
||||
* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
|
||||
* Verify network connectivity between nodes
|
||||
* Ensure sufficient disk space on all nodes
|
||||
* Check for any ongoing operations that may be blocking
|
||||
|
||||
Decommission taking too long
|
||||
============================
|
||||
|
||||
Decommission duration depends on data size. If it appears stuck:
|
||||
|
||||
* Check streaming progress: ``nodetool netstats``
|
||||
* Look for errors in ScyllaDB logs
|
||||
* Verify network bandwidth between nodes
|
||||
|
||||
Schema disagreement
|
||||
===================
|
||||
|
||||
If nodes report different schema versions:
|
||||
|
||||
* Wait a few minutes for schema to propagate
|
||||
* If disagreement persists, restart the nodes one by one
|
||||
* Run ``nodetool describecluster`` to verify agreement
|
||||
|
||||
|
||||
Additional resources
|
||||
--------------------
|
||||
|
||||
* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
|
||||
* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
|
||||
* :doc:`Upgrade </upgrade/index>`
|
||||
@@ -26,6 +26,7 @@ Cluster Management Procedures
|
||||
Safely Restart Your Cluster <safe-start>
|
||||
repair-based-node-operation
|
||||
Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
|
||||
Cluster Platform Migration <cluster-platform-migration>
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -85,6 +86,8 @@ Cluster Management Procedures
|
||||
|
||||
* :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
|
||||
|
||||
* :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
|
||||
|
||||
.. panel-box::
|
||||
:title: Topology Changes
|
||||
:id: "getting-started"
|
||||
|
||||
@@ -57,12 +57,11 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
.. note::
|
||||
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -27,6 +27,16 @@ This configuration takes the form of a query template which is defined in the sc
|
||||
The value of ``ldap_url_template`` parameter should contain a valid LDAP URL (e.g., as returned by the ldapurl utility from OpenLDAP) representing an LDAP query that returns entries for all the user's roles.
|
||||
Scylla will replace the text ``{USER}`` in the URL with the user's Scylla username before querying LDAP.
|
||||
|
||||
.. note:: Usernames substituted into ``{USER}`` are automatically escaped
|
||||
using RFC 4515 filter escaping and URL percent-encoding, so LDAP filter
|
||||
metacharacters (``*``, ``(``, ``)``, ``\``, NUL) and URL metacharacters
|
||||
(``%``, ``?``, ``#``) in usernames are handled safely.
|
||||
|
||||
``{USER}`` must appear only in the **filter** component of the LDAP URL
|
||||
(the part after the third ``?``). Templates that place ``{USER}`` in the
|
||||
host, base DN, attributes, or extensions are rejected at startup, because
|
||||
filter escaping is not the correct encoding for those components.
|
||||
|
||||
Workflow
|
||||
--------
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ Upgrade ScyllaDB
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2026.x.y
|
||||
.. |NEW_VERSION| replace:: 2026.x.z
|
||||
|
||||
==========================================================================
|
||||
Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
|
||||
==========================================================================
|
||||
|
||||
This document describes a step-by-step procedure for upgrading from
|
||||
|SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION| (where "z" is
|
||||
the latest available version), and rolling back to version |SRC_VERSION|
|
||||
if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
|
||||
CentOS, Debian, and Ubuntu.
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions.
|
||||
|
||||
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
See `Upgrade Policy <https://docs.scylladb.com/stable/versioning/upgrade-policy.html>`_ for the ScyllaDB upgrade policy.
|
||||
|
||||
Upgrade Procedure
|
||||
=================
|
||||
|
||||
.. note::
|
||||
Apply the following procedure **serially** on each node. Do not move to the next
|
||||
node before validating that the node is up and running the new version.
|
||||
|
||||
A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
|
||||
shutdown. For each of the nodes in the cluster, you will:
|
||||
|
||||
#. Drain the node and back up the data.
|
||||
#. Backup configuration file.
|
||||
#. Stop ScyllaDB.
|
||||
#. Download and install new ScyllaDB packages.
|
||||
#. Start ScyllaDB.
|
||||
#. Validate that the upgrade was successful.
|
||||
|
||||
**Before** upgrading, check which version you are running now using
|
||||
``scylla --version``. Note the current version in case you want to roll back
|
||||
the upgrade.
|
||||
|
||||
**During** the rolling upgrade it is highly recommended:
|
||||
|
||||
* Not to use new |NEW_VERSION| features.
|
||||
* Not to run administration functions, like repairs, refresh, rebuild or add
|
||||
or remove nodes. See
|
||||
`sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
|
||||
ScyllaDB Manager's scheduled or running repairs.
|
||||
* Not to apply schema changes.
|
||||
|
||||
Upgrade Steps
|
||||
=============
|
||||
|
||||
Back up the data
|
||||
------------------------------
|
||||
|
||||
Back up all the data to an external device. We recommend using
|
||||
`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
|
||||
to create backups.
|
||||
|
||||
Alternatively, you can use the ``nodetool snapshot`` command.
|
||||
For **each** node in the cluster, run the following:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
nodetool snapshot
|
||||
|
||||
Take note of the directory name that nodetool gives you, and copy all
|
||||
the directories with this name under ``/var/lib/scylla`` to a backup device.
|
||||
|
||||
When the upgrade is completed on all nodes, remove the snapshot with the
|
||||
``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
|
||||
space.
|
||||
|
||||
Back up the configuration file
|
||||
------------------------------
|
||||
|
||||
Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
|
||||
in case you need to roll back the upgrade.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
|
||||
|
||||
Gracefully stop the node
|
||||
------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server stop
|
||||
|
||||
Download and install the new release
|
||||
------------------------------------
|
||||
|
||||
You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
|
||||
a patch release.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To install a patch version on Debian or Ubuntu, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To install a patch version on RHEL or CentOS, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo yum clean all
|
||||
sudo yum update scylla\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you're using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you're using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to apply an extended upgrade procedure:
|
||||
|
||||
#. Install the new ScyllaDB version with the additional
|
||||
``scylla-machine-image`` package:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
sudo apt-get dist-upgrade scylla-machine-image
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service start scylla-server
|
||||
|
||||
Validate
|
||||
--------
|
||||
#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
|
||||
including the one you just upgraded, are in UN status.
|
||||
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
|
||||
to check the ScyllaDB version.
|
||||
#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
|
||||
#. Check again after 2 minutes to validate that no new issues are introduced.
|
||||
|
||||
Once you are sure the node upgrade is successful, move to the next node in
|
||||
the cluster.
|
||||
|
||||
Rollback Procedure
|
||||
==================
|
||||
|
||||
The following procedure describes a rollback from ScyllaDB release
|
||||
|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
|
||||
|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes.
|
||||
|
||||
* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
|
||||
* Execute the following commands one node at a time, moving to the next node only
|
||||
after the rollback procedure is completed successfully.
|
||||
|
||||
ScyllaDB rollback is a rolling procedure that does **not** require a full
|
||||
cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
|
||||
|
||||
#. Drain the node and stop ScyllaDB.
|
||||
#. Downgrade to the previous release.
|
||||
#. Restore the configuration file.
|
||||
#. Restart ScyllaDB.
|
||||
#. Validate the rollback success.
|
||||
|
||||
Rollback Steps
|
||||
==============
|
||||
|
||||
Gracefully shutdown ScyllaDB
|
||||
-----------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
sudo service stop scylla-server
|
||||
|
||||
Downgrade to the previous release
|
||||
----------------------------------
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you’re using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you’re using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to additionally downgrade
|
||||
the ``scylla-machine-image`` package.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
|
||||
Restore the configuration file
|
||||
------------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo rm -rf /etc/scylla/scylla.yaml
|
||||
sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server start
|
||||
|
||||
Validate
|
||||
--------
|
||||
Check upgrade instruction above for validation. Once you are sure the node
|
||||
rollback is successful, move to the next node in the cluster.
|
||||
@@ -727,7 +727,12 @@ public:
|
||||
|
||||
// now we need one page more to be able to save one for next lap
|
||||
auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
|
||||
auto buf2 = co_await _input.read_exactly(fill_size);
|
||||
// If the underlying stream is already at EOF (e.g. buf1 came from
|
||||
// cached _next while the previous read_exactly drained the source),
|
||||
// skip the read_exactly call — it would return empty anyway.
|
||||
auto buf2 = _input.eof()
|
||||
? temporary_buffer<char>()
|
||||
: co_await _input.read_exactly(fill_size);
|
||||
|
||||
temporary_buffer<char> output(buf1.size() + buf2.size());
|
||||
|
||||
|
||||
@@ -1042,7 +1042,7 @@ future<seastar::shared_ptr<encryption_context>> register_extensions(const db::co
|
||||
// Since we are in pre-init phase, this should be safe.
|
||||
co_await smp::invoke_on_all([&opts, &exts] () mutable {
|
||||
auto& f = exts.schema_extensions().at(encryption_attribute);
|
||||
for (auto& s : { db::system_keyspace::paxos(), db::system_keyspace::batchlog(), db::system_keyspace::dicts() }) {
|
||||
for (auto& s : { db::system_keyspace::paxos(), db::system_keyspace::batchlog(), db::system_keyspace::dicts(), db::system_keyspace::raft() }) {
|
||||
exts.add_extension_to_schema(s, encryption_attribute, f(*opts));
|
||||
}
|
||||
});
|
||||
|
||||
@@ -38,7 +38,7 @@ This directory should have 700 permissions and belong to the scylla user)foo")
|
||||
R"foo(System information encryption settings
|
||||
|
||||
If enabled, system tables that may contain sensitive information (system.batchlog,
|
||||
system.paxos), hints files and commit logs are encrypted with the
|
||||
system.paxos, system.raft), hints files and commit logs are encrypted with the
|
||||
encryption settings below.
|
||||
|
||||
When enabling system table encryption on a node with existing data, run
|
||||
|
||||
@@ -437,7 +437,6 @@ void ldap_connection::poll_results() {
|
||||
const auto found = _msgid_to_promise.find(id);
|
||||
if (found == _msgid_to_promise.end()) {
|
||||
mylog.error("poll_results: got valid result for unregistered id {}, dropping it", id);
|
||||
ldap_msgfree(result);
|
||||
} else {
|
||||
found->second.set_value(std::move(result_ptr));
|
||||
_msgid_to_promise.erase(found);
|
||||
|
||||
@@ -16,9 +16,11 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
@@ -102,7 +104,123 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
|
||||
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
|
||||
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
};
|
||||
};
|
||||
|
||||
static constexpr auto TC_TARGET_KEY = "tc";
|
||||
static constexpr auto PK_TARGET_KEY = "pk";
|
||||
static constexpr auto FC_TARGET_KEY = "fc";
|
||||
|
||||
// Convert a serialized targets string (as produced by serialize_targets())
|
||||
// back into the CQL column list used inside CREATE INDEX ... ON table(<here>).
|
||||
//
|
||||
// JSON examples:
|
||||
// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2"
|
||||
// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v"
|
||||
// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1"
|
||||
static sstring targets_to_cql(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets));
|
||||
}
|
||||
|
||||
sstring result;
|
||||
|
||||
const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY);
|
||||
if (pk && pk->IsArray() && !pk->Empty()) {
|
||||
result += "(";
|
||||
auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) {
|
||||
return cql3::util::maybe_quote(sstring(rjson::to_string_view(col)));
|
||||
}) | std::ranges::to<std::vector<sstring>>();
|
||||
result += boost::algorithm::join(pk_cols, ", ");
|
||||
result += "), ";
|
||||
}
|
||||
|
||||
const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc)));
|
||||
}
|
||||
|
||||
const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY);
|
||||
if (fc && fc->IsArray()) {
|
||||
for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) {
|
||||
result += ", ";
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i])));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Serialize vector index targets into a format using:
|
||||
// "tc" for the target (vector) column,
|
||||
// "pk" for partition key columns (local index),
|
||||
// "fc" for filtering columns.
|
||||
// For a simple single-column vector index, returns just the column name.
|
||||
// Examples:
|
||||
// (v) -> "v"
|
||||
// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]}
|
||||
// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]}
|
||||
// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}
|
||||
sstring vector_index::serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() == 0) {
|
||||
throw exceptions::invalid_request_exception("Vector index must have at least one target column");
|
||||
}
|
||||
|
||||
if (targets.size() == 1) {
|
||||
auto tc = targets[0]->value;
|
||||
if (!std::holds_alternative<index_target::single_column>(tc)) {
|
||||
throw exceptions::invalid_request_exception("Missing vector column target for local vector index");
|
||||
}
|
||||
return index_target::escape_target_column(*std::get<index_target::single_column>(tc));
|
||||
}
|
||||
|
||||
const bool has_pk = std::holds_alternative<index_target::multiple_columns>(targets.front()->value);
|
||||
const size_t tc_idx = has_pk ? 1 : 0;
|
||||
const size_t fc_count = targets.size() - tc_idx - 1;
|
||||
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[tc_idx]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index target column must be a single column");
|
||||
}
|
||||
|
||||
rjson::value json_map = rjson::empty_object();
|
||||
rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get<index_target::single_column>(targets[tc_idx]->value)->text()));
|
||||
|
||||
if (has_pk) {
|
||||
rjson::value pk_json = rjson::empty_array();
|
||||
for (const auto& col : std::get<index_target::multiple_columns>(targets.front()->value)) {
|
||||
rjson::push_back(pk_json, rjson::from_string(col->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json));
|
||||
}
|
||||
|
||||
if (fc_count > 0) {
|
||||
rjson::value fc_json = rjson::empty_array();
|
||||
for (size_t i = tc_idx + 1; i < targets.size(); ++i) {
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[i]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index filtering column must be a single column");
|
||||
}
|
||||
rjson::push_back(fc_json, rjson::from_string(std::get<index_target::single_column>(targets[i]->value)->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json));
|
||||
}
|
||||
|
||||
return rjson::print(json_map);
|
||||
}
|
||||
|
||||
sstring vector_index::get_target_column(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
return sstring(rjson::to_string_view(*tc));
|
||||
}
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
|
||||
auto q = properties.find("quantization");
|
||||
@@ -133,9 +251,8 @@ bool vector_index::view_should_exist() const {
|
||||
|
||||
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
|
||||
<< cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name())
|
||||
<< "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
<< " USING 'vector_index'";
|
||||
|
||||
return cql3::description{
|
||||
@@ -320,16 +437,23 @@ bool vector_index::has_vector_index(const schema& s) {
|
||||
|
||||
bool vector_index::has_vector_index_on_column(const schema& s, const sstring& target_name) {
|
||||
for (const auto& index : s.indices()) {
|
||||
auto class_it = index.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = index.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != index.options().end() && target_it != index.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && target_it->second == target_name;
|
||||
if (is_vector_index_on_column(index, target_name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool vector_index::is_vector_index_on_column(const index_metadata& im, const sstring& target_name) {
|
||||
auto class_it = im.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != im.options().end() && target_it != im.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_target_column(target_it->second) == target_name;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns the schema version of the base table at which the index was created.
|
||||
/// This is used to determine if the index needs to be rebuilt after a schema change.
|
||||
/// The CREATE INDEX and DROP INDEX statements does change the schema version.
|
||||
|
||||
@@ -34,8 +34,12 @@ public:
|
||||
table_schema_version index_version(const schema& schema) override;
|
||||
static bool has_vector_index(const schema& s);
|
||||
static bool has_vector_index_on_column(const schema& s, const sstring& target_name);
|
||||
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
|
||||
static void check_cdc_options(const schema& schema);
|
||||
|
||||
static sstring serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets);
|
||||
static sstring get_target_column(const sstring& targets);
|
||||
|
||||
static bool is_rescoring_enabled(const index_options_map& properties);
|
||||
static float get_oversampling(const index_options_map& properties);
|
||||
static sstring get_cql_similarity_function_name(const index_options_map& properties);
|
||||
|
||||
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic
|
||||
|
||||
sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
|
||||
const auto replication_factor = erm.get_replication_factor();
|
||||
if (read_replicas.size() > replication_factor) {
|
||||
if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
|
||||
if (read_replicas.size() > replication_factor + 1) {
|
||||
return seastar::format(
|
||||
"everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
|
||||
"cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
|
||||
read_replicas.size(), replication_factor);
|
||||
}
|
||||
} else if (read_replicas.size() > replication_factor) {
|
||||
return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
|
||||
}
|
||||
return {};
|
||||
|
||||
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(
|
||||
|
||||
writev(v.serialize());
|
||||
}
|
||||
return collection_mutation(type, ret);
|
||||
return collection_mutation(type, std::move(ret));
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
|
||||
size 6530196
|
||||
oid sha256:762ffcd253ff9a784fc58e36e1cbe83643e3fe576ac60eb1ce6e4bf8ac2eda8c
|
||||
size 6548000
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
|
||||
size 6528308
|
||||
oid sha256:3f788e2b36a4b87328997c60f0903e197bd193f977e02b5fc8888d79c364e21d
|
||||
size 6540076
|
||||
|
||||
@@ -459,7 +459,7 @@ future<> server_impl::wait_for_state_change(seastar::abort_source* as) {
|
||||
}
|
||||
|
||||
try {
|
||||
return as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future();
|
||||
co_await (as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future());
|
||||
} catch (abort_requested_exception&) {
|
||||
throw request_aborted(fmt::format(
|
||||
"Aborted while waiting for state change on server: {}, latest applied entry: {}, current state: {}", _id, _applied_idx, _fsm->current_state()));
|
||||
@@ -1101,6 +1101,18 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
|
||||
// case.
|
||||
co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
|
||||
_stats.store_term_and_vote++;
|
||||
|
||||
// When the term advances, any in-flight snapshot transfers
|
||||
// belong to an outdated term: the progress tracker has been
|
||||
// reset in become_leader() or we are now a follower.
|
||||
// Abort them before we dispatch this batch's messages, which
|
||||
// may start fresh transfers for the new term.
|
||||
//
|
||||
// A vote may also change independently of the term (e.g. a
|
||||
// follower voting for a candidate at the same term), but in
|
||||
// that case there are no in-flight transfers and the abort
|
||||
// is a no-op.
|
||||
abort_snapshot_transfers();
|
||||
}
|
||||
|
||||
if (batch.snp) {
|
||||
@@ -1210,8 +1222,6 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
|
||||
// quickly) stop happening (we're outside the config after all).
|
||||
co_await _apply_entries.push_eventually(removed_from_config{});
|
||||
}
|
||||
// request aborts of snapshot transfers
|
||||
abort_snapshot_transfers();
|
||||
// abort all read barriers
|
||||
for (auto& r : _reads) {
|
||||
r.promise.set_value(not_a_leader{_fsm->current_leader()});
|
||||
|
||||
@@ -252,6 +252,10 @@ public:
|
||||
//
|
||||
// The caller may pass a pointer to an abort_source to make the function abortable.
|
||||
// It it passes nullptr, the function is unabortable.
|
||||
//
|
||||
// Exceptions:
|
||||
// raft::request_aborted
|
||||
// Thrown if abort is requested before the operation finishes.
|
||||
virtual future<> wait_for_state_change(seastar::abort_source* as) = 0;
|
||||
|
||||
// The returned future is resolved when a leader is elected for the current term.
|
||||
@@ -262,6 +266,10 @@ public:
|
||||
//
|
||||
// The caller may pass a pointer to an abort_source to make the function abortable.
|
||||
// It it passes nullptr, the function is unabortable.
|
||||
//
|
||||
// Exceptions:
|
||||
// raft::request_aborted
|
||||
// Thrown if abort is requested before the operation finishes.
|
||||
virtual future<> wait_for_leader(seastar::abort_source* as) = 0;
|
||||
|
||||
// Manually trigger snapshot creation and log truncation.
|
||||
|
||||
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
on_internal_error_noexcept(rcslog,
|
||||
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
|
||||
_resources, _initial_resources));
|
||||
_resources.count = std::max(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
|
||||
_resources.count = std::min(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::min(_resources.memory, _initial_resources.memory);
|
||||
}
|
||||
maybe_wake_execution_loop();
|
||||
}
|
||||
|
||||
@@ -432,7 +432,9 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
@@ -442,7 +444,7 @@ public:
|
||||
virtual storage_group& storage_group_for_token(dht::token) const = 0;
|
||||
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
|
||||
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
|
||||
virtual locator::combined_load_stats table_load_stats() const = 0;
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
|
||||
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
|
||||
if (!range.is_singular()) {
|
||||
continue;
|
||||
}
|
||||
auto token = dht::token::to_int64(ranges.front().start()->value().token());
|
||||
auto token = dht::token::to_int64(range.start()->value().token());
|
||||
if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
|
||||
// Don't return immediately - account all ranges first
|
||||
ret = can_proceed::no;
|
||||
|
||||
@@ -1129,9 +1129,7 @@ public:
|
||||
return _stats;
|
||||
}
|
||||
|
||||
// The tablet filter is used to not double account migrating tablets, so it's important that
|
||||
// only one of pending or leaving replica is accounted based on current migration stage.
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
|
||||
locator::combined_load_stats table_load_stats() const;
|
||||
|
||||
const db::view::stats& get_view_stats() const {
|
||||
return _view_stats;
|
||||
|
||||
278
replica/table.cc
278
replica/table.cc
@@ -711,7 +711,9 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -734,7 +736,7 @@ public:
|
||||
return *_single_sg;
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
|
||||
locator::combined_load_stats table_load_stats() const override {
|
||||
return locator::combined_load_stats{
|
||||
.table_ls = locator::table_load_stats{
|
||||
.size_in_bytes = _single_sg->live_disk_space_used(),
|
||||
@@ -757,6 +759,11 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -777,7 +784,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -801,7 +808,8 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -895,7 +903,9 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -909,7 +919,7 @@ public:
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
@@ -1006,6 +1016,11 @@ bool storage_group::set_split_mode() {
|
||||
return false;
|
||||
}
|
||||
if (!splitting_mode()) {
|
||||
// Don't create new compaction groups if the main cg has compaction disabled
|
||||
if (_main_cg->compaction_disabled()) {
|
||||
tlogger.debug("storage_group::set_split_mode: split ready groups not created due to compaction disabled on the main group");
|
||||
return false;
|
||||
}
|
||||
auto create_cg = [this] () -> compaction_group_ptr {
|
||||
// TODO: use the actual sub-ranges instead, to help incremental selection on the read path.
|
||||
return compaction_group::make_empty_group(*_main_cg);
|
||||
@@ -1443,6 +1458,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
log_level failure_log_level = log_level::error;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
@@ -1464,6 +1480,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
@@ -1471,13 +1490,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after successful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1491,6 +1510,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
log_level failure_log_level = log_level::error;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
@@ -1500,14 +1520,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1743,7 +1766,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
|
||||
utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
|
||||
throw std::system_error(ENOSPC, std::system_category(), "Injected error");
|
||||
});
|
||||
co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
// signal a memtable was sealed
|
||||
utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
|
||||
});
|
||||
|
||||
undo_stats.reset();
|
||||
@@ -2933,17 +2958,108 @@ void table::on_flush_timer() {
|
||||
});
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
// The following functions return true if we should return the tablet size of a tablet in
|
||||
// migration depending on its transition stage and whether it is a leaving or pending replica
|
||||
bool has_size_on_leaving (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_size_on_pending (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair:
|
||||
return false;
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
|
||||
locator::table_load_stats table_stats;
|
||||
table_stats.split_ready_seq_number = _split_ready_seq_number;
|
||||
|
||||
locator::tablet_load_stats tablet_stats;
|
||||
|
||||
for_each_storage_group([&] (size_t id, storage_group& sg) {
|
||||
locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
|
||||
if (tablet_filter(*_tablet_map, gid)) {
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
auto tid = locator::tablet_id(id);
|
||||
locator::global_tablet_id gid { _t.schema()->id(), tid };
|
||||
locator::tablet_replica me { _my_host_id, this_shard_id() };
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
|
||||
auto transition = _tablet_map->get_tablet_transition_info(tid);
|
||||
auto& info = _tablet_map->get_tablet_info(tid);
|
||||
bool is_pending = transition && transition->pending_replica == me;
|
||||
bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
|
||||
// Otherwise, pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto table_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
// When a tablet is in migration, we want to send its size during any migration stage when
|
||||
// we still know the tablet's size. This way the balancer will have better information about
|
||||
// tablet sizes, and we reduce the chance that the node will be ignored during balancing
|
||||
// due to missing tablet size. On the leaving replica we include tablets until the use_new
|
||||
// stage (inclusive), and on the pending we include tablets after the streaming stage.
|
||||
// There is an overlap in tablet sizes (we report sizes on both the leaving and pending
|
||||
// replicas for some stages), but that should not be a problem.
|
||||
auto tablet_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (is_leaving) {
|
||||
return has_size_on_leaving(transition->stage);
|
||||
} else if (is_pending) {
|
||||
return has_size_on_pending(transition->stage);
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (table_size_filter()) {
|
||||
table_stats.size_in_bytes += tablet_size;
|
||||
}
|
||||
|
||||
if (tablet_size_filter()) {
|
||||
const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
|
||||
// Make sure the token range is in the form (a, b]
|
||||
SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
|
||||
@@ -2956,8 +3072,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
|
||||
};
|
||||
}
|
||||
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
return _sg_manager->table_load_stats(std::move(tablet_filter));
|
||||
locator::combined_load_stats table::table_load_stats() const {
|
||||
return _sg_manager->table_load_stats();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
@@ -3069,7 +3185,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3093,7 +3211,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3102,7 +3220,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
|
||||
auto it = _storage_groups.find(group_id);
|
||||
if (it == _storage_groups.end()) {
|
||||
throw std::runtime_error(format("Unable to find sibling tablet of id for table {}", group_id, table_id));
|
||||
throw std::runtime_error(format("Unable to find sibling tablet of id {} for table {}", group_id, table_id));
|
||||
}
|
||||
auto& sg = it->second;
|
||||
sg->for_each_compaction_group([&new_sg, new_range, new_tid, group_id] (const compaction_group_ptr& cg) {
|
||||
@@ -3126,7 +3244,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3142,7 +3264,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3228,7 +3350,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -3690,7 +3812,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
|
||||
|
||||
std::vector<snapshot_sstable_set> sstable_sets(smp::count);
|
||||
std::vector<int64_t> tablet_counts(smp::count);
|
||||
|
||||
co_await writer->init();
|
||||
co_await smp::invoke_on_all([&] -> future<> {
|
||||
@@ -3698,7 +3819,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
auto [tables, permit] = co_await t.snapshot_sstables();
|
||||
auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
|
||||
sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
|
||||
tablet_counts[this_shard_id()] = t.calculate_tablet_count();
|
||||
});
|
||||
co_await writer->sync();
|
||||
|
||||
@@ -3712,12 +3832,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
|
||||
std::optional<int64_t> min_tablet_count;
|
||||
std::optional<int64_t> tablet_count;
|
||||
if (t.uses_tablets()) {
|
||||
SCYLLA_ASSERT(!tablet_counts.empty());
|
||||
min_tablet_count = *std::ranges::min_element(tablet_counts);
|
||||
auto erm = t.get_effective_replication_map();
|
||||
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
||||
tablet_count = tm.tablet_count();
|
||||
}
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
@@ -3775,6 +3896,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -3782,6 +3904,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -3801,53 +3926,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
|
||||
@@ -263,8 +263,9 @@ public:
|
||||
void enable_schema_commitlog() {
|
||||
_static_props.enable_schema_commitlog();
|
||||
}
|
||||
void set_is_group0_table(bool enabled = true) {
|
||||
_static_props.is_group0_table = enabled;
|
||||
void set_is_group0_table() {
|
||||
_static_props.is_group0_table = true;
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
class default_names {
|
||||
|
||||
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
auto ps_ptr = qp.get_prepared(cache_key);
|
||||
if (!ps_ptr) {
|
||||
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = std::move(msg_ptr->get_prepared());
|
||||
ps_ptr = msg_ptr->get_prepared();
|
||||
if (!ps_ptr) {
|
||||
on_internal_error(paxos_state::logger, "prepared statement is null");
|
||||
}
|
||||
|
||||
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
|
||||
if (!schema->static_props().is_group0_table) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
|
||||
}
|
||||
|
||||
if (!schema->static_props().use_schema_commitlog) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -559,6 +559,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
group0_id = g0_info.group0_id;
|
||||
raft::server_address my_addr{my_id, {}};
|
||||
|
||||
bool starting_server_as_follower = false;
|
||||
if (server == nullptr) {
|
||||
// This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
|
||||
raft::configuration initial_configuration;
|
||||
@@ -586,6 +587,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
// trigger an empty snapshot transfer.
|
||||
nontrivial_snapshot = true;
|
||||
} else {
|
||||
starting_server_as_follower = true;
|
||||
co_await handshaker->pre_server_start(g0_info);
|
||||
}
|
||||
|
||||
@@ -614,7 +616,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(server);
|
||||
if (server->get_configuration().contains(my_id)) {
|
||||
co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
|
||||
utils::wait_for_message(std::chrono::minutes{5}));
|
||||
if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
|
||||
// True if we started a new group or completed a configuration change initiated earlier.
|
||||
group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
|
||||
server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
|
||||
|
||||
@@ -987,7 +987,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
} else {
|
||||
co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(m));
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -5003,6 +5003,8 @@ future<> storage_service::drain() {
|
||||
}
|
||||
|
||||
future<> storage_service::do_drain() {
|
||||
co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
|
||||
|
||||
// Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
|
||||
co_await stop_transport();
|
||||
|
||||
@@ -6056,6 +6058,8 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
});
|
||||
};
|
||||
|
||||
co_await utils::get_local_injector().inject("tablet_split_monitor_wait", utils::wait_for_message(1min));
|
||||
|
||||
exponential_backoff_retry split_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
|
||||
|
||||
while (!_async_gate.is_closed() && !_group0_as.abort_requested()) {
|
||||
@@ -6090,6 +6094,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
} catch (raft::request_aborted& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (seastar::gate_closed_exception& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (...) {
|
||||
slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
|
||||
table, std::current_exception(), split_retry.sleep_time());
|
||||
@@ -6156,6 +6163,57 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -6183,12 +6241,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -6227,43 +6279,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
co_await local_topology_barrier();
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
@@ -7359,34 +7375,8 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (!table) {
|
||||
continue;
|
||||
}
|
||||
auto erm = table->get_effective_replication_map();
|
||||
auto& token_metadata = erm->get_token_metadata();
|
||||
auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached cleanup stage, then leaving replicas are accounted.
|
||||
// If transition is past cleanup stage, then pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
|
||||
auto transition = tmap.get_tablet_transition_info(id.tablet);
|
||||
auto& info = tmap.get_tablet_info(id.tablet);
|
||||
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_pending = transition->pending_replica == me;
|
||||
bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats() };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
|
||||
@@ -944,6 +944,9 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
|
||||
@@ -21,7 +21,6 @@ namespace service {
|
||||
|
||||
struct status_helper {
|
||||
tasks::task_status status;
|
||||
utils::chunked_vector<locator::tablet_id> tablets;
|
||||
std::optional<locator::tablet_replica> pending_replica;
|
||||
};
|
||||
|
||||
@@ -141,27 +140,54 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
auto task_type = hint.get_task_type();
|
||||
auto tablet_id_opt = tablet_id_provided(task_type) ? std::make_optional(hint.get_tablet_id()) : std::nullopt;
|
||||
|
||||
size_t tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
const auto& tablets = _ss.get_token_metadata().tablets();
|
||||
size_t tablet_count = tablets.has_tablet_map(table) ? tablets.get_tablet_map(table).tablet_count() : 0;
|
||||
auto res = co_await get_status_helper(id, std::move(hint));
|
||||
if (!res) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
|
||||
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
});
|
||||
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
|
||||
while (true) {
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
return true;
|
||||
}
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!is_repair_task(task_type)) {
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
auto tmptr = _ss.get_token_metadata_ptr();
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
break;
|
||||
}
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_still_running = false;
|
||||
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
|
||||
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
|
||||
return make_ready_future();
|
||||
});
|
||||
if (!repair_still_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
res->status.end_time = db_clock::now();
|
||||
co_return res->status;
|
||||
}
|
||||
if (is_migration_task(task_type)) {
|
||||
auto& replicas = _ss.get_token_metadata().tablets().get_tablet_map(table).get_tablet_info(tablet_id_opt.value()).replicas;
|
||||
auto migration_failed = std::all_of(replicas.begin(), replicas.end(), [&] (const auto& replica) { return res->pending_replica.has_value() && replica != res->pending_replica.value(); });
|
||||
@@ -169,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -244,7 +270,15 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
status_helper res;
|
||||
auto table = hint.get_table_id();
|
||||
auto task_type = hint.get_task_type();
|
||||
auto schema = _ss._db.local().get_tables_metadata().get_table(table).schema();
|
||||
auto table_ptr = _ss._db.local().get_tables_metadata().get_table_if_exists(table);
|
||||
if (!table_ptr) {
|
||||
co_return tasks::task_status {
|
||||
.task_id = id,
|
||||
.kind = tasks::task_kind::cluster,
|
||||
.is_abortable = co_await is_abortable(std::move(hint)),
|
||||
};
|
||||
}
|
||||
auto schema = table_ptr->schema();
|
||||
res.status = {
|
||||
.task_id = id,
|
||||
.kind = tasks::task_kind::cluster,
|
||||
@@ -257,6 +291,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_task_finished = false;
|
||||
bool repair_task_pending = false;
|
||||
bool no_tablets_processed = true;
|
||||
if (is_repair_task(task_type)) {
|
||||
auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
|
||||
if (progress) {
|
||||
@@ -273,37 +308,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& task_info = info.repair_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tid);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tablet_id);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
} else { // Resize task.
|
||||
auto& task_info = tmap.resize_task_info();
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
if (!res.tablets.empty()) {
|
||||
if (!no_tablets_processed) {
|
||||
res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
|
||||
if (repair_task_pending) {
|
||||
// When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
|
||||
res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
if (repair_task_finished) {
|
||||
|
||||
@@ -1070,6 +1070,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
|
||||
if (abandoning_replicas.size() + new_replicas.size() > 1) {
|
||||
throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
|
||||
"Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
|
||||
ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
|
||||
}
|
||||
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, tablet_info.replicas)
|
||||
@@ -1079,8 +1088,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
));
|
||||
|
||||
// Calculate abandoning replica and abort view building tasks on them
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
if (!abandoning_replicas.empty()) {
|
||||
if (abandoning_replicas.size() != 1) {
|
||||
on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));
|
||||
@@ -2193,6 +2200,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
_tablet_allocator.set_load_stats(reconciled_stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the background storage group merge to finish before releasing the state machine.
|
||||
// Background merge holds the old erm, so a successful barrier joins with it.
|
||||
// This guarantees that the background merge doesn't run concurrently with the next merge.
|
||||
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
|
||||
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
|
||||
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
|
||||
// by the background merge fiber.
|
||||
tm = nullptr;
|
||||
if (!guard) {
|
||||
guard = co_await start_operation();
|
||||
}
|
||||
co_await global_tablet_token_metadata_barrier(std::move(guard));
|
||||
}
|
||||
|
||||
future<> handle_truncate_table(group0_guard guard) {
|
||||
@@ -2469,7 +2489,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
|
||||
// If there is no other work, evaluate load and start tablet migration if there is imbalance.
|
||||
if (co_await maybe_start_tablet_migration(std::move(guard))) {
|
||||
if (auto guard_opt = co_await maybe_start_tablet_migration(std::move(guard)); !guard_opt) {
|
||||
co_return true;
|
||||
} else {
|
||||
guard = std::move(*guard_opt);
|
||||
}
|
||||
|
||||
if (co_await maybe_retry_failed_rf_change_tablet_rebuilds(std::move(guard))) {
|
||||
co_return true;
|
||||
}
|
||||
co_return false;
|
||||
@@ -3674,11 +3700,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
// Returns the guard if no work done. Otherwise, performs a table migration and consumes the guard.
|
||||
future<std::optional<group0_guard>> maybe_migrate_system_tables(group0_guard guard);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet migration path.
|
||||
future<bool> maybe_start_tablet_migration(group0_guard);
|
||||
// Returns the guard if no work done. Otherwise, transitions the state machine into tablet migration path.
|
||||
future<std::optional<group0_guard>> maybe_start_tablet_migration(group0_guard);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet resize finalization path.
|
||||
future<bool> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
|
||||
// Returns the guard if no work done. Otherwise, transitions the state machine into tablet resize finalization path.
|
||||
future<std::optional<group0_guard>> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet migration path.
|
||||
future<bool> maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard);
|
||||
|
||||
future<> refresh_tablet_load_stats();
|
||||
future<> start_tablet_load_stats_refresher();
|
||||
@@ -3790,14 +3819,14 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_migrate_system_t
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
|
||||
future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
|
||||
rtlogger.debug("Evaluating tablet balance");
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
if (plan.empty()) {
|
||||
rtlogger.debug("Tablet load balancer did not make any plan");
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
@@ -3817,15 +3846,15 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
|
||||
.build());
|
||||
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
|
||||
co_return true;
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
|
||||
future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
|
||||
if (plan.finalize_resize.empty()) {
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
if (utils::get_local_injector().enter("tablet_split_finalization_postpone")) {
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
auto resize_finalization_transition_state = [this] {
|
||||
@@ -3841,6 +3870,73 @@ future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0
|
||||
.build());
|
||||
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Started tablet resize finalization");
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard) {
|
||||
rtlogger.debug("Retrying failed rebuilds");
|
||||
|
||||
if (utils::get_local_injector().enter("maybe_retry_failed_rf_change_tablet_rebuilds_skip")) {
|
||||
rtlogger.debug("Skipping retrying failed rebuilds due to error injection");
|
||||
co_return false;
|
||||
}
|
||||
|
||||
auto tmptr = get_token_metadata_ptr();
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
for (auto& ks_name : _db.get_tablets_keyspaces()) {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
auto& strategy = ks.get_replication_strategy();
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto& tablet_map = tmptr->tablets().get_tablet_map(table_or_mv->id());
|
||||
auto new_tablet_map = co_await strategy.maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await tablet_map.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto& replicas = tablet_map.get_tablet_info(tablet_id).replicas;
|
||||
auto it = std::find_if(tablet_info.replicas.begin(), tablet_info.replicas.end(), [&](const auto& replica) {
|
||||
return std::find(replicas.begin(), replicas.end(), replica) == replicas.end();
|
||||
});
|
||||
if (it == tablet_info.replicas.end()) {
|
||||
co_return;
|
||||
}
|
||||
auto new_replicas = replicas;
|
||||
new_replicas.push_back(*it);
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, new_replicas)
|
||||
.set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
|
||||
.set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
|
||||
.build()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
if (!updates.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (updates.empty()) {
|
||||
rtlogger.debug("No failed RF change rebuilds to retry");
|
||||
co_return false;
|
||||
}
|
||||
|
||||
updates.emplace_back(
|
||||
topology_mutation_builder(guard.write_timestamp())
|
||||
.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.build());
|
||||
|
||||
sstring reason = "Retry failed tablet rebuilds";
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), reason);
|
||||
co_return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,13 @@ class service_permit {
|
||||
friend service_permit empty_service_permit();
|
||||
public:
|
||||
size_t count() const { return _permit ? _permit->count() : 0; };
|
||||
// Merge additional semaphore units into this permit.
|
||||
// Used to grow the permit after the actual resource cost is known.
|
||||
void adopt(seastar::semaphore_units<>&& units) {
|
||||
if (_permit) {
|
||||
_permit->adopt(std::move(units));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline service_permit make_service_permit(seastar::semaphore_units<>&& permit) {
|
||||
|
||||
@@ -201,95 +201,47 @@ public:
|
||||
virtual future<std::optional<entry_info>> next_entry() = 0;
|
||||
};
|
||||
|
||||
// Allocated inside LSA.
|
||||
class promoted_index {
|
||||
deletion_time _del_time;
|
||||
uint64_t _promoted_index_start;
|
||||
uint32_t _promoted_index_size;
|
||||
uint32_t _num_blocks;
|
||||
public:
|
||||
promoted_index(const schema& s,
|
||||
deletion_time del_time,
|
||||
uint64_t promoted_index_start,
|
||||
uint32_t promoted_index_size,
|
||||
uint32_t num_blocks)
|
||||
: _del_time{del_time}
|
||||
, _promoted_index_start(promoted_index_start)
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _num_blocks(num_blocks)
|
||||
{ }
|
||||
|
||||
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
|
||||
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
|
||||
// Call under allocating_section.
|
||||
// For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
|
||||
reader_permit,
|
||||
tracing::trace_state_ptr,
|
||||
file_input_stream_options,
|
||||
use_caching);
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
using promoted_index = parsed_promoted_index_entry;
|
||||
|
||||
// A partition index element.
|
||||
// Allocated inside LSA.
|
||||
class index_entry {
|
||||
private:
|
||||
managed_bytes _key;
|
||||
mutable std::optional<dht::token> _token;
|
||||
uint64_t _position;
|
||||
managed_ref<promoted_index> _index;
|
||||
struct [[gnu::packed]] index_entry {
|
||||
mutable int64_t raw_token;
|
||||
uint64_t data_file_offset;
|
||||
uint32_t key_offset;
|
||||
|
||||
public:
|
||||
|
||||
key_view get_key() const {
|
||||
return key_view{_key};
|
||||
}
|
||||
|
||||
// May allocate so must be called under allocating_section.
|
||||
decorated_key_view get_decorated_key(const schema& s) const {
|
||||
if (!_token) {
|
||||
_token.emplace(s.get_partitioner().get_token(get_key()));
|
||||
}
|
||||
return decorated_key_view(*_token, get_key());
|
||||
}
|
||||
|
||||
uint64_t position() const { return _position; };
|
||||
|
||||
std::optional<deletion_time> get_deletion_time() const {
|
||||
if (_index) {
|
||||
return _index->get_deletion_time();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
|
||||
: _key(std::move(key))
|
||||
, _position(position)
|
||||
, _index(std::move(index))
|
||||
{}
|
||||
|
||||
index_entry(index_entry&&) = default;
|
||||
index_entry& operator=(index_entry&&) = default;
|
||||
|
||||
// Can be nullptr
|
||||
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
|
||||
managed_ref<promoted_index>& get_promoted_index() { return _index; }
|
||||
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
return _key.external_memory_usage() + _index.external_memory_usage();
|
||||
}
|
||||
uint64_t position() const { return data_file_offset; }
|
||||
dht::raw_token token() const { return dht::raw_token(raw_token); }
|
||||
};
|
||||
|
||||
// Required for optimized LSA migration of storage of managed_vector.
|
||||
static_assert(std::is_trivially_move_assignable_v<index_entry>);
|
||||
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
|
||||
|
||||
// A partition index page.
|
||||
//
|
||||
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
|
||||
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
|
||||
class partition_index_page {
|
||||
public:
|
||||
lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
|
||||
lsa::chunked_managed_vector<index_entry> _entries;
|
||||
managed_bytes _key_storage;
|
||||
|
||||
// Stores promoted index information of index entries.
|
||||
// The i-th element corresponds to the i-th entry in _entries.
|
||||
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
|
||||
// that entry doesn't have a promoted index.
|
||||
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
|
||||
// which is typical in workloads with small partitions.
|
||||
lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
|
||||
public:
|
||||
partition_index_page() = default;
|
||||
partition_index_page(partition_index_page&&) noexcept = default;
|
||||
@@ -298,15 +250,68 @@ public:
|
||||
bool empty() const { return _entries.empty(); }
|
||||
size_t size() const { return _entries.size(); }
|
||||
|
||||
stop_iteration clear_gently() {
|
||||
// Vectors have trivial storage, so are fast to destroy.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void clear_one_entry() {
|
||||
_entries.pop_back();
|
||||
}
|
||||
|
||||
bool has_promoted_index(size_t i) const {
|
||||
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
const promoted_index& get_promoted_index(size_t i) const {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
promoted_index& get_promoted_index(size_t i) {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index size for the i-th entry.
|
||||
uint32_t get_promoted_index_size(size_t i) const {
|
||||
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
|
||||
}
|
||||
|
||||
/// Get deletion_time for partition represented by the i-th entry.
|
||||
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
|
||||
/// It has to be read from the data file.
|
||||
std::optional<deletion_time> get_deletion_time(size_t i) const {
|
||||
if (has_promoted_index(i)) {
|
||||
return get_promoted_index(i).del_time;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
key_view get_key(size_t i) const {
|
||||
auto start = _entries[i].key_offset;
|
||||
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
|
||||
auto v = managed_bytes_view(_key_storage).prefix(end);
|
||||
v.remove_prefix(start);
|
||||
return key_view(v);
|
||||
}
|
||||
|
||||
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
|
||||
auto key = get_key(i);
|
||||
auto t = _entries[i].token();
|
||||
if (!t) {
|
||||
t = dht::raw_token(s.get_partitioner().get_token(key));
|
||||
_entries[i].raw_token = t.value;
|
||||
}
|
||||
return decorated_key_view(dht::token(t), key);
|
||||
}
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
size_t size = _entries.external_memory_usage();
|
||||
for (auto&& e : _entries) {
|
||||
size += sizeof(index_entry) + e->external_memory_usage();
|
||||
}
|
||||
size += _promoted_indexes.external_memory_usage();
|
||||
size += _key_storage.external_memory_usage();
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,14 +25,6 @@ namespace sstables {
|
||||
extern seastar::logger sstlog;
|
||||
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;
|
||||
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Partition index entry information produced by the parser.
|
||||
struct parsed_partition_index_entry {
|
||||
temporary_buffer<char> key;
|
||||
@@ -53,9 +45,10 @@ class index_consumer {
|
||||
schema_ptr _s;
|
||||
logalloc::allocating_section _alloc_section;
|
||||
logalloc::region& _region;
|
||||
utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
|
||||
size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
|
||||
size_t _key_storage_size = 0;
|
||||
public:
|
||||
index_list indexes;
|
||||
|
||||
index_consumer(logalloc::region& r, schema_ptr s)
|
||||
: _s(s)
|
||||
, _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
|
||||
@@ -64,36 +57,63 @@ public:
|
||||
, _region(r)
|
||||
{ }
|
||||
|
||||
~index_consumer() {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.clear_and_release();
|
||||
});
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_key_storage_size += e.key.size();
|
||||
_parsed_entries.emplace_back(std::move(e));
|
||||
if (e.promoted_index) {
|
||||
_max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
|
||||
}
|
||||
}
|
||||
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_alloc_section(_region, [&] {
|
||||
future<index_list> finalize() {
|
||||
index_list result;
|
||||
// In case of exception, need to deallocate under region allocator.
|
||||
auto delete_result = seastar::defer([&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
managed_ref<promoted_index> pi;
|
||||
if (e.promoted_index) {
|
||||
pi = make_managed<promoted_index>(*_s,
|
||||
e.promoted_index->del_time,
|
||||
e.promoted_index->promoted_index_start,
|
||||
e.promoted_index->promoted_index_size,
|
||||
e.promoted_index->num_blocks);
|
||||
}
|
||||
auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
|
||||
indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
|
||||
result._entries = {};
|
||||
result._promoted_indexes = {};
|
||||
result._key_storage = {};
|
||||
});
|
||||
});
|
||||
auto i = _parsed_entries.begin();
|
||||
size_t key_offset = 0;
|
||||
while (i != _parsed_entries.end()) {
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
result._entries.reserve(_parsed_entries.size());
|
||||
result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
|
||||
if (result._key_storage.empty()) {
|
||||
result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
|
||||
}
|
||||
managed_bytes_mutable_view key_out(result._key_storage);
|
||||
key_out.remove_prefix(key_offset);
|
||||
while (i != _parsed_entries.end()) {
|
||||
parsed_partition_index_entry& e = *i;
|
||||
if (e.promoted_index) {
|
||||
result._promoted_indexes[result._entries.size()] = *e.promoted_index;
|
||||
}
|
||||
write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
|
||||
result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
|
||||
++i;
|
||||
key_offset += e.key.size();
|
||||
if (need_preempt()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
delete_result.cancel();
|
||||
_parsed_entries.clear();
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void prepare(uint64_t size) {
|
||||
_alloc_section = logalloc::allocating_section();
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.reserve(size);
|
||||
});
|
||||
});
|
||||
_max_promoted_index_entry_plus_one = 0;
|
||||
_key_storage_size = 0;
|
||||
_parsed_entries.clear();
|
||||
_parsed_entries.reserve(size);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -198,10 +218,14 @@ public:
|
||||
|
||||
switch (_state) {
|
||||
// START comes first, to make the handling of the 0-quantity case simpler
|
||||
state_START:
|
||||
case state::START:
|
||||
sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
|
||||
_state = state::KEY_SIZE;
|
||||
break;
|
||||
if (data.size() == 0) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case state::KEY_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
|
||||
_entry_offset = current_pos();
|
||||
@@ -227,7 +251,16 @@ public:
|
||||
case state::PROMOTED_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
|
||||
_position = this->_u64;
|
||||
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
|
||||
data.trim_front(1);
|
||||
_consumer.consume_entry(parsed_partition_index_entry{
|
||||
.key = std::move(_key),
|
||||
.data_file_offset = _position,
|
||||
.index_offset = _entry_offset,
|
||||
.promoted_index = std::nullopt
|
||||
});
|
||||
goto state_START;
|
||||
} else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PARTITION_HEADER_LENGTH_1;
|
||||
break;
|
||||
}
|
||||
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
|
||||
return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
|
||||
}
|
||||
|
||||
inline
|
||||
std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
_promoted_index_start, _promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Less-comparator for lookups in the partition index.
|
||||
class index_comparator {
|
||||
dht::ring_position_comparator_for_sstables _tri_cmp;
|
||||
@@ -376,27 +382,17 @@ public:
|
||||
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
|
||||
return operator()(*e, rp);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
|
||||
return operator()(rp, *e);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
|
||||
dht::ring_position_comparator_for_sstables tri_cmp(s);
|
||||
return tri_cmp(page.get_decorated_key(s, idx), rp);
|
||||
}
|
||||
|
||||
// Contains information about index_reader position in the index file
|
||||
struct index_bound {
|
||||
index_bound() = default;
|
||||
@@ -537,7 +533,7 @@ private:
|
||||
if (ex) {
|
||||
return make_exception_future<index_list>(std::move(ex));
|
||||
}
|
||||
return make_ready_future<index_list>(std::move(bound.consumer->indexes));
|
||||
return bound.consumer->finalize();
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -550,17 +546,18 @@ private:
|
||||
if (bound.current_list->empty()) {
|
||||
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
}
|
||||
bound.data_file_position = bound.current_list->_entries[0]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[0].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
|
||||
if (sstlog.is_enabled(seastar::log_level::trace)) {
|
||||
sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
|
||||
logalloc::reclaim_lock rl(_region);
|
||||
for (auto&& e : bound.current_list->_entries) {
|
||||
for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
|
||||
auto& e = bound.current_list->_entries[i];
|
||||
auto dk = dht::decorate_key(*_sstable->_schema,
|
||||
e->get_key().to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e->position());
|
||||
bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e.position());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,7 +601,13 @@ private:
|
||||
// Valid if partition_data_ready(bound)
|
||||
index_entry& current_partition_entry(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list->_entries[bound.current_index_idx];
|
||||
return bound.current_list->_entries[bound.current_index_idx];
|
||||
}
|
||||
|
||||
// Valid if partition_data_ready(bound)
|
||||
partition_index_page& current_page(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list;
|
||||
}
|
||||
|
||||
future<> advance_to_next_partition(index_bound& bound) {
|
||||
@@ -617,7 +620,7 @@ private:
|
||||
if (bound.current_index_idx + 1 < bound.current_list->size()) {
|
||||
++bound.current_index_idx;
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
return reset_clustered_cursor(bound);
|
||||
@@ -680,9 +683,13 @@ private:
|
||||
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
|
||||
sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
|
||||
auto i = _alloc_section(_region, [&] {
|
||||
auto& entries = bound.current_list->_entries;
|
||||
return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
|
||||
index_comparator(*_sstable->_schema));
|
||||
auto& page = *bound.current_list;
|
||||
auto& s = *_sstable->_schema;
|
||||
auto r = std::views::iota(bound.current_index_idx, page._entries.size());
|
||||
auto it = std::ranges::partition_point(r, [&] (int idx) {
|
||||
return index_entry_tri_cmp(s, page, idx, pos) < 0;
|
||||
});
|
||||
return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
|
||||
});
|
||||
// i is valid until next allocation point
|
||||
auto& entries = bound.current_list->_entries;
|
||||
@@ -697,7 +704,7 @@ private:
|
||||
}
|
||||
bound.current_index_idx = std::distance(std::begin(entries), i);
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = (*i)->position();
|
||||
bound.data_file_position = (*i).position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
|
||||
@@ -800,6 +807,34 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
|
||||
shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
pi.promoted_index_start, pi.promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Ensures that partition_data_ready() returns true.
|
||||
// Can be called only when !eof()
|
||||
future<> read_partition_data() override {
|
||||
@@ -835,10 +870,10 @@ public:
|
||||
clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
|
||||
if (!bound.clustered_cursor) {
|
||||
_alloc_section(_region, [&] {
|
||||
index_entry& e = current_partition_entry(bound);
|
||||
promoted_index* pi = e.get_promoted_index().get();
|
||||
if (pi) {
|
||||
bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
|
||||
partition_index_page& page = current_page(bound);
|
||||
if (page.has_promoted_index(bound.current_index_idx)) {
|
||||
promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
|
||||
bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
|
||||
get_file_input_stream_options(), _use_caching);
|
||||
}
|
||||
});
|
||||
@@ -861,15 +896,15 @@ public:
|
||||
// It may be unavailable for old sstables for which this information was not generated.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<sstables::deletion_time> partition_tombstone() override {
|
||||
return current_partition_entry(_lower_bound).get_deletion_time();
|
||||
return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
// Returns the key for current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<partition_key> get_partition_key() override {
|
||||
return _alloc_section(_region, [this] {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_key().to_partition_key(*_sstable->_schema);
|
||||
return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
|
||||
.to_partition_key(*_sstable->_schema);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -883,8 +918,8 @@ public:
|
||||
// Returns the number of promoted index entries for the current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
uint64_t get_promoted_index_size() {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_promoted_index_size();
|
||||
partition_index_page& page = current_page(_lower_bound);
|
||||
return page.get_promoted_index_size(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
bool partition_data_ready() const override {
|
||||
@@ -975,9 +1010,9 @@ public:
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
return read_partition_data().then([this, key] {
|
||||
index_comparator cmp(*_sstable->_schema);
|
||||
bool found = _alloc_section(_region, [&] {
|
||||
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
auto& page = current_page(_lower_bound);
|
||||
return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
|
||||
});
|
||||
return make_ready_future<bool>(found);
|
||||
});
|
||||
|
||||
@@ -189,10 +189,11 @@ public:
|
||||
{}
|
||||
future<std::optional<directory_entry>> get() override {
|
||||
std::filesystem::path dir(_prefix);
|
||||
do {
|
||||
while (true) {
|
||||
if (_pos == _info.size()) {
|
||||
_info.clear();
|
||||
_info = co_await _client->list_objects(_bucket, _prefix, _paging);
|
||||
_pos = 0;
|
||||
}
|
||||
if (_info.empty()) {
|
||||
break;
|
||||
@@ -203,7 +204,7 @@ public:
|
||||
continue;
|
||||
}
|
||||
co_return ent;
|
||||
} while (false);
|
||||
}
|
||||
|
||||
co_return std::nullopt;
|
||||
}
|
||||
@@ -276,7 +277,7 @@ public:
|
||||
co_await f.close();
|
||||
|
||||
auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
|
||||
co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
|
||||
co_await _client->merge_objects(bucket, object, names, {}, as);
|
||||
|
||||
co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
|
||||
co_await _client->delete_object(bucket, name);
|
||||
|
||||
@@ -257,14 +257,11 @@ public:
|
||||
while (partial_page || i != _cache.end()) {
|
||||
if (partial_page) {
|
||||
auto preempted = with_allocator(_region.allocator(), [&] {
|
||||
while (!partial_page->empty()) {
|
||||
partial_page->clear_one_entry();
|
||||
if (need_preempt()) {
|
||||
return true;
|
||||
}
|
||||
while (partial_page->clear_gently() != stop_iteration::yes) {
|
||||
return true;
|
||||
}
|
||||
partial_page.reset();
|
||||
return false;
|
||||
return need_preempt();
|
||||
});
|
||||
if (preempted) {
|
||||
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
|
||||
|
||||
@@ -1094,7 +1094,6 @@ public:
|
||||
|
||||
friend class mc::writer;
|
||||
friend class index_reader;
|
||||
friend class promoted_index;
|
||||
friend class sstables_manager;
|
||||
template <typename DataConsumeRowsContext>
|
||||
friend future<std::unique_ptr<DataConsumeRowsContext>>
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "message/messaging_service.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
|
||||
#include <cfloat>
|
||||
#include <algorithm>
|
||||
@@ -410,6 +411,9 @@ future<> sstable_streamer::stream_sstables(const dht::partition_range& pr, std::
|
||||
size_t nr_sst_current = 0;
|
||||
|
||||
while (!sstables.empty()) {
|
||||
co_await utils::get_local_injector().inject("load_and_stream_before_streaming_batch",
|
||||
utils::wait_for_message(60s));
|
||||
|
||||
const size_t batch_sst_nr = std::min(16uz, sstables.size());
|
||||
auto sst_processed = sstables
|
||||
| std::views::reverse
|
||||
@@ -581,6 +585,16 @@ future<> sstables_loader::load_and_stream(sstring ks_name, sstring cf_name,
|
||||
// throughout its lifetime.
|
||||
auto erm = co_await await_topology_quiesced_and_get_erm(table_id);
|
||||
|
||||
// Obtain a phaser guard to prevent the table from being destroyed
|
||||
// while streaming is in progress. table::stop() calls
|
||||
// _pending_streams_phaser.close() which blocks until all outstanding
|
||||
// stream_in_progress() guards are released, so holding this guard
|
||||
// keeps the table alive for the entire streaming operation.
|
||||
// find_column_family throws no_such_column_family if the table was
|
||||
// already dropped before we got here.
|
||||
auto& tbl = _db.local().find_column_family(table_id);
|
||||
auto stream_guard = tbl.stream_in_progress();
|
||||
|
||||
auto streamer = make_sstable_streamer(_db.local().find_column_family(table_id).uses_tablets(),
|
||||
_messaging, _db.local(), table_id, std::move(erm), std::move(sstables),
|
||||
primary, unlink_sstables(unlink), scope);
|
||||
|
||||
@@ -436,7 +436,10 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
stream_options.buffer_size = file_stream_buffer_size;
|
||||
stream_options.read_ahead = file_stream_read_ahead;
|
||||
|
||||
for (auto& info : sources) {
|
||||
for (auto&& source_info : sources) {
|
||||
// Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
|
||||
// of the sstable component to be released right after it has been streamed.
|
||||
auto info = std::exchange(source_info, {});
|
||||
auto& filename = info.filename;
|
||||
std::optional<input_stream<char>> fstream;
|
||||
bool fstream_closed = false;
|
||||
@@ -617,6 +620,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
ops_id, filename, targets, total_size, get_bw(total_size, start_time));
|
||||
}
|
||||
}
|
||||
co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
if (error) {
|
||||
blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
|
||||
ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
|
||||
@@ -632,7 +636,9 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
future<stream_files_response> tablet_stream_files_handler(replica::database& db, netw::messaging_service& ms, streaming::stream_files_request req) {
|
||||
stream_files_response resp;
|
||||
auto& table = db.find_column_family(req.table);
|
||||
auto table_stream_op = table.stream_in_progress();
|
||||
auto sstables = co_await table.take_storage_snapshot(req.range);
|
||||
co_await utils::get_local_injector().inject("wait_before_tablet_stream_files_after_snapshot", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
co_await utils::get_local_injector().inject("order_sstables_for_streaming", [&sstables] (auto& handler) -> future<> {
|
||||
if (sstables.size() == 3) {
|
||||
// make sure the sstables are ordered so that the sstable containing shadowed data is streamed last
|
||||
@@ -680,15 +686,22 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
|
||||
if (files.empty()) {
|
||||
co_return resp;
|
||||
}
|
||||
auto sstable_nr = sstables.size();
|
||||
// Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
|
||||
// a sstable - that has been compacted - can have its space released from disk right after
|
||||
// that sstable's content has been fully streamed.
|
||||
sstables.clear();
|
||||
// Release the table - we don't need to access it anymore and the files are held by the snapshot.
|
||||
table_stream_op = {};
|
||||
blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
|
||||
req.ops_id, sstables.size(), files.size(), files, req.range);
|
||||
req.ops_id, sstable_nr, files.size(), files, req.range);
|
||||
auto ops_start_time = std::chrono::steady_clock::now();
|
||||
auto files_nr = files.size();
|
||||
size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
|
||||
resp.stream_bytes = stream_bytes;
|
||||
auto duration = std::chrono::steady_clock::now() - ops_start_time;
|
||||
blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
|
||||
req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
co_return resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
|
||||
auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
|
||||
try {
|
||||
shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
|
||||
_prepared_stmt = std::move(msg_ptr->get_prepared());
|
||||
_prepared_stmt = msg_ptr->get_prepared();
|
||||
shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
|
||||
_insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
|
||||
_is_fallback_stmt = fallback;
|
||||
|
||||
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
|
||||
: _module(std::move(module))
|
||||
{}
|
||||
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
|
||||
auto ms = module->get_task_manager()._messaging;
|
||||
if (!ms) {
|
||||
auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
|
||||
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
|
||||
tmlogger.info("tasks_vt_get_children: waiting");
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
|
||||
});
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
if (is_host_alive(host_id)) {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<utils::chunked_vector<task_identity>>();
|
||||
}
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
}).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
|
||||
tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
|
||||
return utils::chunked_vector<task_identity>{};
|
||||
});
|
||||
}, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
|
||||
std::move(b.begin(), b.end(), std::back_inserter(a));
|
||||
return a;
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "db_clock.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
@@ -282,7 +283,7 @@ public:
|
||||
impl& operator=(impl&&) = delete;
|
||||
virtual ~impl() = default;
|
||||
protected:
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
|
||||
public:
|
||||
virtual task_group get_group() const noexcept = 0;
|
||||
// Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
|
||||
|
||||
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
|
||||
# to update latencies for one kind of operation (#17616, and compare #9406),
|
||||
# and to do that checking that ..._count increases for that op is enough.
|
||||
@contextmanager
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
def check_sets_latency_by_metric(metrics, operation_names, metric_name):
|
||||
the_metrics = get_metrics(metrics)
|
||||
saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
yield
|
||||
the_metrics = get_metrics(metrics)
|
||||
for op in operation_names:
|
||||
# The total "count" on all shards should strictly increase
|
||||
assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
|
||||
assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
|
||||
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')
|
||||
|
||||
# Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
|
||||
# We can't check what exactly the latency is - just that it gets updated.
|
||||
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
def test_item_latency_per_table(test_table_s, metrics):
|
||||
with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
|
||||
p = random_string()
|
||||
test_table_s.put_item(Item={'p': p})
|
||||
test_table_s.get_item(Key={'p': p})
|
||||
test_table_s.delete_item(Key={'p': p})
|
||||
test_table_s.update_item(Key={'p': p})
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
# Test latency metrics for GetRecords. Other Streams-related operations -
|
||||
# ListStreams, DescribeStream, and GetShardIterator, have an operation
|
||||
# count (tested above) but do NOT currently have a latency histogram.
|
||||
|
||||
@@ -378,6 +378,7 @@ add_scylla_test(combined_tests
|
||||
tracing_test.cc
|
||||
user_function_test.cc
|
||||
user_types_test.cc
|
||||
vector_index_test.cc
|
||||
view_build_test.cc
|
||||
view_complex_test.cc
|
||||
view_schema_ckey_test.cc
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/manual_clock.hh>
|
||||
#include <seastar/util/later.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/util/alloc_failure_injector.hh>
|
||||
@@ -290,12 +290,17 @@ SEASTAR_THREAD_TEST_CASE(test_address_map_replication) {
|
||||
m.set_expiring(id1);
|
||||
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
|
||||
m.barrier().get();
|
||||
promise<> shard0_timer_expired;
|
||||
timer<manual_clock> shard0_timer([&shard0_timer_expired] {
|
||||
shard0_timer_expired.set_value();
|
||||
});
|
||||
shard0_timer.arm(manual_clock::now() + expiration_time);
|
||||
m_svc.invoke_on(1, [] (address_map_t<manual_clock>& m) {
|
||||
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
|
||||
manual_clock::advance(expiration_time);
|
||||
BOOST_CHECK(!m.find(id1));
|
||||
return smp::submit_to(0, []{}); // Ensure shard 0 notices timer is expired.
|
||||
}).get();
|
||||
shard0_timer_expired.get_future().get();
|
||||
BOOST_CHECK(!m.find(id1));
|
||||
|
||||
// Expiring entries are replicated
|
||||
|
||||
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(1.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();
|
||||
|
||||
@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(0.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();
|
||||
|
||||
|
||||
@@ -1111,6 +1111,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
|
||||
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
|
||||
return make_ready_future();
|
||||
#endif
|
||||
return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
|
||||
sharded<db::snapshot_ctl> sc;
|
||||
sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
|
||||
auto stop_sc = deferred_stop(sc);
|
||||
|
||||
auto& cf = e.local_db().find_column_family("ks", "cf");
|
||||
take_snapshot(e).get();
|
||||
|
||||
utils::get_local_injector().enable("get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
auto details = cf.get_snapshot_details().get();
|
||||
BOOST_REQUIRE_EQUAL(details.size(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
// toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
|
||||
SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
|
||||
return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
|
||||
@@ -1857,7 +1881,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {
|
||||
|
||||
schema_builder::register_schema_initializer([] (schema_builder& builder) {
|
||||
if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")
|
||||
|
||||
@@ -23,8 +23,11 @@
|
||||
#include "test/lib/tmpdir.hh"
|
||||
#include "test/lib/random_utils.hh"
|
||||
#include "test/lib/exception_utils.hh"
|
||||
#include "utils/limiting_data_source.hh"
|
||||
#include "utils/io-wrappers.hh"
|
||||
|
||||
#include <seastar/util/memory-data-source.hh>
|
||||
|
||||
using namespace encryption;
|
||||
|
||||
static tmpdir dir;
|
||||
@@ -595,6 +598,113 @@ SEASTAR_TEST_CASE(test_encrypted_data_source_simple) {
|
||||
co_await test_random_data_source(sizes);
|
||||
}
|
||||
|
||||
// Reproduces the production deadlock where encrypted SSTable component downloads
|
||||
// got stuck during restore. The encrypted_data_source::get() caches a block in
|
||||
// _next, then on the next call bypasses input_stream::read()'s _eof check and
|
||||
// calls input_stream::read_exactly() — which does NOT check _eof when _buf is
|
||||
// empty. This causes a second get() on the underlying source after EOS.
|
||||
//
|
||||
// In production the underlying source was chunked_download_source whose get()
|
||||
// hung forever. Here we simulate it with a strict source that fails the test.
|
||||
//
|
||||
// The fix belongs in seastar's input_stream::read_exactly(): check _eof before
|
||||
// calling _fd.get(), consistent with read(), read_up_to(), and consume().
|
||||
static future<> test_encrypted_source_copy(size_t plaintext_size) {
|
||||
testlog.info("test_encrypted_source_copy: plaintext_size={}", plaintext_size);
|
||||
|
||||
key_info info{"AES/CBC", 256};
|
||||
auto k = ::make_shared<symmetric_key>(info);
|
||||
|
||||
// Step 1: Encrypt the plaintext into memory buffers
|
||||
auto plaintext = generate_random<char>(plaintext_size);
|
||||
std::vector<temporary_buffer<char>> encrypted_bufs;
|
||||
{
|
||||
data_sink sink(make_encrypted_sink(create_memory_sink(encrypted_bufs), k));
|
||||
co_await sink.put(plaintext.clone());
|
||||
co_await sink.close();
|
||||
}
|
||||
|
||||
// Flatten encrypted buffers into a single contiguous buffer
|
||||
size_t encrypted_total = 0;
|
||||
for (const auto& b : encrypted_bufs) {
|
||||
encrypted_total += b.size();
|
||||
}
|
||||
temporary_buffer<char> encrypted(encrypted_total);
|
||||
size_t pos = 0;
|
||||
for (const auto& b : encrypted_bufs) {
|
||||
std::copy(b.begin(), b.end(), encrypted.get_write() + pos);
|
||||
pos += b.size();
|
||||
}
|
||||
|
||||
// Step 2: Create a data source from the encrypted data that fails on
|
||||
// post-EOS get() — simulating a source like chunked_download_source
|
||||
// that would hang forever in this situation.
|
||||
class strict_memory_source final : public limiting_data_source_impl {
|
||||
bool _eof = false;
|
||||
public:
|
||||
strict_memory_source(temporary_buffer<char> data, size_t chunk_size)
|
||||
: limiting_data_source_impl(
|
||||
data_source(std::make_unique<util::temporary_buffer_data_source>(std::move(data))),
|
||||
[chunk_size] { return chunk_size; }) {}
|
||||
|
||||
future<temporary_buffer<char>> get() override {
|
||||
BOOST_REQUIRE_MESSAGE(!_eof,
|
||||
"get() called on source after it already returned EOS — "
|
||||
"this is the production deadlock: read_exactly() does not "
|
||||
"check _eof before calling _fd.get()");
|
||||
auto buf = co_await limiting_data_source_impl::get();
|
||||
_eof = buf.empty();
|
||||
co_return buf;
|
||||
}
|
||||
};
|
||||
|
||||
// Step 3: Wrap in encrypted_data_source and drain via consume() —
|
||||
// the exact code path used by seastar::copy() which is what
|
||||
// sstables_loader_helpers::download_sstable() calls.
|
||||
// Try multiple chunk sizes to hit different alignment scenarios.
|
||||
for (size_t chunk_size : {1ul, 7ul, 4096ul, 8192ul, encrypted_total, encrypted_total + 1}) {
|
||||
if (chunk_size == 0) continue;
|
||||
auto src = data_source(make_encrypted_source(
|
||||
data_source(std::make_unique<strict_memory_source>(encrypted.clone(), chunk_size)), k));
|
||||
auto in = input_stream<char>(std::move(src));
|
||||
|
||||
// consume() is what seastar::copy() uses internally. It calls
|
||||
// encrypted_data_source::get() via _fd.get() until EOF.
|
||||
size_t total_decrypted = 0;
|
||||
co_await in.consume([&total_decrypted](temporary_buffer<char> buf) {
|
||||
total_decrypted += buf.size();
|
||||
return make_ready_future<consumption_result<char>>(continue_consuming{});
|
||||
});
|
||||
co_await in.close();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(total_decrypted, plaintext_size);
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_8k) {
|
||||
co_await test_encrypted_source_copy(8192);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_4k) {
|
||||
co_await test_encrypted_source_copy(4096);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_small) {
|
||||
co_await test_encrypted_source_copy(100);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_12k) {
|
||||
co_await test_encrypted_source_copy(12288);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_unaligned) {
|
||||
co_await test_encrypted_source_copy(8193);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_1byte) {
|
||||
co_await test_encrypted_source_copy(1);
|
||||
}
|
||||
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_data_source_fuzzy) {
|
||||
std::mt19937_64 rand_gen(std::random_device{}());
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "test/lib/azure_kms_fixture.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/extensions.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/commitlog/commitlog.hh"
|
||||
#include "db/commitlog/commitlog_replayer.hh"
|
||||
#include "init.hh"
|
||||
@@ -1075,6 +1076,39 @@ SEASTAR_FIXTURE_TEST_CASE(test_kms_network_error, local_aws_kms_wrapper, *check_
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
SEASTAR_TEST_CASE(test_system_info_encryption_includes_raft_tables) {
|
||||
tmpdir tmp;
|
||||
auto sysdir = tmp.path() / "system_keys";
|
||||
auto syskey = sysdir / "system" / "system_table_keytab";
|
||||
auto yaml = fmt::format("system_key_directory: {}", sysdir.string());
|
||||
|
||||
co_await create_key_file(syskey, { { "AES/CBC/PKCSPadding", 128 }});
|
||||
|
||||
test_provider_args args{
|
||||
.tmp = tmp,
|
||||
.extra_yaml = yaml,
|
||||
};
|
||||
|
||||
auto [cfg, ext] = make_commitlog_config(args, {});
|
||||
|
||||
co_await do_with_cql_env_thread(
|
||||
[](cql_test_env& env) {
|
||||
auto check_has_encryption = [&](schema_ptr s) {
|
||||
auto it = s->extensions().find("scylla_encryption_options");
|
||||
BOOST_REQUIRE_MESSAGE(it != s->extensions().end(),
|
||||
fmt::format("Expected encryption extension on {}.{}",
|
||||
s->ks_name(), s->cf_name()));
|
||||
BOOST_REQUIRE_MESSAGE(!it->second->is_placeholder(),
|
||||
fmt::format("Encryption extension on {}.{} "
|
||||
"should not be a placeholder",
|
||||
s->ks_name(), s->cf_name()));
|
||||
};
|
||||
|
||||
check_has_encryption(db::system_keyspace::raft());
|
||||
},
|
||||
cfg, {}, cql_test_init_configurables{ *ext });
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_user_info_encryption) {
|
||||
tmpdir tmp;
|
||||
auto keyfile = tmp.path() / "secret_key";
|
||||
|
||||
@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
// (group0 mutations are not allowed on non-group0 tables)
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_batch") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -345,4 +345,29 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
|
||||
return do_with_cql_env([] (cql_test_env& e) {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
|
||||
|
||||
return make_ready_future();
|
||||
});
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user