mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-03 14:45:49 +00:00
Compare commits
95 Commits
fix_sl_v2_
...
scylla-202
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4681d0975 | ||
|
|
24d8843803 | ||
|
|
5c5fbfaabb | ||
|
|
ec46a8a7d3 | ||
|
|
dae73f4781 | ||
|
|
bcd320a82a | ||
|
|
1cb9d0b245 | ||
|
|
ac7efa2085 | ||
|
|
f61343ca15 | ||
|
|
3386716217 | ||
|
|
cb9c65af43 | ||
|
|
351ed72f5f | ||
|
|
7a080130cf | ||
|
|
164364ed3e | ||
|
|
dd9df62617 | ||
|
|
df56f6bdc2 | ||
|
|
92f8f2c2db | ||
|
|
e992d76489 | ||
|
|
196db8931e | ||
|
|
e436db01e3 | ||
|
|
9020288c79 | ||
|
|
7467dcd30f | ||
|
|
f9be6f4a83 | ||
|
|
d46ff9b405 | ||
|
|
5ca0bc2019 | ||
|
|
e5d82bf857 | ||
|
|
fac9795325 | ||
|
|
3a1d7d2b09 | ||
|
|
fb81acb7aa | ||
|
|
56bf4c8f0e | ||
|
|
bf1f5ee796 | ||
|
|
da53b8798f | ||
|
|
3d167dd36e | ||
|
|
c42799fb01 | ||
|
|
c93c037d39 | ||
|
|
3107d9083e | ||
|
|
04e5fa6c3e | ||
|
|
70b9ae04ff | ||
|
|
eaae2bf0af | ||
|
|
abfa4d0272 | ||
|
|
8bdc97924e | ||
|
|
253fa9519f | ||
|
|
666d0440f1 | ||
|
|
70b7652e64 | ||
|
|
27604deebb | ||
|
|
cd7baebc8b | ||
|
|
c5f57815a5 | ||
|
|
5eabf35824 | ||
|
|
95e422db48 | ||
|
|
b033bbc560 | ||
|
|
faf8ad69f0 | ||
|
|
dc7829a9b5 | ||
|
|
f2111c011f | ||
|
|
d2b12329ab | ||
|
|
b638170a4e | ||
|
|
d5c7f29734 | ||
|
|
a5dd529475 | ||
|
|
b176591488 | ||
|
|
233da83dd9 | ||
|
|
9b81939a93 | ||
|
|
804842e95c | ||
|
|
4f77cb621f | ||
|
|
eb6c333e1b | ||
|
|
8d21636a81 | ||
|
|
7f236baf61 | ||
|
|
4da8641d83 | ||
|
|
3ab789e1ca | ||
|
|
25a17282bd | ||
|
|
7afcc56128 | ||
|
|
32443ed6f7 | ||
|
|
3e9b984020 | ||
|
|
2d199fb609 | ||
|
|
35cd7f9239 | ||
|
|
32ce43d4b1 | ||
|
|
fef7750eb6 | ||
|
|
213442227d | ||
|
|
1398a55d16 | ||
|
|
a0a2a67634 | ||
|
|
d4e454b5bc | ||
|
|
825a36c97a | ||
|
|
45413e99a5 | ||
|
|
c93a935564 | ||
|
|
69f78ce74a | ||
|
|
3513ce6069 | ||
|
|
0ca7253315 | ||
|
|
c7ac3b5394 | ||
|
|
d6ed05efc1 | ||
|
|
39fcc83e75 | ||
|
|
6250f1e967 | ||
|
|
b307c9301d | ||
|
|
f26af8cd30 | ||
|
|
2bd10bff5e | ||
|
|
1105d83893 | ||
|
|
9b9d5cee8a | ||
|
|
a8fd9936a3 |
22
.github/workflows/trigger-scylla-ci.yaml
vendored
22
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -15,13 +15,19 @@ jobs:
|
||||
- name: Verify Org Membership
|
||||
id: verify_author
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
|
||||
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
|
||||
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
|
||||
AUTHOR="${{ github.event.pull_request.user.login }}"
|
||||
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
|
||||
AUTHOR="$PR_AUTHOR"
|
||||
ASSOCIATION="$PR_ASSOCIATION"
|
||||
else
|
||||
AUTHOR="${{ github.event.comment.user.login }}"
|
||||
AUTHOR="$COMMENT_AUTHOR"
|
||||
ASSOCIATION="$COMMENT_ASSOCIATION"
|
||||
fi
|
||||
ORG="scylladb"
|
||||
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
|
||||
@@ -34,13 +40,11 @@ jobs:
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
env:
|
||||
COMMENT_BODY: ${{ github.event.comment.body }}
|
||||
shell: bash
|
||||
run: |
|
||||
BODY=$(cat << 'EOF'
|
||||
${{ github.event.comment.body }}
|
||||
EOF
|
||||
)
|
||||
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
|
||||
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0
|
||||
VERSION=2026.1.2
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
if (should_add_wcu) {
|
||||
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
|
||||
}
|
||||
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_write_item_latency.mark(duration);
|
||||
for (const auto& w : per_table_wcu) {
|
||||
w.first->api_operations.batch_write_item_latency.mark(duration);
|
||||
}
|
||||
co_return rjson::print(std::move(ret));
|
||||
}
|
||||
|
||||
@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
if (!some_succeeded && eptr) {
|
||||
co_await coroutine::return_exception_ptr(std::move(eptr));
|
||||
}
|
||||
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
||||
auto duration = std::chrono::steady_clock::now() - start_time;
|
||||
_stats.api_operations.batch_get_item_latency.mark(duration);
|
||||
for (const table_requests& rs : requests) {
|
||||
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
||||
per_table_stats->api_operations.batch_get_item_latency.mark(duration);
|
||||
}
|
||||
if (is_big(response)) {
|
||||
co_return make_streamed(std::move(response));
|
||||
} else {
|
||||
|
||||
@@ -32,6 +32,8 @@ namespace {
|
||||
|
||||
logger mylog{"ldap_role_manager"}; // `log` is taken by math.
|
||||
|
||||
constexpr std::string_view user_placeholder = "{USER}";
|
||||
|
||||
struct url_desc_deleter {
|
||||
void operator()(LDAPURLDesc *p) {
|
||||
ldap_free_urldesc(p);
|
||||
@@ -40,9 +42,141 @@ struct url_desc_deleter {
|
||||
|
||||
using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;
|
||||
|
||||
url_desc_ptr parse_url(std::string_view url) {
|
||||
/// Escapes LDAP filter assertion value per RFC 4515 Section 3.
|
||||
/// The characters *, (, ), \, and NUL must be backslash-hex-escaped
|
||||
/// to prevent filter injection when interpolating untrusted input.
|
||||
sstring escape_filter_value(std::string_view value) {
|
||||
size_t escapable_chars = 0;
|
||||
for (unsigned char ch : value) {
|
||||
switch (ch) {
|
||||
case '*':
|
||||
case '(':
|
||||
case ')':
|
||||
case '\\':
|
||||
case '\0':
|
||||
++escapable_chars;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (escapable_chars == 0) {
|
||||
return sstring(value);
|
||||
}
|
||||
|
||||
sstring escaped(value.size() + escapable_chars * 2, 0);
|
||||
size_t pos = 0;
|
||||
for (unsigned char ch : value) {
|
||||
switch (ch) {
|
||||
case '*':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = 'a';
|
||||
break;
|
||||
case '(':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = '8';
|
||||
break;
|
||||
case ')':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '2';
|
||||
escaped[pos++] = '9';
|
||||
break;
|
||||
case '\\':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '5';
|
||||
escaped[pos++] = 'c';
|
||||
break;
|
||||
case '\0':
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '0';
|
||||
escaped[pos++] = '0';
|
||||
break;
|
||||
default:
|
||||
escaped[pos++] = static_cast<char>(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return escaped;
|
||||
}
|
||||
|
||||
/// Percent-encodes characters that are not RFC 3986 "unreserved"
|
||||
/// (ALPHA / DIGIT / '-' / '.' / '_' / '~').
|
||||
///
|
||||
/// Uses explicit ASCII range checks instead of std::isalnum() because
|
||||
/// the latter is locale-dependent and could pass non-ASCII characters
|
||||
/// through unencoded under certain locale settings.
|
||||
///
|
||||
/// This is applied AFTER RFC 4515 filter escaping when the value is
|
||||
/// substituted into an LDAP URL. It serves two purposes:
|
||||
/// 1. Prevents URL-level metacharacters ('?', '#') from breaking
|
||||
/// the URL structure parsed by ldap_url_parse.
|
||||
/// 2. Prevents percent-decoding (which ldap_url_parse performs on
|
||||
/// each component) from undoing the filter escaping, e.g. a
|
||||
/// literal "%2a" in the username would otherwise decode to '*'.
|
||||
sstring percent_encode_for_url(std::string_view value) {
|
||||
static constexpr char hex[] = "0123456789ABCDEF";
|
||||
|
||||
size_t chars_to_encode = 0;
|
||||
for (unsigned char ch : value) {
|
||||
if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
|
||||
|| ch == '-' || ch == '.' || ch == '_' || ch == '~')) {
|
||||
++chars_to_encode;
|
||||
}
|
||||
}
|
||||
|
||||
if (chars_to_encode == 0) {
|
||||
return sstring(value);
|
||||
}
|
||||
|
||||
sstring encoded(value.size() + chars_to_encode * 2, 0);
|
||||
size_t pos = 0;
|
||||
for (unsigned char ch : value) {
|
||||
if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
|
||||
|| ch == '-' || ch == '.' || ch == '_' || ch == '~') {
|
||||
encoded[pos++] = static_cast<char>(ch);
|
||||
} else {
|
||||
encoded[pos++] = '%';
|
||||
encoded[pos++] = hex[ch >> 4];
|
||||
encoded[pos++] = hex[ch & 0x0F];
|
||||
}
|
||||
}
|
||||
|
||||
return encoded;
|
||||
}
|
||||
|
||||
/// Checks whether \p sentinel appears in any parsed URL component
|
||||
/// other than the filter (host, DN, attributes, extensions).
|
||||
bool sentinel_outside_filter(const LDAPURLDesc& desc, std::string_view sentinel) {
|
||||
auto contains = [&](const char* field) {
|
||||
return field && std::string_view(field).find(sentinel) != std::string_view::npos;
|
||||
};
|
||||
if (contains(desc.lud_host) || contains(desc.lud_dn)) {
|
||||
return true;
|
||||
}
|
||||
if (desc.lud_attrs) {
|
||||
for (int i = 0; desc.lud_attrs[i]; ++i) {
|
||||
if (contains(desc.lud_attrs[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (desc.lud_exts) {
|
||||
for (int i = 0; desc.lud_exts[i]; ++i) {
|
||||
if (contains(desc.lud_exts[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
url_desc_ptr parse_url(const sstring& url) {
|
||||
LDAPURLDesc *desc = nullptr;
|
||||
if (ldap_url_parse(url.data(), &desc)) {
|
||||
if (ldap_url_parse(url.c_str(), &desc)) {
|
||||
mylog.error("error in ldap_url_parse({})", url);
|
||||
}
|
||||
return url_desc_ptr(desc);
|
||||
@@ -115,6 +249,7 @@ const resource_set& ldap_role_manager::protected_resources() const {
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::start() {
|
||||
validate_query_template();
|
||||
if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
|
||||
return make_exception_future(
|
||||
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
@@ -199,7 +334,7 @@ future<> ldap_role_manager::revoke(std::string_view, std::string_view, ::service
|
||||
}
|
||||
|
||||
future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
|
||||
const auto url = get_url(grantee_name.data());
|
||||
const auto url = get_url(grantee_name);
|
||||
auto desc = parse_url(url);
|
||||
if (!desc) {
|
||||
return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
|
||||
@@ -331,7 +466,46 @@ future<> ldap_role_manager::remove_attribute(std::string_view role_name, std::st
|
||||
}
|
||||
|
||||
sstring ldap_role_manager::get_url(std::string_view user) const {
|
||||
return boost::replace_all_copy(_query_template, "{USER}", user);
|
||||
// Two-layer encoding protects against injection:
|
||||
// 1. RFC 4515 filter escaping neutralizes filter metacharacters (*, (, ), \, NUL)
|
||||
// 2. URL percent-encoding prevents URL structure injection (?, #) and blocks
|
||||
// ldap_url_parse's percent-decoding from undoing the filter escaping (%2a -> *)
|
||||
return boost::replace_all_copy(_query_template, user_placeholder,
|
||||
percent_encode_for_url(escape_filter_value(user)));
|
||||
}
|
||||
|
||||
void ldap_role_manager::validate_query_template() const {
|
||||
if (_query_template.find(user_placeholder) == sstring::npos) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Substitute {USER} with a sentinel and let ldap_url_parse tell us
|
||||
// which URL component it landed in. The sentinel is purely
|
||||
// alphanumeric so it cannot affect URL parsing.
|
||||
static constexpr std::string_view sentinel = "XLDAPSENTINELX";
|
||||
sstring test_url = boost::replace_all_copy(_query_template, user_placeholder, sentinel);
|
||||
auto desc = parse_url(test_url);
|
||||
if (!desc) {
|
||||
throw url_error(format("LDAP URL template is not a valid URL when {{USER}} is substituted: {}", _query_template));
|
||||
}
|
||||
|
||||
// The sentinel must appear in the filter ...
|
||||
if (!desc->lud_filter
|
||||
|| std::string_view(desc->lud_filter).find(sentinel) == std::string_view::npos) {
|
||||
throw url_error(format(
|
||||
"LDAP URL template places {{USER}} outside the filter component. "
|
||||
"RFC 4515 filter escaping only protects the filter; other components "
|
||||
"(e.g. the base DN) require different escaping and are not supported. "
|
||||
"Template: {}", _query_template));
|
||||
}
|
||||
// ... and nowhere else (host, DN, attributes, extensions).
|
||||
if (sentinel_outside_filter(*desc, sentinel)) {
|
||||
throw url_error(format(
|
||||
"LDAP URL template places {{USER}} outside the filter component. "
|
||||
"RFC 4515 filter escaping only protects the filter; other components "
|
||||
"(e.g. the host) require different escaping and are not supported. "
|
||||
"Template: {}", _query_template));
|
||||
}
|
||||
}
|
||||
|
||||
future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants() {
|
||||
|
||||
@@ -107,6 +107,9 @@ class ldap_role_manager : public role_manager {
|
||||
/// Macro-expands _query_template, returning the result.
|
||||
sstring get_url(std::string_view user) const;
|
||||
|
||||
/// Validates that {USER}, if present, is used only in the LDAP filter component.
|
||||
void validate_query_template() const;
|
||||
|
||||
/// Used to auto-create roles returned by ldap.
|
||||
future<> create_role(std::string_view role_name);
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "mutation/mutation_fragment_stream_validator.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/pretty_printers.hh"
|
||||
#include "readers/multi_range.hh"
|
||||
#include "readers/compacting.hh"
|
||||
@@ -611,23 +612,23 @@ private:
|
||||
}
|
||||
|
||||
// Called in a seastar thread
|
||||
dht::partition_range_vector
|
||||
utils::chunked_vector<dht::partition_range>
|
||||
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
// If owned ranges is disengaged, it means no cleanup work was done and
|
||||
// so nothing needs to be invalidated.
|
||||
if (!_owned_ranges) {
|
||||
return dht::partition_range_vector{};
|
||||
return {};
|
||||
}
|
||||
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
|
||||
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
|
||||
|
||||
auto non_owned_ranges = sstables
|
||||
| std::views::transform([] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return dht::partition_range::make({sst->get_first_decorated_key(), true},
|
||||
{sst->get_last_decorated_key(), true});
|
||||
}) | std::ranges::to<dht::partition_range_vector>();
|
||||
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
|
||||
|
||||
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
|
||||
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
|
||||
}
|
||||
protected:
|
||||
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
|
||||
@@ -718,8 +719,8 @@ protected:
|
||||
|
||||
compaction_completion_desc
|
||||
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
|
||||
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
|
||||
auto ranges = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
|
||||
}
|
||||
|
||||
// Tombstone expiration is enabled based on the presence of sstable set.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "compaction_fwd.hh"
|
||||
#include "mutation_writer/token_group_based_splitting_writer.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace compaction {
|
||||
|
||||
@@ -38,7 +39,7 @@ struct compaction_completion_desc {
|
||||
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
|
||||
std::vector<sstables::shared_sstable> new_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// creates a new SSTable for a given shard
|
||||
|
||||
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
|
||||
if (dsm && (this_shard_id() == 0)) {
|
||||
_out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
|
||||
if (threshold_reached) {
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = true;
|
||||
return cm.drain();
|
||||
});
|
||||
}
|
||||
return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
|
||||
return container().invoke_on_all([] (compaction_manager& cm) {
|
||||
cm._in_critical_disk_utilization_mode = false;
|
||||
cm.enable();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2291,6 +2297,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
|
||||
return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
|
||||
}
|
||||
|
||||
std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
|
||||
std::exception_ptr ex;
|
||||
if (_in_critical_disk_utilization_mode) {
|
||||
ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
|
||||
} else {
|
||||
ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
|
||||
}
|
||||
return ex;
|
||||
}
|
||||
|
||||
future<std::vector<sstables::shared_sstable>>
|
||||
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
|
||||
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
|
||||
@@ -2300,8 +2316,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
|
||||
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
|
||||
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
|
||||
if (is_disabled()) {
|
||||
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
|
||||
"reason might be out of space prevention", sst->get_filename()))));
|
||||
co_return coroutine::exception(make_disabled_exception(t));
|
||||
}
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
|
||||
|
||||
@@ -114,6 +114,8 @@ private:
|
||||
uint32_t _disabled_state_count = 0;
|
||||
|
||||
bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
|
||||
// precondition: is_disabled() is true.
|
||||
std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);
|
||||
|
||||
std::optional<future<>> _stop_future;
|
||||
|
||||
@@ -173,6 +175,7 @@ private:
|
||||
tombstone_gc_state _tombstone_gc_state;
|
||||
|
||||
utils::disk_space_monitor::subscription _out_of_space_subscription;
|
||||
bool _in_critical_disk_utilization_mode = false;
|
||||
private:
|
||||
// Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
|
||||
future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
|
||||
|
||||
@@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/tracing_test.cc',
|
||||
'test/boost/user_function_test.cc',
|
||||
'test/boost/user_types_test.cc',
|
||||
'test/boost/vector_index_test.cc',
|
||||
'test/boost/view_build_test.cc',
|
||||
'test/boost/view_complex_test.cc',
|
||||
'test/boost/view_schema_ckey_test.cc',
|
||||
|
||||
@@ -105,6 +105,7 @@ public:
|
||||
static const std::chrono::minutes entry_expiry;
|
||||
|
||||
using key_type = prepared_cache_key_type;
|
||||
using pinned_value_type = cache_value_ptr;
|
||||
using value_type = checked_weak_ptr;
|
||||
using statement_is_too_big = typename cache_type::entry_is_too_big;
|
||||
|
||||
@@ -116,9 +117,14 @@ public:
|
||||
: _cache(size, entry_expiry, logger)
|
||||
{}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
|
||||
}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_type> get(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
|
||||
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
|
||||
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
const auto& warnings = prep_ptr->warnings;
|
||||
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
for (const auto& w : warnings) {
|
||||
msg->add_warning(w);
|
||||
}
|
||||
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
|
||||
co_return std::move(msg);
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
@@ -1029,6 +1029,11 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
query_options& options,
|
||||
std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
|
||||
auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (failed) {
|
||||
std::rethrow_exception(access_future.get_exception());
|
||||
}
|
||||
co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
|
||||
try {
|
||||
co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
|
||||
@@ -1036,11 +1041,6 @@ query_processor::execute_batch_without_checking_exception_message(
|
||||
log.error("failed to cache the entry: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
bool failed = access_future.failed();
|
||||
co_await audit::inspect(batch, query_state, options, failed);
|
||||
if (access_future.failed()) {
|
||||
std::rethrow_exception(access_future.get_exception());
|
||||
}
|
||||
batch->validate();
|
||||
batch->validate(*this, query_state.get_client_state());
|
||||
_stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
|
||||
|
||||
@@ -201,6 +201,10 @@ public:
|
||||
return _clustering_columns_restrictions;
|
||||
}
|
||||
|
||||
const expr::expression& get_nonprimary_key_restrictions() const {
|
||||
return _nonprimary_key_restrictions;
|
||||
}
|
||||
|
||||
// Get a set of columns restricted by the IS NOT NULL restriction.
|
||||
// IS NOT NULL is a special case that is handled separately from other restrictions.
|
||||
const std::unordered_set<const column_definition*> get_not_null_columns() const;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include "create_index_statement.hh"
|
||||
#include "db/config.hh"
|
||||
@@ -37,6 +38,7 @@
|
||||
#include "types/concrete_types.hh"
|
||||
#include "db/tags/extension.hh"
|
||||
#include "tombstone_gc_extension.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
@@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_vector_capable_class(const sstring& class_name) {
|
||||
return boost::iequals(class_name, "vector_index");
|
||||
}
|
||||
|
||||
static bool is_vector_index(const index_options_map& options) {
|
||||
auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
|
||||
return class_it != options.end() && is_vector_capable_class(class_it->second);
|
||||
}
|
||||
|
||||
view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
|
||||
const data_dictionary::database& db) const
|
||||
{
|
||||
@@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
|
||||
_idx_properties->validate();
|
||||
|
||||
// FIXME: This is ugly and can be improved.
|
||||
const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
|
||||
const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
|
||||
const bool uses_view_properties = _view_properties.properties()->count() > 0
|
||||
|| _view_properties.use_compact_storage()
|
||||
|| _view_properties.defined_ordering().size() > 0;
|
||||
@@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
|
||||
const index_options_map& options)
|
||||
{
|
||||
index_options_map new_options = options;
|
||||
auto target_option = secondary_index::target_parser::serialize_targets(targets);
|
||||
auto target_option = is_vector_index(options)
|
||||
? secondary_index::vector_index::serialize_targets(targets)
|
||||
: secondary_index::target_parser::serialize_targets(targets);
|
||||
new_options.emplace(index_target::target_option_name, target_option);
|
||||
|
||||
const auto& first_target = targets.front()->value;
|
||||
|
||||
@@ -2006,9 +2006,7 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(
|
||||
|
||||
auto indexes = sim.list_indexes();
|
||||
auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
|
||||
return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
|
||||
ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
|
||||
(ind.target_column() == prepared_ann_ordering.first->name_as_text());
|
||||
return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
|
||||
});
|
||||
|
||||
if (it == indexes.end()) {
|
||||
|
||||
@@ -461,7 +461,17 @@ public:
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_background_continuity = true; // Default continuity
|
||||
if (_reversed) [[unlikely]] {
|
||||
if (!rows.empty()) {
|
||||
it = std::prev(rows.end());
|
||||
cont = is_continuous::yes;
|
||||
rt = {};
|
||||
} else {
|
||||
_background_continuity = true;
|
||||
}
|
||||
} else {
|
||||
_background_continuity = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!it) {
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/labels.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace cache {
|
||||
|
||||
@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
|
||||
return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
|
||||
return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
|
||||
future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
|
||||
return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
|
||||
return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
|
||||
auto on_failure = defer([this] () noexcept {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "utils/histogram.hh"
|
||||
#include "mutation/partition_version.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/cache_tracker.hh"
|
||||
#include "readers/empty.hh"
|
||||
#include "readers/mutation_source.hh"
|
||||
@@ -457,7 +458,7 @@ public:
|
||||
// mutation source made prior to the call to invalidate().
|
||||
future<> invalidate(external_updater, const dht::decorated_key&);
|
||||
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
|
||||
// Evicts entries from cache.
|
||||
//
|
||||
|
||||
@@ -105,7 +105,7 @@ namespace {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.ks_name() == schema_tables::NAME) {
|
||||
// all schema tables are group0 tables
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -87,31 +87,15 @@ namespace {
|
||||
static const std::unordered_set<sstring> tables = {
|
||||
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
|
||||
system_keyspace::BROADCAST_KV_STORE,
|
||||
system_keyspace::CDC_GENERATIONS_V3,
|
||||
system_keyspace::RAFT,
|
||||
system_keyspace::RAFT_SNAPSHOTS,
|
||||
system_keyspace::RAFT_SNAPSHOT_CONFIG,
|
||||
system_keyspace::GROUP0_HISTORY,
|
||||
system_keyspace::DISCOVERY,
|
||||
system_keyspace::TABLETS,
|
||||
system_keyspace::TOPOLOGY,
|
||||
system_keyspace::TOPOLOGY_REQUESTS,
|
||||
system_keyspace::LOCAL,
|
||||
system_keyspace::PEERS,
|
||||
system_keyspace::SCYLLA_LOCAL,
|
||||
system_keyspace::COMMITLOG_CLEANUPS,
|
||||
system_keyspace::SERVICE_LEVELS_V2,
|
||||
system_keyspace::VIEW_BUILD_STATUS_V2,
|
||||
system_keyspace::CDC_STREAMS_STATE,
|
||||
system_keyspace::CDC_STREAMS_HISTORY,
|
||||
system_keyspace::ROLES,
|
||||
system_keyspace::ROLE_MEMBERS,
|
||||
system_keyspace::ROLE_ATTRIBUTES,
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.enable_schema_commitlog();
|
||||
@@ -143,7 +127,7 @@ namespace {
|
||||
system_keyspace::REPAIR_TASKS,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
const row& existing_row = existing.cells();
|
||||
const row& updated_row = update.cells();
|
||||
|
||||
const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
|
||||
const auto view_it = _view->columns_by_name().find(cdef.name());
|
||||
const bool column_is_selected = view_it != _view->columns_by_name().end();
|
||||
|
||||
@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
// as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
|
||||
// Because of that, we don't generate view updates when the value in an unselected column is created
|
||||
// or changes.
|
||||
if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
|
||||
if (!column_is_selected) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//TODO(sarna): Optimize collections case - currently they do not go under optimization
|
||||
if (!cdef.is_atomic()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the value was created or deleted, unless we have a non-expiring marker
|
||||
// We cannot skip if the value was created or deleted
|
||||
const auto* existing_cell = existing_row.find_cell(cdef.id);
|
||||
const auto* updated_cell = updated_row.find_cell(cdef.id);
|
||||
if (existing_cell == nullptr || updated_cell == nullptr) {
|
||||
return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
|
||||
return existing_cell == updated_cell;
|
||||
}
|
||||
|
||||
if (!cdef.is_atomic()) {
|
||||
return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
|
||||
}
|
||||
|
||||
atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
|
||||
atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);
|
||||
|
||||
// We cannot skip when a selected column is changed
|
||||
if (column_is_selected) {
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
|
||||
// With non-expiring row marker, liveness checks below are not relevant
|
||||
if (base_has_nonexpiring_marker) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the change updates TTL
|
||||
const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
|
||||
const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
|
||||
if (existing_has_ttl || updated_has_ttl) {
|
||||
return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
|
||||
}
|
||||
|
||||
return true;
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
locator::endpoint_dc_rack my_location,
|
||||
const locator::network_topology_strategy* network_topology,
|
||||
const bool network_topology,
|
||||
replica::cf_stats& cf_stats) {
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id me,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& my_location = topology.get_location(me);
|
||||
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
|
||||
|
||||
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
|
||||
if (auto* np = topology.find_node(ep)) {
|
||||
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
// view pairing as the leaving base replica.
|
||||
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
|
||||
auto leaving_base = it->get().host_id();
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
|
||||
view_token, use_tablets, cf_stats);
|
||||
}
|
||||
}
|
||||
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
|
||||
wait_for_all_updates wait_for_all)
|
||||
{
|
||||
auto& ks = _db.find_keyspace(base->ks_name());
|
||||
auto& replication = ks.get_replication_strategy();
|
||||
const bool uses_tablets = ks.uses_tablets();
|
||||
const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
|
||||
// The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
|
||||
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
|
||||
auto get_erm = [&] (table_id id) {
|
||||
auto it = erms.find(id);
|
||||
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
|
||||
co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
|
||||
auto view_token = dht::get_token(*mut.s, mut.fm.key());
|
||||
auto view_ermp = erms.at(mut.s->id());
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
|
||||
ks.uses_tablets(), cf_stats);
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
|
||||
uses_tablets, cf_stats);
|
||||
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
|
||||
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
|
||||
if (no_pairing_endpoint) {
|
||||
|
||||
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id node,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
|
||||
@@ -200,9 +200,7 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
while (!_as.abort_requested()) {
|
||||
bool sleep = false;
|
||||
try {
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
co_await create_staging_sstable_tasks();
|
||||
lock.return_all();
|
||||
_as.check();
|
||||
co_await _sstables_to_register_event.when();
|
||||
} catch (semaphore_aborted&) {
|
||||
@@ -227,13 +225,45 @@ future<> view_building_worker::run_staging_sstables_registrator() {
|
||||
}
|
||||
}
|
||||
|
||||
future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
// Collect `_staging_sstables_mutex` locks from multiple shards,
|
||||
// so other shards won't interact with their `_staging_sstables` map
|
||||
// until the caller releases them.
|
||||
std::vector<foreign_ptr<semaphore_units<>>> locks;
|
||||
locks.resize(smp::count);
|
||||
// Locks are acquired from multiple shards in parallel.
|
||||
// This is the only place where multiple-shard locks are acquired at once
|
||||
// and the method is called only once at a time (from `create_staging_sstable_tasks()`
|
||||
// on shard 0), so no deadlock may occur.
|
||||
co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
|
||||
auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
|
||||
auto& vbw = sharded_vbw.local();
|
||||
auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
|
||||
co_return make_foreign(std::move(lock));
|
||||
});
|
||||
locks[shard_id] = std::move(lock_ptr);
|
||||
});
|
||||
co_return std::move(locks);
|
||||
}
|
||||
|
||||
future<> view_building_worker::create_staging_sstable_tasks() {
|
||||
// Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
|
||||
auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
|
||||
if (_sstables_to_register.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto shards = _sstables_to_register
|
||||
| std::views::values
|
||||
| std::views::join
|
||||
| std::views::transform([] (const auto& sst_info) { return sst_info.shard; })
|
||||
| std::ranges::to<std::flat_set<shard_id>>();
|
||||
shards.erase(0); // We're already holding shard0 lock
|
||||
auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));
|
||||
|
||||
utils::chunked_vector<canonical_mutation> cmuts;
|
||||
auto guard = co_await _group0.client().start_operation(_as);
|
||||
auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
|
||||
for (auto& [table_id, sst_infos]: _sstables_to_register) {
|
||||
@@ -672,24 +702,34 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
|
||||
}
|
||||
|
||||
future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
|
||||
if (_staging_sstables[table_id].empty()) {
|
||||
auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
|
||||
std::vector<sstables::shared_sstable> sstables_to_process;
|
||||
|
||||
try {
|
||||
// Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
|
||||
// concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
|
||||
// while we read them.
|
||||
auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
|
||||
auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
|
||||
auto tid = tablet_map.get_tablet_id(last_token);
|
||||
auto tablet_range = tablet_map.get_token_range(tid);
|
||||
|
||||
// Select sstables belonging to the tablet (identified by `last_token`)
|
||||
for (auto& sst: _staging_sstables[table_id]) {
|
||||
auto sst_last_token = sst->get_last_decorated_key().token();
|
||||
if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
|
||||
sstables_to_process.push_back(sst);
|
||||
}
|
||||
}
|
||||
lock.return_all();
|
||||
} catch (semaphore_aborted&) {
|
||||
vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
|
||||
auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
|
||||
auto tid = tablet_map.get_tablet_id(last_token);
|
||||
auto tablet_range = tablet_map.get_token_range(tid);
|
||||
|
||||
// Select sstables belonging to the tablet (identified by `last_token`)
|
||||
std::vector<sstables::shared_sstable> sstables_to_process;
|
||||
for (auto& sst: _staging_sstables[table_id]) {
|
||||
auto sst_last_token = sst->get_last_decorated_key().token();
|
||||
if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
|
||||
sstables_to_process.push_back(sst);
|
||||
}
|
||||
if (sstables_to_process.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);
|
||||
|
||||
try {
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <flat_set>
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
#include "locator/tablets.hh"
|
||||
#include "raft/raft.hh"
|
||||
@@ -169,10 +170,15 @@ private:
|
||||
future<> do_process_staging(table_id base_id, dht::token last_token);
|
||||
|
||||
future<> run_staging_sstables_registrator();
|
||||
// Caller must hold units from `_staging_sstables_mutex`
|
||||
// Acquires `_staging_sstables_mutex` on all shards internally,
|
||||
// so callers must not hold `_staging_sstables_mutex` when invoking it.
|
||||
future<> create_staging_sstable_tasks();
|
||||
future<> discover_existing_staging_sstables();
|
||||
std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
|
||||
// Acquire `_staging_sstables_mutex` on multiple shards in parallel.
|
||||
// Must be called only from shard 0.
|
||||
// Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
|
||||
future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);
|
||||
|
||||
void init_messaging_service();
|
||||
future<> uninit_messaging_service();
|
||||
|
||||
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
|
||||
return prs;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
|
||||
utils::chunked_vector<dht::partition_range> prs;
|
||||
prs.reserve(ranges.size());
|
||||
for (auto& range : ranges) {
|
||||
prs.push_back(dht::to_partition_range(range));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return prs;
|
||||
}
|
||||
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
|
||||
std::map<unsigned, dht::partition_range_vector> ret;
|
||||
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
|
||||
return ret;
|
||||
}
|
||||
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
|
||||
auto cmp = dht::ring_position_comparator(schema);
|
||||
// optimize set of potentially overlapping ranges by deoverlapping them.
|
||||
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
|
||||
dht::partition_range_vector res;
|
||||
auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
|
||||
utils::chunked_vector<dht::partition_range> res;
|
||||
res.reserve(ranges.size() * 2);
|
||||
|
||||
auto range = ranges.begin();
|
||||
|
||||
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {
|
||||
|
||||
dht::partition_range to_partition_range(dht::token_range);
|
||||
dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);
|
||||
|
||||
// Each shard gets a sorted, disjoint vector of ranges
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
|
||||
// Returns a sorted and deoverlapped list of ranges that are
|
||||
// the result of subtracting all ranges from ranges_to_subtract.
|
||||
// ranges_to_subtract must be sorted and deoverlapped.
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
|
||||
|
||||
// Returns a token_range vector split based on the given number of most-significant bits
|
||||
dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -52,6 +77,10 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -223,6 +252,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
28
dist/common/scripts/scylla_swap_setup
vendored
28
dist/common/scripts/scylla_swap_setup
vendored
@@ -9,6 +9,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shlex
|
||||
import argparse
|
||||
import psutil
|
||||
from pathlib import Path
|
||||
@@ -103,16 +104,41 @@ if __name__ == '__main__':
|
||||
run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
|
||||
swapfile.chmod(0o600)
|
||||
run('mkswap -f {}'.format(swapfile), shell=True, check=True)
|
||||
|
||||
mount_point = find_mount_point(swap_directory)
|
||||
mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
|
||||
|
||||
# Add DefaultDependencies=no to the swap unit to avoid getting the default
|
||||
# Before=swap.target dependency. We apply this to all clouds, but the
|
||||
# requirement came from Azure:
|
||||
#
|
||||
# On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
|
||||
# However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
|
||||
# the network (After=network-online.target). By extension, this means that
|
||||
# the swap unit depends on the network. If we didn't use DefaultDependencies=no,
|
||||
# then the swap unit would be part of the swap.target which other services
|
||||
# assume to be a local boot target, so we would end up with dependency cycles
|
||||
# such as:
|
||||
#
|
||||
# swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
|
||||
#
|
||||
# By removing the automatic Before=swap.target, the swap unit is no longer
|
||||
# part of swap.target, avoiding such cycles. The swap will still be
|
||||
# activated via WantedBy=multi-user.target.
|
||||
unit_data = '''
|
||||
[Unit]
|
||||
Description=swapfile
|
||||
DefaultDependencies=no
|
||||
After={}
|
||||
Conflicts=umount.target
|
||||
Before=umount.target
|
||||
|
||||
[Swap]
|
||||
What={}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
'''[1:-1].format(swapfile)
|
||||
'''[1:-1].format(mount_unit, swapfile)
|
||||
with swapunit.open('w') as f:
|
||||
f.write(unit_data)
|
||||
systemd_unit.reload()
|
||||
|
||||
@@ -31,7 +31,7 @@ was used. Alternator currently supports two compression algorithms, `gzip`
|
||||
and `deflate`, both standardized in ([RFC 9110](https://www.rfc-editor.org/rfc/rfc9110.html)).
|
||||
Other standard compression types which are listed in
|
||||
[IANA's HTTP Content Coding Registry](https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding),
|
||||
including `zstd` ([RFC 8878][https://www.rfc-editor.org/rfc/rfc8878.html]),
|
||||
including `zstd` ([RFC 8878](https://www.rfc-editor.org/rfc/rfc8878.html)),
|
||||
are not yet supported by Alternator.
|
||||
|
||||
Note that HTTP's compression only compresses the request's _body_ - not the
|
||||
|
||||
@@ -437,6 +437,36 @@ To migrate a keyspace from a numeric replication factor to a rack-list replicati
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. _fix-rf-change-tablet-rebuilds:
|
||||
|
||||
Fixing invalid replica state with RF change
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If a tablet rebuild fails during an RF change, the state of replicas will be invalid, even though the RF change is marked as successful. The missing replicas will be eventually added in the background. However, until then, the following RF changes will fail.
|
||||
|
||||
To fix the state of replicas in the foreground, retry the previous ALTER KEYSPACE statement, i.e. update the replication factor to the same value it currently has.
|
||||
|
||||
For example, if the following statement fails due to invalid replica state:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
|
||||
|
||||
Check the current replication factor with DESCRIBE KEYSPACE:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE Excelsior;
|
||||
CREATE KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
|
||||
|
||||
Ensure that reaching the valid replicas state is possible (e.g. there is enough non-excluded racks) and alter keyspace with the current replication factor:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
|
||||
|
||||
This should fix the state of replicas and allow future RF changes to succeed.
|
||||
|
||||
.. _drop-keyspace-statement:
|
||||
|
||||
DROP KEYSPACE
|
||||
|
||||
@@ -281,8 +281,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
See :ref:`WHERE <where-clause>`.
|
||||
|
||||
For example::
|
||||
|
||||
@@ -290,10 +290,6 @@ For example::
|
||||
WHERE user_id = 'user123'
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
|
||||
|
||||
Other filtering scenarios are currently not supported.
|
||||
|
||||
.. note::
|
||||
|
||||
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
|
||||
|
||||
@@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index
|
||||
- index on map, set or list values: VALUES(v)
|
||||
- index on map entries: ENTRIES(v)
|
||||
|
||||
Their serialization is just string representation, so:
|
||||
"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets.
|
||||
Their serialization uses lowercase type names as prefixes, except for `full` which is serialized
|
||||
as just the column name (without any prefix):
|
||||
`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection
|
||||
index on column `v` is stored simply as `"v"` (same as a regular index).
|
||||
|
||||
If the column name contains characters that could be confused with the above formats
|
||||
(e.g., a name containing parentheses or braces), it is escaped using the CQL
|
||||
quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the
|
||||
name in double quotes and doubles any embedded double-quote characters. For example,
|
||||
a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is
|
||||
stored as `"keys(m)"`.
|
||||
|
||||
## Local index
|
||||
|
||||
|
||||
10
docs/dev/vector_index.md
Normal file
10
docs/dev/vector_index.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Vector index in Scylla
|
||||
|
||||
Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
|
||||
|
||||
- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
|
||||
- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
|
||||
- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
|
||||
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
|
||||
|
||||
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.
|
||||
@@ -52,7 +52,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -125,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -133,19 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
sudo yum install scylla-2025.3.1
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -36,11 +36,8 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -50,20 +47,4 @@ you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -0,0 +1,492 @@
|
||||
=================================================
|
||||
Cluster Platform Migration Using Node Cycling
|
||||
=================================================
|
||||
|
||||
This procedure describes how to migrate a ScyllaDB cluster to new instance types
|
||||
using the add-and-replace approach, which is commonly used for:
|
||||
|
||||
* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
|
||||
* Upgrading to newer instance types with better performance
|
||||
* Changing instance families within the same cloud provider
|
||||
|
||||
The add-and-replace approach maintains data replication throughout the migration
|
||||
and ensures zero downtime for client applications.
|
||||
|
||||
.. note::
|
||||
|
||||
This procedure does **not** change the ScyllaDB software version. All nodes
|
||||
(both existing and new) must run the same ScyllaDB version. For software
|
||||
version upgrades, see :doc:`Upgrade </upgrade/index>`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The add-and-replace migration follows these steps:
|
||||
|
||||
#. Add new nodes (on target instance type) to the existing cluster
|
||||
#. Wait for data to stream to the new nodes
|
||||
#. Decommission old nodes (on source instance type)
|
||||
|
||||
This approach keeps the cluster operational throughout the migration while
|
||||
maintaining the configured replication factor.
|
||||
|
||||
Key characteristics
|
||||
===================
|
||||
|
||||
* **Zero downtime**: Client applications continue to operate during migration
|
||||
* **Data safety**: Replication factor is maintained throughout the process
|
||||
* **Flexible**: Works with both vnodes and tablets-enabled clusters
|
||||
* **Multi-DC support**: Can migrate nodes across multiple datacenters
|
||||
|
||||
.. warning::
|
||||
|
||||
Ensure your cluster has sufficient capacity during the migration. At the peak
|
||||
of the process, your cluster will temporarily have double the number of nodes.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Check cluster health
|
||||
====================
|
||||
|
||||
Before starting the migration, verify that your cluster is healthy:
|
||||
|
||||
#. Check that all nodes are in Up Normal (UN) status:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status. Do not proceed if any nodes are down.
|
||||
|
||||
#. Ensure no streaming or repair operations are in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
nodetool compactionstats
|
||||
|
||||
Plan the migration
|
||||
==================
|
||||
|
||||
Before provisioning new instances, plan the following:
|
||||
|
||||
**Instance type mapping**: Identify the source and target instance types.
|
||||
If your cluster uses vnodes (not tablets), consider that mismatched shard
|
||||
counts between source and target instance types can cause slower repairs.
|
||||
With tablets enabled, shard count mismatch is fully supported.
|
||||
|
||||
**Rack assignment planning**: Each new node must be assigned to the same rack
|
||||
as the node it will replace. This maintains rack-aware topology for:
|
||||
|
||||
* Rack-aware replication (NetworkTopologyStrategy)
|
||||
* Proper data distribution across failure domains
|
||||
* Minimizing data movement during decommission
|
||||
|
||||
Example mapping for a 3-node cluster:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
Source nodes (to be decommissioned): Target nodes (to be added):
|
||||
192.168.1.10 - RACK0 → 192.168.2.10 - RACK0
|
||||
192.168.1.11 - RACK1 → 192.168.2.11 - RACK1
|
||||
192.168.1.12 - RACK2 → 192.168.2.12 - RACK2
|
||||
|
||||
Create a backup
|
||||
===============
|
||||
|
||||
Back up the data before starting the migration. One of the following
|
||||
methods can be used:
|
||||
|
||||
* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
|
||||
cluster-wide backup. See the
|
||||
`ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
|
||||
for details.
|
||||
|
||||
* **Snapshots**: On each node in the cluster, create a snapshot:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool snapshot -t pre_migration_backup
|
||||
nodetool listsnapshots
|
||||
|
||||
.. note::
|
||||
|
||||
Snapshots are local to each node and do not protect against node or disk
|
||||
failure. For full disaster recovery, use ScyllaDB Manager backup.
|
||||
|
||||
|
||||
Procedure
|
||||
---------
|
||||
|
||||
Adding new nodes
|
||||
================
|
||||
|
||||
#. Provision new instances with the target instance type. Ensure:
|
||||
|
||||
* The same ScyllaDB version as existing nodes
|
||||
* Same network configuration and security groups
|
||||
* Appropriate storage configuration
|
||||
|
||||
#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
|
||||
cluster:
|
||||
|
||||
* **cluster_name**: Must match the existing cluster name
|
||||
* **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
|
||||
* **endpoint_snitch**: Must match the existing cluster configuration
|
||||
* **listen_address**: IP address of the new node
|
||||
* **rpc_address**: IP address of the new node
|
||||
|
||||
All other cluster-wide settings (tablets configuration, encryption settings,
|
||||
experimental features, etc.) must match the existing nodes.
|
||||
|
||||
.. caution::
|
||||
|
||||
Make sure that the ScyllaDB version on the new node is identical to the
|
||||
version on the other nodes in the cluster. Running nodes with different
|
||||
versions is not supported.
|
||||
|
||||
#. If using ``GossipingPropertyFileSnitch``, configure
|
||||
``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
|
||||
and rack assignment for this node:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
dc = <datacenter-name>
|
||||
rack = <rack-name>
|
||||
prefer_local = true
|
||||
|
||||
.. warning::
|
||||
|
||||
Each node must have the correct rack assignment. Using the same rack for
|
||||
all new nodes breaks rack-aware replication topology.
|
||||
|
||||
#. Start ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl start scylla-server
|
||||
|
||||
For Docker deployments:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker exec -it <container-name> supervisorctl start scylla
|
||||
|
||||
#. Monitor the bootstrap process from an existing node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The new node will appear with ``UJ`` (Up, Joining) status while streaming
|
||||
data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
|
||||
|
||||
**Example output during bootstrap:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
After the node reaches ``UN`` status, verify no streaming is in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
Wait until output shows "Not sending any streams" and no active receiving streams.
|
||||
|
||||
#. Repeat steps 1-6 for each new node to be added.
|
||||
|
||||
.. note::
|
||||
|
||||
You can add multiple nodes in parallel if they are in different datacenters.
|
||||
Within a single datacenter, add nodes one at a time for best results.
|
||||
|
||||
|
||||
Updating seed node configuration
|
||||
================================
|
||||
|
||||
If any of your original nodes are configured as seed nodes, you must update
|
||||
the seed configuration before decommissioning them.
|
||||
|
||||
#. Check the current seed configuration on any node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
|
||||
|
||||
#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
|
||||
on **all new nodes** to use the new node IPs as seeds:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
seed_provider:
|
||||
- class_name: org.apache.cassandra.locator.SimpleSeedProvider
|
||||
parameters:
|
||||
- seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
|
||||
|
||||
.. note::
|
||||
|
||||
Updating seed configuration on the **old nodes** (that will be
|
||||
decommissioned) is optional. Seeds are only used during node startup
|
||||
to discover the cluster. If you don't plan to restart the old nodes
|
||||
before decommissioning them, their seed configuration doesn't matter.
|
||||
However, updating all nodes is recommended for safety in case an old
|
||||
node unexpectedly restarts during the migration.
|
||||
|
||||
#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
|
||||
configuration:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl restart scylla-server
|
||||
|
||||
Wait for the node to fully start before restarting the next node.
|
||||
|
||||
#. After restarting the new nodes, verify the cluster is healthy:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
nodetool describecluster
|
||||
|
||||
.. warning::
|
||||
|
||||
Complete this seed list update on **all new nodes** before decommissioning
|
||||
any old nodes. This ensures the new nodes can reform the cluster after
|
||||
the old nodes are removed.
|
||||
|
||||
|
||||
Decommissioning old nodes
|
||||
=========================
|
||||
|
||||
After all new nodes are added and healthy, decommission the old nodes one
|
||||
at a time.
|
||||
|
||||
#. Verify all nodes are healthy before starting decommission:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status.
|
||||
|
||||
#. On the node to be decommissioned, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool decommission
|
||||
|
||||
This command blocks until the decommission is complete. The node will
|
||||
stream its data to the remaining nodes.
|
||||
|
||||
#. Monitor the decommission progress from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
|
||||
→ removed from the cluster.
|
||||
|
||||
You can also monitor streaming progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
#. After decommission completes, verify the node is no longer in the cluster:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioned node should no longer appear in the output.
|
||||
|
||||
#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
|
||||
no longer belongs to them after the topology change:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool cleanup
|
||||
|
||||
.. note::
|
||||
|
||||
``nodetool cleanup`` can be resource-intensive. Run it on one node at a
|
||||
time during low-traffic periods.
|
||||
|
||||
#. Wait for the cluster to stabilize before decommissioning the next node.
|
||||
Ensure no streaming operations are in progress.
|
||||
|
||||
#. Repeat steps 1-7 for each old node to be decommissioned.
|
||||
|
||||
|
||||
Post-migration verification
|
||||
---------------------------
|
||||
|
||||
After all old nodes are decommissioned, verify the migration was successful.
|
||||
|
||||
Verify cluster topology
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
Confirm:
|
||||
|
||||
* All nodes show ``UN`` (Up, Normal) status
|
||||
* Only the new instance type nodes are present
|
||||
* Nodes are balanced across racks
|
||||
|
||||
Verify schema agreement
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool describecluster
|
||||
|
||||
All nodes should report the same schema version.
|
||||
|
||||
Verify data connectivity
|
||||
========================
|
||||
|
||||
Connect to the cluster and run a test query:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
|
||||
|
||||
.. note::
|
||||
|
||||
If ScyllaDB is configured with ``listen_interface``, you must use the
|
||||
node's interface IP address (not localhost) for cqlsh connections.
|
||||
|
||||
Verify ScyllaDB version
|
||||
=======================
|
||||
|
||||
Confirm all nodes are running the same ScyllaDB version:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
scylla --version
|
||||
|
||||
Verify data integrity (optional)
|
||||
================================
|
||||
|
||||
Run data validation on each keyspace to verify sstable integrity:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool scrub --mode=VALIDATE <keyspace_name>
|
||||
|
||||
Rollback
|
||||
--------
|
||||
|
||||
If issues occur during the migration, you can roll back by reversing the
|
||||
procedure.
|
||||
|
||||
During add phase
|
||||
================
|
||||
|
||||
If a new node fails to bootstrap:
|
||||
|
||||
#. Stop ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl stop scylla-server
|
||||
|
||||
#. From an existing node, remove the failed node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id-of-failed-node>
|
||||
|
||||
During decommission phase
|
||||
=========================
|
||||
|
||||
If a decommission operation gets stuck:
|
||||
|
||||
#. If the node is still reachable, try stopping and restarting ScyllaDB
|
||||
#. If the node is unresponsive, from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id>
|
||||
|
||||
See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
for more details.
|
||||
|
||||
Full rollback
|
||||
=============
|
||||
|
||||
To roll back after the migration is complete (all nodes on new instance type),
|
||||
apply the same add-and-replace procedure in reverse:
|
||||
|
||||
#. Add new nodes on the original instance type
|
||||
#. Wait for data streaming to complete
|
||||
#. Decommission the nodes on the new instance type
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
Node stuck in Joining (UJ) state
|
||||
================================
|
||||
|
||||
If a new node remains in ``UJ`` state for an extended period:
|
||||
|
||||
* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
|
||||
* Verify network connectivity between nodes
|
||||
* Ensure sufficient disk space on all nodes
|
||||
* Check for any ongoing operations that may be blocking
|
||||
|
||||
Decommission taking too long
|
||||
============================
|
||||
|
||||
Decommission duration depends on data size. If it appears stuck:
|
||||
|
||||
* Check streaming progress: ``nodetool netstats``
|
||||
* Look for errors in ScyllaDB logs
|
||||
* Verify network bandwidth between nodes
|
||||
|
||||
Schema disagreement
|
||||
===================
|
||||
|
||||
If nodes report different schema versions:
|
||||
|
||||
* Wait a few minutes for schema to propagate
|
||||
* If disagreement persists, restart the nodes one by one
|
||||
* Run ``nodetool describecluster`` to verify agreement
|
||||
|
||||
|
||||
Additional resources
|
||||
--------------------
|
||||
|
||||
* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
|
||||
* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
|
||||
* :doc:`Upgrade </upgrade/index>`
|
||||
@@ -26,6 +26,7 @@ Cluster Management Procedures
|
||||
Safely Restart Your Cluster <safe-start>
|
||||
repair-based-node-operation
|
||||
Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
|
||||
Cluster Platform Migration <cluster-platform-migration>
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -85,6 +86,8 @@ Cluster Management Procedures
|
||||
|
||||
* :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
|
||||
|
||||
* :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
|
||||
|
||||
.. panel-box::
|
||||
:title: Topology Changes
|
||||
:id: "getting-started"
|
||||
|
||||
@@ -57,12 +57,11 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
.. note::
|
||||
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -27,6 +27,16 @@ This configuration takes the form of a query template which is defined in the sc
|
||||
The value of ``ldap_url_template`` parameter should contain a valid LDAP URL (e.g., as returned by the ldapurl utility from OpenLDAP) representing an LDAP query that returns entries for all the user's roles.
|
||||
Scylla will replace the text ``{USER}`` in the URL with the user's Scylla username before querying LDAP.
|
||||
|
||||
.. note:: Usernames substituted into ``{USER}`` are automatically escaped
|
||||
using RFC 4515 filter escaping and URL percent-encoding, so LDAP filter
|
||||
metacharacters (``*``, ``(``, ``)``, ``\``, NUL) and URL metacharacters
|
||||
(``%``, ``?``, ``#``) in usernames are handled safely.
|
||||
|
||||
``{USER}`` must appear only in the **filter** component of the LDAP URL
|
||||
(the part after the third ``?``). Templates that place ``{USER}`` in the
|
||||
host, base DN, attributes, or extensions are rejected at startup, because
|
||||
filter escaping is not the correct encoding for those components.
|
||||
|
||||
Workflow
|
||||
--------
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ Upgrade ScyllaDB
|
||||
.. toctree::
|
||||
|
||||
ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
|
||||
ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
|
||||
ScyllaDB Image <ami-upgrade>
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
.. |SCYLLA_NAME| replace:: ScyllaDB
|
||||
|
||||
.. |SRC_VERSION| replace:: 2026.x.y
|
||||
.. |NEW_VERSION| replace:: 2026.x.z
|
||||
|
||||
==========================================================================
|
||||
Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
|
||||
==========================================================================
|
||||
|
||||
This document describes a step-by-step procedure for upgrading from
|
||||
|SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION| (where "z" is
|
||||
the latest available version), and rolling back to version |SRC_VERSION|
|
||||
if necessary.
|
||||
|
||||
This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
|
||||
CentOS, Debian, and Ubuntu.
|
||||
See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
|
||||
for information about supported versions.
|
||||
|
||||
It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
|
||||
|
||||
See `Upgrade Policy <https://docs.scylladb.com/stable/versioning/upgrade-policy.html>`_ for the ScyllaDB upgrade policy.
|
||||
|
||||
Upgrade Procedure
|
||||
=================
|
||||
|
||||
.. note::
|
||||
Apply the following procedure **serially** on each node. Do not move to the next
|
||||
node before validating that the node is up and running the new version.
|
||||
|
||||
A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
|
||||
shutdown. For each of the nodes in the cluster, you will:
|
||||
|
||||
#. Drain the node and back up the data.
|
||||
#. Backup configuration file.
|
||||
#. Stop ScyllaDB.
|
||||
#. Download and install new ScyllaDB packages.
|
||||
#. Start ScyllaDB.
|
||||
#. Validate that the upgrade was successful.
|
||||
|
||||
**Before** upgrading, check which version you are running now using
|
||||
``scylla --version``. Note the current version in case you want to roll back
|
||||
the upgrade.
|
||||
|
||||
**During** the rolling upgrade it is highly recommended:
|
||||
|
||||
* Not to use new |NEW_VERSION| features.
|
||||
* Not to run administration functions, like repairs, refresh, rebuild or add
|
||||
or remove nodes. See
|
||||
`sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
|
||||
ScyllaDB Manager's scheduled or running repairs.
|
||||
* Not to apply schema changes.
|
||||
|
||||
Upgrade Steps
|
||||
=============
|
||||
|
||||
Back up the data
|
||||
------------------------------
|
||||
|
||||
Back up all the data to an external device. We recommend using
|
||||
`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
|
||||
to create backups.
|
||||
|
||||
Alternatively, you can use the ``nodetool snapshot`` command.
|
||||
For **each** node in the cluster, run the following:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
nodetool snapshot
|
||||
|
||||
Take note of the directory name that nodetool gives you, and copy all
|
||||
the directories with this name under ``/var/lib/scylla`` to a backup device.
|
||||
|
||||
When the upgrade is completed on all nodes, remove the snapshot with the
|
||||
``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
|
||||
space.
|
||||
|
||||
Back up the configuration file
|
||||
------------------------------
|
||||
|
||||
Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
|
||||
in case you need to roll back the upgrade.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
|
||||
sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
|
||||
|
||||
Gracefully stop the node
|
||||
------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server stop
|
||||
|
||||
Download and install the new release
|
||||
------------------------------------
|
||||
|
||||
You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
|
||||
a patch release.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To install a patch version on Debian or Ubuntu, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To install a patch version on RHEL or CentOS, run:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo yum clean all
|
||||
sudo yum update scylla\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you're using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you're using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to apply an extended upgrade procedure:
|
||||
|
||||
#. Install the new ScyllaDB version with the additional
|
||||
``scylla-machine-image`` package:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get clean all
|
||||
sudo apt-get update
|
||||
sudo apt-get dist-upgrade scylla
|
||||
sudo apt-get dist-upgrade scylla-machine-image
|
||||
#. Run ``scylla_setup`` without ``running io_setup``.
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service start scylla-server
|
||||
|
||||
Validate
|
||||
--------
|
||||
#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
|
||||
including the one you just upgraded, are in UN status.
|
||||
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
|
||||
to check the ScyllaDB version.
|
||||
#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
|
||||
#. Check again after 2 minutes to validate that no new issues are introduced.
|
||||
|
||||
Once you are sure the node upgrade is successful, move to the next node in
|
||||
the cluster.
|
||||
|
||||
Rollback Procedure
|
||||
==================
|
||||
|
||||
The following procedure describes a rollback from ScyllaDB release
|
||||
|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
|
||||
|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes.
|
||||
|
||||
* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
|
||||
* Execute the following commands one node at a time, moving to the next node only
|
||||
after the rollback procedure is completed successfully.
|
||||
|
||||
ScyllaDB rollback is a rolling procedure that does **not** require a full
|
||||
cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
|
||||
|
||||
#. Drain the node and stop ScyllaDB.
|
||||
#. Downgrade to the previous release.
|
||||
#. Restore the configuration file.
|
||||
#. Restart ScyllaDB.
|
||||
#. Validate the rollback success.
|
||||
|
||||
Rollback Steps
|
||||
==============
|
||||
|
||||
Gracefully shutdown ScyllaDB
|
||||
-----------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
nodetool drain
|
||||
sudo service stop scylla-server
|
||||
|
||||
Downgrade to the previous release
|
||||
----------------------------------
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
|
||||
To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
.. group-tab:: RHEL/CentOS
|
||||
|
||||
To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
|
||||
|
||||
.. group-tab:: EC2/GCP/Azure Ubuntu Image
|
||||
|
||||
If you’re using the ScyllaDB official image (recommended), see
|
||||
the **Debian/Ubuntu** tab for upgrade instructions.
|
||||
|
||||
If you’re using your own image and have installed ScyllaDB packages for
|
||||
Ubuntu or Debian, you need to additionally downgrade
|
||||
the ``scylla-machine-image`` package.
|
||||
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
|
||||
sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
|
||||
|
||||
Answer ‘y’ to the first two questions.
|
||||
|
||||
|
||||
Restore the configuration file
|
||||
------------------------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo rm -rf /etc/scylla/scylla.yaml
|
||||
sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
.. code:: sh
|
||||
|
||||
sudo service scylla-server start
|
||||
|
||||
Validate
|
||||
--------
|
||||
Check upgrade instruction above for validation. Once you are sure the node
|
||||
rollback is successful, move to the next node in the cluster.
|
||||
@@ -727,7 +727,12 @@ public:
|
||||
|
||||
// now we need one page more to be able to save one for next lap
|
||||
auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
|
||||
auto buf2 = co_await _input.read_exactly(fill_size);
|
||||
// If the underlying stream is already at EOF (e.g. buf1 came from
|
||||
// cached _next while the previous read_exactly drained the source),
|
||||
// skip the read_exactly call — it would return empty anyway.
|
||||
auto buf2 = _input.eof()
|
||||
? temporary_buffer<char>()
|
||||
: co_await _input.read_exactly(fill_size);
|
||||
|
||||
temporary_buffer<char> output(buf1.size() + buf2.size());
|
||||
|
||||
|
||||
@@ -437,7 +437,6 @@ void ldap_connection::poll_results() {
|
||||
const auto found = _msgid_to_promise.find(id);
|
||||
if (found == _msgid_to_promise.end()) {
|
||||
mylog.error("poll_results: got valid result for unregistered id {}, dropping it", id);
|
||||
ldap_msgfree(result);
|
||||
} else {
|
||||
found->second.set_value(std::move(result_ptr));
|
||||
_msgid_to_promise.erase(found);
|
||||
|
||||
@@ -16,9 +16,11 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
@@ -102,7 +104,123 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
|
||||
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
|
||||
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
};
|
||||
};
|
||||
|
||||
static constexpr auto TC_TARGET_KEY = "tc";
|
||||
static constexpr auto PK_TARGET_KEY = "pk";
|
||||
static constexpr auto FC_TARGET_KEY = "fc";
|
||||
|
||||
// Convert a serialized targets string (as produced by serialize_targets())
|
||||
// back into the CQL column list used inside CREATE INDEX ... ON table(<here>).
|
||||
//
|
||||
// JSON examples:
|
||||
// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2"
|
||||
// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v"
|
||||
// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1"
|
||||
static sstring targets_to_cql(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets));
|
||||
}
|
||||
|
||||
sstring result;
|
||||
|
||||
const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY);
|
||||
if (pk && pk->IsArray() && !pk->Empty()) {
|
||||
result += "(";
|
||||
auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) {
|
||||
return cql3::util::maybe_quote(sstring(rjson::to_string_view(col)));
|
||||
}) | std::ranges::to<std::vector<sstring>>();
|
||||
result += boost::algorithm::join(pk_cols, ", ");
|
||||
result += "), ";
|
||||
}
|
||||
|
||||
const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc)));
|
||||
}
|
||||
|
||||
const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY);
|
||||
if (fc && fc->IsArray()) {
|
||||
for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) {
|
||||
result += ", ";
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i])));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Serialize vector index targets into a format using:
|
||||
// "tc" for the target (vector) column,
|
||||
// "pk" for partition key columns (local index),
|
||||
// "fc" for filtering columns.
|
||||
// For a simple single-column vector index, returns just the column name.
|
||||
// Examples:
|
||||
// (v) -> "v"
|
||||
// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]}
|
||||
// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]}
|
||||
// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}
|
||||
sstring vector_index::serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() == 0) {
|
||||
throw exceptions::invalid_request_exception("Vector index must have at least one target column");
|
||||
}
|
||||
|
||||
if (targets.size() == 1) {
|
||||
auto tc = targets[0]->value;
|
||||
if (!std::holds_alternative<index_target::single_column>(tc)) {
|
||||
throw exceptions::invalid_request_exception("Missing vector column target for local vector index");
|
||||
}
|
||||
return index_target::escape_target_column(*std::get<index_target::single_column>(tc));
|
||||
}
|
||||
|
||||
const bool has_pk = std::holds_alternative<index_target::multiple_columns>(targets.front()->value);
|
||||
const size_t tc_idx = has_pk ? 1 : 0;
|
||||
const size_t fc_count = targets.size() - tc_idx - 1;
|
||||
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[tc_idx]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index target column must be a single column");
|
||||
}
|
||||
|
||||
rjson::value json_map = rjson::empty_object();
|
||||
rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get<index_target::single_column>(targets[tc_idx]->value)->text()));
|
||||
|
||||
if (has_pk) {
|
||||
rjson::value pk_json = rjson::empty_array();
|
||||
for (const auto& col : std::get<index_target::multiple_columns>(targets.front()->value)) {
|
||||
rjson::push_back(pk_json, rjson::from_string(col->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json));
|
||||
}
|
||||
|
||||
if (fc_count > 0) {
|
||||
rjson::value fc_json = rjson::empty_array();
|
||||
for (size_t i = tc_idx + 1; i < targets.size(); ++i) {
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[i]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index filtering column must be a single column");
|
||||
}
|
||||
rjson::push_back(fc_json, rjson::from_string(std::get<index_target::single_column>(targets[i]->value)->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json));
|
||||
}
|
||||
|
||||
return rjson::print(json_map);
|
||||
}
|
||||
|
||||
sstring vector_index::get_target_column(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
return sstring(rjson::to_string_view(*tc));
|
||||
}
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
|
||||
auto q = properties.find("quantization");
|
||||
@@ -133,9 +251,8 @@ bool vector_index::view_should_exist() const {
|
||||
|
||||
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
|
||||
<< cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name())
|
||||
<< "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
<< " USING 'vector_index'";
|
||||
|
||||
return cql3::description{
|
||||
@@ -320,16 +437,23 @@ bool vector_index::has_vector_index(const schema& s) {
|
||||
|
||||
bool vector_index::has_vector_index_on_column(const schema& s, const sstring& target_name) {
|
||||
for (const auto& index : s.indices()) {
|
||||
auto class_it = index.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = index.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != index.options().end() && target_it != index.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && target_it->second == target_name;
|
||||
if (is_vector_index_on_column(index, target_name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool vector_index::is_vector_index_on_column(const index_metadata& im, const sstring& target_name) {
|
||||
auto class_it = im.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != im.options().end() && target_it != im.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_target_column(target_it->second) == target_name;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns the schema version of the base table at which the index was created.
|
||||
/// This is used to determine if the index needs to be rebuilt after a schema change.
|
||||
/// The CREATE INDEX and DROP INDEX statements does change the schema version.
|
||||
|
||||
@@ -34,8 +34,12 @@ public:
|
||||
table_schema_version index_version(const schema& schema) override;
|
||||
static bool has_vector_index(const schema& s);
|
||||
static bool has_vector_index_on_column(const schema& s, const sstring& target_name);
|
||||
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
|
||||
static void check_cdc_options(const schema& schema);
|
||||
|
||||
static sstring serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets);
|
||||
static sstring get_target_column(const sstring& targets);
|
||||
|
||||
static bool is_rescoring_enabled(const index_options_map& properties);
|
||||
static float get_oversampling(const index_options_map& properties);
|
||||
static sstring get_cql_similarity_function_name(const index_options_map& properties);
|
||||
|
||||
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic
|
||||
|
||||
sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
|
||||
const auto replication_factor = erm.get_replication_factor();
|
||||
if (read_replicas.size() > replication_factor) {
|
||||
if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
|
||||
if (read_replicas.size() > replication_factor + 1) {
|
||||
return seastar::format(
|
||||
"everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
|
||||
"cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
|
||||
read_replicas.size(), replication_factor);
|
||||
}
|
||||
} else if (read_replicas.size() > replication_factor) {
|
||||
return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
|
||||
}
|
||||
return {};
|
||||
|
||||
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(
|
||||
|
||||
writev(v.serialize());
|
||||
}
|
||||
return collection_mutation(type, ret);
|
||||
return collection_mutation(type, std::move(ret));
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
|
||||
size 6530196
|
||||
oid sha256:762ffcd253ff9a784fc58e36e1cbe83643e3fe576ac60eb1ce6e4bf8ac2eda8c
|
||||
size 6548000
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
|
||||
size 6528308
|
||||
oid sha256:3f788e2b36a4b87328997c60f0903e197bd193f977e02b5fc8888d79c364e21d
|
||||
size 6540076
|
||||
|
||||
@@ -1101,6 +1101,18 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
|
||||
// case.
|
||||
co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
|
||||
_stats.store_term_and_vote++;
|
||||
|
||||
// When the term advances, any in-flight snapshot transfers
|
||||
// belong to an outdated term: the progress tracker has been
|
||||
// reset in become_leader() or we are now a follower.
|
||||
// Abort them before we dispatch this batch's messages, which
|
||||
// may start fresh transfers for the new term.
|
||||
//
|
||||
// A vote may also change independently of the term (e.g. a
|
||||
// follower voting for a candidate at the same term), but in
|
||||
// that case there are no in-flight transfers and the abort
|
||||
// is a no-op.
|
||||
abort_snapshot_transfers();
|
||||
}
|
||||
|
||||
if (batch.snp) {
|
||||
@@ -1210,8 +1222,6 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
|
||||
// quickly) stop happening (we're outside the config after all).
|
||||
co_await _apply_entries.push_eventually(removed_from_config{});
|
||||
}
|
||||
// request aborts of snapshot transfers
|
||||
abort_snapshot_transfers();
|
||||
// abort all read barriers
|
||||
for (auto& r : _reads) {
|
||||
r.promise.set_value(not_a_leader{_fsm->current_leader()});
|
||||
|
||||
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
on_internal_error_noexcept(rcslog,
|
||||
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
|
||||
_resources, _initial_resources));
|
||||
_resources.count = std::max(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
|
||||
_resources.count = std::min(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::min(_resources.memory, _initial_resources.memory);
|
||||
}
|
||||
maybe_wake_execution_loop();
|
||||
}
|
||||
|
||||
@@ -432,7 +432,9 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
@@ -442,7 +444,7 @@ public:
|
||||
virtual storage_group& storage_group_for_token(dht::token) const = 0;
|
||||
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
|
||||
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
|
||||
virtual locator::combined_load_stats table_load_stats() const = 0;
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
|
||||
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
|
||||
if (!range.is_singular()) {
|
||||
continue;
|
||||
}
|
||||
auto token = dht::token::to_int64(ranges.front().start()->value().token());
|
||||
auto token = dht::token::to_int64(range.start()->value().token());
|
||||
if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
|
||||
// Don't return immediately - account all ranges first
|
||||
ret = can_proceed::no;
|
||||
|
||||
@@ -1129,9 +1129,7 @@ public:
|
||||
return _stats;
|
||||
}
|
||||
|
||||
// The tablet filter is used to not double account migrating tablets, so it's important that
|
||||
// only one of pending or leaving replica is accounted based on current migration stage.
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
|
||||
locator::combined_load_stats table_load_stats() const;
|
||||
|
||||
const db::view::stats& get_view_stats() const {
|
||||
return _view_stats;
|
||||
|
||||
278
replica/table.cc
278
replica/table.cc
@@ -711,7 +711,9 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -734,7 +736,7 @@ public:
|
||||
return *_single_sg;
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
|
||||
locator::combined_load_stats table_load_stats() const override {
|
||||
return locator::combined_load_stats{
|
||||
.table_ls = locator::table_load_stats{
|
||||
.size_in_bytes = _single_sg->live_disk_space_used(),
|
||||
@@ -757,6 +759,11 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -777,7 +784,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -801,7 +808,8 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -895,7 +903,9 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -909,7 +919,7 @@ public:
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
@@ -1006,6 +1016,11 @@ bool storage_group::set_split_mode() {
|
||||
return false;
|
||||
}
|
||||
if (!splitting_mode()) {
|
||||
// Don't create new compaction groups if the main cg has compaction disabled
|
||||
if (_main_cg->compaction_disabled()) {
|
||||
tlogger.debug("storage_group::set_split_mode: split ready groups not created due to compaction disabled on the main group");
|
||||
return false;
|
||||
}
|
||||
auto create_cg = [this] () -> compaction_group_ptr {
|
||||
// TODO: use the actual sub-ranges instead, to help incremental selection on the read path.
|
||||
return compaction_group::make_empty_group(*_main_cg);
|
||||
@@ -1443,6 +1458,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
sstables::offstrategy offstrategy) {
|
||||
std::vector<sstables::shared_sstable> ret, ssts;
|
||||
std::exception_ptr ex;
|
||||
log_level failure_log_level = log_level::error;
|
||||
try {
|
||||
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
|
||||
auto& cg = compaction_group_for_sstable(new_sst);
|
||||
@@ -1464,6 +1480,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
|
||||
sst = nullptr;
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
@@ -1471,13 +1490,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
|
||||
if (ex) {
|
||||
// on failed split, input sstable is unlinked here.
|
||||
if (new_sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
|
||||
co_await new_sst->unlink();
|
||||
}
|
||||
// on failure after successful split, sstables not attached yet will be unlinked
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1491,6 +1510,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::function<future<>(sstables::shared_sstable)> on_add) {
|
||||
std::exception_ptr ex;
|
||||
std::vector<sstables::shared_sstable> ret;
|
||||
log_level failure_log_level = log_level::error;
|
||||
|
||||
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
|
||||
// so the exception handling below will only have to unlink sstables not processed yet.
|
||||
@@ -1500,14 +1520,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
|
||||
std::ranges::move(ssts, std::back_inserter(ret));
|
||||
|
||||
}
|
||||
} catch (compaction::compaction_stopped_exception&) {
|
||||
failure_log_level = log_level::warn;
|
||||
ex = std::current_exception();
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (ex) {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
|
||||
co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
|
||||
if (sst) {
|
||||
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
|
||||
co_await sst->unlink();
|
||||
}
|
||||
});
|
||||
@@ -1743,7 +1766,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
|
||||
utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
|
||||
throw std::system_error(ENOSPC, std::system_category(), "Injected error");
|
||||
});
|
||||
co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
|
||||
// signal a memtable was sealed
|
||||
utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
|
||||
});
|
||||
|
||||
undo_stats.reset();
|
||||
@@ -2933,17 +2958,108 @@ void table::on_flush_timer() {
|
||||
});
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
// The following functions return true if we should return the tablet size of a tablet in
|
||||
// migration depending on its transition stage and whether it is a leaving or pending replica
|
||||
bool has_size_on_leaving (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_size_on_pending (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair:
|
||||
return false;
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
|
||||
locator::table_load_stats table_stats;
|
||||
table_stats.split_ready_seq_number = _split_ready_seq_number;
|
||||
|
||||
locator::tablet_load_stats tablet_stats;
|
||||
|
||||
for_each_storage_group([&] (size_t id, storage_group& sg) {
|
||||
locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
|
||||
if (tablet_filter(*_tablet_map, gid)) {
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
auto tid = locator::tablet_id(id);
|
||||
locator::global_tablet_id gid { _t.schema()->id(), tid };
|
||||
locator::tablet_replica me { _my_host_id, this_shard_id() };
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
|
||||
auto transition = _tablet_map->get_tablet_transition_info(tid);
|
||||
auto& info = _tablet_map->get_tablet_info(tid);
|
||||
bool is_pending = transition && transition->pending_replica == me;
|
||||
bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
|
||||
// Otherwise, pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto table_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
// When a tablet is in migration, we want to send its size during any migration stage when
|
||||
// we still know the tablet's size. This way the balancer will have better information about
|
||||
// tablet sizes, and we reduce the chance that the node will be ignored during balancing
|
||||
// due to missing tablet size. On the leaving replica we include tablets until the use_new
|
||||
// stage (inclusive), and on the pending we include tablets after the streaming stage.
|
||||
// There is an overlap in tablet sizes (we report sizes on both the leaving and pending
|
||||
// replicas for some stages), but that should not be a problem.
|
||||
auto tablet_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (is_leaving) {
|
||||
return has_size_on_leaving(transition->stage);
|
||||
} else if (is_pending) {
|
||||
return has_size_on_pending(transition->stage);
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (table_size_filter()) {
|
||||
table_stats.size_in_bytes += tablet_size;
|
||||
}
|
||||
|
||||
if (tablet_size_filter()) {
|
||||
const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
|
||||
// Make sure the token range is in the form (a, b]
|
||||
SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
|
||||
@@ -2956,8 +3072,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
|
||||
};
|
||||
}
|
||||
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
return _sg_manager->table_load_stats(std::move(tablet_filter));
|
||||
locator::combined_load_stats table::table_load_stats() const {
|
||||
return _sg_manager->table_load_stats();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
@@ -3069,7 +3185,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3093,7 +3211,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3102,7 +3220,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
|
||||
auto it = _storage_groups.find(group_id);
|
||||
if (it == _storage_groups.end()) {
|
||||
throw std::runtime_error(format("Unable to find sibling tablet of id for table {}", group_id, table_id));
|
||||
throw std::runtime_error(format("Unable to find sibling tablet of id {} for table {}", group_id, table_id));
|
||||
}
|
||||
auto& sg = it->second;
|
||||
sg->for_each_compaction_group([&new_sg, new_range, new_tid, group_id] (const compaction_group_ptr& cg) {
|
||||
@@ -3126,7 +3244,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3142,7 +3264,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3228,7 +3350,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -3690,7 +3812,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
|
||||
|
||||
std::vector<snapshot_sstable_set> sstable_sets(smp::count);
|
||||
std::vector<int64_t> tablet_counts(smp::count);
|
||||
|
||||
co_await writer->init();
|
||||
co_await smp::invoke_on_all([&] -> future<> {
|
||||
@@ -3698,7 +3819,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
auto [tables, permit] = co_await t.snapshot_sstables();
|
||||
auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
|
||||
sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
|
||||
tablet_counts[this_shard_id()] = t.calculate_tablet_count();
|
||||
});
|
||||
co_await writer->sync();
|
||||
|
||||
@@ -3712,12 +3832,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
|
||||
std::optional<int64_t> min_tablet_count;
|
||||
std::optional<int64_t> tablet_count;
|
||||
if (t.uses_tablets()) {
|
||||
SCYLLA_ASSERT(!tablet_counts.empty());
|
||||
min_tablet_count = *std::ranges::min_element(tablet_counts);
|
||||
auto erm = t.get_effective_replication_map();
|
||||
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
||||
tablet_count = tm.tablet_count();
|
||||
}
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
@@ -3775,6 +3896,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -3782,6 +3904,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -3801,53 +3926,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
|
||||
@@ -263,8 +263,9 @@ public:
|
||||
void enable_schema_commitlog() {
|
||||
_static_props.enable_schema_commitlog();
|
||||
}
|
||||
void set_is_group0_table(bool enabled = true) {
|
||||
_static_props.is_group0_table = enabled;
|
||||
void set_is_group0_table() {
|
||||
_static_props.is_group0_table = true;
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
class default_names {
|
||||
|
||||
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
auto ps_ptr = qp.get_prepared(cache_key);
|
||||
if (!ps_ptr) {
|
||||
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = std::move(msg_ptr->get_prepared());
|
||||
ps_ptr = msg_ptr->get_prepared();
|
||||
if (!ps_ptr) {
|
||||
on_internal_error(paxos_state::logger, "prepared statement is null");
|
||||
}
|
||||
|
||||
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
|
||||
if (!schema->static_props().is_group0_table) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
|
||||
}
|
||||
|
||||
if (!schema->static_props().use_schema_commitlog) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -559,6 +559,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
group0_id = g0_info.group0_id;
|
||||
raft::server_address my_addr{my_id, {}};
|
||||
|
||||
bool starting_server_as_follower = false;
|
||||
if (server == nullptr) {
|
||||
// This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
|
||||
raft::configuration initial_configuration;
|
||||
@@ -586,6 +587,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
// trigger an empty snapshot transfer.
|
||||
nontrivial_snapshot = true;
|
||||
} else {
|
||||
starting_server_as_follower = true;
|
||||
co_await handshaker->pre_server_start(g0_info);
|
||||
}
|
||||
|
||||
@@ -614,7 +616,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(server);
|
||||
if (server->get_configuration().contains(my_id)) {
|
||||
co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
|
||||
utils::wait_for_message(std::chrono::minutes{5}));
|
||||
if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
|
||||
// True if we started a new group or completed a configuration change initiated earlier.
|
||||
group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
|
||||
server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
|
||||
|
||||
@@ -987,7 +987,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
} else {
|
||||
co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
|
||||
frozen_muts_to_apply.push_back(co_await freeze_gently(m));
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -5003,6 +5003,8 @@ future<> storage_service::drain() {
|
||||
}
|
||||
|
||||
future<> storage_service::do_drain() {
|
||||
co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
|
||||
|
||||
// Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
|
||||
co_await stop_transport();
|
||||
|
||||
@@ -6056,6 +6058,8 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
});
|
||||
};
|
||||
|
||||
co_await utils::get_local_injector().inject("tablet_split_monitor_wait", utils::wait_for_message(1min));
|
||||
|
||||
exponential_backoff_retry split_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
|
||||
|
||||
while (!_async_gate.is_closed() && !_group0_as.abort_requested()) {
|
||||
@@ -6090,6 +6094,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
|
||||
} catch (raft::request_aborted& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (seastar::gate_closed_exception& ex) {
|
||||
slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
|
||||
break;
|
||||
} catch (...) {
|
||||
slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
|
||||
table, std::current_exception(), split_retry.sleep_time());
|
||||
@@ -6156,6 +6163,57 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -6183,12 +6241,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -6227,43 +6279,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
co_await local_topology_barrier();
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
@@ -7359,34 +7375,8 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (!table) {
|
||||
continue;
|
||||
}
|
||||
auto erm = table->get_effective_replication_map();
|
||||
auto& token_metadata = erm->get_token_metadata();
|
||||
auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached cleanup stage, then leaving replicas are accounted.
|
||||
// If transition is past cleanup stage, then pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
|
||||
auto transition = tmap.get_tablet_transition_info(id.tablet);
|
||||
auto& info = tmap.get_tablet_info(id.tablet);
|
||||
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_pending = transition->pending_replica == me;
|
||||
bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats() };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
|
||||
@@ -944,6 +944,9 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
|
||||
@@ -21,7 +21,6 @@ namespace service {
|
||||
|
||||
struct status_helper {
|
||||
tasks::task_status status;
|
||||
utils::chunked_vector<locator::tablet_id> tablets;
|
||||
std::optional<locator::tablet_replica> pending_replica;
|
||||
};
|
||||
|
||||
@@ -141,27 +140,54 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
auto task_type = hint.get_task_type();
|
||||
auto tablet_id_opt = tablet_id_provided(task_type) ? std::make_optional(hint.get_tablet_id()) : std::nullopt;
|
||||
|
||||
size_t tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
const auto& tablets = _ss.get_token_metadata().tablets();
|
||||
size_t tablet_count = tablets.has_tablet_map(table) ? tablets.get_tablet_map(table).tablet_count() : 0;
|
||||
auto res = co_await get_status_helper(id, std::move(hint));
|
||||
if (!res) {
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
|
||||
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
});
|
||||
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
|
||||
while (true) {
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
return true;
|
||||
}
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!is_repair_task(task_type)) {
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
auto tmptr = _ss.get_token_metadata_ptr();
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
break;
|
||||
}
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_still_running = false;
|
||||
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
|
||||
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
|
||||
return make_ready_future();
|
||||
});
|
||||
if (!repair_still_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
res->status.end_time = db_clock::now();
|
||||
co_return res->status;
|
||||
}
|
||||
if (is_migration_task(task_type)) {
|
||||
auto& replicas = _ss.get_token_metadata().tablets().get_tablet_map(table).get_tablet_info(tablet_id_opt.value()).replicas;
|
||||
auto migration_failed = std::all_of(replicas.begin(), replicas.end(), [&] (const auto& replica) { return res->pending_replica.has_value() && replica != res->pending_replica.value(); });
|
||||
@@ -169,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -244,7 +270,15 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
status_helper res;
|
||||
auto table = hint.get_table_id();
|
||||
auto task_type = hint.get_task_type();
|
||||
auto schema = _ss._db.local().get_tables_metadata().get_table(table).schema();
|
||||
auto table_ptr = _ss._db.local().get_tables_metadata().get_table_if_exists(table);
|
||||
if (!table_ptr) {
|
||||
co_return tasks::task_status {
|
||||
.task_id = id,
|
||||
.kind = tasks::task_kind::cluster,
|
||||
.is_abortable = co_await is_abortable(std::move(hint)),
|
||||
};
|
||||
}
|
||||
auto schema = table_ptr->schema();
|
||||
res.status = {
|
||||
.task_id = id,
|
||||
.kind = tasks::task_kind::cluster,
|
||||
@@ -257,6 +291,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_task_finished = false;
|
||||
bool repair_task_pending = false;
|
||||
bool no_tablets_processed = true;
|
||||
if (is_repair_task(task_type)) {
|
||||
auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
|
||||
if (progress) {
|
||||
@@ -273,37 +308,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& task_info = info.repair_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tid);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tablet_id);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
} else { // Resize task.
|
||||
auto& task_info = tmap.resize_task_info();
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
if (!res.tablets.empty()) {
|
||||
if (!no_tablets_processed) {
|
||||
res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
|
||||
if (repair_task_pending) {
|
||||
// When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
|
||||
res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
if (repair_task_finished) {
|
||||
|
||||
@@ -1070,6 +1070,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
|
||||
if (abandoning_replicas.size() + new_replicas.size() > 1) {
|
||||
throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
|
||||
"Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
|
||||
ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
|
||||
}
|
||||
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, tablet_info.replicas)
|
||||
@@ -1079,8 +1088,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
));
|
||||
|
||||
// Calculate abandoning replica and abort view building tasks on them
|
||||
auto old_tablet_info = old_tablets.get_tablet_info(last_token);
|
||||
auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
|
||||
if (!abandoning_replicas.empty()) {
|
||||
if (abandoning_replicas.size() != 1) {
|
||||
on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));
|
||||
@@ -2193,6 +2200,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
_tablet_allocator.set_load_stats(reconciled_stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the background storage group merge to finish before releasing the state machine.
|
||||
// Background merge holds the old erm, so a successful barrier joins with it.
|
||||
// This guarantees that the background merge doesn't run concurrently with the next merge.
|
||||
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
|
||||
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
|
||||
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
|
||||
// by the background merge fiber.
|
||||
tm = nullptr;
|
||||
if (!guard) {
|
||||
guard = co_await start_operation();
|
||||
}
|
||||
co_await global_tablet_token_metadata_barrier(std::move(guard));
|
||||
}
|
||||
|
||||
future<> handle_truncate_table(group0_guard guard) {
|
||||
@@ -2469,7 +2489,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
|
||||
// If there is no other work, evaluate load and start tablet migration if there is imbalance.
|
||||
if (co_await maybe_start_tablet_migration(std::move(guard))) {
|
||||
if (auto guard_opt = co_await maybe_start_tablet_migration(std::move(guard)); !guard_opt) {
|
||||
co_return true;
|
||||
} else {
|
||||
guard = std::move(*guard_opt);
|
||||
}
|
||||
|
||||
if (co_await maybe_retry_failed_rf_change_tablet_rebuilds(std::move(guard))) {
|
||||
co_return true;
|
||||
}
|
||||
co_return false;
|
||||
@@ -3674,11 +3700,14 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
// Returns the guard if no work done. Otherwise, performs a table migration and consumes the guard.
|
||||
future<std::optional<group0_guard>> maybe_migrate_system_tables(group0_guard guard);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet migration path.
|
||||
future<bool> maybe_start_tablet_migration(group0_guard);
|
||||
// Returns the guard if no work done. Otherwise, transitions the state machine into tablet migration path.
|
||||
future<std::optional<group0_guard>> maybe_start_tablet_migration(group0_guard);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet resize finalization path.
|
||||
future<bool> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
|
||||
// Returns the guard if no work done. Otherwise, transitions the state machine into tablet resize finalization path.
|
||||
future<std::optional<group0_guard>> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
|
||||
|
||||
// Returns true if the state machine was transitioned into tablet migration path.
|
||||
future<bool> maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard);
|
||||
|
||||
future<> refresh_tablet_load_stats();
|
||||
future<> start_tablet_load_stats_refresher();
|
||||
@@ -3790,14 +3819,14 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_migrate_system_t
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
|
||||
future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
|
||||
rtlogger.debug("Evaluating tablet balance");
|
||||
|
||||
auto tm = get_token_metadata_ptr();
|
||||
auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
|
||||
if (plan.empty()) {
|
||||
rtlogger.debug("Tablet load balancer did not make any plan");
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
@@ -3817,15 +3846,15 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
|
||||
.build());
|
||||
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
|
||||
co_return true;
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
|
||||
future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
|
||||
if (plan.finalize_resize.empty()) {
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
if (utils::get_local_injector().enter("tablet_split_finalization_postpone")) {
|
||||
co_return false;
|
||||
co_return std::move(guard);
|
||||
}
|
||||
|
||||
auto resize_finalization_transition_state = [this] {
|
||||
@@ -3841,6 +3870,73 @@ future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0
|
||||
.build());
|
||||
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), "Started tablet resize finalization");
|
||||
co_return std::nullopt;
|
||||
}
|
||||
|
||||
future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard) {
|
||||
rtlogger.debug("Retrying failed rebuilds");
|
||||
|
||||
if (utils::get_local_injector().enter("maybe_retry_failed_rf_change_tablet_rebuilds_skip")) {
|
||||
rtlogger.debug("Skipping retrying failed rebuilds due to error injection");
|
||||
co_return false;
|
||||
}
|
||||
|
||||
auto tmptr = get_token_metadata_ptr();
|
||||
utils::chunked_vector<canonical_mutation> updates;
|
||||
for (auto& ks_name : _db.get_tablets_keyspaces()) {
|
||||
auto& ks = _db.find_keyspace(ks_name);
|
||||
auto& strategy = ks.get_replication_strategy();
|
||||
auto tables_with_mvs = ks.metadata()->tables();
|
||||
auto views = ks.metadata()->views();
|
||||
tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
|
||||
for (const auto& table_or_mv : tables_with_mvs) {
|
||||
if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto& tablet_map = tmptr->tablets().get_tablet_map(table_or_mv->id());
|
||||
auto new_tablet_map = co_await strategy.maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await tablet_map.clone_gently());
|
||||
|
||||
replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
|
||||
co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
|
||||
auto& replicas = tablet_map.get_tablet_info(tablet_id).replicas;
|
||||
auto it = std::find_if(tablet_info.replicas.begin(), tablet_info.replicas.end(), [&](const auto& replica) {
|
||||
return std::find(replicas.begin(), replicas.end(), replica) == replicas.end();
|
||||
});
|
||||
if (it == tablet_info.replicas.end()) {
|
||||
co_return;
|
||||
}
|
||||
auto new_replicas = replicas;
|
||||
new_replicas.push_back(*it);
|
||||
auto last_token = new_tablet_map.get_last_token(tablet_id);
|
||||
updates.emplace_back(co_await make_canonical_mutation_gently(
|
||||
replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
|
||||
.set_new_replicas(last_token, new_replicas)
|
||||
.set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
|
||||
.set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
|
||||
.build()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
if (!updates.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (updates.empty()) {
|
||||
rtlogger.debug("No failed RF change rebuilds to retry");
|
||||
co_return false;
|
||||
}
|
||||
|
||||
updates.emplace_back(
|
||||
topology_mutation_builder(guard.write_timestamp())
|
||||
.set_transition_state(topology::transition_state::tablet_migration)
|
||||
.set_version(_topo_sm._topology.version + 1)
|
||||
.build());
|
||||
|
||||
sstring reason = "Retry failed tablet rebuilds";
|
||||
co_await update_topology_state(std::move(guard), std::move(updates), reason);
|
||||
co_return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,13 @@ class service_permit {
|
||||
friend service_permit empty_service_permit();
|
||||
public:
|
||||
size_t count() const { return _permit ? _permit->count() : 0; };
|
||||
// Merge additional semaphore units into this permit.
|
||||
// Used to grow the permit after the actual resource cost is known.
|
||||
void adopt(seastar::semaphore_units<>&& units) {
|
||||
if (_permit) {
|
||||
_permit->adopt(std::move(units));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline service_permit make_service_permit(seastar::semaphore_units<>&& permit) {
|
||||
|
||||
@@ -201,95 +201,47 @@ public:
|
||||
virtual future<std::optional<entry_info>> next_entry() = 0;
|
||||
};
|
||||
|
||||
// Allocated inside LSA.
|
||||
class promoted_index {
|
||||
deletion_time _del_time;
|
||||
uint64_t _promoted_index_start;
|
||||
uint32_t _promoted_index_size;
|
||||
uint32_t _num_blocks;
|
||||
public:
|
||||
promoted_index(const schema& s,
|
||||
deletion_time del_time,
|
||||
uint64_t promoted_index_start,
|
||||
uint32_t promoted_index_size,
|
||||
uint32_t num_blocks)
|
||||
: _del_time{del_time}
|
||||
, _promoted_index_start(promoted_index_start)
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _num_blocks(num_blocks)
|
||||
{ }
|
||||
|
||||
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
|
||||
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
|
||||
// Call under allocating_section.
|
||||
// For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
|
||||
reader_permit,
|
||||
tracing::trace_state_ptr,
|
||||
file_input_stream_options,
|
||||
use_caching);
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
using promoted_index = parsed_promoted_index_entry;
|
||||
|
||||
// A partition index element.
|
||||
// Allocated inside LSA.
|
||||
class index_entry {
|
||||
private:
|
||||
managed_bytes _key;
|
||||
mutable std::optional<dht::token> _token;
|
||||
uint64_t _position;
|
||||
managed_ref<promoted_index> _index;
|
||||
struct [[gnu::packed]] index_entry {
|
||||
mutable int64_t raw_token;
|
||||
uint64_t data_file_offset;
|
||||
uint32_t key_offset;
|
||||
|
||||
public:
|
||||
|
||||
key_view get_key() const {
|
||||
return key_view{_key};
|
||||
}
|
||||
|
||||
// May allocate so must be called under allocating_section.
|
||||
decorated_key_view get_decorated_key(const schema& s) const {
|
||||
if (!_token) {
|
||||
_token.emplace(s.get_partitioner().get_token(get_key()));
|
||||
}
|
||||
return decorated_key_view(*_token, get_key());
|
||||
}
|
||||
|
||||
uint64_t position() const { return _position; };
|
||||
|
||||
std::optional<deletion_time> get_deletion_time() const {
|
||||
if (_index) {
|
||||
return _index->get_deletion_time();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
|
||||
: _key(std::move(key))
|
||||
, _position(position)
|
||||
, _index(std::move(index))
|
||||
{}
|
||||
|
||||
index_entry(index_entry&&) = default;
|
||||
index_entry& operator=(index_entry&&) = default;
|
||||
|
||||
// Can be nullptr
|
||||
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
|
||||
managed_ref<promoted_index>& get_promoted_index() { return _index; }
|
||||
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
return _key.external_memory_usage() + _index.external_memory_usage();
|
||||
}
|
||||
uint64_t position() const { return data_file_offset; }
|
||||
dht::raw_token token() const { return dht::raw_token(raw_token); }
|
||||
};
|
||||
|
||||
// Required for optimized LSA migration of storage of managed_vector.
|
||||
static_assert(std::is_trivially_move_assignable_v<index_entry>);
|
||||
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
|
||||
|
||||
// A partition index page.
|
||||
//
|
||||
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
|
||||
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
|
||||
class partition_index_page {
|
||||
public:
|
||||
lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
|
||||
lsa::chunked_managed_vector<index_entry> _entries;
|
||||
managed_bytes _key_storage;
|
||||
|
||||
// Stores promoted index information of index entries.
|
||||
// The i-th element corresponds to the i-th entry in _entries.
|
||||
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
|
||||
// that entry doesn't have a promoted index.
|
||||
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
|
||||
// which is typical in workloads with small partitions.
|
||||
lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
|
||||
public:
|
||||
partition_index_page() = default;
|
||||
partition_index_page(partition_index_page&&) noexcept = default;
|
||||
@@ -298,15 +250,68 @@ public:
|
||||
bool empty() const { return _entries.empty(); }
|
||||
size_t size() const { return _entries.size(); }
|
||||
|
||||
stop_iteration clear_gently() {
|
||||
// Vectors have trivial storage, so are fast to destroy.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void clear_one_entry() {
|
||||
_entries.pop_back();
|
||||
}
|
||||
|
||||
bool has_promoted_index(size_t i) const {
|
||||
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
const promoted_index& get_promoted_index(size_t i) const {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
promoted_index& get_promoted_index(size_t i) {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index size for the i-th entry.
|
||||
uint32_t get_promoted_index_size(size_t i) const {
|
||||
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
|
||||
}
|
||||
|
||||
/// Get deletion_time for partition represented by the i-th entry.
|
||||
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
|
||||
/// It has to be read from the data file.
|
||||
std::optional<deletion_time> get_deletion_time(size_t i) const {
|
||||
if (has_promoted_index(i)) {
|
||||
return get_promoted_index(i).del_time;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
key_view get_key(size_t i) const {
|
||||
auto start = _entries[i].key_offset;
|
||||
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
|
||||
auto v = managed_bytes_view(_key_storage).prefix(end);
|
||||
v.remove_prefix(start);
|
||||
return key_view(v);
|
||||
}
|
||||
|
||||
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
|
||||
auto key = get_key(i);
|
||||
auto t = _entries[i].token();
|
||||
if (!t) {
|
||||
t = dht::raw_token(s.get_partitioner().get_token(key));
|
||||
_entries[i].raw_token = t.value;
|
||||
}
|
||||
return decorated_key_view(dht::token(t), key);
|
||||
}
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
size_t size = _entries.external_memory_usage();
|
||||
for (auto&& e : _entries) {
|
||||
size += sizeof(index_entry) + e->external_memory_usage();
|
||||
}
|
||||
size += _promoted_indexes.external_memory_usage();
|
||||
size += _key_storage.external_memory_usage();
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,14 +25,6 @@ namespace sstables {
|
||||
extern seastar::logger sstlog;
|
||||
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;
|
||||
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Partition index entry information produced by the parser.
|
||||
struct parsed_partition_index_entry {
|
||||
temporary_buffer<char> key;
|
||||
@@ -53,9 +45,10 @@ class index_consumer {
|
||||
schema_ptr _s;
|
||||
logalloc::allocating_section _alloc_section;
|
||||
logalloc::region& _region;
|
||||
utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
|
||||
size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
|
||||
size_t _key_storage_size = 0;
|
||||
public:
|
||||
index_list indexes;
|
||||
|
||||
index_consumer(logalloc::region& r, schema_ptr s)
|
||||
: _s(s)
|
||||
, _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
|
||||
@@ -64,36 +57,63 @@ public:
|
||||
, _region(r)
|
||||
{ }
|
||||
|
||||
~index_consumer() {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.clear_and_release();
|
||||
});
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_key_storage_size += e.key.size();
|
||||
_parsed_entries.emplace_back(std::move(e));
|
||||
if (e.promoted_index) {
|
||||
_max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
|
||||
}
|
||||
}
|
||||
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_alloc_section(_region, [&] {
|
||||
future<index_list> finalize() {
|
||||
index_list result;
|
||||
// In case of exception, need to deallocate under region allocator.
|
||||
auto delete_result = seastar::defer([&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
managed_ref<promoted_index> pi;
|
||||
if (e.promoted_index) {
|
||||
pi = make_managed<promoted_index>(*_s,
|
||||
e.promoted_index->del_time,
|
||||
e.promoted_index->promoted_index_start,
|
||||
e.promoted_index->promoted_index_size,
|
||||
e.promoted_index->num_blocks);
|
||||
}
|
||||
auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
|
||||
indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
|
||||
result._entries = {};
|
||||
result._promoted_indexes = {};
|
||||
result._key_storage = {};
|
||||
});
|
||||
});
|
||||
auto i = _parsed_entries.begin();
|
||||
size_t key_offset = 0;
|
||||
while (i != _parsed_entries.end()) {
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
result._entries.reserve(_parsed_entries.size());
|
||||
result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
|
||||
if (result._key_storage.empty()) {
|
||||
result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
|
||||
}
|
||||
managed_bytes_mutable_view key_out(result._key_storage);
|
||||
key_out.remove_prefix(key_offset);
|
||||
while (i != _parsed_entries.end()) {
|
||||
parsed_partition_index_entry& e = *i;
|
||||
if (e.promoted_index) {
|
||||
result._promoted_indexes[result._entries.size()] = *e.promoted_index;
|
||||
}
|
||||
write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
|
||||
result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
|
||||
++i;
|
||||
key_offset += e.key.size();
|
||||
if (need_preempt()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
delete_result.cancel();
|
||||
_parsed_entries.clear();
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void prepare(uint64_t size) {
|
||||
_alloc_section = logalloc::allocating_section();
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.reserve(size);
|
||||
});
|
||||
});
|
||||
_max_promoted_index_entry_plus_one = 0;
|
||||
_key_storage_size = 0;
|
||||
_parsed_entries.clear();
|
||||
_parsed_entries.reserve(size);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -198,10 +218,14 @@ public:
|
||||
|
||||
switch (_state) {
|
||||
// START comes first, to make the handling of the 0-quantity case simpler
|
||||
state_START:
|
||||
case state::START:
|
||||
sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
|
||||
_state = state::KEY_SIZE;
|
||||
break;
|
||||
if (data.size() == 0) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case state::KEY_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
|
||||
_entry_offset = current_pos();
|
||||
@@ -227,7 +251,16 @@ public:
|
||||
case state::PROMOTED_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
|
||||
_position = this->_u64;
|
||||
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
|
||||
data.trim_front(1);
|
||||
_consumer.consume_entry(parsed_partition_index_entry{
|
||||
.key = std::move(_key),
|
||||
.data_file_offset = _position,
|
||||
.index_offset = _entry_offset,
|
||||
.promoted_index = std::nullopt
|
||||
});
|
||||
goto state_START;
|
||||
} else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PARTITION_HEADER_LENGTH_1;
|
||||
break;
|
||||
}
|
||||
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
|
||||
return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
|
||||
}
|
||||
|
||||
inline
|
||||
std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
_promoted_index_start, _promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Less-comparator for lookups in the partition index.
|
||||
class index_comparator {
|
||||
dht::ring_position_comparator_for_sstables _tri_cmp;
|
||||
@@ -376,27 +382,17 @@ public:
|
||||
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
|
||||
return operator()(*e, rp);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
|
||||
return operator()(rp, *e);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
|
||||
dht::ring_position_comparator_for_sstables tri_cmp(s);
|
||||
return tri_cmp(page.get_decorated_key(s, idx), rp);
|
||||
}
|
||||
|
||||
// Contains information about index_reader position in the index file
|
||||
struct index_bound {
|
||||
index_bound() = default;
|
||||
@@ -537,7 +533,7 @@ private:
|
||||
if (ex) {
|
||||
return make_exception_future<index_list>(std::move(ex));
|
||||
}
|
||||
return make_ready_future<index_list>(std::move(bound.consumer->indexes));
|
||||
return bound.consumer->finalize();
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -550,17 +546,18 @@ private:
|
||||
if (bound.current_list->empty()) {
|
||||
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
}
|
||||
bound.data_file_position = bound.current_list->_entries[0]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[0].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
|
||||
if (sstlog.is_enabled(seastar::log_level::trace)) {
|
||||
sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
|
||||
logalloc::reclaim_lock rl(_region);
|
||||
for (auto&& e : bound.current_list->_entries) {
|
||||
for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
|
||||
auto& e = bound.current_list->_entries[i];
|
||||
auto dk = dht::decorate_key(*_sstable->_schema,
|
||||
e->get_key().to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e->position());
|
||||
bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e.position());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,7 +601,13 @@ private:
|
||||
// Valid if partition_data_ready(bound)
|
||||
index_entry& current_partition_entry(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list->_entries[bound.current_index_idx];
|
||||
return bound.current_list->_entries[bound.current_index_idx];
|
||||
}
|
||||
|
||||
// Valid if partition_data_ready(bound)
|
||||
partition_index_page& current_page(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list;
|
||||
}
|
||||
|
||||
future<> advance_to_next_partition(index_bound& bound) {
|
||||
@@ -617,7 +620,7 @@ private:
|
||||
if (bound.current_index_idx + 1 < bound.current_list->size()) {
|
||||
++bound.current_index_idx;
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
return reset_clustered_cursor(bound);
|
||||
@@ -680,9 +683,13 @@ private:
|
||||
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
|
||||
sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
|
||||
auto i = _alloc_section(_region, [&] {
|
||||
auto& entries = bound.current_list->_entries;
|
||||
return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
|
||||
index_comparator(*_sstable->_schema));
|
||||
auto& page = *bound.current_list;
|
||||
auto& s = *_sstable->_schema;
|
||||
auto r = std::views::iota(bound.current_index_idx, page._entries.size());
|
||||
auto it = std::ranges::partition_point(r, [&] (int idx) {
|
||||
return index_entry_tri_cmp(s, page, idx, pos) < 0;
|
||||
});
|
||||
return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
|
||||
});
|
||||
// i is valid until next allocation point
|
||||
auto& entries = bound.current_list->_entries;
|
||||
@@ -697,7 +704,7 @@ private:
|
||||
}
|
||||
bound.current_index_idx = std::distance(std::begin(entries), i);
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = (*i)->position();
|
||||
bound.data_file_position = (*i).position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
|
||||
@@ -800,6 +807,34 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
|
||||
shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
pi.promoted_index_start, pi.promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Ensures that partition_data_ready() returns true.
|
||||
// Can be called only when !eof()
|
||||
future<> read_partition_data() override {
|
||||
@@ -835,10 +870,10 @@ public:
|
||||
clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
|
||||
if (!bound.clustered_cursor) {
|
||||
_alloc_section(_region, [&] {
|
||||
index_entry& e = current_partition_entry(bound);
|
||||
promoted_index* pi = e.get_promoted_index().get();
|
||||
if (pi) {
|
||||
bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
|
||||
partition_index_page& page = current_page(bound);
|
||||
if (page.has_promoted_index(bound.current_index_idx)) {
|
||||
promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
|
||||
bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
|
||||
get_file_input_stream_options(), _use_caching);
|
||||
}
|
||||
});
|
||||
@@ -861,15 +896,15 @@ public:
|
||||
// It may be unavailable for old sstables for which this information was not generated.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<sstables::deletion_time> partition_tombstone() override {
|
||||
return current_partition_entry(_lower_bound).get_deletion_time();
|
||||
return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
// Returns the key for current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<partition_key> get_partition_key() override {
|
||||
return _alloc_section(_region, [this] {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_key().to_partition_key(*_sstable->_schema);
|
||||
return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
|
||||
.to_partition_key(*_sstable->_schema);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -883,8 +918,8 @@ public:
|
||||
// Returns the number of promoted index entries for the current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
uint64_t get_promoted_index_size() {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_promoted_index_size();
|
||||
partition_index_page& page = current_page(_lower_bound);
|
||||
return page.get_promoted_index_size(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
bool partition_data_ready() const override {
|
||||
@@ -975,9 +1010,9 @@ public:
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
return read_partition_data().then([this, key] {
|
||||
index_comparator cmp(*_sstable->_schema);
|
||||
bool found = _alloc_section(_region, [&] {
|
||||
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
auto& page = current_page(_lower_bound);
|
||||
return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
|
||||
});
|
||||
return make_ready_future<bool>(found);
|
||||
});
|
||||
|
||||
@@ -189,10 +189,11 @@ public:
|
||||
{}
|
||||
future<std::optional<directory_entry>> get() override {
|
||||
std::filesystem::path dir(_prefix);
|
||||
do {
|
||||
while (true) {
|
||||
if (_pos == _info.size()) {
|
||||
_info.clear();
|
||||
_info = co_await _client->list_objects(_bucket, _prefix, _paging);
|
||||
_pos = 0;
|
||||
}
|
||||
if (_info.empty()) {
|
||||
break;
|
||||
@@ -203,7 +204,7 @@ public:
|
||||
continue;
|
||||
}
|
||||
co_return ent;
|
||||
} while (false);
|
||||
}
|
||||
|
||||
co_return std::nullopt;
|
||||
}
|
||||
@@ -276,7 +277,7 @@ public:
|
||||
co_await f.close();
|
||||
|
||||
auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
|
||||
co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
|
||||
co_await _client->merge_objects(bucket, object, names, {}, as);
|
||||
|
||||
co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
|
||||
co_await _client->delete_object(bucket, name);
|
||||
|
||||
@@ -257,14 +257,11 @@ public:
|
||||
while (partial_page || i != _cache.end()) {
|
||||
if (partial_page) {
|
||||
auto preempted = with_allocator(_region.allocator(), [&] {
|
||||
while (!partial_page->empty()) {
|
||||
partial_page->clear_one_entry();
|
||||
if (need_preempt()) {
|
||||
return true;
|
||||
}
|
||||
while (partial_page->clear_gently() != stop_iteration::yes) {
|
||||
return true;
|
||||
}
|
||||
partial_page.reset();
|
||||
return false;
|
||||
return need_preempt();
|
||||
});
|
||||
if (preempted) {
|
||||
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
|
||||
|
||||
@@ -1094,7 +1094,6 @@ public:
|
||||
|
||||
friend class mc::writer;
|
||||
friend class index_reader;
|
||||
friend class promoted_index;
|
||||
friend class sstables_manager;
|
||||
template <typename DataConsumeRowsContext>
|
||||
friend future<std::unique_ptr<DataConsumeRowsContext>>
|
||||
|
||||
@@ -436,7 +436,10 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
stream_options.buffer_size = file_stream_buffer_size;
|
||||
stream_options.read_ahead = file_stream_read_ahead;
|
||||
|
||||
for (auto& info : sources) {
|
||||
for (auto&& source_info : sources) {
|
||||
// Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
|
||||
// of the sstable component to be released right after it has been streamed.
|
||||
auto info = std::exchange(source_info, {});
|
||||
auto& filename = info.filename;
|
||||
std::optional<input_stream<char>> fstream;
|
||||
bool fstream_closed = false;
|
||||
@@ -617,6 +620,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
ops_id, filename, targets, total_size, get_bw(total_size, start_time));
|
||||
}
|
||||
}
|
||||
co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
if (error) {
|
||||
blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
|
||||
ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
|
||||
@@ -632,7 +636,9 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
future<stream_files_response> tablet_stream_files_handler(replica::database& db, netw::messaging_service& ms, streaming::stream_files_request req) {
|
||||
stream_files_response resp;
|
||||
auto& table = db.find_column_family(req.table);
|
||||
auto table_stream_op = table.stream_in_progress();
|
||||
auto sstables = co_await table.take_storage_snapshot(req.range);
|
||||
co_await utils::get_local_injector().inject("wait_before_tablet_stream_files_after_snapshot", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
co_await utils::get_local_injector().inject("order_sstables_for_streaming", [&sstables] (auto& handler) -> future<> {
|
||||
if (sstables.size() == 3) {
|
||||
// make sure the sstables are ordered so that the sstable containing shadowed data is streamed last
|
||||
@@ -680,15 +686,22 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
|
||||
if (files.empty()) {
|
||||
co_return resp;
|
||||
}
|
||||
auto sstable_nr = sstables.size();
|
||||
// Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
|
||||
// a sstable - that has been compacted - can have its space released from disk right after
|
||||
// that sstable's content has been fully streamed.
|
||||
sstables.clear();
|
||||
// Release the table - we don't need to access it anymore and the files are held by the snapshot.
|
||||
table_stream_op = {};
|
||||
blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
|
||||
req.ops_id, sstables.size(), files.size(), files, req.range);
|
||||
req.ops_id, sstable_nr, files.size(), files, req.range);
|
||||
auto ops_start_time = std::chrono::steady_clock::now();
|
||||
auto files_nr = files.size();
|
||||
size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
|
||||
resp.stream_bytes = stream_bytes;
|
||||
auto duration = std::chrono::steady_clock::now() - ops_start_time;
|
||||
blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
|
||||
req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
co_return resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
|
||||
auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
|
||||
try {
|
||||
shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
|
||||
_prepared_stmt = std::move(msg_ptr->get_prepared());
|
||||
_prepared_stmt = msg_ptr->get_prepared();
|
||||
shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
|
||||
_insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
|
||||
_is_fallback_stmt = fallback;
|
||||
|
||||
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
|
||||
: _module(std::move(module))
|
||||
{}
|
||||
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
|
||||
auto ms = module->get_task_manager()._messaging;
|
||||
if (!ms) {
|
||||
auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
|
||||
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
|
||||
tmlogger.info("tasks_vt_get_children: waiting");
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
|
||||
});
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
if (is_host_alive(host_id)) {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<utils::chunked_vector<task_identity>>();
|
||||
}
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
}).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
|
||||
tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
|
||||
return utils::chunked_vector<task_identity>{};
|
||||
});
|
||||
}, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
|
||||
std::move(b.begin(), b.end(), std::back_inserter(a));
|
||||
return a;
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "db_clock.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
@@ -282,7 +283,7 @@ public:
|
||||
impl& operator=(impl&&) = delete;
|
||||
virtual ~impl() = default;
|
||||
protected:
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
|
||||
public:
|
||||
virtual task_group get_group() const noexcept = 0;
|
||||
// Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
|
||||
|
||||
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
|
||||
# to update latencies for one kind of operation (#17616, and compare #9406),
|
||||
# and to do that checking that ..._count increases for that op is enough.
|
||||
@contextmanager
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
def check_sets_latency_by_metric(metrics, operation_names, metric_name):
|
||||
the_metrics = get_metrics(metrics)
|
||||
saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
|
||||
yield
|
||||
the_metrics = get_metrics(metrics)
|
||||
for op in operation_names:
|
||||
# The total "count" on all shards should strictly increase
|
||||
assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
|
||||
assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
|
||||
|
||||
def check_sets_latency(metrics, operation_names):
|
||||
return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')
|
||||
|
||||
# Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
|
||||
# We can't check what exactly the latency is - just that it gets updated.
|
||||
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
def test_item_latency_per_table(test_table_s, metrics):
|
||||
with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
|
||||
p = random_string()
|
||||
test_table_s.put_item(Item={'p': p})
|
||||
test_table_s.get_item(Key={'p': p})
|
||||
test_table_s.delete_item(Key={'p': p})
|
||||
test_table_s.update_item(Key={'p': p})
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
|
||||
# Test latency metrics for GetRecords. Other Streams-related operations -
|
||||
# ListStreams, DescribeStream, and GetShardIterator, have an operation
|
||||
# count (tested above) but do NOT currently have a latency histogram.
|
||||
|
||||
@@ -378,6 +378,7 @@ add_scylla_test(combined_tests
|
||||
tracing_test.cc
|
||||
user_function_test.cc
|
||||
user_types_test.cc
|
||||
vector_index_test.cc
|
||||
view_build_test.cc
|
||||
view_complex_test.cc
|
||||
view_schema_ckey_test.cc
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/core/manual_clock.hh>
|
||||
#include <seastar/util/later.hh>
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/util/alloc_failure_injector.hh>
|
||||
@@ -290,12 +290,17 @@ SEASTAR_THREAD_TEST_CASE(test_address_map_replication) {
|
||||
m.set_expiring(id1);
|
||||
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
|
||||
m.barrier().get();
|
||||
promise<> shard0_timer_expired;
|
||||
timer<manual_clock> shard0_timer([&shard0_timer_expired] {
|
||||
shard0_timer_expired.set_value();
|
||||
});
|
||||
shard0_timer.arm(manual_clock::now() + expiration_time);
|
||||
m_svc.invoke_on(1, [] (address_map_t<manual_clock>& m) {
|
||||
BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
|
||||
manual_clock::advance(expiration_time);
|
||||
BOOST_CHECK(!m.find(id1));
|
||||
return smp::submit_to(0, []{}); // Ensure shard 0 notices timer is expired.
|
||||
}).get();
|
||||
shard0_timer_expired.get_future().get();
|
||||
BOOST_CHECK(!m.find(id1));
|
||||
|
||||
// Expiring entries are replicated
|
||||
|
||||
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(1.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();
|
||||
|
||||
@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(0.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();
|
||||
|
||||
|
||||
@@ -1111,6 +1111,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
|
||||
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
|
||||
return make_ready_future();
|
||||
#endif
|
||||
return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
|
||||
sharded<db::snapshot_ctl> sc;
|
||||
sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
|
||||
auto stop_sc = deferred_stop(sc);
|
||||
|
||||
auto& cf = e.local_db().find_column_family("ks", "cf");
|
||||
take_snapshot(e).get();
|
||||
|
||||
utils::get_local_injector().enable("get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
auto details = cf.get_snapshot_details().get();
|
||||
BOOST_REQUIRE_EQUAL(details.size(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
// toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
|
||||
SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
|
||||
return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
|
||||
@@ -1857,7 +1881,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {
|
||||
|
||||
schema_builder::register_schema_initializer([] (schema_builder& builder) {
|
||||
if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")
|
||||
|
||||
@@ -23,8 +23,11 @@
|
||||
#include "test/lib/tmpdir.hh"
|
||||
#include "test/lib/random_utils.hh"
|
||||
#include "test/lib/exception_utils.hh"
|
||||
#include "utils/limiting_data_source.hh"
|
||||
#include "utils/io-wrappers.hh"
|
||||
|
||||
#include <seastar/util/memory-data-source.hh>
|
||||
|
||||
using namespace encryption;
|
||||
|
||||
static tmpdir dir;
|
||||
@@ -595,6 +598,113 @@ SEASTAR_TEST_CASE(test_encrypted_data_source_simple) {
|
||||
co_await test_random_data_source(sizes);
|
||||
}
|
||||
|
||||
// Reproduces the production deadlock where encrypted SSTable component downloads
|
||||
// got stuck during restore. The encrypted_data_source::get() caches a block in
|
||||
// _next, then on the next call bypasses input_stream::read()'s _eof check and
|
||||
// calls input_stream::read_exactly() — which does NOT check _eof when _buf is
|
||||
// empty. This causes a second get() on the underlying source after EOS.
|
||||
//
|
||||
// In production the underlying source was chunked_download_source whose get()
|
||||
// hung forever. Here we simulate it with a strict source that fails the test.
|
||||
//
|
||||
// The fix belongs in seastar's input_stream::read_exactly(): check _eof before
|
||||
// calling _fd.get(), consistent with read(), read_up_to(), and consume().
|
||||
static future<> test_encrypted_source_copy(size_t plaintext_size) {
|
||||
testlog.info("test_encrypted_source_copy: plaintext_size={}", plaintext_size);
|
||||
|
||||
key_info info{"AES/CBC", 256};
|
||||
auto k = ::make_shared<symmetric_key>(info);
|
||||
|
||||
// Step 1: Encrypt the plaintext into memory buffers
|
||||
auto plaintext = generate_random<char>(plaintext_size);
|
||||
std::vector<temporary_buffer<char>> encrypted_bufs;
|
||||
{
|
||||
data_sink sink(make_encrypted_sink(create_memory_sink(encrypted_bufs), k));
|
||||
co_await sink.put(plaintext.clone());
|
||||
co_await sink.close();
|
||||
}
|
||||
|
||||
// Flatten encrypted buffers into a single contiguous buffer
|
||||
size_t encrypted_total = 0;
|
||||
for (const auto& b : encrypted_bufs) {
|
||||
encrypted_total += b.size();
|
||||
}
|
||||
temporary_buffer<char> encrypted(encrypted_total);
|
||||
size_t pos = 0;
|
||||
for (const auto& b : encrypted_bufs) {
|
||||
std::copy(b.begin(), b.end(), encrypted.get_write() + pos);
|
||||
pos += b.size();
|
||||
}
|
||||
|
||||
// Step 2: Create a data source from the encrypted data that fails on
|
||||
// post-EOS get() — simulating a source like chunked_download_source
|
||||
// that would hang forever in this situation.
|
||||
class strict_memory_source final : public limiting_data_source_impl {
|
||||
bool _eof = false;
|
||||
public:
|
||||
strict_memory_source(temporary_buffer<char> data, size_t chunk_size)
|
||||
: limiting_data_source_impl(
|
||||
data_source(std::make_unique<util::temporary_buffer_data_source>(std::move(data))),
|
||||
[chunk_size] { return chunk_size; }) {}
|
||||
|
||||
future<temporary_buffer<char>> get() override {
|
||||
BOOST_REQUIRE_MESSAGE(!_eof,
|
||||
"get() called on source after it already returned EOS — "
|
||||
"this is the production deadlock: read_exactly() does not "
|
||||
"check _eof before calling _fd.get()");
|
||||
auto buf = co_await limiting_data_source_impl::get();
|
||||
_eof = buf.empty();
|
||||
co_return buf;
|
||||
}
|
||||
};
|
||||
|
||||
// Step 3: Wrap in encrypted_data_source and drain via consume() —
|
||||
// the exact code path used by seastar::copy() which is what
|
||||
// sstables_loader_helpers::download_sstable() calls.
|
||||
// Try multiple chunk sizes to hit different alignment scenarios.
|
||||
for (size_t chunk_size : {1ul, 7ul, 4096ul, 8192ul, encrypted_total, encrypted_total + 1}) {
|
||||
if (chunk_size == 0) continue;
|
||||
auto src = data_source(make_encrypted_source(
|
||||
data_source(std::make_unique<strict_memory_source>(encrypted.clone(), chunk_size)), k));
|
||||
auto in = input_stream<char>(std::move(src));
|
||||
|
||||
// consume() is what seastar::copy() uses internally. It calls
|
||||
// encrypted_data_source::get() via _fd.get() until EOF.
|
||||
size_t total_decrypted = 0;
|
||||
co_await in.consume([&total_decrypted](temporary_buffer<char> buf) {
|
||||
total_decrypted += buf.size();
|
||||
return make_ready_future<consumption_result<char>>(continue_consuming{});
|
||||
});
|
||||
co_await in.close();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(total_decrypted, plaintext_size);
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_8k) {
|
||||
co_await test_encrypted_source_copy(8192);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_4k) {
|
||||
co_await test_encrypted_source_copy(4096);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_small) {
|
||||
co_await test_encrypted_source_copy(100);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_12k) {
|
||||
co_await test_encrypted_source_copy(12288);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_unaligned) {
|
||||
co_await test_encrypted_source_copy(8193);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_source_copy_1byte) {
|
||||
co_await test_encrypted_source_copy(1);
|
||||
}
|
||||
|
||||
|
||||
SEASTAR_TEST_CASE(test_encrypted_data_source_fuzzy) {
|
||||
std::mt19937_64 rand_gen(std::random_device{}());
|
||||
|
||||
@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
// (group0 mutations are not allowed on non-group0 tables)
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_batch") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -345,4 +345,29 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
|
||||
return do_with_cql_env([] (cql_test_env& e) {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
|
||||
|
||||
return make_ready_future();
|
||||
});
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
@@ -1004,7 +1004,20 @@ SEASTAR_TEST_CASE(memtable_flush_compresses_mutations) {
|
||||
}, db_config);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(memtable_flush_period) {
|
||||
static auto check_has_error_injection() {
|
||||
return boost::unit_test::precondition([](auto){
|
||||
return
|
||||
#ifdef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
true
|
||||
#else
|
||||
false
|
||||
#endif
|
||||
;
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(memtable_flush_period, *check_has_error_injection()) {
|
||||
#ifdef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
auto db_config = make_shared<db::config>();
|
||||
db_config->enable_cache.set(false);
|
||||
return do_with_cql_env_thread([](cql_test_env& env) {
|
||||
@@ -1028,6 +1041,9 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
|
||||
t.apply(m);
|
||||
BOOST_REQUIRE_EQUAL(t.sstables_count(), 0); // add mutation and check there are no sstables for this table
|
||||
|
||||
auto& errj = utils::get_local_injector();
|
||||
errj.enable("table_seal_post_flush_waiters", true);
|
||||
|
||||
// change schema to set memtable flush period
|
||||
// we use small value in this test but it is impossible to set the period less than 60000ms using ALTER TABLE construction
|
||||
schema_builder b(t.schema());
|
||||
@@ -1035,8 +1051,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
|
||||
schema_ptr s2 = b.build();
|
||||
t.set_schema(s2);
|
||||
|
||||
sleep(500ms).get(); // wait until memtable flush starts at least once
|
||||
BOOST_REQUIRE(t.sstables_count() == 1 || t.get_stats().pending_flushes > 0); // flush started
|
||||
BOOST_TEST_MESSAGE("Wait for flush");
|
||||
errj.inject("table_seal_post_flush_waiters", utils::wait_for_message(std::chrono::minutes(2))).get();
|
||||
BOOST_TEST_MESSAGE("Flush received");
|
||||
|
||||
BOOST_REQUIRE(eventually_true([&] { // wait until memtable will be flushed at least once
|
||||
return t.sstables_count() == 1;
|
||||
}));
|
||||
@@ -1047,6 +1065,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}, db_config);
|
||||
#else
|
||||
BOOST_TEST_MESSAGE("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev)");
|
||||
return make_ready_future<>();
|
||||
#endif
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(sstable_compaction_does_not_resurrect_data) {
|
||||
|
||||
@@ -1990,6 +1990,116 @@ SEASTAR_TEST_CASE(test_reverse_cursor_refreshing_on_nonevictable_snapshot_with_e
|
||||
});
|
||||
}
|
||||
|
||||
// Reproducer for SCYLLADB-1253 (https://github.com/scylladb/scylladb/issues/18732)
|
||||
// A reversed query with many overlapping range tombstones and a single live row
|
||||
// near the end of the tombstone range fails to return the live row.
|
||||
//
|
||||
// The bug is in partition_snapshot_row_cursor::maybe_refresh(), in the
|
||||
// !is_in_latest_version() path. When the cursor is reversed and positioned
|
||||
// above all entries in the latest version (in table order), it incorrectly
|
||||
// removes the latest version's entry from the heap, causing the live row
|
||||
// to be skipped.
|
||||
//
|
||||
// This test creates a multi-version partition snapshot:
|
||||
// - v0 (older): contains overlapping range tombstones
|
||||
// - v1 (latest): contains the live row with a higher timestamp
|
||||
// and directly exercises the cursor to verify that advance+maybe_refresh
|
||||
// correctly keeps the live row in the heap during reversed traversal.
|
||||
SEASTAR_TEST_CASE(test_reversed_maybe_refresh_keeps_latest_version_entry) {
|
||||
return seastar::async([] {
|
||||
logalloc::region region;
|
||||
mutation_application_stats app_stats;
|
||||
mutation_cleaner cleaner(region, no_cache_tracker, app_stats);
|
||||
|
||||
simple_schema ss(simple_schema::with_static::no);
|
||||
auto s = ss.schema();
|
||||
auto rev_s = s->make_reversed();
|
||||
|
||||
const int num_tombstones = 100;
|
||||
const int range_span = num_tombstones; // each range covers [i, i + range_span)
|
||||
const int row_ck = 5; // below all range tombstone boundary entries
|
||||
|
||||
// Step 1: Create a partition_entry with all range tombstones.
|
||||
auto pe_ptr = with_allocator(region.allocator(), [&] {
|
||||
logalloc::allocating_section as;
|
||||
return as(region, [&] () -> std::unique_ptr<partition_entry> {
|
||||
mutation m(s, ss.make_pkey(0));
|
||||
for (int i = 0; i < num_tombstones; ++i) {
|
||||
auto range = query::clustering_range::make(
|
||||
query::clustering_range::bound(ss.make_ckey(i), true),
|
||||
query::clustering_range::bound(ss.make_ckey(i + range_span), false));
|
||||
ss.delete_range(m, range);
|
||||
}
|
||||
return std::make_unique<partition_entry>(*s, std::move(m.partition()));
|
||||
});
|
||||
});
|
||||
|
||||
// Step 2: Take a snapshot to pin the current version (v0 with tombstones).
|
||||
auto snap1 = with_allocator(region.allocator(), [&] {
|
||||
logalloc::allocating_section as;
|
||||
return as(region, [&] {
|
||||
return pe_ptr->read(region, cleaner, no_cache_tracker);
|
||||
});
|
||||
});
|
||||
|
||||
// Step 3: Apply the live row. Since v0 is pinned by snap1,
|
||||
// this creates v1 (latest version) with just the live row.
|
||||
with_allocator(region.allocator(), [&] {
|
||||
logalloc::allocating_section as;
|
||||
as(region, [&] {
|
||||
mutation m(s, ss.make_pkey(0));
|
||||
ss.add_row(m, ss.make_ckey(row_ck), "live_value");
|
||||
pe_ptr->apply(region, cleaner, *s, m.partition(), *m.schema(), app_stats);
|
||||
});
|
||||
});
|
||||
|
||||
// Step 4: Take a second snapshot (sees both versions) and test cursor.
|
||||
auto snap2 = with_allocator(region.allocator(), [&] {
|
||||
logalloc::allocating_section as;
|
||||
return as(region, [&] {
|
||||
return pe_ptr->read(region, cleaner, no_cache_tracker);
|
||||
});
|
||||
});
|
||||
|
||||
{
|
||||
logalloc::reclaim_lock rl(region);
|
||||
|
||||
partition_snapshot_row_cursor cursor(*rev_s, *snap2, false /* unique_owner */, true /* reversed */);
|
||||
|
||||
// Position cursor at the very end (in reversed/query order, this means
|
||||
// the highest table-order position).
|
||||
cursor.maybe_advance_to(position_in_partition_view::before_all_clustered_rows());
|
||||
bool has_row = cursor.at_a_row();
|
||||
|
||||
// Traverse all entries in reversed order, calling maybe_refresh()
|
||||
// before processing each row. This simulates what
|
||||
// partition_snapshot_reader::next_interval() does and is where
|
||||
// the bug manifests.
|
||||
bool found_live_row = false;
|
||||
while (has_row) {
|
||||
cursor.maybe_refresh();
|
||||
if (!cursor.dummy()) {
|
||||
found_live_row = true;
|
||||
break;
|
||||
}
|
||||
has_row = cursor.next();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE_MESSAGE(found_live_row,
|
||||
fmt::format("Reversed cursor failed to find the live row at ck={}. "
|
||||
"The !is_in_latest_version() path in maybe_refresh() "
|
||||
"incorrectly removed the latest version's entry from the heap.",
|
||||
row_ck));
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
snap2 = {};
|
||||
snap1 = {};
|
||||
with_allocator(region.allocator(), [&] {
|
||||
pe_ptr.reset();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_apply_to_incomplete_with_dummies) {
|
||||
return seastar::async([] {
|
||||
|
||||
@@ -1499,7 +1499,7 @@ SEASTAR_THREAD_TEST_CASE(tablets_simple_rack_aware_view_pairing_test) {
|
||||
base_host,
|
||||
base_erm,
|
||||
view_erm,
|
||||
*ars_ptr,
|
||||
true, // uses NTS
|
||||
base_token,
|
||||
view_token,
|
||||
use_tablets,
|
||||
|
||||
@@ -719,7 +719,7 @@ SEASTAR_THREAD_TEST_CASE(test_dht_subtract_ranges) {
|
||||
|
||||
auto get_random_ranges = [&] (size_t max_count) {
|
||||
auto count = tests::random::get_int<size_t>(1, max_count);
|
||||
dht::partition_range_vector ranges;
|
||||
utils::chunked_vector<dht::partition_range> ranges;
|
||||
ranges.reserve(count);
|
||||
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
|
||||
@@ -2644,7 +2644,10 @@ SEASTAR_TEST_CASE(test_exception_safety_of_update_from_memtable) {
|
||||
return rd;
|
||||
};
|
||||
|
||||
populate_range(cache, population_range);
|
||||
{
|
||||
memory::scoped_critical_alloc_section dfg;
|
||||
populate_range(cache, population_range);
|
||||
}
|
||||
auto rd1_v1 = assert_that(make_reader(population_range));
|
||||
mutation_reader_opt snap;
|
||||
auto close_snap = defer([&snap] {
|
||||
|
||||
@@ -20,16 +20,24 @@ static void add_entry(logalloc::region& r,
|
||||
const schema& s,
|
||||
partition_index_page& page,
|
||||
const partition_key& key,
|
||||
uint64_t position)
|
||||
uint64_t position,
|
||||
std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
|
||||
{
|
||||
logalloc::allocating_section as;
|
||||
as(r, [&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
auto key_offset = page._key_storage.size();
|
||||
auto old_storage = std::move(page._key_storage);
|
||||
page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
|
||||
auto out = managed_bytes_mutable_view(page._key_storage);
|
||||
write_fragmented(out, managed_bytes_view(old_storage));
|
||||
write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
|
||||
page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
|
||||
if (promoted_index) {
|
||||
page._promoted_indexes.resize(page._entries.size());
|
||||
page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -54,10 +62,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
|
||||
static void has_page0(partition_index_cache::entry_ptr ptr) {
|
||||
BOOST_REQUIRE(!ptr->empty());
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
|
||||
};
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_caching) {
|
||||
@@ -139,6 +147,59 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
|
||||
::lru lru;
|
||||
simple_schema s;
|
||||
logalloc::region r;
|
||||
partition_index_cache_stats stats;
|
||||
partition_index_cache cache(lru, r, stats);
|
||||
|
||||
auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
|
||||
partition_index_page page;
|
||||
auto destroy_page = defer([&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
auto p = std::move(page);
|
||||
});
|
||||
});
|
||||
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
|
||||
.promoted_index_start = 1,
|
||||
.promoted_index_size = 10,
|
||||
.num_blocks = 3
|
||||
});
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
|
||||
.promoted_index_start = 2,
|
||||
.promoted_index_size = 13,
|
||||
.num_blocks = 1
|
||||
});
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
|
||||
destroy_page.cancel();
|
||||
co_return std::move(page);
|
||||
};
|
||||
|
||||
auto page = cache.get_or_load(0, page0_loader).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
|
||||
|
||||
with_allocator(r.allocator(), [&] {
|
||||
lru.evict_all();
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static future<> ignore_result(future<T>&& f) {
|
||||
return f.then_wrapped([] (auto&& f) {
|
||||
|
||||
@@ -1607,6 +1607,29 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
|
||||
auto& stm = e.local_db().get_shared_token_metadata();
|
||||
auto tm = stm.get();
|
||||
|
||||
e.get_topology_state_machine().local()._topology.version = tm->get_version();
|
||||
|
||||
co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
|
||||
utils::chunked_vector<frozen_mutation> muts;
|
||||
muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
|
||||
.set_version(tm->get_version())
|
||||
.build().to_mutation(db::system_keyspace::topology())));
|
||||
co_await e.local_db().apply(muts, db::no_timeout);
|
||||
co_await e.get_storage_service().local().update_tablet_metadata({});
|
||||
|
||||
// Need a new guard to make sure later changes use later timestamp.
|
||||
// Also, so that the table layer processes the changes we persisted, which is important for splits.
|
||||
// Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
|
||||
release_guard(std::move(guard));
|
||||
abort_source as;
|
||||
co_return co_await e.get_raft_group0_client().start_operation(as);
|
||||
}
|
||||
|
||||
static
|
||||
future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
@@ -1626,19 +1649,14 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
|
||||
changed = true;
|
||||
tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
|
||||
tm.set_version(tm.get_version() + 1);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
// Need to reload on each resize because table object expects tablet count to change by a factor of 2.
|
||||
co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
|
||||
co_await e.get_storage_service().local().update_tablet_metadata({});
|
||||
|
||||
// Need a new guard to make sure later changes use later timestamp.
|
||||
release_guard(std::move(guard));
|
||||
abort_source as;
|
||||
guard = co_await e.get_raft_group0_client().start_operation(as);
|
||||
guard = co_await save_token_metadata(e, std::move(guard));
|
||||
|
||||
if (load_stats) {
|
||||
auto new_tm = stm.get();
|
||||
@@ -1647,6 +1665,11 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
load_stats->stats = *reconciled_stats;
|
||||
}
|
||||
}
|
||||
|
||||
testlog.debug("Calling local_topology_barrier()");
|
||||
old_tm = nullptr;
|
||||
co_await e.get_storage_service().local().local_topology_barrier();
|
||||
testlog.debug("Finished local_topology_barrier()");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1750,13 +1773,22 @@ void do_rebalance_tablets(cql_test_env& e,
|
||||
}).get();
|
||||
|
||||
if (auto_split && load_stats) {
|
||||
bool reload = false;
|
||||
auto& tm = *stm.get();
|
||||
for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
|
||||
if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
|
||||
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
|
||||
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
|
||||
if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
|
||||
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
|
||||
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
|
||||
reload = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Need to order split-ack before split finalization, storage_group assumes that.
|
||||
if (reload) {
|
||||
guard = save_token_metadata(e, std::move(guard)).get();
|
||||
}
|
||||
}
|
||||
|
||||
handle_resize_finalize(e, guard, plan, load_stats).get();
|
||||
|
||||
@@ -331,4 +331,28 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
|
||||
std::cerr.rdbuf(oldCerr);
|
||||
|
||||
BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_raw_token) {
|
||||
const auto t1 = dht::token::from_int64(1);
|
||||
const auto t2 = dht::token::from_int64(2);
|
||||
|
||||
dht::raw_token_opt rt_opt;
|
||||
BOOST_REQUIRE(!rt_opt);
|
||||
rt_opt = dht::raw_token(t1);
|
||||
BOOST_REQUIRE(*rt_opt == t1);
|
||||
|
||||
BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::first_token());
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
|
||||
|
||||
auto rt1 = dht::raw_token(t1);
|
||||
BOOST_REQUIRE(bool(rt1));
|
||||
BOOST_REQUIRE(rt1 > dht::raw_token());
|
||||
BOOST_REQUIRE(rt1 > dht::minimum_token());
|
||||
BOOST_REQUIRE_EQUAL(rt1, t1);
|
||||
BOOST_REQUIRE(rt1 == t1);
|
||||
BOOST_REQUIRE(rt1 < t2);
|
||||
BOOST_REQUIRE(rt1 < dht::maximum_token());
|
||||
}
|
||||
|
||||
@@ -57,6 +57,20 @@ BOOST_AUTO_TEST_CASE(test_null_is_not_empty) {
|
||||
BOOST_REQUIRE(empty != null);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_null_data_value_to_parsable_string) {
|
||||
auto null_utf8 = data_value::make_null(utf8_type);
|
||||
BOOST_REQUIRE_EQUAL(null_utf8.to_parsable_string(), "null");
|
||||
|
||||
auto null_int = data_value::make_null(int32_type);
|
||||
BOOST_REQUIRE_EQUAL(null_int.to_parsable_string(), "null");
|
||||
|
||||
auto null_list = data_value::make_null(list_type_impl::get_instance(int32_type, true));
|
||||
BOOST_REQUIRE_EQUAL(null_list.to_parsable_string(), "null");
|
||||
|
||||
auto null_map = data_value::make_null(map_type_impl::get_instance(utf8_type, int32_type, true));
|
||||
BOOST_REQUIRE_EQUAL(null_map.to_parsable_string(), "null");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_bytes_type_string_conversions) {
|
||||
BOOST_REQUIRE(bytes_type->equal(bytes_type->from_string("616263646566"), bytes_type->decompose(data_value(bytes{"abcdef"}))));
|
||||
}
|
||||
|
||||
189
test/boost/vector_index_test.cc
Normal file
189
test/boost/vector_index_test.cc
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "index/vector_index.hh"
|
||||
#include "utils/rjson.hh"
|
||||
|
||||
using namespace secondary_index;
|
||||
using namespace cql3;
|
||||
|
||||
using statements::index_target;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(vector_index_test)
|
||||
|
||||
namespace {
|
||||
|
||||
::shared_ptr<index_target> make_single(const sstring& name) {
|
||||
auto col = ::make_shared<cql3::column_identifier>(name, true);
|
||||
return ::make_shared<index_target>(col, index_target::target_type::regular_values);
|
||||
}
|
||||
|
||||
::shared_ptr<index_target> make_multi(const std::vector<sstring>& names) {
|
||||
std::vector<::shared_ptr<column_identifier>> cols;
|
||||
cols.reserve(names.size());
|
||||
for (const auto& n : names) {
|
||||
cols.push_back(::make_shared<column_identifier>(n, true));
|
||||
}
|
||||
return ::make_shared<index_target>(std::move(cols), index_target::target_type::regular_values);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1", "p2"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_single_column) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("v"),
|
||||
};
|
||||
BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("v"),
|
||||
make_single("f1"),
|
||||
make_single("f2"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
BOOST_REQUIRE(!rjson::find(json, "pk"));
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
const auto* fc = rjson::find(json, "fc");
|
||||
BOOST_REQUIRE(fc);
|
||||
BOOST_REQUIRE(fc->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_local_index) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_multi({"p1", "p2"}),
|
||||
make_single("v"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
const auto* pk = rjson::find(json, "pk");
|
||||
BOOST_REQUIRE(pk);
|
||||
BOOST_REQUIRE(pk->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
BOOST_REQUIRE(!rjson::find(json, "fc"));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_multi({"p1", "p2"}),
|
||||
make_single("v"),
|
||||
make_single("f1"),
|
||||
make_single("f2"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
const auto* pk = rjson::find(json, "pk");
|
||||
BOOST_REQUIRE(pk);
|
||||
BOOST_REQUIRE(pk->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
const auto* fc = rjson::find(json, "fc");
|
||||
BOOST_REQUIRE(fc);
|
||||
BOOST_REQUIRE(fc->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1"}), make_multi({"c1"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("v"), make_multi({"c1"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("MyCol")};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(serialized, "\"MyCol\"");
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("a\"b")};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\"");
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("MyCol"),
|
||||
make_single("f1"),
|
||||
};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("a\"b"),
|
||||
make_single("f1"),
|
||||
};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
@@ -3221,6 +3221,87 @@ SEASTAR_TEST_CASE(test_view_update_generating_writetime) {
|
||||
});
|
||||
}
|
||||
|
||||
// Usually if only an unselected column in the base table is modified, we expect an optimization that a view
|
||||
// update is not done, but we had an bug(https://scylladb.atlassian.net/browse/SCYLLADB-808) where the existence
|
||||
// of a collection selected in the view caused us to skip this optimization, even when it was not modified.
|
||||
// This test reproduces this bug.
|
||||
SEASTAR_TEST_CASE(test_view_update_unmodified_collection) {
|
||||
// In this test we verify that we correctly skip (or not) view updates to a view that selects
|
||||
// a collection column. We use two MVs, similarly as in the test above test.
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
|
||||
auto f1 = e.local_view_builder().wait_until_built("ks", "mv1");
|
||||
auto f2 = e.local_view_builder().wait_until_built("ks", "mv2");
|
||||
|
||||
e.execute_cql("CREATE TABLE t (k int, c int, a int, b list<int>, g int, primary key(k, c))").get();
|
||||
e.execute_cql("CREATE MATERIALIZED VIEW mv1 AS SELECT k,c,a,b FROM t "
|
||||
"WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)").get();
|
||||
e.execute_cql("CREATE MATERIALIZED VIEW mv2 AS SELECT k,c,a,b FROM t "
|
||||
"WHERE k IS NOT NULL AND c IS NOT NULL AND a IS NOT NULL PRIMARY KEY (c, k, a)").get();
|
||||
|
||||
f1.get();
|
||||
f2.get();
|
||||
|
||||
auto total_t_view_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
const db::view::stats& local_stats = local_db.find_column_family("ks", "t").get_view_stats();
|
||||
return local_stats.view_updates_pushed_local + local_stats.view_updates_pushed_remote;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
auto total_mv1_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
return local_db.find_column_family("ks", "mv1").get_stats().writes.hist.count;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
auto total_mv2_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
return local_db.find_column_family("ks", "mv2").get_stats().writes.hist.count;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
::shared_ptr<cql_transport::messages::result_message> msg;
|
||||
|
||||
e.execute_cql("INSERT INTO t (k, c, a) VALUES (1, 1, 1)").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{1, 1, 2};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update an unselected column and the collection remains NULL, so we should generate an
|
||||
// update to the virtual column in mv1 but not to mv2.
|
||||
e.execute_cql("UPDATE t SET g=1 WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{2, 1, 3};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update the collection with an initial value
|
||||
e.execute_cql("UPDATE t SET b=[1] WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{3, 2, 5};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update an unselected column again with a non-NULL selected collection. Because the liveness of the updated column is unchanged
|
||||
// and no other selected column is updated (in particular, the collection column), we should generate no view updates.
|
||||
e.execute_cql("UPDATE t SET g=2 WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{3, 2, 5};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_conflicting_batch) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
|
||||
|
||||
@@ -254,27 +254,3 @@ async def test_node_ops_task_wait(manager: ManagerClient):
|
||||
|
||||
await decommission_task
|
||||
await waiting_task
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_children(manager: ManagerClient):
|
||||
module_name = "node_ops"
|
||||
tm = TaskManagerClient(manager.api)
|
||||
servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
|
||||
|
||||
injection = "tasks_vt_get_children"
|
||||
handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
|
||||
|
||||
async def _decommission():
|
||||
await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
|
||||
await manager.decommission_node(servers[1].server_id)
|
||||
await handler.message()
|
||||
|
||||
async def _get_status():
|
||||
await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
|
||||
|
||||
await asyncio.gather(*(_decommission(), _get_status()))
|
||||
|
||||
@@ -12,9 +12,11 @@ import pytest
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.pylib.tablets import get_all_tablet_replicas
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.cluster.util import create_new_test_keyspace, new_test_keyspace
|
||||
from test.cluster.util import create_new_test_keyspace, new_test_keyspace, get_topology_coordinator, find_server_by_host_id
|
||||
from test.cluster.test_incremental_repair import trigger_tablet_merge
|
||||
from test.cluster.test_tablets2 import inject_error_on
|
||||
from test.cluster.tasks.task_manager_client import TaskManagerClient
|
||||
from test.cluster.tasks.task_manager_types import TaskStatus, TaskStats
|
||||
@@ -96,6 +98,50 @@ async def test_tablet_repair_task(manager: ManagerClient):
|
||||
|
||||
await asyncio.gather(repair_task(), check_and_abort_repair_task(manager, tm, servers, module_name, ks))
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_repair_wait_with_table_drop(manager: ManagerClient):
|
||||
module_name = "tablets"
|
||||
tm = TaskManagerClient(manager.api)
|
||||
injection = "tablet_virtual_task_wait"
|
||||
|
||||
cmdline = [
|
||||
'--logger-log-level', 'debug_error_injection=debug',
|
||||
]
|
||||
servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, cmdline=cmdline)
|
||||
assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
|
||||
|
||||
token = -1
|
||||
await enable_injection(manager, servers, "repair_tablet_fail_on_rpc_call")
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, await_completion=False)
|
||||
|
||||
repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
|
||||
|
||||
task = repair_tasks[0]
|
||||
assert task.scope == "table"
|
||||
assert task.keyspace == ks
|
||||
assert task.table == "test"
|
||||
assert task.state in ["created", "running"]
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
await enable_injection(manager, [servers[0]], injection)
|
||||
|
||||
async def wait_for_task():
|
||||
status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
|
||||
assert status_wait.state == "done"
|
||||
|
||||
async def drop_table():
|
||||
await log.wait_for(f'"{injection}"', from_mark=mark)
|
||||
await disable_injection(manager, servers, "repair_tablet_fail_on_rpc_call")
|
||||
await manager.get_cql().run_async(f"DROP TABLE {ks}.test")
|
||||
await manager.api.message_injection(servers[0].ip_addr, injection)
|
||||
|
||||
await asyncio.gather(wait_for_task(), drop_table())
|
||||
|
||||
await disable_injection(manager, servers, injection)
|
||||
|
||||
async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str):
|
||||
def get_task_with_id(repair_tasks, task_id):
|
||||
tasks_with_id1 = [task for task in repair_tasks if task.task_id == task_id]
|
||||
@@ -151,6 +197,45 @@ async def test_tablet_repair_task_list(manager: ManagerClient):
|
||||
|
||||
await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks))
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_repair_wait(manager: ManagerClient):
|
||||
module_name = "tablets"
|
||||
tm = TaskManagerClient(manager.api)
|
||||
|
||||
stop_repair_injection = "repair_tablet_repair_task_impl_run"
|
||||
servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager)
|
||||
assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
|
||||
|
||||
await inject_error_on(manager, stop_repair_injection, servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", "all", await_completion=False)
|
||||
|
||||
repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
|
||||
task = repair_tasks[0]
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
async def wait_for_task():
|
||||
await enable_injection(manager, servers, "tablet_virtual_task_wait")
|
||||
status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
|
||||
|
||||
async def merge_tablets():
|
||||
await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark)
|
||||
|
||||
# Resume repair.
|
||||
await message_injection(manager, servers, stop_repair_injection)
|
||||
|
||||
# Merge tablets.
|
||||
coord = await find_server_by_host_id(manager, servers, await get_topology_coordinator(manager))
|
||||
log2 = await manager.server_open_log(coord.server_id)
|
||||
await trigger_tablet_merge(manager, servers, [log2])
|
||||
|
||||
await read_barrier(manager.api, servers[0].ip_addr)
|
||||
await message_injection(manager, servers, "tablet_virtual_task_wait")
|
||||
|
||||
await asyncio.gather(wait_for_task(), merge_tablets())
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_repair_task_children(manager: ManagerClient):
|
||||
|
||||
70
test/cluster/test_bootstrap_with_quick_group0_join.py
Normal file
70
test/cluster/test_bootstrap_with_quick_group0_join.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from test.cluster.util import get_current_group0_config
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.pylib.util import wait_for
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
|
||||
"""Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
|
||||
|
||||
The bug was that when the bootstrapping node joined group0 before reaching
|
||||
post_server_start, it skipped post_server_start and thus hung forever.
|
||||
|
||||
The test simulates the scenario by starting the second node with the
|
||||
join_group0_pause_before_config_check injection. Without the fix, the
|
||||
startup times out.
|
||||
"""
|
||||
logger.info("Adding first server")
|
||||
s1 = await manager.server_add()
|
||||
|
||||
logger.info("Adding second server with join_group0_pause_before_config_check enabled")
|
||||
s2 = await manager.server_add(start=False, config={
|
||||
'error_injections_at_startup': ['join_group0_pause_before_config_check']
|
||||
})
|
||||
|
||||
logger.info(f"Starting {s2}")
|
||||
start_task = asyncio.create_task(manager.server_start(s2.server_id))
|
||||
|
||||
s2_log = await manager.server_open_log(s2.server_id)
|
||||
|
||||
await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
|
||||
|
||||
s1_host_id = await manager.get_host_id(s1.server_id)
|
||||
s2_host_id = await manager.get_host_id(s2.server_id)
|
||||
|
||||
async def s2_in_group0_config_on_s1():
|
||||
config = await get_current_group0_config(manager, s1)
|
||||
ids = {m[0] for m in config}
|
||||
assert s1_host_id in ids # sanity check
|
||||
return True if s2_host_id in ids else None
|
||||
|
||||
# Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
|
||||
# get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
|
||||
# to see s2 and then perform a read barrier on s2.
|
||||
logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
|
||||
await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
|
||||
|
||||
logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
|
||||
await read_barrier(manager.api, s2.ip_addr)
|
||||
|
||||
logger.info(f"Unblocking {s2}")
|
||||
await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
|
||||
|
||||
logger.info(f"Waiting for {s2} to complete bootstrap")
|
||||
await asyncio.wait_for(start_task, timeout=60)
|
||||
@@ -54,9 +54,9 @@ async def run_test_cache_tombstone_gc(manager: ManagerClient, statement_pairs: l
|
||||
" AND compaction = {'class': 'NullCompactionStrategy'}")
|
||||
|
||||
for write_statement, delete_statement in statement_pairs:
|
||||
execute_with_tracing(cql, write_statement.format(ks=ks), log = True)
|
||||
execute_with_tracing(cql, SimpleStatement(write_statement.format(ks=ks), consistency_level=ConsistencyLevel.ALL), log = True)
|
||||
await manager.api.enable_injection(node3.ip_addr, "database_apply", one_shot=False)
|
||||
execute_with_tracing(cql, delete_statement.format(ks=ks), log = True)
|
||||
execute_with_tracing(cql, SimpleStatement(delete_statement.format(ks=ks), consistency_level=ConsistencyLevel.LOCAL_QUORUM), log = True)
|
||||
await manager.api.disable_injection(node3.ip_addr, "database_apply")
|
||||
|
||||
def check_data(host, data):
|
||||
|
||||
@@ -117,17 +117,18 @@ async def create_encrypted_cf(manager: ManagerClient, ks: str,
|
||||
|
||||
return new_test_table(manager, ks, columns, extra)
|
||||
|
||||
async def prepare_write_workload(cql: CassandraSession, table_name, flush=True, n: int = None):
|
||||
"""write some data"""
|
||||
keys = list(range(n if n else 100))
|
||||
async def prepare_write_workload(cql: CassandraSession, table_name, flush=True, n: int = None) -> list[str]:
|
||||
"""write some data, returns list of written partition keys"""
|
||||
key_ids = list(range(n if n else 100))
|
||||
c1_values = ['value1']
|
||||
c2_values = ['value2']
|
||||
|
||||
statement = cql.prepare(f"INSERT INTO {table_name} (key, c1, c2) VALUES (?, ?, ?)")
|
||||
statement.consistency_level = ConsistencyLevel.ALL
|
||||
|
||||
keys = [f"k{x}" for x in key_ids]
|
||||
await asyncio.gather(*[cql.run_async(statement, params) for params in
|
||||
list(map(lambda x, y, z: [f"k{x}", y, z], keys,
|
||||
list(map(lambda x, y, z: [x, y, z], keys,
|
||||
itertools.cycle(c1_values),
|
||||
itertools.cycle(c2_values)))]
|
||||
)
|
||||
@@ -135,10 +136,14 @@ async def prepare_write_workload(cql: CassandraSession, table_name, flush=True,
|
||||
if flush:
|
||||
nodetool.flush(cql, table_name)
|
||||
|
||||
async def read_verify_workload(cql: CassandraSession, table_name: str, expected_len: int = 100):
|
||||
"""check written data"""
|
||||
rows = list(cql.execute(f"SELECT c1, c2 FROM {table_name}"))
|
||||
assert len(rows) == expected_len
|
||||
return keys
|
||||
|
||||
async def read_verify_workload(cql: CassandraSession, table_name: str, keys: list[str]):
|
||||
"""check written data using single-partition queries"""
|
||||
statement = cql.prepare(f"SELECT c1, c2 FROM {table_name} WHERE key = ?")
|
||||
rows = await asyncio.gather(*[cql.run_async(statement, [key]) for key in keys])
|
||||
for key, result in zip(keys, rows):
|
||||
assert len(list(result)) == 1, f"Expected 1 row for key={key}, got {len(list(result))}"
|
||||
|
||||
async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
|
||||
ciphers: dict[str, list[int]], compression: str = None,
|
||||
@@ -167,8 +172,8 @@ async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
|
||||
compression=compression,
|
||||
additional_options=additional_options
|
||||
))
|
||||
await prepare_write_workload(cql, table_name=table_name)
|
||||
cfs.append(table_name)
|
||||
keys = await prepare_write_workload(cql, table_name=table_name)
|
||||
cfs.append((table_name, keys))
|
||||
except Exception as e:
|
||||
if exception_handler:
|
||||
exception_handler(e, cipher_algorithm, secret_key_strength)
|
||||
@@ -176,12 +181,12 @@ async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
|
||||
raise e
|
||||
# restart the cluster
|
||||
if restart:
|
||||
await restart(manager, servers, cfs)
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
await restart(manager, servers, [table_name for table_name, _ in cfs])
|
||||
cql, _ = await manager.get_ready_cql(servers)
|
||||
else:
|
||||
await manager.rolling_restart(servers)
|
||||
for table_name in cfs:
|
||||
await read_verify_workload(cql, table_name=table_name)
|
||||
for table_name, keys in cfs:
|
||||
await read_verify_workload(cql, table_name=table_name, keys=keys)
|
||||
|
||||
# default: 'AES/CBC/PKCS5Padding', length 128
|
||||
supported_cipher_algorithms = {
|
||||
@@ -363,7 +368,7 @@ async def test_alter(manager, key_provider):
|
||||
table_names[0], False,
|
||||
expected_data=expected_data)
|
||||
|
||||
await read_verify_workload(cql, table_name=table_names[0])
|
||||
await read_verify_workload(cql, table_name=table_names[0], keys=[row[0] for row in expected_data])
|
||||
# enable encryption again
|
||||
options = key_provider.additional_cf_options()
|
||||
cql.execute(f"ALTER TABLE {table_names[0]} with scylla_encryption_options={options}")
|
||||
@@ -433,7 +438,8 @@ async def test_non_existant_table_master_key(manager: ManagerClient, tmpdir):
|
||||
|
||||
async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
cfg = {"authenticator": "org.apache.cassandra.auth.PasswordAuthenticator",
|
||||
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer"}
|
||||
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer",
|
||||
"commitlog_sync": "batch" }
|
||||
|
||||
servers: list[ServerInfo] = await manager.servers_add(servers_num = 1, config=cfg,
|
||||
driver_connect_opts={'auth_provider': PlainTextAuthProvider(username='cassandra', password='cassandra')})
|
||||
@@ -450,11 +456,14 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
file_paths = [f for f in file_paths if os.path.isfile(f) and not os.path.islink(f)]
|
||||
|
||||
for file_path in file_paths:
|
||||
with open(file_path, 'rb') as f:
|
||||
data = f.read()
|
||||
if pbytes in data:
|
||||
pattern_found_counter += 1
|
||||
logger.debug("Pattern '%s' found in %s", pattern, file_path)
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
data = f.read()
|
||||
if pbytes in data:
|
||||
pattern_found_counter += 1
|
||||
logger.debug("Pattern '%s' found in %s", pattern, file_path)
|
||||
except FileNotFoundError:
|
||||
pass # assume just compacted away
|
||||
|
||||
if expect:
|
||||
assert pattern_found_counter > 0
|
||||
@@ -462,15 +471,15 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
assert pattern_found_counter == 0
|
||||
|
||||
async def verify_system_info(expect: bool):
|
||||
user = f"user_{str(uuid.uuid4())}"
|
||||
user = f"user_{str(uuid.uuid4())}".replace('-','_')
|
||||
pwd = f"pwd_{str(uuid.uuid4())}"
|
||||
cql.execute(f"CREATE USER {user} WITH PASSWORD '{pwd}' NOSUPERUSER")
|
||||
assert_one(cql, f"LIST ROLES of {user}", [user, False, True, {}])
|
||||
|
||||
logger.debug("Verify PART 1: check commitlogs -------------")
|
||||
|
||||
grep_database_files(pwd, "commitlog", "**/*.log", expect)
|
||||
grep_database_files(user, "commitlog", "**/*.log", True)
|
||||
await grep_database_files(pwd, "commitlog", "**/*.log", False)
|
||||
await grep_database_files(user, "commitlog", "**/*.log", expect)
|
||||
|
||||
salted_hash = None
|
||||
system_auth = None
|
||||
@@ -487,39 +496,38 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
|
||||
assert salted_hash is not None
|
||||
assert system_auth is not None
|
||||
grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
|
||||
await grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
|
||||
|
||||
rand_comment = f"comment_{str(uuid.uuid4())}"
|
||||
|
||||
async with await create_ks(manager) as ks:
|
||||
async with await new_test_table(cql, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
|
||||
async with new_test_table(manager, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
|
||||
cql.execute(f"ALTER TABLE {table} WITH comment = '{rand_comment}'")
|
||||
grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
|
||||
nodetool.flush_all(cql)
|
||||
await grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
|
||||
# Note: original test did greping in sstables. This does no longer work
|
||||
# since all system tables are compressed, and thus binary greping will
|
||||
# not work. We could do scylla sstable dump-data and grep in the json,
|
||||
# but this is somewhat pointless as this would, if it handles it, just
|
||||
# decrypt the info from the sstable, thus we can't really verify anything.
|
||||
# We could maybe check that the expected system tables are in fact encrypted,
|
||||
# though this is more a promise than guarantee... Also, the only tables
|
||||
# encrypted are paxos and batchlog -> pointless
|
||||
|
||||
logger.debug("Verify PART 2: check sstable files -------------\n`system_info_encryption` won't encrypt sstable files on disk")
|
||||
logger.debug("GREP_DB_FILES: Check PM key user in sstable file ....")
|
||||
grep_database_files(user, f"data/{system_auth}/", "**/*-Data.db", expect=True)
|
||||
logger.debug("GREP_DB_FILES: Check original password in commitlogs .... Original password should never be saved")
|
||||
grep_database_files(pwd, f"data/{system_auth}/", "**/*-Data.db", expect=False)
|
||||
logger.debug("GREP_DB_FILES: Check salted_hash of password in sstable file ....")
|
||||
grep_database_files(salted_hash, f"data/{system_auth}/", "**/*-Data.db", expect=False)
|
||||
logger.debug("GREP_DB_FILES: Check table comment in sstable file ....")
|
||||
grep_database_files(rand_comment, "data/system_schema/", "**/*-Data.db", expect=True)
|
||||
|
||||
verify_system_info(True) # not encrypted
|
||||
await verify_system_info(True) # not encrypted
|
||||
|
||||
cfg = {"system_info_encryption": {
|
||||
"enabled": True,
|
||||
"key_provider": "LocalFileSystemKeyProviderFactory"}
|
||||
"key_provider": "LocalFileSystemKeyProviderFactory"},
|
||||
"system_key_directory": os.path.join(tmpdir, "resources/system_keys")
|
||||
}
|
||||
|
||||
for server in servers:
|
||||
manager.server_update_config(server.server_id, config_options=cfg)
|
||||
await manager.server_update_config(server.server_id, config_options=cfg)
|
||||
await manager.server_restart(server.server_id)
|
||||
|
||||
await manager.rolling_restart(servers)
|
||||
|
||||
verify_system_info(False) # should not see stuff now
|
||||
await verify_system_info(False) # should not see stuff now
|
||||
|
||||
|
||||
async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):
|
||||
|
||||
@@ -8,8 +8,11 @@ import asyncio
|
||||
import time
|
||||
import pytest
|
||||
import logging
|
||||
from functools import partial
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.util import wait_for
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -17,6 +20,26 @@ logger = logging.getLogger(__name__)
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_crashed_node_substitution(manager: ManagerClient):
|
||||
"""Test that a node which crashed after starting gossip but before joining group0
|
||||
(an 'orphan' node) is eventually removed from gossip by the gossiper_orphan_remover_fiber.
|
||||
|
||||
The scenario:
|
||||
1. Start 3 nodes with the 'fast_orphan_removal_fiber' injection enabled. This freezes
|
||||
the gossiper_orphan_remover_fiber on each node before it enters its polling loop,
|
||||
so it cannot remove any orphan until explicitly unblocked.
|
||||
2. Start a 4th node with the 'crash_before_group0_join' injection enabled. This node
|
||||
starts gossip normally but blocks inside pre_server_start(), just before sending
|
||||
the join RPC to the topology coordinator. It never joins group0.
|
||||
3. Wait until the 4th node's gossip state has fully propagated to all 3 running peers,
|
||||
then trigger its crash via the injection. At this point all peers see it as an orphan:
|
||||
present in gossip but absent from the group0 topology.
|
||||
4. Assert the orphan is visible in gossip (live or down) on the surviving nodes.
|
||||
5. Unblock the gossiper_orphan_remover_fiber on all 3 nodes (via message_injection) and
|
||||
enable the 'speedup_orphan_removal' injection so the fiber removes the orphan immediately
|
||||
without waiting for the normal 60-second age threshold.
|
||||
6. Wait for the 'Finished to force remove node' log line confirming removal, then assert
|
||||
the orphan is no longer present in gossip.
|
||||
"""
|
||||
servers = await manager.servers_add(3, config={
|
||||
'error_injections_at_startup': ['fast_orphan_removal_fiber']
|
||||
})
|
||||
@@ -31,10 +54,24 @@ async def test_crashed_node_substitution(manager: ManagerClient):
|
||||
log = await manager.server_open_log(failed_server.server_id)
|
||||
await log.wait_for("finished do_send_ack2_msg")
|
||||
failed_id = await manager.get_host_id(failed_server.server_id)
|
||||
|
||||
# Wait until the failed server's gossip state has propagated to all running peers.
|
||||
# "finished do_send_ack2_msg" only guarantees that one peer completed a gossip round
|
||||
# with the failed server; other nodes learn about it only in subsequent gossip rounds.
|
||||
# Querying gossip before propagation completes would cause the assertion below to fail
|
||||
# because the orphan node would not yet appear as live or down on every peer.
|
||||
async def gossip_has_node(server: ServerInfo):
|
||||
live = await manager.api.client.get_json("/gossiper/endpoint/live", host=server.ip_addr)
|
||||
down = await manager.api.client.get_json("/gossiper/endpoint/down", host=server.ip_addr)
|
||||
return True if failed_server.ip_addr in live + down else None
|
||||
|
||||
for s in servers:
|
||||
await wait_for(partial(gossip_has_node, s), deadline=time.time() + 30)
|
||||
|
||||
await manager.api.message_injection(failed_server.ip_addr, 'crash_before_group0_join')
|
||||
|
||||
|
||||
await task
|
||||
|
||||
|
||||
live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
|
||||
down_eps = await manager.api.client.get_json("/gossiper/endpoint/down", host=servers[0].ip_addr)
|
||||
|
||||
|
||||
@@ -87,7 +87,7 @@ async def test_limited_concurrency_of_writes(manager: ManagerClient):
|
||||
})
|
||||
node2 = await manager.server_add()
|
||||
|
||||
cql = manager.get_cql()
|
||||
cql = await manager.get_cql_exclusive(node1)
|
||||
async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks:
|
||||
table = f"{ks}.t"
|
||||
await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
|
||||
|
||||
@@ -312,14 +312,28 @@ async def test_tablet_incremental_repair_error(manager: ManagerClient):
|
||||
token = -1
|
||||
map0 = await load_tablet_sstables_repaired_at(manager, cql, servers[0], hosts[0], table_id)
|
||||
|
||||
# Repair should not finish with error
|
||||
# Repair should not finish while the injection is enabled. We abort the task
|
||||
# before turning the injection off, otherwise it may continue in background
|
||||
# and increase sstables_repaired_at.
|
||||
await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers)
|
||||
try:
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental', timeout=10)
|
||||
assert False # Check the tablet repair is not supposed to finish
|
||||
except TimeoutError:
|
||||
logger.info("Repair timeout as expected")
|
||||
await inject_error_off(manager, "repair_tablet_fail_on_rpc_call", servers)
|
||||
repair_response = await manager.api.tablet_repair(
|
||||
servers[0].ip_addr,
|
||||
ks,
|
||||
"test",
|
||||
token,
|
||||
await_completion=False,
|
||||
incremental_mode='incremental',
|
||||
)
|
||||
task_id = repair_response['tablet_task_id']
|
||||
|
||||
with pytest.raises(asyncio.TimeoutError):
|
||||
await asyncio.wait_for(manager.api.wait_task(servers[0].ip_addr, task_id), timeout=10)
|
||||
|
||||
await manager.api.abort_task(servers[0].ip_addr, task_id)
|
||||
await manager.api.wait_task(servers[0].ip_addr, task_id)
|
||||
finally:
|
||||
await inject_error_off(manager, "repair_tablet_fail_on_rpc_call", servers)
|
||||
|
||||
map1 = await load_tablet_sstables_repaired_at(manager, cql, servers[0], hosts[0], table_id)
|
||||
|
||||
@@ -609,14 +623,19 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and error in merge
|
||||
s1_mark = await logs[0].mark()
|
||||
await inject_error_on(manager, error, servers[:1])
|
||||
mark = await coord_log.mark()
|
||||
await inject_error_on(manager, error, [coord_serv])
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await logs[0].wait_for(f'Got {error}', from_mark=s1_mark)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
|
||||
await coord_log.wait_for(f'Got {error}', from_mark=mark)
|
||||
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await manager.server_stop(servers[0].server_id)
|
||||
await manager.server_start(servers[0].server_id)
|
||||
await manager.server_stop(coord_serv.server_id)
|
||||
await manager.server_start(coord_serv.server_id)
|
||||
|
||||
for server in servers:
|
||||
await manager.server_stop_gracefully(server.server_id)
|
||||
@@ -862,50 +881,6 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
|
||||
logger.info("Starting vnode repair")
|
||||
await manager.api.repair(servers[1].ip_addr, ks, "test")
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs tablet merge
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
|
||||
cmdline = ['--logger-log-level', 'repair=debug']
|
||||
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and wait until the merge fiber starts
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "merge_completion_fiber", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
|
||||
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
|
||||
|
||||
# Trigger repair and wait for the inc repair prepare preparation to start
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
|
||||
# Wait for preparation to start.
|
||||
await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
|
||||
# Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
|
||||
# With the serialization, preparation will wait for merge fiber to finish.
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Continue to execute the merge fiber so that the compaction group is removed
|
||||
await inject_error_on(manager, "replica_merge_completion_wait", servers)
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
|
||||
|
||||
await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
|
||||
|
||||
# Continue the repair to trigger use-after-free
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
|
||||
|
||||
await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs table drop
|
||||
@pytest.mark.asyncio
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user