mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-19 16:15:07 +00:00
Compare commits
41 Commits
fix_sl_v2_
...
scylla-202
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b638170a4e | ||
|
|
d5c7f29734 | ||
|
|
a5dd529475 | ||
|
|
b176591488 | ||
|
|
233da83dd9 | ||
|
|
9b81939a93 | ||
|
|
804842e95c | ||
|
|
4f77cb621f | ||
|
|
eb6c333e1b | ||
|
|
8d21636a81 | ||
|
|
7f236baf61 | ||
|
|
4da8641d83 | ||
|
|
3ab789e1ca | ||
|
|
25a17282bd | ||
|
|
7afcc56128 | ||
|
|
32443ed6f7 | ||
|
|
3e9b984020 | ||
|
|
2d199fb609 | ||
|
|
35cd7f9239 | ||
|
|
32ce43d4b1 | ||
|
|
fef7750eb6 | ||
|
|
213442227d | ||
|
|
1398a55d16 | ||
|
|
a0a2a67634 | ||
|
|
d4e454b5bc | ||
|
|
825a36c97a | ||
|
|
45413e99a5 | ||
|
|
c93a935564 | ||
|
|
69f78ce74a | ||
|
|
3513ce6069 | ||
|
|
0ca7253315 | ||
|
|
c7ac3b5394 | ||
|
|
d6ed05efc1 | ||
|
|
39fcc83e75 | ||
|
|
6250f1e967 | ||
|
|
b307c9301d | ||
|
|
f26af8cd30 | ||
|
|
2bd10bff5e | ||
|
|
1105d83893 | ||
|
|
9b9d5cee8a | ||
|
|
a8fd9936a3 |
22
.github/workflows/trigger-scylla-ci.yaml
vendored
22
.github/workflows/trigger-scylla-ci.yaml
vendored
@@ -15,13 +15,19 @@ jobs:
|
||||
- name: Verify Org Membership
|
||||
id: verify_author
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
|
||||
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
|
||||
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
|
||||
AUTHOR="${{ github.event.pull_request.user.login }}"
|
||||
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
|
||||
AUTHOR="$PR_AUTHOR"
|
||||
ASSOCIATION="$PR_ASSOCIATION"
|
||||
else
|
||||
AUTHOR="${{ github.event.comment.user.login }}"
|
||||
AUTHOR="$COMMENT_AUTHOR"
|
||||
ASSOCIATION="$COMMENT_ASSOCIATION"
|
||||
fi
|
||||
ORG="scylladb"
|
||||
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
|
||||
@@ -34,13 +40,11 @@ jobs:
|
||||
- name: Validate Comment Trigger
|
||||
if: github.event_name == 'issue_comment'
|
||||
id: verify_comment
|
||||
env:
|
||||
COMMENT_BODY: ${{ github.event.comment.body }}
|
||||
shell: bash
|
||||
run: |
|
||||
BODY=$(cat << 'EOF'
|
||||
${{ github.event.comment.body }}
|
||||
EOF
|
||||
)
|
||||
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
|
||||
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
|
||||
|
||||
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
|
||||
echo "trigger=true" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -78,7 +78,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=2026.1.0
|
||||
VERSION=2026.1.1
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "mutation/mutation_fragment_stream_validator.hh"
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "utils/pretty_printers.hh"
|
||||
#include "readers/multi_range.hh"
|
||||
#include "readers/compacting.hh"
|
||||
@@ -611,23 +612,23 @@ private:
|
||||
}
|
||||
|
||||
// Called in a seastar thread
|
||||
dht::partition_range_vector
|
||||
utils::chunked_vector<dht::partition_range>
|
||||
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
// If owned ranges is disengaged, it means no cleanup work was done and
|
||||
// so nothing needs to be invalidated.
|
||||
if (!_owned_ranges) {
|
||||
return dht::partition_range_vector{};
|
||||
return {};
|
||||
}
|
||||
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
|
||||
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
|
||||
|
||||
auto non_owned_ranges = sstables
|
||||
| std::views::transform([] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return dht::partition_range::make({sst->get_first_decorated_key(), true},
|
||||
{sst->get_last_decorated_key(), true});
|
||||
}) | std::ranges::to<dht::partition_range_vector>();
|
||||
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
|
||||
|
||||
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
|
||||
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
|
||||
}
|
||||
protected:
|
||||
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
|
||||
@@ -718,8 +719,8 @@ protected:
|
||||
|
||||
compaction_completion_desc
|
||||
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
|
||||
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
|
||||
auto ranges = get_ranges_for_invalidation(input_sstables);
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
|
||||
}
|
||||
|
||||
// Tombstone expiration is enabled based on the presence of sstable set.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "sstables/sstable_set.hh"
|
||||
#include "compaction_fwd.hh"
|
||||
#include "mutation_writer/token_group_based_splitting_writer.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace compaction {
|
||||
|
||||
@@ -38,7 +39,7 @@ struct compaction_completion_desc {
|
||||
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
|
||||
std::vector<sstables::shared_sstable> new_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// creates a new SSTable for a given shard
|
||||
|
||||
@@ -105,6 +105,7 @@ public:
|
||||
static const std::chrono::minutes entry_expiry;
|
||||
|
||||
using key_type = prepared_cache_key_type;
|
||||
using pinned_value_type = cache_value_ptr;
|
||||
using value_type = checked_weak_ptr;
|
||||
using statement_is_too_big = typename cache_type::entry_is_too_big;
|
||||
|
||||
@@ -116,9 +117,14 @@ public:
|
||||
: _cache(size, entry_expiry, logger)
|
||||
{}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
|
||||
}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_type> get(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
|
||||
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
|
||||
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
|
||||
try {
|
||||
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
|
||||
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
|
||||
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
|
||||
auto prepared = get_statement(query_string, client_state, d);
|
||||
prepared->calculate_metadata_id();
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
});
|
||||
|
||||
const auto& warnings = prep_ptr->warnings;
|
||||
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
|
||||
co_await utils::get_local_injector().inject(
|
||||
"query_processor_prepare_wait_after_cache_get",
|
||||
utils::wait_for_message(std::chrono::seconds(60)));
|
||||
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
|
||||
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
|
||||
for (const auto& w : warnings) {
|
||||
msg->add_warning(w);
|
||||
}
|
||||
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
|
||||
co_return std::move(msg);
|
||||
} catch(typename prepared_statements_cache::statement_is_too_big&) {
|
||||
throw prepared_statement_is_too_big(query_string);
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "utils/assert.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include "utils/labels.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
|
||||
namespace cache {
|
||||
|
||||
@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
|
||||
return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
|
||||
return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
|
||||
}
|
||||
|
||||
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
|
||||
future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
|
||||
return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
|
||||
return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
|
||||
auto on_failure = defer([this] () noexcept {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "utils/histogram.hh"
|
||||
#include "mutation/partition_version.hh"
|
||||
#include "utils/double-decker.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include "db/cache_tracker.hh"
|
||||
#include "readers/empty.hh"
|
||||
#include "readers/mutation_source.hh"
|
||||
@@ -457,7 +458,7 @@ public:
|
||||
// mutation source made prior to the call to invalidate().
|
||||
future<> invalidate(external_updater, const dht::decorated_key&);
|
||||
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
|
||||
|
||||
// Evicts entries from cache.
|
||||
//
|
||||
|
||||
@@ -105,7 +105,7 @@ namespace {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.ks_name() == schema_tables::NAME) {
|
||||
// all schema tables are group0 tables
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -87,31 +87,15 @@ namespace {
|
||||
static const std::unordered_set<sstring> tables = {
|
||||
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
|
||||
system_keyspace::BROADCAST_KV_STORE,
|
||||
system_keyspace::CDC_GENERATIONS_V3,
|
||||
system_keyspace::RAFT,
|
||||
system_keyspace::RAFT_SNAPSHOTS,
|
||||
system_keyspace::RAFT_SNAPSHOT_CONFIG,
|
||||
system_keyspace::GROUP0_HISTORY,
|
||||
system_keyspace::DISCOVERY,
|
||||
system_keyspace::TABLETS,
|
||||
system_keyspace::TOPOLOGY,
|
||||
system_keyspace::TOPOLOGY_REQUESTS,
|
||||
system_keyspace::LOCAL,
|
||||
system_keyspace::PEERS,
|
||||
system_keyspace::SCYLLA_LOCAL,
|
||||
system_keyspace::COMMITLOG_CLEANUPS,
|
||||
system_keyspace::SERVICE_LEVELS_V2,
|
||||
system_keyspace::VIEW_BUILD_STATUS_V2,
|
||||
system_keyspace::CDC_STREAMS_STATE,
|
||||
system_keyspace::CDC_STREAMS_HISTORY,
|
||||
system_keyspace::ROLES,
|
||||
system_keyspace::ROLE_MEMBERS,
|
||||
system_keyspace::ROLE_ATTRIBUTES,
|
||||
system_keyspace::ROLE_PERMISSIONS,
|
||||
system_keyspace::CDC_LOCAL,
|
||||
system_keyspace::DICTS,
|
||||
system_keyspace::VIEW_BUILDING_TASKS,
|
||||
system_keyspace::CLIENT_ROUTES,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.enable_schema_commitlog();
|
||||
@@ -143,7 +127,7 @@ namespace {
|
||||
system_keyspace::REPAIR_TASKS,
|
||||
};
|
||||
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
const row& existing_row = existing.cells();
|
||||
const row& updated_row = update.cells();
|
||||
|
||||
const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
|
||||
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
|
||||
const auto view_it = _view->columns_by_name().find(cdef.name());
|
||||
const bool column_is_selected = view_it != _view->columns_by_name().end();
|
||||
|
||||
@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
|
||||
// as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
|
||||
// Because of that, we don't generate view updates when the value in an unselected column is created
|
||||
// or changes.
|
||||
if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
|
||||
if (!column_is_selected) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//TODO(sarna): Optimize collections case - currently they do not go under optimization
|
||||
if (!cdef.is_atomic()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the value was created or deleted, unless we have a non-expiring marker
|
||||
// We cannot skip if the value was created or deleted
|
||||
const auto* existing_cell = existing_row.find_cell(cdef.id);
|
||||
const auto* updated_cell = updated_row.find_cell(cdef.id);
|
||||
if (existing_cell == nullptr || updated_cell == nullptr) {
|
||||
return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
|
||||
return existing_cell == updated_cell;
|
||||
}
|
||||
|
||||
if (!cdef.is_atomic()) {
|
||||
return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
|
||||
}
|
||||
|
||||
atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
|
||||
atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);
|
||||
|
||||
// We cannot skip when a selected column is changed
|
||||
if (column_is_selected) {
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
if (view_it->second->is_view_virtual()) {
|
||||
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
|
||||
}
|
||||
|
||||
// With non-expiring row marker, liveness checks below are not relevant
|
||||
if (base_has_nonexpiring_marker) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We cannot skip if the change updates TTL
|
||||
const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
|
||||
const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
|
||||
if (existing_has_ttl || updated_has_ttl) {
|
||||
return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
|
||||
}
|
||||
|
||||
return true;
|
||||
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
|
||||
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
|
||||
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
|
||||
locator::endpoint_dc_rack my_location,
|
||||
const locator::network_topology_strategy* network_topology,
|
||||
const bool network_topology,
|
||||
replica::cf_stats& cf_stats) {
|
||||
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
|
||||
node_vector base_endpoints, view_endpoints;
|
||||
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id me,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
|
||||
auto& my_location = topology.get_location(me);
|
||||
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
|
||||
|
||||
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
|
||||
if (auto* np = topology.find_node(ep)) {
|
||||
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
// view pairing as the leaving base replica.
|
||||
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
|
||||
auto leaving_base = it->get().host_id();
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
|
||||
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
|
||||
view_token, use_tablets, cf_stats);
|
||||
}
|
||||
}
|
||||
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
|
||||
wait_for_all_updates wait_for_all)
|
||||
{
|
||||
auto& ks = _db.find_keyspace(base->ks_name());
|
||||
auto& replication = ks.get_replication_strategy();
|
||||
const bool uses_tablets = ks.uses_tablets();
|
||||
const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
|
||||
// The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
|
||||
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
|
||||
auto get_erm = [&] (table_id id) {
|
||||
auto it = erms.find(id);
|
||||
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
|
||||
co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
|
||||
auto view_token = dht::get_token(*mut.s, mut.fm.key());
|
||||
auto view_ermp = erms.at(mut.s->id());
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
|
||||
ks.uses_tablets(), cf_stats);
|
||||
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
|
||||
uses_tablets, cf_stats);
|
||||
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
|
||||
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
|
||||
if (no_pairing_endpoint) {
|
||||
|
||||
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
|
||||
locator::host_id node,
|
||||
const locator::effective_replication_map_ptr& base_erm,
|
||||
const locator::effective_replication_map_ptr& view_erm,
|
||||
const locator::abstract_replication_strategy& replication_strategy,
|
||||
const bool network_topology,
|
||||
const dht::token& base_token,
|
||||
const dht::token& view_token,
|
||||
bool use_tablets,
|
||||
|
||||
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
|
||||
return prs;
|
||||
}
|
||||
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
|
||||
utils::chunked_vector<dht::partition_range> prs;
|
||||
prs.reserve(ranges.size());
|
||||
for (auto& range : ranges) {
|
||||
prs.push_back(dht::to_partition_range(range));
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
co_return prs;
|
||||
}
|
||||
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
|
||||
std::map<unsigned, dht::partition_range_vector> ret;
|
||||
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
|
||||
return ret;
|
||||
}
|
||||
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
|
||||
auto cmp = dht::ring_position_comparator(schema);
|
||||
// optimize set of potentially overlapping ranges by deoverlapping them.
|
||||
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
|
||||
dht::partition_range_vector res;
|
||||
auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
|
||||
utils::chunked_vector<dht::partition_range> res;
|
||||
res.reserve(ranges.size() * 2);
|
||||
|
||||
auto range = ranges.begin();
|
||||
|
||||
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {
|
||||
|
||||
dht::partition_range to_partition_range(dht::token_range);
|
||||
dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
|
||||
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);
|
||||
|
||||
// Each shard gets a sorted, disjoint vector of ranges
|
||||
std::map<unsigned, dht::partition_range_vector>
|
||||
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
|
||||
// Returns a sorted and deoverlapped list of ranges that are
|
||||
// the result of subtracting all ranges from ranges_to_subtract.
|
||||
// ranges_to_subtract must be sorted and deoverlapped.
|
||||
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
|
||||
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
|
||||
|
||||
// Returns a token_range vector split based on the given number of most-significant bits
|
||||
dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
|
||||
|
||||
63
dht/token.hh
63
dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
|
||||
after_all_keys,
|
||||
};
|
||||
|
||||
// Represents a token for partition keys.
|
||||
// Has a disengaged state, which sorts before all engaged states.
|
||||
struct raw_token {
|
||||
int64_t value;
|
||||
|
||||
/// Constructs a disengaged token.
|
||||
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
|
||||
|
||||
/// Constructs an engaged token.
|
||||
/// The token must be of token_kind::key kind.
|
||||
explicit raw_token(const token&);
|
||||
|
||||
explicit raw_token(int64_t v) : value(v) {};
|
||||
|
||||
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
|
||||
std::strong_ordering operator<=>(const token& o) const noexcept;
|
||||
|
||||
/// Returns true iff engaged.
|
||||
explicit operator bool() const noexcept {
|
||||
return value != std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
};
|
||||
|
||||
using raw_token_opt = seastar::optimized_optional<raw_token>;
|
||||
|
||||
class token {
|
||||
// INT64_MIN is not a legal token, but a special value used to represent
|
||||
// infinity in token intervals.
|
||||
@@ -52,6 +77,10 @@ public:
|
||||
|
||||
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
|
||||
|
||||
token(raw_token raw) noexcept
|
||||
: token(raw ? kind::key : kind::before_all_keys, raw.value)
|
||||
{ }
|
||||
|
||||
// This constructor seems redundant with the bytes_view constructor, but
|
||||
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
|
||||
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
|
||||
@@ -223,6 +252,29 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
raw_token::raw_token(const token& t)
|
||||
: value(t.raw())
|
||||
{
|
||||
#ifdef DEBUG
|
||||
assert(t._kind == token::kind::key);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
|
||||
switch (o._kind) {
|
||||
case token::kind::after_all_keys:
|
||||
return std::strong_ordering::less;
|
||||
case token::kind::before_all_keys:
|
||||
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
|
||||
// So we can order them by just comparing raw values.
|
||||
[[fallthrough]];
|
||||
case token::kind::key:
|
||||
return value <=> o._data;
|
||||
}
|
||||
}
|
||||
|
||||
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
|
||||
if (l1 == l2) {
|
||||
return std::strong_ordering::equal;
|
||||
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
|
||||
template <typename FormatContext>
|
||||
auto format(const dht::raw_token& t, FormatContext& ctx) const {
|
||||
if (!t) {
|
||||
return fmt::format_to(ctx.out(), "null");
|
||||
}
|
||||
return fmt::format_to(ctx.out(), "{}", t.value);
|
||||
}
|
||||
};
|
||||
|
||||
namespace std {
|
||||
|
||||
template<>
|
||||
|
||||
@@ -281,8 +281,8 @@ For example::
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
|
||||
or columns provided in a definition of the index.
|
||||
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
|
||||
See :ref:`WHERE <where-clause>`.
|
||||
|
||||
For example::
|
||||
|
||||
@@ -290,10 +290,6 @@ For example::
|
||||
WHERE user_id = 'user123'
|
||||
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
|
||||
|
||||
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
|
||||
|
||||
Other filtering scenarios are currently not supported.
|
||||
|
||||
.. note::
|
||||
|
||||
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
|
||||
|
||||
@@ -52,7 +52,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
|
||||
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
@@ -125,7 +125,7 @@ Install ScyllaDB
|
||||
.. code-block:: console
|
||||
:substitutions:
|
||||
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
|
||||
|
||||
#. Install ScyllaDB packages.
|
||||
|
||||
@@ -133,19 +133,19 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla
|
||||
|
||||
Running the command installs the latest official version of ScyllaDB Open Source.
|
||||
Alternatively, you can to install a specific patch version:
|
||||
Running the command installs the latest official version of ScyllaDB.
|
||||
Alternatively, you can install a specific patch version:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install scylla-<your patch version>
|
||||
|
||||
Example: The following example shows the command to install ScyllaDB 5.2.3.
|
||||
Example: The following example shows installing ScyllaDB 2025.3.1.
|
||||
|
||||
.. code-block:: console
|
||||
:class: hide-copy-button
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
sudo yum install scylla-2025.3.1
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
|
||||
@@ -36,11 +36,8 @@ release versions, run:
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
|
||||
|
||||
|
||||
Versions 2025.1 and Later
|
||||
==============================
|
||||
|
||||
Run the command with the ``--scylla-version`` option to specify the version
|
||||
you want to install.
|
||||
To install a non-default version, run the command with the ``--scylla-version``
|
||||
option to specify the version you want to install.
|
||||
|
||||
**Example**
|
||||
|
||||
@@ -50,20 +47,4 @@ you want to install.
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
|
||||
|
||||
|
||||
Versions Earlier than 2025.1
|
||||
================================
|
||||
|
||||
To install a supported version of *ScyllaDB Enterprise*, run the command with:
|
||||
|
||||
* ``--scylla-product scylla-enterprise`` to specify that you want to install
|
||||
ScyllaDB Entrprise.
|
||||
* ``--scylla-version`` to specify the version you want to install.
|
||||
|
||||
For example:
|
||||
|
||||
.. code:: console
|
||||
|
||||
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
@@ -0,0 +1,492 @@
|
||||
=================================================
|
||||
Cluster Platform Migration Using Node Cycling
|
||||
=================================================
|
||||
|
||||
This procedure describes how to migrate a ScyllaDB cluster to new instance types
|
||||
using the add-and-replace approach, which is commonly used for:
|
||||
|
||||
* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
|
||||
* Upgrading to newer instance types with better performance
|
||||
* Changing instance families within the same cloud provider
|
||||
|
||||
The add-and-replace approach maintains data replication throughout the migration
|
||||
and ensures zero downtime for client applications.
|
||||
|
||||
.. note::
|
||||
|
||||
This procedure does **not** change the ScyllaDB software version. All nodes
|
||||
(both existing and new) must run the same ScyllaDB version. For software
|
||||
version upgrades, see :doc:`Upgrade </upgrade/index>`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
The add-and-replace migration follows these steps:
|
||||
|
||||
#. Add new nodes (on target instance type) to the existing cluster
|
||||
#. Wait for data to stream to the new nodes
|
||||
#. Decommission old nodes (on source instance type)
|
||||
|
||||
This approach keeps the cluster operational throughout the migration while
|
||||
maintaining the configured replication factor.
|
||||
|
||||
Key characteristics
|
||||
===================
|
||||
|
||||
* **Zero downtime**: Client applications continue to operate during migration
|
||||
* **Data safety**: Replication factor is maintained throughout the process
|
||||
* **Flexible**: Works with both vnodes and tablets-enabled clusters
|
||||
* **Multi-DC support**: Can migrate nodes across multiple datacenters
|
||||
|
||||
.. warning::
|
||||
|
||||
Ensure your cluster has sufficient capacity during the migration. At the peak
|
||||
of the process, your cluster will temporarily have double the number of nodes.
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
Check cluster health
|
||||
====================
|
||||
|
||||
Before starting the migration, verify that your cluster is healthy:
|
||||
|
||||
#. Check that all nodes are in Up Normal (UN) status:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status. Do not proceed if any nodes are down.
|
||||
|
||||
#. Ensure no streaming or repair operations are in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
nodetool compactionstats
|
||||
|
||||
Plan the migration
|
||||
==================
|
||||
|
||||
Before provisioning new instances, plan the following:
|
||||
|
||||
**Instance type mapping**: Identify the source and target instance types.
|
||||
If your cluster uses vnodes (not tablets), consider that mismatched shard
|
||||
counts between source and target instance types can cause slower repairs.
|
||||
With tablets enabled, shard count mismatch is fully supported.
|
||||
|
||||
**Rack assignment planning**: Each new node must be assigned to the same rack
|
||||
as the node it will replace. This maintains rack-aware topology for:
|
||||
|
||||
* Rack-aware replication (NetworkTopologyStrategy)
|
||||
* Proper data distribution across failure domains
|
||||
* Minimizing data movement during decommission
|
||||
|
||||
Example mapping for a 3-node cluster:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
Source nodes (to be decommissioned): Target nodes (to be added):
|
||||
192.168.1.10 - RACK0 → 192.168.2.10 - RACK0
|
||||
192.168.1.11 - RACK1 → 192.168.2.11 - RACK1
|
||||
192.168.1.12 - RACK2 → 192.168.2.12 - RACK2
|
||||
|
||||
Create a backup
|
||||
===============
|
||||
|
||||
Back up the data before starting the migration. One of the following
|
||||
methods can be used:
|
||||
|
||||
* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
|
||||
cluster-wide backup. See the
|
||||
`ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
|
||||
for details.
|
||||
|
||||
* **Snapshots**: On each node in the cluster, create a snapshot:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool snapshot -t pre_migration_backup
|
||||
nodetool listsnapshots
|
||||
|
||||
.. note::
|
||||
|
||||
Snapshots are local to each node and do not protect against node or disk
|
||||
failure. For full disaster recovery, use ScyllaDB Manager backup.
|
||||
|
||||
|
||||
Procedure
|
||||
---------
|
||||
|
||||
Adding new nodes
|
||||
================
|
||||
|
||||
#. Provision new instances with the target instance type. Ensure:
|
||||
|
||||
* The same ScyllaDB version as existing nodes
|
||||
* Same network configuration and security groups
|
||||
* Appropriate storage configuration
|
||||
|
||||
#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
|
||||
cluster:
|
||||
|
||||
* **cluster_name**: Must match the existing cluster name
|
||||
* **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
|
||||
* **endpoint_snitch**: Must match the existing cluster configuration
|
||||
* **listen_address**: IP address of the new node
|
||||
* **rpc_address**: IP address of the new node
|
||||
|
||||
All other cluster-wide settings (tablets configuration, encryption settings,
|
||||
experimental features, etc.) must match the existing nodes.
|
||||
|
||||
.. caution::
|
||||
|
||||
Make sure that the ScyllaDB version on the new node is identical to the
|
||||
version on the other nodes in the cluster. Running nodes with different
|
||||
versions is not supported.
|
||||
|
||||
#. If using ``GossipingPropertyFileSnitch``, configure
|
||||
``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
|
||||
and rack assignment for this node:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
dc = <datacenter-name>
|
||||
rack = <rack-name>
|
||||
prefer_local = true
|
||||
|
||||
.. warning::
|
||||
|
||||
Each node must have the correct rack assignment. Using the same rack for
|
||||
all new nodes breaks rack-aware replication topology.
|
||||
|
||||
#. Start ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl start scylla-server
|
||||
|
||||
For Docker deployments:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker exec -it <container-name> supervisorctl start scylla
|
||||
|
||||
#. Monitor the bootstrap process from an existing node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The new node will appear with ``UJ`` (Up, Joining) status while streaming
|
||||
data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
|
||||
|
||||
**Example output during bootstrap:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
**Example output after bootstrap completes:**
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
Datacenter: dc1
|
||||
Status=Up/Down
|
||||
State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns Host ID Rack
|
||||
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
|
||||
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
|
||||
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
|
||||
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
|
||||
|
||||
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
|
||||
After the node reaches ``UN`` status, verify no streaming is in progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
Wait until output shows "Not sending any streams" and no active receiving streams.
|
||||
|
||||
#. Repeat steps 1-6 for each new node to be added.
|
||||
|
||||
.. note::
|
||||
|
||||
You can add multiple nodes in parallel if they are in different datacenters.
|
||||
Within a single datacenter, add nodes one at a time for best results.
|
||||
|
||||
|
||||
Updating seed node configuration
|
||||
================================
|
||||
|
||||
If any of your original nodes are configured as seed nodes, you must update
|
||||
the seed configuration before decommissioning them.
|
||||
|
||||
#. Check the current seed configuration on any node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
|
||||
|
||||
#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
|
||||
on **all new nodes** to use the new node IPs as seeds:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
seed_provider:
|
||||
- class_name: org.apache.cassandra.locator.SimpleSeedProvider
|
||||
parameters:
|
||||
- seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
|
||||
|
||||
.. note::
|
||||
|
||||
Updating seed configuration on the **old nodes** (that will be
|
||||
decommissioned) is optional. Seeds are only used during node startup
|
||||
to discover the cluster. If you don't plan to restart the old nodes
|
||||
before decommissioning them, their seed configuration doesn't matter.
|
||||
However, updating all nodes is recommended for safety in case an old
|
||||
node unexpectedly restarts during the migration.
|
||||
|
||||
#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
|
||||
configuration:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl restart scylla-server
|
||||
|
||||
Wait for the node to fully start before restarting the next node.
|
||||
|
||||
#. After restarting the new nodes, verify the cluster is healthy:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
nodetool describecluster
|
||||
|
||||
.. warning::
|
||||
|
||||
Complete this seed list update on **all new nodes** before decommissioning
|
||||
any old nodes. This ensures the new nodes can reform the cluster after
|
||||
the old nodes are removed.
|
||||
|
||||
|
||||
Decommissioning old nodes
|
||||
=========================
|
||||
|
||||
After all new nodes are added and healthy, decommission the old nodes one
|
||||
at a time.
|
||||
|
||||
#. Verify all nodes are healthy before starting decommission:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
All nodes should show ``UN`` status.
|
||||
|
||||
#. On the node to be decommissioned, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool decommission
|
||||
|
||||
This command blocks until the decommission is complete. The node will
|
||||
stream its data to the remaining nodes.
|
||||
|
||||
#. Monitor the decommission progress from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
|
||||
→ removed from the cluster.
|
||||
|
||||
You can also monitor streaming progress:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool netstats
|
||||
|
||||
#. After decommission completes, verify the node is no longer in the cluster:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
The decommissioned node should no longer appear in the output.
|
||||
|
||||
#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
|
||||
no longer belongs to them after the topology change:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool cleanup
|
||||
|
||||
.. note::
|
||||
|
||||
``nodetool cleanup`` can be resource-intensive. Run it on one node at a
|
||||
time during low-traffic periods.
|
||||
|
||||
#. Wait for the cluster to stabilize before decommissioning the next node.
|
||||
Ensure no streaming operations are in progress.
|
||||
|
||||
#. Repeat steps 1-7 for each old node to be decommissioned.
|
||||
|
||||
|
||||
Post-migration verification
|
||||
---------------------------
|
||||
|
||||
After all old nodes are decommissioned, verify the migration was successful.
|
||||
|
||||
Verify cluster topology
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool status
|
||||
|
||||
Confirm:
|
||||
|
||||
* All nodes show ``UN`` (Up, Normal) status
|
||||
* Only the new instance type nodes are present
|
||||
* Nodes are balanced across racks
|
||||
|
||||
Verify schema agreement
|
||||
=======================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool describecluster
|
||||
|
||||
All nodes should report the same schema version.
|
||||
|
||||
Verify data connectivity
|
||||
========================
|
||||
|
||||
Connect to the cluster and run a test query:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
|
||||
|
||||
.. note::
|
||||
|
||||
If ScyllaDB is configured with ``listen_interface``, you must use the
|
||||
node's interface IP address (not localhost) for cqlsh connections.
|
||||
|
||||
Verify ScyllaDB version
|
||||
=======================
|
||||
|
||||
Confirm all nodes are running the same ScyllaDB version:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
scylla --version
|
||||
|
||||
Verify data integrity (optional)
|
||||
================================
|
||||
|
||||
Run data validation on each keyspace to verify sstable integrity:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool scrub --mode=VALIDATE <keyspace_name>
|
||||
|
||||
Rollback
|
||||
--------
|
||||
|
||||
If issues occur during the migration, you can roll back by reversing the
|
||||
procedure.
|
||||
|
||||
During add phase
|
||||
================
|
||||
|
||||
If a new node fails to bootstrap:
|
||||
|
||||
#. Stop ScyllaDB on the new node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo systemctl stop scylla-server
|
||||
|
||||
#. From an existing node, remove the failed node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id-of-failed-node>
|
||||
|
||||
During decommission phase
|
||||
=========================
|
||||
|
||||
If a decommission operation gets stuck:
|
||||
|
||||
#. If the node is still reachable, try stopping and restarting ScyllaDB
|
||||
#. If the node is unresponsive, from another node:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
nodetool removenode <host-id>
|
||||
|
||||
See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
for more details.
|
||||
|
||||
Full rollback
|
||||
=============
|
||||
|
||||
To roll back after the migration is complete (all nodes on new instance type),
|
||||
apply the same add-and-replace procedure in reverse:
|
||||
|
||||
#. Add new nodes on the original instance type
|
||||
#. Wait for data streaming to complete
|
||||
#. Decommission the nodes on the new instance type
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
Node stuck in Joining (UJ) state
|
||||
================================
|
||||
|
||||
If a new node remains in ``UJ`` state for an extended period:
|
||||
|
||||
* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
|
||||
* Verify network connectivity between nodes
|
||||
* Ensure sufficient disk space on all nodes
|
||||
* Check for any ongoing operations that may be blocking
|
||||
|
||||
Decommission taking too long
|
||||
============================
|
||||
|
||||
Decommission duration depends on data size. If it appears stuck:
|
||||
|
||||
* Check streaming progress: ``nodetool netstats``
|
||||
* Look for errors in ScyllaDB logs
|
||||
* Verify network bandwidth between nodes
|
||||
|
||||
Schema disagreement
|
||||
===================
|
||||
|
||||
If nodes report different schema versions:
|
||||
|
||||
* Wait a few minutes for schema to propagate
|
||||
* If disagreement persists, restart the nodes one by one
|
||||
* Run ``nodetool describecluster`` to verify agreement
|
||||
|
||||
|
||||
Additional resources
|
||||
--------------------
|
||||
|
||||
* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
|
||||
* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
|
||||
* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
|
||||
* :doc:`Upgrade </upgrade/index>`
|
||||
@@ -26,6 +26,7 @@ Cluster Management Procedures
|
||||
Safely Restart Your Cluster <safe-start>
|
||||
repair-based-node-operation
|
||||
Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
|
||||
Cluster Platform Migration <cluster-platform-migration>
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -85,6 +86,8 @@ Cluster Management Procedures
|
||||
|
||||
* :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
|
||||
|
||||
* :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
|
||||
|
||||
.. panel-box::
|
||||
:title: Topology Changes
|
||||
:id: "getting-started"
|
||||
|
||||
@@ -57,12 +57,11 @@ To enable shared dictionaries:
|
||||
internode_compression_enable_advanced: true
|
||||
rpc_dict_training_when: when_leader
|
||||
|
||||
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
|
||||
.. note::
|
||||
|
||||
Trained dictionaries contain randomly chosen samples of data transferred between
|
||||
nodes. The data samples are persisted in the Raft log, which is not encrypted.
|
||||
As a result, some data from otherwise encrypted tables might be stored on disk
|
||||
unencrypted.
|
||||
Some dictionary training data may be encrypted using storage-level encryption
|
||||
(if enabled) instead of database-level encryption, meaning protection is
|
||||
applied at the storage layer rather than within the database itself.
|
||||
|
||||
|
||||
Reference
|
||||
|
||||
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic
|
||||
|
||||
sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
|
||||
const auto replication_factor = erm.get_replication_factor();
|
||||
if (read_replicas.size() > replication_factor) {
|
||||
if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
|
||||
if (read_replicas.size() > replication_factor + 1) {
|
||||
return seastar::format(
|
||||
"everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
|
||||
"cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
|
||||
read_replicas.size(), replication_factor);
|
||||
}
|
||||
} else if (read_replicas.size() > replication_factor) {
|
||||
return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
|
||||
}
|
||||
return {};
|
||||
|
||||
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(
|
||||
|
||||
writev(v.serialize());
|
||||
}
|
||||
return collection_mutation(type, ret);
|
||||
return collection_mutation(type, std::move(ret));
|
||||
}
|
||||
|
||||
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
|
||||
|
||||
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
|
||||
.entity = stats.entity,
|
||||
.progress_units = "",
|
||||
.progress = tasks::task_manager::task::progress{},
|
||||
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
|
||||
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
|
||||
size 6530196
|
||||
oid sha256:088a9d7e165d33436eb3029ab092582cbae61f0e17486c226d8947ff44658c78
|
||||
size 6535832
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
|
||||
size 6528308
|
||||
oid sha256:5f0c0709f9724cd3a545ebcc50ed587f28b2424d55e2334ac2db5d917903bcaf
|
||||
size 6536892
|
||||
|
||||
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
on_internal_error_noexcept(rcslog,
|
||||
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
|
||||
_resources, _initial_resources));
|
||||
_resources.count = std::max(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
|
||||
_resources.count = std::min(_resources.count, _initial_resources.count);
|
||||
_resources.memory = std::min(_resources.memory, _initial_resources.memory);
|
||||
}
|
||||
maybe_wake_execution_loop();
|
||||
}
|
||||
|
||||
@@ -432,7 +432,9 @@ public:
|
||||
// refresh_mutation_source must be called when there are changes to data source
|
||||
// structures but logical state of data is not changed (e.g. when state for a
|
||||
// new tablet replica is allocated).
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) = 0;
|
||||
|
||||
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
|
||||
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
|
||||
@@ -442,7 +444,7 @@ public:
|
||||
virtual storage_group& storage_group_for_token(dht::token) const = 0;
|
||||
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
|
||||
|
||||
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
|
||||
virtual locator::combined_load_stats table_load_stats() const = 0;
|
||||
virtual bool all_storage_groups_split() = 0;
|
||||
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
|
||||
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
|
||||
|
||||
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
|
||||
if (!range.is_singular()) {
|
||||
continue;
|
||||
}
|
||||
auto token = dht::token::to_int64(ranges.front().start()->value().token());
|
||||
auto token = dht::token::to_int64(range.start()->value().token());
|
||||
if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
|
||||
// Don't return immediately - account all ranges first
|
||||
ret = can_proceed::no;
|
||||
|
||||
@@ -1129,9 +1129,7 @@ public:
|
||||
return _stats;
|
||||
}
|
||||
|
||||
// The tablet filter is used to not double account migrating tablets, so it's important that
|
||||
// only one of pending or leaving replica is accounted based on current migration stage.
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
|
||||
locator::combined_load_stats table_load_stats() const;
|
||||
|
||||
const db::view::stats& get_view_stats() const {
|
||||
return _view_stats;
|
||||
|
||||
249
replica/table.cc
249
replica/table.cc
@@ -711,7 +711,9 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override {}
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override {
|
||||
return get_compaction_group();
|
||||
@@ -734,7 +736,7 @@ public:
|
||||
return *_single_sg;
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
|
||||
locator::combined_load_stats table_load_stats() const override {
|
||||
return locator::combined_load_stats{
|
||||
.table_ls = locator::table_load_stats{
|
||||
.size_in_bytes = _single_sg->live_disk_space_used(),
|
||||
@@ -757,6 +759,11 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct background_merge_guard {
|
||||
compaction::compaction_reenabler compaction_guard;
|
||||
locator::effective_replication_map_ptr erm_guard;
|
||||
};
|
||||
|
||||
class tablet_storage_group_manager final : public storage_group_manager {
|
||||
replica::table& _t;
|
||||
locator::host_id _my_host_id;
|
||||
@@ -777,7 +784,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
|
||||
utils::phased_barrier _merge_fiber_barrier;
|
||||
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
|
||||
// Holds compaction reenabler which disables compaction temporarily during tablet merge
|
||||
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
|
||||
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
|
||||
private:
|
||||
const schema_ptr& schema() const {
|
||||
return _t.schema();
|
||||
@@ -801,7 +808,8 @@ private:
|
||||
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
|
||||
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
|
||||
// are merged into a new storage group with id (X >> 1).
|
||||
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
|
||||
|
||||
// When merge completes, compaction groups of sibling tablets are added to same storage
|
||||
// group, but they're not merged yet into one, since the merge completion handler happens
|
||||
@@ -895,7 +903,9 @@ public:
|
||||
std::exchange(_stop_fut, make_ready_future())).discard_result();
|
||||
}
|
||||
|
||||
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
|
||||
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source) override;
|
||||
|
||||
compaction_group& compaction_group_for_token(dht::token token) const override;
|
||||
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
|
||||
@@ -909,7 +919,7 @@ public:
|
||||
return storage_group_for_id(storage_group_of(token).first);
|
||||
}
|
||||
|
||||
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
|
||||
locator::combined_load_stats table_load_stats() const override;
|
||||
bool all_storage_groups_split() override;
|
||||
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
|
||||
future<> maybe_split_compaction_group_of(size_t idx) override;
|
||||
@@ -2933,17 +2943,108 @@ void table::on_flush_timer() {
|
||||
});
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
// The following functions return true if we should return the tablet size of a tablet in
|
||||
// migration depending on its transition stage and whether it is a leaving or pending replica
|
||||
bool has_size_on_leaving (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_size_on_pending (locator::tablet_transition_stage stage) {
|
||||
switch (stage) {
|
||||
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::streaming: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::rebuild_repair:
|
||||
return false;
|
||||
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::use_new: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::repair: [[fallthrough]];
|
||||
case locator::tablet_transition_stage::end_repair:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
|
||||
locator::table_load_stats table_stats;
|
||||
table_stats.split_ready_seq_number = _split_ready_seq_number;
|
||||
|
||||
locator::tablet_load_stats tablet_stats;
|
||||
|
||||
for_each_storage_group([&] (size_t id, storage_group& sg) {
|
||||
locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
|
||||
if (tablet_filter(*_tablet_map, gid)) {
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
auto tid = locator::tablet_id(id);
|
||||
locator::global_tablet_id gid { _t.schema()->id(), tid };
|
||||
locator::tablet_replica me { _my_host_id, this_shard_id() };
|
||||
const uint64_t tablet_size = sg.live_disk_space_used();
|
||||
|
||||
auto transition = _tablet_map->get_tablet_transition_info(tid);
|
||||
auto& info = _tablet_map->get_tablet_info(tid);
|
||||
bool is_pending = transition && transition->pending_replica == me;
|
||||
bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
|
||||
// Otherwise, pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto table_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
// When a tablet is in migration, we want to send its size during any migration stage when
|
||||
// we still know the tablet's size. This way the balancer will have better information about
|
||||
// tablet sizes, and we reduce the chance that the node will be ignored during balancing
|
||||
// due to missing tablet size. On the leaving replica we include tablets until the use_new
|
||||
// stage (inclusive), and on the pending we include tablets after the streaming stage.
|
||||
// There is an overlap in tablet sizes (we report sizes on both the leaving and pending
|
||||
// replicas for some stages), but that should not be a problem.
|
||||
auto tablet_size_filter = [&] () {
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (is_leaving) {
|
||||
return has_size_on_leaving(transition->stage);
|
||||
} else if (is_pending) {
|
||||
return has_size_on_pending(transition->stage);
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (table_size_filter()) {
|
||||
table_stats.size_in_bytes += tablet_size;
|
||||
}
|
||||
|
||||
if (tablet_size_filter()) {
|
||||
const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
|
||||
// Make sure the token range is in the form (a, b]
|
||||
SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
|
||||
@@ -2956,8 +3057,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
|
||||
};
|
||||
}
|
||||
|
||||
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
|
||||
return _sg_manager->table_load_stats(std::move(tablet_filter));
|
||||
locator::combined_load_stats table::table_load_stats() const {
|
||||
return _sg_manager->table_load_stats();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
@@ -3069,7 +3170,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
|
||||
}
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
|
||||
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
|
||||
const locator::tablet_map& old_tmap,
|
||||
const locator::tablet_map& new_tmap) {
|
||||
auto table_id = schema()->id();
|
||||
size_t old_tablet_count = old_tmap.tablet_count();
|
||||
size_t new_tablet_count = new_tmap.tablet_count();
|
||||
@@ -3093,7 +3196,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
|
||||
for (auto& view : new_cg->all_views()) {
|
||||
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
|
||||
_compaction_reenablers_for_merging.push_back(std::move(cre));
|
||||
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
|
||||
}
|
||||
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
|
||||
|
||||
@@ -3126,7 +3229,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
|
||||
_merge_completion_event.signal();
|
||||
}
|
||||
|
||||
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
|
||||
void tablet_storage_group_manager::update_effective_replication_map(
|
||||
const locator::effective_replication_map_ptr& old_erm,
|
||||
const locator::effective_replication_map& erm,
|
||||
noncopyable_function<void()> refresh_mutation_source)
|
||||
{
|
||||
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
|
||||
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
|
||||
|
||||
@@ -3142,7 +3249,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
|
||||
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
|
||||
}
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
@@ -3228,7 +3335,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
|
||||
};
|
||||
|
||||
if (uses_tablets()) {
|
||||
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
|
||||
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
|
||||
}
|
||||
if (old_erm) {
|
||||
old_erm->invalidate();
|
||||
@@ -3690,7 +3797,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
|
||||
|
||||
std::vector<snapshot_sstable_set> sstable_sets(smp::count);
|
||||
std::vector<int64_t> tablet_counts(smp::count);
|
||||
|
||||
co_await writer->init();
|
||||
co_await smp::invoke_on_all([&] -> future<> {
|
||||
@@ -3698,7 +3804,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
auto [tables, permit] = co_await t.snapshot_sstables();
|
||||
auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
|
||||
sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
|
||||
tablet_counts[this_shard_id()] = t.calculate_tablet_count();
|
||||
});
|
||||
co_await writer->sync();
|
||||
|
||||
@@ -3712,12 +3817,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
});
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
|
||||
std::optional<int64_t> min_tablet_count;
|
||||
std::optional<int64_t> tablet_count;
|
||||
if (t.uses_tablets()) {
|
||||
SCYLLA_ASSERT(!tablet_counts.empty());
|
||||
min_tablet_count = *std::ranges::min_element(tablet_counts);
|
||||
auto erm = t.get_effective_replication_map();
|
||||
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
||||
tablet_count = tm.tablet_count();
|
||||
}
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
@@ -3775,6 +3881,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
|
||||
auto close_lister = deferred_close(lister);
|
||||
while (auto de = lister.get().get()) {
|
||||
auto snapshot_name = de->name;
|
||||
all_snapshots.emplace(snapshot_name, snapshot_details());
|
||||
@@ -3782,6 +3889,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
|
||||
auto& sd = all_snapshots.at(snapshot_name);
|
||||
sd.total += details.total;
|
||||
sd.live += details.live;
|
||||
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in get_snapshot_details");
|
||||
}).get();
|
||||
}
|
||||
}
|
||||
return all_snapshots;
|
||||
@@ -3801,53 +3911,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
|
||||
}
|
||||
|
||||
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (auto de = co_await lister.get()) {
|
||||
const auto& name = de->name;
|
||||
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
|
||||
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
|
||||
auto size = sd.allocated_size;
|
||||
|
||||
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
|
||||
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
|
||||
}).get();
|
||||
|
||||
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
|
||||
//
|
||||
// All the others should just generate an exception: there is something wrong, so don't blindly
|
||||
// add it to the size.
|
||||
if (name != "manifest.json" && name != "schema.cql") {
|
||||
details.total += size;
|
||||
if (sd.number_of_links == 1) {
|
||||
// File exists only in the snapshot directory.
|
||||
details.live += size;
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
|
||||
// So check the datadir for the file too.
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
|
||||
try {
|
||||
// File exists in the main SSTable directory. Snapshots are not contributing to size
|
||||
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
|
||||
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
|
||||
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
|
||||
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
|
||||
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
|
||||
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
|
||||
co_return false;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
co_return true;
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
co_return false;
|
||||
}
|
||||
};
|
||||
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
|
||||
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
|
||||
!co_await exists_in_dir(data_directory, datadir, name)) {
|
||||
details.live += size;
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await lister.close();
|
||||
if (ex) {
|
||||
co_await coroutine::return_exception_ptr(std::move(ex));
|
||||
}
|
||||
|
||||
co_return details;
|
||||
|
||||
@@ -263,8 +263,9 @@ public:
|
||||
void enable_schema_commitlog() {
|
||||
_static_props.enable_schema_commitlog();
|
||||
}
|
||||
void set_is_group0_table(bool enabled = true) {
|
||||
_static_props.is_group0_table = enabled;
|
||||
void set_is_group0_table() {
|
||||
_static_props.is_group0_table = true;
|
||||
enable_schema_commitlog();
|
||||
}
|
||||
|
||||
class default_names {
|
||||
|
||||
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
|
||||
auto ps_ptr = qp.get_prepared(cache_key);
|
||||
if (!ps_ptr) {
|
||||
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
|
||||
ps_ptr = std::move(msg_ptr->get_prepared());
|
||||
ps_ptr = msg_ptr->get_prepared();
|
||||
if (!ps_ptr) {
|
||||
on_internal_error(paxos_state::logger, "prepared statement is null");
|
||||
}
|
||||
|
||||
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
|
||||
if (!schema->static_props().is_group0_table) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
|
||||
}
|
||||
|
||||
if (!schema->static_props().use_schema_commitlog) {
|
||||
on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -559,6 +559,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
group0_id = g0_info.group0_id;
|
||||
raft::server_address my_addr{my_id, {}};
|
||||
|
||||
bool starting_server_as_follower = false;
|
||||
if (server == nullptr) {
|
||||
// This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
|
||||
raft::configuration initial_configuration;
|
||||
@@ -586,6 +587,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
// trigger an empty snapshot transfer.
|
||||
nontrivial_snapshot = true;
|
||||
} else {
|
||||
starting_server_as_follower = true;
|
||||
co_await handshaker->pre_server_start(g0_info);
|
||||
}
|
||||
|
||||
@@ -614,7 +616,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(server);
|
||||
if (server->get_configuration().contains(my_id)) {
|
||||
co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
|
||||
utils::wait_for_message(std::chrono::minutes{5}));
|
||||
if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
|
||||
// True if we started a new group or completed a configuration change initiated earlier.
|
||||
group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
|
||||
server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
|
||||
|
||||
@@ -6156,6 +6156,57 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<> storage_service::local_topology_barrier() {
|
||||
if (this_shard_id() != 0) {
|
||||
co_await container().invoke_on(0, [] (storage_service& ss) {
|
||||
return ss.local_topology_barrier();
|
||||
});
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto version = _topology_state_machine._topology.version;
|
||||
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
@@ -6183,12 +6234,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
state.last_index = cmd_index;
|
||||
}
|
||||
|
||||
// We capture the topology version right after the checks
|
||||
// above, before any yields. This is crucial since _topology_state_machine._topology
|
||||
// might be altered concurrently while this method is running,
|
||||
// which can cause the fence command to apply an invalid fence version.
|
||||
const auto version = _topology_state_machine._topology.version;
|
||||
|
||||
switch (cmd.cmd) {
|
||||
case raft_topology_cmd::command::barrier: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_fail",
|
||||
@@ -6227,43 +6272,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
break;
|
||||
case raft_topology_cmd::command::barrier_and_drain: {
|
||||
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
|
||||
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
|
||||
});
|
||||
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
|
||||
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
|
||||
for (auto& n : _topology_state_machine._topology.transition_nodes) {
|
||||
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
|
||||
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
// moved to another node and managed to update the topology
|
||||
// parallel to this method. The previous coordinator
|
||||
// should be inactive now, so it won't observe this
|
||||
// exception. By returning exception we aim
|
||||
// to reveal any other conditions where this may arise.
|
||||
if (current_version != version) {
|
||||
co_await coroutine::return_exception(std::runtime_error(
|
||||
::format("raft topology: command::barrier_and_drain, the version has changed, "
|
||||
"version {}, current_version {}, the topology change coordinator "
|
||||
" had probably migrated to another node",
|
||||
version, current_version)));
|
||||
}
|
||||
|
||||
co_await ss._shared_token_metadata.stale_versions_in_use();
|
||||
co_await get_topology_session_manager().drain_closing_sessions();
|
||||
|
||||
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
|
||||
});
|
||||
co_await local_topology_barrier();
|
||||
|
||||
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
|
||||
auto ks = handler.get("keyspace");
|
||||
@@ -7359,34 +7368,8 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (!table) {
|
||||
continue;
|
||||
}
|
||||
auto erm = table->get_effective_replication_map();
|
||||
auto& token_metadata = erm->get_token_metadata();
|
||||
auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };
|
||||
|
||||
// It's important to tackle the anomaly in reported size, since both leaving and
|
||||
// pending replicas could otherwise be accounted during tablet migration.
|
||||
// If transition hasn't reached cleanup stage, then leaving replicas are accounted.
|
||||
// If transition is past cleanup stage, then pending replicas are accounted.
|
||||
// This helps to reduce the discrepancy window.
|
||||
auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
|
||||
auto transition = tmap.get_tablet_transition_info(id.tablet);
|
||||
auto& info = tmap.get_tablet_info(id.tablet);
|
||||
|
||||
// if tablet is not in transit, it's filtered in.
|
||||
if (!transition) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_pending = transition->pending_replica == me;
|
||||
bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
|
||||
auto s = transition->reads; // read selector
|
||||
|
||||
return (!is_pending && !is_leaving)
|
||||
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|
||||
|| (is_pending && s == locator::read_replica_set_selector::next);
|
||||
};
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats() };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
|
||||
@@ -944,6 +944,9 @@ public:
|
||||
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
|
||||
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
|
||||
future<> initialize_done_topology_upgrade_state();
|
||||
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
|
||||
// In particular, waits for non-latest local erms to go die.
|
||||
future<> local_topology_barrier();
|
||||
private:
|
||||
// State machine that is responsible for topology change
|
||||
topology_state_machine& _topology_state_machine;
|
||||
|
||||
@@ -21,7 +21,6 @@ namespace service {
|
||||
|
||||
struct status_helper {
|
||||
tasks::task_status status;
|
||||
utils::chunked_vector<locator::tablet_id> tablets;
|
||||
std::optional<locator::tablet_replica> pending_replica;
|
||||
};
|
||||
|
||||
@@ -148,18 +147,40 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
}
|
||||
|
||||
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
|
||||
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
});
|
||||
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
|
||||
while (true) {
|
||||
co_await _ss._topology_state_machine.event.wait([&] {
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
return true;
|
||||
}
|
||||
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
|
||||
if (is_resize_task(task_type)) { // Resize task.
|
||||
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
|
||||
} else if (tablet_id_opt.has_value()) { // Migration task.
|
||||
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
|
||||
} else { // Repair task.
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!is_repair_task(task_type)) {
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
auto tmptr = _ss.get_token_metadata_ptr();
|
||||
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
|
||||
break;
|
||||
}
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_still_running = false;
|
||||
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
|
||||
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
|
||||
return make_ready_future();
|
||||
});
|
||||
if (!repair_still_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
|
||||
if (is_migration_task(task_type)) {
|
||||
@@ -169,9 +190,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
|
||||
} else if (is_resize_task(task_type)) {
|
||||
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
|
||||
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
} else {
|
||||
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
}
|
||||
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
|
||||
co_return res->status;
|
||||
@@ -257,6 +278,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& tmap = tmptr->tablets().get_tablet_map(table);
|
||||
bool repair_task_finished = false;
|
||||
bool repair_task_pending = false;
|
||||
bool no_tablets_processed = true;
|
||||
if (is_repair_task(task_type)) {
|
||||
auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
|
||||
if (progress) {
|
||||
@@ -273,37 +295,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
|
||||
auto& task_info = info.repair_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tid);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
return make_ready_future();
|
||||
});
|
||||
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
|
||||
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
|
||||
} else if (is_migration_task(task_type)) { // Migration task.
|
||||
auto tablet_id = hint.get_tablet_id();
|
||||
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
|
||||
auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.tablets.push_back(tablet_id);
|
||||
no_tablets_processed = false;
|
||||
}
|
||||
} else { // Resize task.
|
||||
auto& task_info = tmap.resize_task_info();
|
||||
if (task_info.tablet_task_id.uuid() == id.uuid()) {
|
||||
update_status(task_info, res.status, sched_nr);
|
||||
res.status.state = tasks::task_manager::task_state::running;
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
|
||||
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
|
||||
co_return res;
|
||||
}
|
||||
}
|
||||
|
||||
if (!res.tablets.empty()) {
|
||||
if (!no_tablets_processed) {
|
||||
res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
|
||||
if (repair_task_pending) {
|
||||
// When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
|
||||
res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
|
||||
co_return res;
|
||||
}
|
||||
if (repair_task_finished) {
|
||||
|
||||
@@ -2193,6 +2193,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
_tablet_allocator.set_load_stats(reconciled_stats);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the background storage group merge to finish before releasing the state machine.
|
||||
// Background merge holds the old erm, so a successful barrier joins with it.
|
||||
// This guarantees that the background merge doesn't run concurrently with the next merge.
|
||||
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
|
||||
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
|
||||
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
|
||||
// by the background merge fiber.
|
||||
tm = nullptr;
|
||||
if (!guard) {
|
||||
guard = co_await start_operation();
|
||||
}
|
||||
co_await global_tablet_token_metadata_barrier(std::move(guard));
|
||||
}
|
||||
|
||||
future<> handle_truncate_table(group0_guard guard) {
|
||||
|
||||
@@ -201,95 +201,47 @@ public:
|
||||
virtual future<std::optional<entry_info>> next_entry() = 0;
|
||||
};
|
||||
|
||||
// Allocated inside LSA.
|
||||
class promoted_index {
|
||||
deletion_time _del_time;
|
||||
uint64_t _promoted_index_start;
|
||||
uint32_t _promoted_index_size;
|
||||
uint32_t _num_blocks;
|
||||
public:
|
||||
promoted_index(const schema& s,
|
||||
deletion_time del_time,
|
||||
uint64_t promoted_index_start,
|
||||
uint32_t promoted_index_size,
|
||||
uint32_t num_blocks)
|
||||
: _del_time{del_time}
|
||||
, _promoted_index_start(promoted_index_start)
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _num_blocks(num_blocks)
|
||||
{ }
|
||||
|
||||
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
|
||||
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
|
||||
// Call under allocating_section.
|
||||
// For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
|
||||
reader_permit,
|
||||
tracing::trace_state_ptr,
|
||||
file_input_stream_options,
|
||||
use_caching);
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
using promoted_index = parsed_promoted_index_entry;
|
||||
|
||||
// A partition index element.
|
||||
// Allocated inside LSA.
|
||||
class index_entry {
|
||||
private:
|
||||
managed_bytes _key;
|
||||
mutable std::optional<dht::token> _token;
|
||||
uint64_t _position;
|
||||
managed_ref<promoted_index> _index;
|
||||
struct [[gnu::packed]] index_entry {
|
||||
mutable int64_t raw_token;
|
||||
uint64_t data_file_offset;
|
||||
uint32_t key_offset;
|
||||
|
||||
public:
|
||||
|
||||
key_view get_key() const {
|
||||
return key_view{_key};
|
||||
}
|
||||
|
||||
// May allocate so must be called under allocating_section.
|
||||
decorated_key_view get_decorated_key(const schema& s) const {
|
||||
if (!_token) {
|
||||
_token.emplace(s.get_partitioner().get_token(get_key()));
|
||||
}
|
||||
return decorated_key_view(*_token, get_key());
|
||||
}
|
||||
|
||||
uint64_t position() const { return _position; };
|
||||
|
||||
std::optional<deletion_time> get_deletion_time() const {
|
||||
if (_index) {
|
||||
return _index->get_deletion_time();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
|
||||
: _key(std::move(key))
|
||||
, _position(position)
|
||||
, _index(std::move(index))
|
||||
{}
|
||||
|
||||
index_entry(index_entry&&) = default;
|
||||
index_entry& operator=(index_entry&&) = default;
|
||||
|
||||
// Can be nullptr
|
||||
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
|
||||
managed_ref<promoted_index>& get_promoted_index() { return _index; }
|
||||
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
return _key.external_memory_usage() + _index.external_memory_usage();
|
||||
}
|
||||
uint64_t position() const { return data_file_offset; }
|
||||
dht::raw_token token() const { return dht::raw_token(raw_token); }
|
||||
};
|
||||
|
||||
// Required for optimized LSA migration of storage of managed_vector.
|
||||
static_assert(std::is_trivially_move_assignable_v<index_entry>);
|
||||
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
|
||||
|
||||
// A partition index page.
|
||||
//
|
||||
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
|
||||
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
|
||||
class partition_index_page {
|
||||
public:
|
||||
lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
|
||||
lsa::chunked_managed_vector<index_entry> _entries;
|
||||
managed_bytes _key_storage;
|
||||
|
||||
// Stores promoted index information of index entries.
|
||||
// The i-th element corresponds to the i-th entry in _entries.
|
||||
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
|
||||
// that entry doesn't have a promoted index.
|
||||
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
|
||||
// which is typical in workloads with small partitions.
|
||||
lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
|
||||
public:
|
||||
partition_index_page() = default;
|
||||
partition_index_page(partition_index_page&&) noexcept = default;
|
||||
@@ -298,15 +250,68 @@ public:
|
||||
bool empty() const { return _entries.empty(); }
|
||||
size_t size() const { return _entries.size(); }
|
||||
|
||||
stop_iteration clear_gently() {
|
||||
// Vectors have trivial storage, so are fast to destroy.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void clear_one_entry() {
|
||||
_entries.pop_back();
|
||||
}
|
||||
|
||||
bool has_promoted_index(size_t i) const {
|
||||
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
const promoted_index& get_promoted_index(size_t i) const {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index for the i-th entry.
|
||||
/// Call only when has_promoted_index(i) is true.
|
||||
promoted_index& get_promoted_index(size_t i) {
|
||||
return _promoted_indexes[i];
|
||||
}
|
||||
|
||||
/// Get promoted index size for the i-th entry.
|
||||
uint32_t get_promoted_index_size(size_t i) const {
|
||||
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
|
||||
}
|
||||
|
||||
/// Get deletion_time for partition represented by the i-th entry.
|
||||
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
|
||||
/// It has to be read from the data file.
|
||||
std::optional<deletion_time> get_deletion_time(size_t i) const {
|
||||
if (has_promoted_index(i)) {
|
||||
return get_promoted_index(i).del_time;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
key_view get_key(size_t i) const {
|
||||
auto start = _entries[i].key_offset;
|
||||
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
|
||||
auto v = managed_bytes_view(_key_storage).prefix(end);
|
||||
v.remove_prefix(start);
|
||||
return key_view(v);
|
||||
}
|
||||
|
||||
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
|
||||
auto key = get_key(i);
|
||||
auto t = _entries[i].token();
|
||||
if (!t) {
|
||||
t = dht::raw_token(s.get_partitioner().get_token(key));
|
||||
_entries[i].raw_token = t.value;
|
||||
}
|
||||
return decorated_key_view(dht::token(t), key);
|
||||
}
|
||||
|
||||
size_t external_memory_usage() const {
|
||||
size_t size = _entries.external_memory_usage();
|
||||
for (auto&& e : _entries) {
|
||||
size += sizeof(index_entry) + e->external_memory_usage();
|
||||
}
|
||||
size += _promoted_indexes.external_memory_usage();
|
||||
size += _key_storage.external_memory_usage();
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,14 +25,6 @@ namespace sstables {
|
||||
extern seastar::logger sstlog;
|
||||
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;
|
||||
|
||||
// Promoted index information produced by the parser.
|
||||
struct parsed_promoted_index_entry {
|
||||
deletion_time del_time;
|
||||
uint64_t promoted_index_start;
|
||||
uint32_t promoted_index_size;
|
||||
uint32_t num_blocks;
|
||||
};
|
||||
|
||||
// Partition index entry information produced by the parser.
|
||||
struct parsed_partition_index_entry {
|
||||
temporary_buffer<char> key;
|
||||
@@ -53,9 +45,10 @@ class index_consumer {
|
||||
schema_ptr _s;
|
||||
logalloc::allocating_section _alloc_section;
|
||||
logalloc::region& _region;
|
||||
utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
|
||||
size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
|
||||
size_t _key_storage_size = 0;
|
||||
public:
|
||||
index_list indexes;
|
||||
|
||||
index_consumer(logalloc::region& r, schema_ptr s)
|
||||
: _s(s)
|
||||
, _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
|
||||
@@ -64,36 +57,63 @@ public:
|
||||
, _region(r)
|
||||
{ }
|
||||
|
||||
~index_consumer() {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.clear_and_release();
|
||||
});
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_key_storage_size += e.key.size();
|
||||
_parsed_entries.emplace_back(std::move(e));
|
||||
if (e.promoted_index) {
|
||||
_max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
|
||||
}
|
||||
}
|
||||
|
||||
void consume_entry(parsed_partition_index_entry&& e) {
|
||||
_alloc_section(_region, [&] {
|
||||
future<index_list> finalize() {
|
||||
index_list result;
|
||||
// In case of exception, need to deallocate under region allocator.
|
||||
auto delete_result = seastar::defer([&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
managed_ref<promoted_index> pi;
|
||||
if (e.promoted_index) {
|
||||
pi = make_managed<promoted_index>(*_s,
|
||||
e.promoted_index->del_time,
|
||||
e.promoted_index->promoted_index_start,
|
||||
e.promoted_index->promoted_index_size,
|
||||
e.promoted_index->num_blocks);
|
||||
}
|
||||
auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
|
||||
indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
|
||||
result._entries = {};
|
||||
result._promoted_indexes = {};
|
||||
result._key_storage = {};
|
||||
});
|
||||
});
|
||||
auto i = _parsed_entries.begin();
|
||||
size_t key_offset = 0;
|
||||
while (i != _parsed_entries.end()) {
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
result._entries.reserve(_parsed_entries.size());
|
||||
result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
|
||||
if (result._key_storage.empty()) {
|
||||
result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
|
||||
}
|
||||
managed_bytes_mutable_view key_out(result._key_storage);
|
||||
key_out.remove_prefix(key_offset);
|
||||
while (i != _parsed_entries.end()) {
|
||||
parsed_partition_index_entry& e = *i;
|
||||
if (e.promoted_index) {
|
||||
result._promoted_indexes[result._entries.size()] = *e.promoted_index;
|
||||
}
|
||||
write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
|
||||
result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
|
||||
++i;
|
||||
key_offset += e.key.size();
|
||||
if (need_preempt()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
delete_result.cancel();
|
||||
_parsed_entries.clear();
|
||||
co_return std::move(result);
|
||||
}
|
||||
|
||||
void prepare(uint64_t size) {
|
||||
_alloc_section = logalloc::allocating_section();
|
||||
_alloc_section(_region, [&] {
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
indexes._entries.reserve(size);
|
||||
});
|
||||
});
|
||||
_max_promoted_index_entry_plus_one = 0;
|
||||
_key_storage_size = 0;
|
||||
_parsed_entries.clear();
|
||||
_parsed_entries.reserve(size);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -198,10 +218,14 @@ public:
|
||||
|
||||
switch (_state) {
|
||||
// START comes first, to make the handling of the 0-quantity case simpler
|
||||
state_START:
|
||||
case state::START:
|
||||
sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
|
||||
_state = state::KEY_SIZE;
|
||||
break;
|
||||
if (data.size() == 0) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case state::KEY_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
|
||||
_entry_offset = current_pos();
|
||||
@@ -227,7 +251,16 @@ public:
|
||||
case state::PROMOTED_SIZE:
|
||||
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
|
||||
_position = this->_u64;
|
||||
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
|
||||
data.trim_front(1);
|
||||
_consumer.consume_entry(parsed_partition_index_entry{
|
||||
.key = std::move(_key),
|
||||
.data_file_offset = _position,
|
||||
.index_offset = _entry_offset,
|
||||
.promoted_index = std::nullopt
|
||||
});
|
||||
goto state_START;
|
||||
} else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PARTITION_HEADER_LENGTH_1;
|
||||
break;
|
||||
}
|
||||
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
|
||||
return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
|
||||
}
|
||||
|
||||
inline
|
||||
std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
_promoted_index_start, _promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Less-comparator for lookups in the partition index.
|
||||
class index_comparator {
|
||||
dht::ring_position_comparator_for_sstables _tri_cmp;
|
||||
@@ -376,27 +382,17 @@ public:
|
||||
return _tri_cmp(e.get_decorated_key(), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
|
||||
}
|
||||
|
||||
bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
|
||||
return operator()(*e, rp);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
|
||||
return operator()(rp, *e);
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(), rp) > 0;
|
||||
}
|
||||
|
||||
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
|
||||
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
|
||||
dht::ring_position_comparator_for_sstables tri_cmp(s);
|
||||
return tri_cmp(page.get_decorated_key(s, idx), rp);
|
||||
}
|
||||
|
||||
// Contains information about index_reader position in the index file
|
||||
struct index_bound {
|
||||
index_bound() = default;
|
||||
@@ -537,7 +533,7 @@ private:
|
||||
if (ex) {
|
||||
return make_exception_future<index_list>(std::move(ex));
|
||||
}
|
||||
return make_ready_future<index_list>(std::move(bound.consumer->indexes));
|
||||
return bound.consumer->finalize();
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -550,17 +546,18 @@ private:
|
||||
if (bound.current_list->empty()) {
|
||||
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
|
||||
}
|
||||
bound.data_file_position = bound.current_list->_entries[0]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[0].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
|
||||
if (sstlog.is_enabled(seastar::log_level::trace)) {
|
||||
sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
|
||||
logalloc::reclaim_lock rl(_region);
|
||||
for (auto&& e : bound.current_list->_entries) {
|
||||
for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
|
||||
auto& e = bound.current_list->_entries[i];
|
||||
auto dk = dht::decorate_key(*_sstable->_schema,
|
||||
e->get_key().to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e->position());
|
||||
bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
|
||||
sstlog.trace(" {} -> {}", dk, e.position());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,7 +601,13 @@ private:
|
||||
// Valid if partition_data_ready(bound)
|
||||
index_entry& current_partition_entry(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list->_entries[bound.current_index_idx];
|
||||
return bound.current_list->_entries[bound.current_index_idx];
|
||||
}
|
||||
|
||||
// Valid if partition_data_ready(bound)
|
||||
partition_index_page& current_page(index_bound& bound) {
|
||||
parse_assert(bool(bound.current_list), _sstable->index_filename());
|
||||
return *bound.current_list;
|
||||
}
|
||||
|
||||
future<> advance_to_next_partition(index_bound& bound) {
|
||||
@@ -617,7 +620,7 @@ private:
|
||||
if (bound.current_index_idx + 1 < bound.current_list->size()) {
|
||||
++bound.current_index_idx;
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
|
||||
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
return reset_clustered_cursor(bound);
|
||||
@@ -680,9 +683,13 @@ private:
|
||||
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
|
||||
sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
|
||||
auto i = _alloc_section(_region, [&] {
|
||||
auto& entries = bound.current_list->_entries;
|
||||
return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
|
||||
index_comparator(*_sstable->_schema));
|
||||
auto& page = *bound.current_list;
|
||||
auto& s = *_sstable->_schema;
|
||||
auto r = std::views::iota(bound.current_index_idx, page._entries.size());
|
||||
auto it = std::ranges::partition_point(r, [&] (int idx) {
|
||||
return index_entry_tri_cmp(s, page, idx, pos) < 0;
|
||||
});
|
||||
return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
|
||||
});
|
||||
// i is valid until next allocation point
|
||||
auto& entries = bound.current_list->_entries;
|
||||
@@ -697,7 +704,7 @@ private:
|
||||
}
|
||||
bound.current_index_idx = std::distance(std::begin(entries), i);
|
||||
bound.current_pi_idx = 0;
|
||||
bound.data_file_position = (*i)->position();
|
||||
bound.data_file_position = (*i).position();
|
||||
bound.element = indexable_element::partition;
|
||||
bound.end_open_marker.reset();
|
||||
sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
|
||||
@@ -800,6 +807,34 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
|
||||
shared_sstable sst,
|
||||
reader_permit permit,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
file_input_stream_options options,
|
||||
use_caching caching)
|
||||
{
|
||||
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
|
||||
seastar::shared_ptr<cached_file> cached_file_ptr = caching
|
||||
? sst->_cached_index_file
|
||||
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
|
||||
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
|
||||
sst->manager().get_cache_tracker().get_lru(),
|
||||
sst->manager().get_cache_tracker().region(),
|
||||
sst->_index_file_size);
|
||||
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
|
||||
pi.promoted_index_start, pi.promoted_index_size,
|
||||
promoted_index_cache_metrics, permit,
|
||||
sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
|
||||
}
|
||||
|
||||
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
|
||||
auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
|
||||
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
|
||||
std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
|
||||
}
|
||||
|
||||
// Ensures that partition_data_ready() returns true.
|
||||
// Can be called only when !eof()
|
||||
future<> read_partition_data() override {
|
||||
@@ -835,10 +870,10 @@ public:
|
||||
clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
|
||||
if (!bound.clustered_cursor) {
|
||||
_alloc_section(_region, [&] {
|
||||
index_entry& e = current_partition_entry(bound);
|
||||
promoted_index* pi = e.get_promoted_index().get();
|
||||
if (pi) {
|
||||
bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
|
||||
partition_index_page& page = current_page(bound);
|
||||
if (page.has_promoted_index(bound.current_index_idx)) {
|
||||
promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
|
||||
bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
|
||||
get_file_input_stream_options(), _use_caching);
|
||||
}
|
||||
});
|
||||
@@ -861,15 +896,15 @@ public:
|
||||
// It may be unavailable for old sstables for which this information was not generated.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<sstables::deletion_time> partition_tombstone() override {
|
||||
return current_partition_entry(_lower_bound).get_deletion_time();
|
||||
return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
// Returns the key for current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<partition_key> get_partition_key() override {
|
||||
return _alloc_section(_region, [this] {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_key().to_partition_key(*_sstable->_schema);
|
||||
return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
|
||||
.to_partition_key(*_sstable->_schema);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -883,8 +918,8 @@ public:
|
||||
// Returns the number of promoted index entries for the current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
uint64_t get_promoted_index_size() {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_promoted_index_size();
|
||||
partition_index_page& page = current_page(_lower_bound);
|
||||
return page.get_promoted_index_size(_lower_bound.current_index_idx);
|
||||
}
|
||||
|
||||
bool partition_data_ready() const override {
|
||||
@@ -975,9 +1010,9 @@ public:
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
return read_partition_data().then([this, key] {
|
||||
index_comparator cmp(*_sstable->_schema);
|
||||
bool found = _alloc_section(_region, [&] {
|
||||
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
auto& page = current_page(_lower_bound);
|
||||
return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
|
||||
});
|
||||
return make_ready_future<bool>(found);
|
||||
});
|
||||
|
||||
@@ -257,14 +257,11 @@ public:
|
||||
while (partial_page || i != _cache.end()) {
|
||||
if (partial_page) {
|
||||
auto preempted = with_allocator(_region.allocator(), [&] {
|
||||
while (!partial_page->empty()) {
|
||||
partial_page->clear_one_entry();
|
||||
if (need_preempt()) {
|
||||
return true;
|
||||
}
|
||||
while (partial_page->clear_gently() != stop_iteration::yes) {
|
||||
return true;
|
||||
}
|
||||
partial_page.reset();
|
||||
return false;
|
||||
return need_preempt();
|
||||
});
|
||||
if (preempted) {
|
||||
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
|
||||
|
||||
@@ -1094,7 +1094,6 @@ public:
|
||||
|
||||
friend class mc::writer;
|
||||
friend class index_reader;
|
||||
friend class promoted_index;
|
||||
friend class sstables_manager;
|
||||
template <typename DataConsumeRowsContext>
|
||||
friend future<std::unique_ptr<DataConsumeRowsContext>>
|
||||
|
||||
@@ -436,7 +436,10 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
stream_options.buffer_size = file_stream_buffer_size;
|
||||
stream_options.read_ahead = file_stream_read_ahead;
|
||||
|
||||
for (auto& info : sources) {
|
||||
for (auto&& source_info : sources) {
|
||||
// Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
|
||||
// of the sstable component to be released right after it has been streamed.
|
||||
auto info = std::exchange(source_info, {});
|
||||
auto& filename = info.filename;
|
||||
std::optional<input_stream<char>> fstream;
|
||||
bool fstream_closed = false;
|
||||
@@ -617,6 +620,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
|
||||
ops_id, filename, targets, total_size, get_bw(total_size, start_time));
|
||||
}
|
||||
}
|
||||
co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
|
||||
if (error) {
|
||||
blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
|
||||
ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
|
||||
@@ -680,15 +684,20 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
|
||||
if (files.empty()) {
|
||||
co_return resp;
|
||||
}
|
||||
auto sstable_nr = sstables.size();
|
||||
// Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
|
||||
// a sstable - that has been compacted - can have its space released from disk right after
|
||||
// that sstable's content has been fully streamed.
|
||||
sstables.clear();
|
||||
blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
|
||||
req.ops_id, sstables.size(), files.size(), files, req.range);
|
||||
req.ops_id, sstable_nr, files.size(), files, req.range);
|
||||
auto ops_start_time = std::chrono::steady_clock::now();
|
||||
auto files_nr = files.size();
|
||||
size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
|
||||
resp.stream_bytes = stream_bytes;
|
||||
auto duration = std::chrono::steady_clock::now() - ops_start_time;
|
||||
blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
|
||||
req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
|
||||
co_return resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
|
||||
auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
|
||||
try {
|
||||
shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
|
||||
_prepared_stmt = std::move(msg_ptr->get_prepared());
|
||||
_prepared_stmt = msg_ptr->get_prepared();
|
||||
shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
|
||||
_insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
|
||||
_is_fallback_stmt = fallback;
|
||||
|
||||
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
|
||||
: _module(std::move(module))
|
||||
{}
|
||||
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
|
||||
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
|
||||
auto ms = module->get_task_manager()._messaging;
|
||||
if (!ms) {
|
||||
auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
|
||||
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
|
||||
tmlogger.info("tasks_vt_get_children: waiting");
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
|
||||
});
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
if (is_host_alive(host_id)) {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<utils::chunked_vector<task_identity>>();
|
||||
}
|
||||
co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
|
||||
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
|
||||
return resp | std::views::transform([host_id] (auto id) {
|
||||
return task_identity{
|
||||
.host_id = host_id,
|
||||
.task_id = id
|
||||
};
|
||||
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
|
||||
}).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
|
||||
tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
|
||||
return utils::chunked_vector<task_identity>{};
|
||||
});
|
||||
}, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
|
||||
std::move(b.begin(), b.end(), std::back_inserter(a));
|
||||
return a;
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "db_clock.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "locator/token_metadata_fwd.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "tasks/types.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
@@ -282,7 +283,7 @@ public:
|
||||
impl& operator=(impl&&) = delete;
|
||||
virtual ~impl() = default;
|
||||
protected:
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
|
||||
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
|
||||
public:
|
||||
virtual task_group get_group() const noexcept = 0;
|
||||
// Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
|
||||
|
||||
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(1.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();
|
||||
|
||||
@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
|
||||
// cfg.db_config->index_cache_fraction.set(0.0);
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
// We disable compactions because they cause confusing cache mispopulations.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
|
||||
// We disable compression because the sstable writer targets a specific
|
||||
// (*compressed* data file size : summary file size) ratio,
|
||||
// so the number of keys per index page becomes hard to control,
|
||||
// and might be arbitrarily large.
|
||||
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
|
||||
auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
|
||||
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();
|
||||
|
||||
|
||||
@@ -1111,6 +1111,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
|
||||
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
|
||||
testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
|
||||
return make_ready_future();
|
||||
#endif
|
||||
return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
|
||||
sharded<db::snapshot_ctl> sc;
|
||||
sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
|
||||
auto stop_sc = deferred_stop(sc);
|
||||
|
||||
auto& cf = e.local_db().find_column_family("ks", "cf");
|
||||
take_snapshot(e).get();
|
||||
|
||||
utils::get_local_injector().enable("get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
|
||||
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
|
||||
|
||||
auto details = cf.get_snapshot_details().get();
|
||||
BOOST_REQUIRE_EQUAL(details.size(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
// toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
|
||||
SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
|
||||
return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
|
||||
@@ -1857,7 +1881,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {
|
||||
|
||||
schema_builder::register_schema_initializer([] (schema_builder& builder) {
|
||||
if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")
|
||||
|
||||
@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
// (group0 mutations are not allowed on non-group0 tables)
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_batch") {
|
||||
builder.set_is_group0_table(true);
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -345,4 +345,29 @@ SEASTAR_TEST_CASE(test_group0_batch) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
|
||||
return do_with_cql_env([] (cql_test_env& e) {
|
||||
schema_builder::register_schema_initializer([](schema_builder& builder) {
|
||||
if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
|
||||
builder.set_is_group0_table();
|
||||
}
|
||||
});
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.build();
|
||||
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
|
||||
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
|
||||
|
||||
return make_ready_future();
|
||||
});
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
@@ -1499,7 +1499,7 @@ SEASTAR_THREAD_TEST_CASE(tablets_simple_rack_aware_view_pairing_test) {
|
||||
base_host,
|
||||
base_erm,
|
||||
view_erm,
|
||||
*ars_ptr,
|
||||
true, // uses NTS
|
||||
base_token,
|
||||
view_token,
|
||||
use_tablets,
|
||||
|
||||
@@ -719,7 +719,7 @@ SEASTAR_THREAD_TEST_CASE(test_dht_subtract_ranges) {
|
||||
|
||||
auto get_random_ranges = [&] (size_t max_count) {
|
||||
auto count = tests::random::get_int<size_t>(1, max_count);
|
||||
dht::partition_range_vector ranges;
|
||||
utils::chunked_vector<dht::partition_range> ranges;
|
||||
ranges.reserve(count);
|
||||
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
|
||||
@@ -20,16 +20,24 @@ static void add_entry(logalloc::region& r,
|
||||
const schema& s,
|
||||
partition_index_page& page,
|
||||
const partition_key& key,
|
||||
uint64_t position)
|
||||
uint64_t position,
|
||||
std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
|
||||
{
|
||||
logalloc::allocating_section as;
|
||||
as(r, [&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
auto key_offset = page._key_storage.size();
|
||||
auto old_storage = std::move(page._key_storage);
|
||||
page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
|
||||
auto out = managed_bytes_mutable_view(page._key_storage);
|
||||
write_fragmented(out, managed_bytes_view(old_storage));
|
||||
write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
|
||||
page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
|
||||
if (promoted_index) {
|
||||
page._promoted_indexes.resize(page._entries.size());
|
||||
page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -54,10 +62,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
|
||||
static void has_page0(partition_index_cache::entry_ptr ptr) {
|
||||
BOOST_REQUIRE(!ptr->empty());
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
|
||||
BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
|
||||
};
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_caching) {
|
||||
@@ -139,6 +147,59 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
|
||||
::lru lru;
|
||||
simple_schema s;
|
||||
logalloc::region r;
|
||||
partition_index_cache_stats stats;
|
||||
partition_index_cache cache(lru, r, stats);
|
||||
|
||||
auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
|
||||
partition_index_page page;
|
||||
auto destroy_page = defer([&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
auto p = std::move(page);
|
||||
});
|
||||
});
|
||||
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
|
||||
.promoted_index_start = 1,
|
||||
.promoted_index_size = 10,
|
||||
.num_blocks = 3
|
||||
});
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
|
||||
.promoted_index_start = 2,
|
||||
.promoted_index_size = 13,
|
||||
.num_blocks = 1
|
||||
});
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
|
||||
destroy_page.cancel();
|
||||
co_return std::move(page);
|
||||
};
|
||||
|
||||
auto page = cache.get_or_load(0, page0_loader).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
|
||||
BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
|
||||
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
|
||||
|
||||
with_allocator(r.allocator(), [&] {
|
||||
lru.evict_all();
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static future<> ignore_result(future<T>&& f) {
|
||||
return f.then_wrapped([] (auto&& f) {
|
||||
|
||||
@@ -1607,6 +1607,29 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
|
||||
auto& stm = e.local_db().get_shared_token_metadata();
|
||||
auto tm = stm.get();
|
||||
|
||||
e.get_topology_state_machine().local()._topology.version = tm->get_version();
|
||||
|
||||
co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
|
||||
utils::chunked_vector<frozen_mutation> muts;
|
||||
muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
|
||||
.set_version(tm->get_version())
|
||||
.build().to_mutation(db::system_keyspace::topology())));
|
||||
co_await e.local_db().apply(muts, db::no_timeout);
|
||||
co_await e.get_storage_service().local().update_tablet_metadata({});
|
||||
|
||||
// Need a new guard to make sure later changes use later timestamp.
|
||||
// Also, so that the table layer processes the changes we persisted, which is important for splits.
|
||||
// Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
|
||||
release_guard(std::move(guard));
|
||||
abort_source as;
|
||||
co_return co_await e.get_raft_group0_client().start_operation(as);
|
||||
}
|
||||
|
||||
static
|
||||
future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
|
||||
auto& talloc = e.get_tablet_allocator().local();
|
||||
@@ -1626,19 +1649,14 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
|
||||
changed = true;
|
||||
tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
|
||||
tm.set_version(tm.get_version() + 1);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
// Need to reload on each resize because table object expects tablet count to change by a factor of 2.
|
||||
co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
|
||||
co_await e.get_storage_service().local().update_tablet_metadata({});
|
||||
|
||||
// Need a new guard to make sure later changes use later timestamp.
|
||||
release_guard(std::move(guard));
|
||||
abort_source as;
|
||||
guard = co_await e.get_raft_group0_client().start_operation(as);
|
||||
guard = co_await save_token_metadata(e, std::move(guard));
|
||||
|
||||
if (load_stats) {
|
||||
auto new_tm = stm.get();
|
||||
@@ -1647,6 +1665,11 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
load_stats->stats = *reconciled_stats;
|
||||
}
|
||||
}
|
||||
|
||||
testlog.debug("Calling local_topology_barrier()");
|
||||
old_tm = nullptr;
|
||||
co_await e.get_storage_service().local().local_topology_barrier();
|
||||
testlog.debug("Finished local_topology_barrier()");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1750,13 +1773,22 @@ void do_rebalance_tablets(cql_test_env& e,
|
||||
}).get();
|
||||
|
||||
if (auto_split && load_stats) {
|
||||
bool reload = false;
|
||||
auto& tm = *stm.get();
|
||||
for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
|
||||
if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
|
||||
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
|
||||
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
|
||||
if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
|
||||
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
|
||||
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
|
||||
reload = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Need to order split-ack before split finalization, storage_group assumes that.
|
||||
if (reload) {
|
||||
guard = save_token_metadata(e, std::move(guard)).get();
|
||||
}
|
||||
}
|
||||
|
||||
handle_resize_finalize(e, guard, plan, load_stats).get();
|
||||
|
||||
@@ -331,4 +331,28 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
|
||||
std::cerr.rdbuf(oldCerr);
|
||||
|
||||
BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_raw_token) {
|
||||
const auto t1 = dht::token::from_int64(1);
|
||||
const auto t2 = dht::token::from_int64(2);
|
||||
|
||||
dht::raw_token_opt rt_opt;
|
||||
BOOST_REQUIRE(!rt_opt);
|
||||
rt_opt = dht::raw_token(t1);
|
||||
BOOST_REQUIRE(*rt_opt == t1);
|
||||
|
||||
BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::first_token());
|
||||
BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
|
||||
|
||||
auto rt1 = dht::raw_token(t1);
|
||||
BOOST_REQUIRE(bool(rt1));
|
||||
BOOST_REQUIRE(rt1 > dht::raw_token());
|
||||
BOOST_REQUIRE(rt1 > dht::minimum_token());
|
||||
BOOST_REQUIRE_EQUAL(rt1, t1);
|
||||
BOOST_REQUIRE(rt1 == t1);
|
||||
BOOST_REQUIRE(rt1 < t2);
|
||||
BOOST_REQUIRE(rt1 < dht::maximum_token());
|
||||
}
|
||||
|
||||
@@ -3221,6 +3221,87 @@ SEASTAR_TEST_CASE(test_view_update_generating_writetime) {
|
||||
});
|
||||
}
|
||||
|
||||
// Usually if only an unselected column in the base table is modified, we expect an optimization that a view
|
||||
// update is not done, but we had an bug(https://scylladb.atlassian.net/browse/SCYLLADB-808) where the existence
|
||||
// of a collection selected in the view caused us to skip this optimization, even when it was not modified.
|
||||
// This test reproduces this bug.
|
||||
SEASTAR_TEST_CASE(test_view_update_unmodified_collection) {
|
||||
// In this test we verify that we correctly skip (or not) view updates to a view that selects
|
||||
// a collection column. We use two MVs, similarly as in the test above test.
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
|
||||
auto f1 = e.local_view_builder().wait_until_built("ks", "mv1");
|
||||
auto f2 = e.local_view_builder().wait_until_built("ks", "mv2");
|
||||
|
||||
e.execute_cql("CREATE TABLE t (k int, c int, a int, b list<int>, g int, primary key(k, c))").get();
|
||||
e.execute_cql("CREATE MATERIALIZED VIEW mv1 AS SELECT k,c,a,b FROM t "
|
||||
"WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)").get();
|
||||
e.execute_cql("CREATE MATERIALIZED VIEW mv2 AS SELECT k,c,a,b FROM t "
|
||||
"WHERE k IS NOT NULL AND c IS NOT NULL AND a IS NOT NULL PRIMARY KEY (c, k, a)").get();
|
||||
|
||||
f1.get();
|
||||
f2.get();
|
||||
|
||||
auto total_t_view_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
const db::view::stats& local_stats = local_db.find_column_family("ks", "t").get_view_stats();
|
||||
return local_stats.view_updates_pushed_local + local_stats.view_updates_pushed_remote;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
auto total_mv1_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
return local_db.find_column_family("ks", "mv1").get_stats().writes.hist.count;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
auto total_mv2_updates = [&] {
|
||||
return e.db().map_reduce0([] (replica::database& local_db) {
|
||||
return local_db.find_column_family("ks", "mv2").get_stats().writes.hist.count;
|
||||
}, 0, std::plus<int64_t>()).get();
|
||||
};
|
||||
|
||||
::shared_ptr<cql_transport::messages::result_message> msg;
|
||||
|
||||
e.execute_cql("INSERT INTO t (k, c, a) VALUES (1, 1, 1)").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{1, 1, 2};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update an unselected column and the collection remains NULL, so we should generate an
|
||||
// update to the virtual column in mv1 but not to mv2.
|
||||
e.execute_cql("UPDATE t SET g=1 WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{2, 1, 3};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update the collection with an initial value
|
||||
e.execute_cql("UPDATE t SET b=[1] WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{3, 2, 5};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
|
||||
// We update an unselected column again with a non-NULL selected collection. Because the liveness of the updated column is unchanged
|
||||
// and no other selected column is updated (in particular, the collection column), we should generate no view updates.
|
||||
e.execute_cql("UPDATE t SET g=2 WHERE k=1 AND c=1;").get();
|
||||
eventually([&] {
|
||||
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
|
||||
const update_counter expected{3, 2, 5};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(results, expected);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_conflicting_batch) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
|
||||
|
||||
@@ -254,27 +254,3 @@ async def test_node_ops_task_wait(manager: ManagerClient):
|
||||
|
||||
await decommission_task
|
||||
await waiting_task
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_children(manager: ManagerClient):
|
||||
module_name = "node_ops"
|
||||
tm = TaskManagerClient(manager.api)
|
||||
servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
|
||||
|
||||
injection = "tasks_vt_get_children"
|
||||
handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
|
||||
|
||||
async def _decommission():
|
||||
await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
|
||||
await manager.decommission_node(servers[1].server_id)
|
||||
await handler.message()
|
||||
|
||||
async def _get_status():
|
||||
await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
|
||||
|
||||
await asyncio.gather(*(_decommission(), _get_status()))
|
||||
|
||||
@@ -12,9 +12,11 @@ import pytest
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.pylib.tablets import get_all_tablet_replicas
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.cluster.util import create_new_test_keyspace, new_test_keyspace
|
||||
from test.cluster.util import create_new_test_keyspace, new_test_keyspace, get_topology_coordinator, find_server_by_host_id
|
||||
from test.cluster.test_incremental_repair import trigger_tablet_merge
|
||||
from test.cluster.test_tablets2 import inject_error_on
|
||||
from test.cluster.tasks.task_manager_client import TaskManagerClient
|
||||
from test.cluster.tasks.task_manager_types import TaskStatus, TaskStats
|
||||
@@ -151,6 +153,45 @@ async def test_tablet_repair_task_list(manager: ManagerClient):
|
||||
|
||||
await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks))
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_repair_wait(manager: ManagerClient):
|
||||
module_name = "tablets"
|
||||
tm = TaskManagerClient(manager.api)
|
||||
|
||||
stop_repair_injection = "repair_tablet_repair_task_impl_run"
|
||||
servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager)
|
||||
assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
|
||||
|
||||
await inject_error_on(manager, stop_repair_injection, servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", "all", await_completion=False)
|
||||
|
||||
repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
|
||||
task = repair_tasks[0]
|
||||
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
async def wait_for_task():
|
||||
await enable_injection(manager, servers, "tablet_virtual_task_wait")
|
||||
status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
|
||||
|
||||
async def merge_tablets():
|
||||
await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark)
|
||||
|
||||
# Resume repair.
|
||||
await message_injection(manager, servers, stop_repair_injection)
|
||||
|
||||
# Merge tablets.
|
||||
coord = await find_server_by_host_id(manager, servers, await get_topology_coordinator(manager))
|
||||
log2 = await manager.server_open_log(coord.server_id)
|
||||
await trigger_tablet_merge(manager, servers, [log2])
|
||||
|
||||
await read_barrier(manager.api, servers[0].ip_addr)
|
||||
await message_injection(manager, servers, "tablet_virtual_task_wait")
|
||||
|
||||
await asyncio.gather(wait_for_task(), merge_tablets())
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_repair_task_children(manager: ManagerClient):
|
||||
|
||||
70
test/cluster/test_bootstrap_with_quick_group0_join.py
Normal file
70
test/cluster/test_bootstrap_with_quick_group0_join.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from test.cluster.util import get_current_group0_config
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.pylib.util import wait_for
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
|
||||
"""Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
|
||||
|
||||
The bug was that when the bootstrapping node joined group0 before reaching
|
||||
post_server_start, it skipped post_server_start and thus hung forever.
|
||||
|
||||
The test simulates the scenario by starting the second node with the
|
||||
join_group0_pause_before_config_check injection. Without the fix, the
|
||||
startup times out.
|
||||
"""
|
||||
logger.info("Adding first server")
|
||||
s1 = await manager.server_add()
|
||||
|
||||
logger.info("Adding second server with join_group0_pause_before_config_check enabled")
|
||||
s2 = await manager.server_add(start=False, config={
|
||||
'error_injections_at_startup': ['join_group0_pause_before_config_check']
|
||||
})
|
||||
|
||||
logger.info(f"Starting {s2}")
|
||||
start_task = asyncio.create_task(manager.server_start(s2.server_id))
|
||||
|
||||
s2_log = await manager.server_open_log(s2.server_id)
|
||||
|
||||
await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
|
||||
|
||||
s1_host_id = await manager.get_host_id(s1.server_id)
|
||||
s2_host_id = await manager.get_host_id(s2.server_id)
|
||||
|
||||
async def s2_in_group0_config_on_s1():
|
||||
config = await get_current_group0_config(manager, s1)
|
||||
ids = {m[0] for m in config}
|
||||
assert s1_host_id in ids # sanity check
|
||||
return True if s2_host_id in ids else None
|
||||
|
||||
# Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
|
||||
# get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
|
||||
# to see s2 and then perform a read barrier on s2.
|
||||
logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
|
||||
await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
|
||||
|
||||
logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
|
||||
await read_barrier(manager.api, s2.ip_addr)
|
||||
|
||||
logger.info(f"Unblocking {s2}")
|
||||
await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
|
||||
|
||||
logger.info(f"Waiting for {s2} to complete bootstrap")
|
||||
await asyncio.wait_for(start_task, timeout=60)
|
||||
@@ -433,7 +433,8 @@ async def test_non_existant_table_master_key(manager: ManagerClient, tmpdir):
|
||||
|
||||
async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
cfg = {"authenticator": "org.apache.cassandra.auth.PasswordAuthenticator",
|
||||
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer"}
|
||||
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer",
|
||||
"commitlog_sync": "batch" }
|
||||
|
||||
servers: list[ServerInfo] = await manager.servers_add(servers_num = 1, config=cfg,
|
||||
driver_connect_opts={'auth_provider': PlainTextAuthProvider(username='cassandra', password='cassandra')})
|
||||
@@ -450,11 +451,14 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
file_paths = [f for f in file_paths if os.path.isfile(f) and not os.path.islink(f)]
|
||||
|
||||
for file_path in file_paths:
|
||||
with open(file_path, 'rb') as f:
|
||||
data = f.read()
|
||||
if pbytes in data:
|
||||
pattern_found_counter += 1
|
||||
logger.debug("Pattern '%s' found in %s", pattern, file_path)
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
data = f.read()
|
||||
if pbytes in data:
|
||||
pattern_found_counter += 1
|
||||
logger.debug("Pattern '%s' found in %s", pattern, file_path)
|
||||
except FileNotFoundError:
|
||||
pass # assume just compacted away
|
||||
|
||||
if expect:
|
||||
assert pattern_found_counter > 0
|
||||
@@ -462,15 +466,15 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
assert pattern_found_counter == 0
|
||||
|
||||
async def verify_system_info(expect: bool):
|
||||
user = f"user_{str(uuid.uuid4())}"
|
||||
user = f"user_{str(uuid.uuid4())}".replace('-','_')
|
||||
pwd = f"pwd_{str(uuid.uuid4())}"
|
||||
cql.execute(f"CREATE USER {user} WITH PASSWORD '{pwd}' NOSUPERUSER")
|
||||
assert_one(cql, f"LIST ROLES of {user}", [user, False, True, {}])
|
||||
|
||||
logger.debug("Verify PART 1: check commitlogs -------------")
|
||||
|
||||
grep_database_files(pwd, "commitlog", "**/*.log", expect)
|
||||
grep_database_files(user, "commitlog", "**/*.log", True)
|
||||
await grep_database_files(pwd, "commitlog", "**/*.log", False)
|
||||
await grep_database_files(user, "commitlog", "**/*.log", expect)
|
||||
|
||||
salted_hash = None
|
||||
system_auth = None
|
||||
@@ -487,39 +491,38 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
|
||||
|
||||
assert salted_hash is not None
|
||||
assert system_auth is not None
|
||||
grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
|
||||
await grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
|
||||
|
||||
rand_comment = f"comment_{str(uuid.uuid4())}"
|
||||
|
||||
async with await create_ks(manager) as ks:
|
||||
async with await new_test_table(cql, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
|
||||
async with new_test_table(manager, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
|
||||
cql.execute(f"ALTER TABLE {table} WITH comment = '{rand_comment}'")
|
||||
grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
|
||||
nodetool.flush_all(cql)
|
||||
await grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
|
||||
# Note: original test did greping in sstables. This does no longer work
|
||||
# since all system tables are compressed, and thus binary greping will
|
||||
# not work. We could do scylla sstable dump-data and grep in the json,
|
||||
# but this is somewhat pointless as this would, if it handles it, just
|
||||
# decrypt the info from the sstable, thus we can't really verify anything.
|
||||
# We could maybe check that the expected system tables are in fact encrypted,
|
||||
# though this is more a promise than guarantee... Also, the only tables
|
||||
# encrypted are paxos and batchlog -> pointless
|
||||
|
||||
logger.debug("Verify PART 2: check sstable files -------------\n`system_info_encryption` won't encrypt sstable files on disk")
|
||||
logger.debug("GREP_DB_FILES: Check PM key user in sstable file ....")
|
||||
grep_database_files(user, f"data/{system_auth}/", "**/*-Data.db", expect=True)
|
||||
logger.debug("GREP_DB_FILES: Check original password in commitlogs .... Original password should never be saved")
|
||||
grep_database_files(pwd, f"data/{system_auth}/", "**/*-Data.db", expect=False)
|
||||
logger.debug("GREP_DB_FILES: Check salted_hash of password in sstable file ....")
|
||||
grep_database_files(salted_hash, f"data/{system_auth}/", "**/*-Data.db", expect=False)
|
||||
logger.debug("GREP_DB_FILES: Check table comment in sstable file ....")
|
||||
grep_database_files(rand_comment, "data/system_schema/", "**/*-Data.db", expect=True)
|
||||
|
||||
verify_system_info(True) # not encrypted
|
||||
await verify_system_info(True) # not encrypted
|
||||
|
||||
cfg = {"system_info_encryption": {
|
||||
"enabled": True,
|
||||
"key_provider": "LocalFileSystemKeyProviderFactory"}
|
||||
"key_provider": "LocalFileSystemKeyProviderFactory"},
|
||||
"system_key_directory": os.path.join(tmpdir, "resources/system_keys")
|
||||
}
|
||||
|
||||
for server in servers:
|
||||
manager.server_update_config(server.server_id, config_options=cfg)
|
||||
await manager.server_update_config(server.server_id, config_options=cfg)
|
||||
await manager.server_restart(server.server_id)
|
||||
|
||||
await manager.rolling_restart(servers)
|
||||
|
||||
verify_system_info(False) # should not see stuff now
|
||||
await verify_system_info(False) # should not see stuff now
|
||||
|
||||
|
||||
async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):
|
||||
|
||||
@@ -609,14 +609,19 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and error in merge
|
||||
s1_mark = await logs[0].mark()
|
||||
await inject_error_on(manager, error, servers[:1])
|
||||
mark = await coord_log.mark()
|
||||
await inject_error_on(manager, error, [coord_serv])
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await logs[0].wait_for(f'Got {error}', from_mark=s1_mark)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
|
||||
await coord_log.wait_for(f'Got {error}', from_mark=mark)
|
||||
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await manager.server_stop(servers[0].server_id)
|
||||
await manager.server_start(servers[0].server_id)
|
||||
await manager.server_stop(coord_serv.server_id)
|
||||
await manager.server_start(coord_serv.server_id)
|
||||
|
||||
for server in servers:
|
||||
await manager.server_stop_gracefully(server.server_id)
|
||||
@@ -862,50 +867,6 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
|
||||
logger.info("Starting vnode repair")
|
||||
await manager.api.repair(servers[1].ip_addr, ks, "test")
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs tablet merge
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
|
||||
cmdline = ['--logger-log-level', 'repair=debug']
|
||||
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
|
||||
|
||||
coord = await get_topology_coordinator(manager)
|
||||
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
||||
coord_log = await manager.server_open_log(coord_serv.server_id)
|
||||
|
||||
# Trigger merge and wait until the merge fiber starts
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "merge_completion_fiber", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
|
||||
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
|
||||
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
|
||||
await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
|
||||
|
||||
# Trigger repair and wait for the inc repair prepare preparation to start
|
||||
s1_mark = await coord_log.mark()
|
||||
await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
|
||||
# Wait for preparation to start.
|
||||
await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
|
||||
# Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
|
||||
# With the serialization, preparation will wait for merge fiber to finish.
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Continue to execute the merge fiber so that the compaction group is removed
|
||||
await inject_error_on(manager, "replica_merge_completion_wait", servers)
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
|
||||
|
||||
await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
|
||||
|
||||
# Continue the repair to trigger use-after-free
|
||||
for s in servers:
|
||||
await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
|
||||
|
||||
await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
|
||||
# Incremental repair vs table drop
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -162,7 +162,12 @@ async def do_test_internode_compression_between_datacenters(manager: ManagerClie
|
||||
|
||||
await asyncio.gather(*[manager.server_stop(s.server_id) for s,_ in servers])
|
||||
await asyncio.gather(*[p.stop() for p in proxies])
|
||||
|
||||
# these will all except, because we just stopped them above
|
||||
for coro in proxy_futs:
|
||||
try:
|
||||
await coro
|
||||
except:
|
||||
pass
|
||||
|
||||
async def test_internode_compression_compress_packets_between_nodes(request, manager: ManagerClient) -> None:
|
||||
def check_expected(msg_size, node1_proxy, node2_proxy, node3_proxy):
|
||||
|
||||
65
test/cluster/test_prepare_race.py
Normal file
65
test/cluster/test_prepare_race.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import pytest
|
||||
|
||||
from test.cluster.util import new_test_keyspace, new_test_table
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import inject_error_one_shot
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
|
||||
async def test_prepare_fails_if_cached_statement_is_invalidated_mid_prepare(manager: ManagerClient):
|
||||
server = await manager.server_add()
|
||||
cql = manager.get_cql()
|
||||
log = await manager.server_open_log(server.server_id)
|
||||
|
||||
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks:
|
||||
async with new_test_table(manager, ks, "pk int PRIMARY KEY") as table:
|
||||
query = f"SELECT * FROM {table} WHERE pk = ?"
|
||||
loop = asyncio.get_running_loop()
|
||||
await cql.run_async(f"INSERT INTO {table} (pk) VALUES (7)")
|
||||
await cql.run_async(f"INSERT INTO {table} (pk) VALUES (8)")
|
||||
|
||||
handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
|
||||
mark = await log.mark()
|
||||
prepare_future = loop.run_in_executor(None, lambda: cql.prepare(query))
|
||||
await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=mark, timeout=60)
|
||||
|
||||
# Trigger table schema update (metadata-only) to invalidate prepared statements while PREPARE is paused.
|
||||
await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race'")
|
||||
|
||||
await handler.message()
|
||||
done, _ = await asyncio.wait({prepare_future}, timeout=15)
|
||||
if not done:
|
||||
pytest.fail("Timed out waiting for PREPARE to complete after signaling injection")
|
||||
|
||||
result = done.pop().result()
|
||||
print(f"PREPARE succeeded as expected: {result!r}")
|
||||
|
||||
rows = cql.execute(result, [7])
|
||||
row = rows.one()
|
||||
assert row is not None and row.pk == 7
|
||||
|
||||
# Invalidate prepared statements again, then execute the same prepared object.
|
||||
# The driver should transparently re-prepare and re-request execution.
|
||||
await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race-again'")
|
||||
|
||||
reprepare_handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
|
||||
reprepare_mark = await log.mark()
|
||||
execute_future = loop.run_in_executor(None, lambda: cql.execute(result, [8]))
|
||||
await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=reprepare_mark, timeout=60)
|
||||
|
||||
await reprepare_handler.message()
|
||||
execute_done, _ = await asyncio.wait({execute_future}, timeout=15)
|
||||
if not execute_done:
|
||||
pytest.fail("Timed out waiting for driver execute to finish after re-prepare signaling")
|
||||
|
||||
retried_rows = execute_done.pop().result()
|
||||
retried_row = retried_rows.one()
|
||||
assert retried_row is not None and retried_row.pk == 8
|
||||
@@ -16,8 +16,10 @@ import pytest
|
||||
import socket
|
||||
import ssl
|
||||
import struct
|
||||
import time
|
||||
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.util import wait_for
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -269,6 +271,28 @@ async def send_cql_with_proxy_header_tls(
|
||||
sock.close()
|
||||
|
||||
|
||||
async def wait_for_results(cql, query: str, expected_count: int, timeout: float = 30.0, filter_fn=None):
|
||||
"""
|
||||
Polls `query` until at least `expected_count` rows satisfy `filter_fn` (all rows if no filter is given).
|
||||
On timeout, logs the full result set from the last poll to aid debugging.
|
||||
"""
|
||||
last_rows: list = []
|
||||
|
||||
async def check_resultset():
|
||||
nonlocal last_rows
|
||||
last_rows = list(await cql.run_async(query))
|
||||
matching = filter_fn(last_rows) if filter_fn is not None else last_rows
|
||||
if len(matching) >= expected_count:
|
||||
return matching
|
||||
return None
|
||||
|
||||
try:
|
||||
return await wait_for(check_resultset, time.time() + timeout, period=0.1)
|
||||
except Exception:
|
||||
logger.error('Timed out waiting for %d matching rows in system.clients. Last poll returned %d total rows:\n%s',
|
||||
expected_count, len(last_rows),'\n'.join(str(r) for r in last_rows))
|
||||
raise
|
||||
|
||||
# Shared server configuration for all tests
|
||||
# We configure explicit SSL ports to keep the standard ports unencrypted
|
||||
# so the Python driver can connect without TLS.
|
||||
@@ -368,9 +392,12 @@ async def test_proxy_protocol_shard_aware(proxy_server):
|
||||
await do_cql_handshake(reader, writer)
|
||||
|
||||
# Now query system.clients to verify shard assignments
|
||||
rows = list(cql.execute(
|
||||
f"SELECT address, port, shard_id FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
|
||||
))
|
||||
rows = await wait_for_results(
|
||||
cql,
|
||||
'SELECT address, port, shard_id FROM system.clients',
|
||||
expected_count=num_shards,
|
||||
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
|
||||
)
|
||||
|
||||
# Build a map of port -> shard_id from the results
|
||||
port_to_shard = {row.port: row.shard_id for row in rows}
|
||||
@@ -446,9 +473,12 @@ async def test_proxy_protocol_port_preserved_in_system_clients(proxy_server):
|
||||
|
||||
# Now query system.clients using the driver to see our connection
|
||||
cql = manager.get_cql()
|
||||
rows = list(cql.execute(
|
||||
f"SELECT address, port FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
|
||||
))
|
||||
rows = await wait_for_results(
|
||||
cql,
|
||||
'SELECT address, port FROM system.clients',
|
||||
expected_count=1,
|
||||
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
|
||||
)
|
||||
|
||||
# We should find our connection with the fake source address and port
|
||||
assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
|
||||
@@ -569,9 +599,12 @@ async def test_proxy_protocol_ssl_shard_aware(proxy_server):
|
||||
ssl_sock.recv(4096)
|
||||
|
||||
# Now query system.clients to verify shard assignments
|
||||
rows = list(cql.execute(
|
||||
f"SELECT address, port, shard_id, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
|
||||
))
|
||||
rows = await wait_for_results(
|
||||
cql,
|
||||
'SELECT address, port, shard_id, ssl_enabled FROM system.clients',
|
||||
expected_count=num_shards,
|
||||
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
|
||||
)
|
||||
|
||||
# Build a map of port -> (shard_id, ssl_enabled) from the results
|
||||
port_to_info = {row.port: (row.shard_id, row.ssl_enabled) for row in rows}
|
||||
@@ -656,9 +689,12 @@ async def test_proxy_protocol_ssl_port_preserved(proxy_server):
|
||||
|
||||
# Now query system.clients using the driver to see our connection
|
||||
cql = manager.get_cql()
|
||||
rows = list(cql.execute(
|
||||
f"SELECT address, port, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
|
||||
))
|
||||
rows = await wait_for_results(
|
||||
cql,
|
||||
'SELECT address, port, ssl_enabled FROM system.clients',
|
||||
expected_count=1,
|
||||
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
|
||||
)
|
||||
|
||||
# We should find our connection
|
||||
assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
|
||||
|
||||
@@ -7,6 +7,7 @@ import logging
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from test.pylib.internal_types import ServerNum
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
|
||||
@@ -20,6 +21,20 @@ def fixture_raft_op_timeout(build_mode):
|
||||
return 10000 if build_mode == 'debug' else 1000
|
||||
|
||||
|
||||
async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
|
||||
logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
|
||||
running_ids = [srv.server_id for srv in await manager.running_servers()]
|
||||
if server_id in running_ids:
|
||||
# If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
|
||||
# doesn't guarantee that the new config file is active. Work around this by looking at the logs.
|
||||
log_file = await manager.server_open_log(server_id)
|
||||
mark = await log_file.mark()
|
||||
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
|
||||
await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
|
||||
else:
|
||||
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
|
||||
@@ -42,7 +57,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
|
||||
config = {
|
||||
'direct_failure_detector_ping_timeout_in_ms': 300,
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -64,6 +78,10 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
manager.server_stop_gracefully(servers[3].server_id),
|
||||
manager.server_stop_gracefully(servers[4].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
|
||||
for srv in servers[:2]))
|
||||
|
||||
logger.info("starting a sixth node with no quorum")
|
||||
await manager.server_add(expected_error="raft operation \\[read_barrier\\] timed out, there is no raft quorum",
|
||||
timeout=60)
|
||||
@@ -76,7 +94,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
|
||||
async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
|
||||
config = {
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -107,6 +124,9 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("release join-node-before-add-entry injection")
|
||||
await injection_handler.message()
|
||||
|
||||
@@ -126,7 +146,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
|
||||
logger.info("adding a fourth node")
|
||||
servers += [await manager.server_add(config={
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -153,6 +172,9 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("release join-node-response_handler-before-read-barrier injection")
|
||||
injection_handler = InjectionHandler(manager.api,
|
||||
'join-node-response_handler-before-read-barrier',
|
||||
@@ -169,7 +191,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
|
||||
logger.info("starting a first node (the leader)")
|
||||
servers = [await manager.server_add(config={
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -189,6 +210,9 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("attempting removenode for the second node")
|
||||
await manager.remove_node(servers[0].server_id, servers[1].server_id,
|
||||
expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
|
||||
@@ -232,9 +256,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
|
||||
await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))
|
||||
|
||||
# This ensures the read barriers below fail quickly without group 0 quorum.
|
||||
logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
|
||||
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
|
||||
for srv in servers))
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))
|
||||
|
||||
logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
|
||||
for idx, srv in enumerate(servers[:2]):
|
||||
@@ -246,8 +268,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
|
||||
|
||||
# Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
|
||||
# times out.
|
||||
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
|
||||
for srv in servers))
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))
|
||||
|
||||
logger.info(f"Restarting {servers[2:]} with group 0 quorum")
|
||||
for srv in servers[2:]:
|
||||
|
||||
@@ -978,7 +978,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
|
||||
await wait_for_tablet_count(manager, s0, ks, 'test', lambda c: c == 1, 1, timeout_s=15)
|
||||
|
||||
logger.info("Ensure the guard decided to retain the erm")
|
||||
await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
|
||||
m, _ = await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
|
||||
from_mark=m, timeout=10)
|
||||
|
||||
tablets = await get_all_tablet_replicas(manager, s0, ks, 'test')
|
||||
@@ -986,7 +986,11 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
|
||||
tablet = tablets[0]
|
||||
assert tablet.replicas == [(s0_host_id, 0)]
|
||||
|
||||
m = await log0.mark()
|
||||
# Since merge now waits for erms before releasing the state machine,
|
||||
# the migration initiated below will not start until paxos released the erm.
|
||||
# The barrier which is blocked is the one in merge finalization.
|
||||
# I keep the tablet movement as a guard against regressions in case the behavior changes.
|
||||
|
||||
migration_task = asyncio.create_task(manager.api.move_tablet(s0.ip_addr, ks, "test",
|
||||
s0_host_id, 0,
|
||||
s0_host_id, 1,
|
||||
|
||||
@@ -441,84 +441,6 @@ async def test_tablet_split_merge_with_many_tables(build_mode: str, manager: Man
|
||||
|
||||
await check_logs("after merge completion")
|
||||
|
||||
# Reproduces use-after-free when migration right after merge, but concurrently to background
|
||||
# merge completion handler.
|
||||
# See: https://github.com/scylladb/scylladb/issues/24045
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_migration_running_concurrently_to_merge_completion_handling(manager: ManagerClient):
|
||||
cmdline = []
|
||||
# Size based balancing can attempt to migrate the merged tablet as soon as the merge is complete
|
||||
# because of a lower transient effective_capacity on the node with the merged tablet.
|
||||
# This migration will timeout on cleanup because the compaction group still has an active task,
|
||||
# which is held by the merge_completion_fiber injection, so the tablet's compaction group gate
|
||||
# can not be closed, resulting in cleanup getting stuck. We force capacity based balancing to
|
||||
# avoid this problem.
|
||||
cfg = {'force_capacity_based_balancing': True}
|
||||
servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
|
||||
|
||||
await manager.disable_tablet_balancing()
|
||||
|
||||
cql = manager.get_cql()
|
||||
|
||||
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
|
||||
|
||||
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
|
||||
assert tablet_count == 2
|
||||
|
||||
old_tablet_count = tablet_count
|
||||
|
||||
keys = range(100)
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
|
||||
|
||||
await cql.run_async(f"ALTER KEYSPACE {ks} WITH tablets = {{'initial': 1}};")
|
||||
|
||||
s0_log = await manager.server_open_log(servers[0].server_id)
|
||||
s0_mark = await s0_log.mark()
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "replica_merge_completion_wait", one_shot=True)
|
||||
await manager.enable_tablet_balancing()
|
||||
|
||||
servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
|
||||
s1_host_id = await manager.get_host_id(servers[1].server_id)
|
||||
|
||||
async def finished_merging():
|
||||
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
|
||||
return tablet_count < old_tablet_count or None
|
||||
|
||||
await wait_for(finished_merging, time.time() + 120)
|
||||
|
||||
await manager.disable_tablet_balancing()
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "take_storage_snapshot", one_shot=True)
|
||||
|
||||
await s0_log.wait_for(f"merge_completion_fiber: waiting", from_mark=s0_mark)
|
||||
|
||||
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
|
||||
assert tablet_count == 1
|
||||
|
||||
tablet_token = 0 # Doesn't matter since there is one tablet
|
||||
replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
|
||||
|
||||
s0_host_id = await manager.get_host_id(servers[0].server_id)
|
||||
src_shard = replica[1]
|
||||
dst_shard = src_shard
|
||||
|
||||
migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, s1_host_id, dst_shard, tablet_token))
|
||||
|
||||
await s0_log.wait_for(f"take_storage_snapshot: waiting", from_mark=s0_mark)
|
||||
|
||||
await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
|
||||
await s0_log.wait_for(f"Merge completion fiber finished", from_mark=s0_mark)
|
||||
|
||||
await manager.api.message_injection(servers[0].ip_addr, "take_storage_snapshot")
|
||||
|
||||
await migration
|
||||
|
||||
rows = await cql.run_async(f"SELECT * FROM {ks}.test;")
|
||||
assert len(rows) == len(keys)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_missing_data(manager: ManagerClient):
|
||||
@@ -655,3 +577,77 @@ async def test_merge_with_drop(manager: ManagerClient):
|
||||
await asyncio.sleep(0.1)
|
||||
await manager.api.message_injection(server.ip_addr, "compaction_group_stop_wait")
|
||||
await drop_table_fut
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_background_merge_deadlock(manager: ManagerClient):
|
||||
"""
|
||||
Reproducer for https://scylladb.atlassian.net/browse/SCYLLADB-928
|
||||
|
||||
Reproduces a deadlock in the background merge completion handler that can happen when multiple merges accumulate.
|
||||
If we accumulate more than 1 merge cycle for the fiber, deadlock occurs due to compaction lock taken
|
||||
on the main group (post-merge). The lock is held until compaction groups are precessed by the background merge
|
||||
fiber
|
||||
|
||||
Example:
|
||||
|
||||
Initial state:
|
||||
|
||||
cg0: main,
|
||||
cg1: main
|
||||
cg2: main
|
||||
cg3: main
|
||||
|
||||
After 1st merge:
|
||||
|
||||
cg0': main [locked], merging_groups=[cg0.main, cg1.main]
|
||||
cg1': main [locked], merging_groups=[cg2.main, cg3.main]
|
||||
|
||||
After 2nd merge:
|
||||
|
||||
cg0'': main [locked], merging_groups=[cg0'.main [locked], cg0.main, cg1.main, cg1'.main [locked], cg2.main, cg3.main]
|
||||
|
||||
The test reproduces this by doing a tablet merge from 8 tablets to 1 (8 -> 4 -> 2 -> 1). The background merge fiber
|
||||
is blocked until after the first merge (to 4), so that there is a higher chance of two merges queueing in the fiber.
|
||||
|
||||
If deadlock occurs, node shutdown will hang waiting for the background merge fiber. That's why the test
|
||||
tries to stop the node at the end.
|
||||
"""
|
||||
|
||||
cmdline = [
|
||||
'--logger-log-level', 'load_balancer=debug',
|
||||
'--logger-log-level', 'raft_topology=debug',
|
||||
]
|
||||
|
||||
servers = [await manager.server_add(cmdline=cmdline)]
|
||||
cql, _ = await manager.get_ready_cql(servers)
|
||||
|
||||
ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
|
||||
|
||||
# Create a table which will go through 3 merge cycles.
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) with tablets = {{'min_tablet_count': 8}};")
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
|
||||
log = await manager.server_open_log(servers[0].server_id)
|
||||
mark = await log.mark()
|
||||
|
||||
# Trigger tablet merging
|
||||
await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': 1}};")
|
||||
|
||||
async def produced_one_merge():
|
||||
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
|
||||
return tablet_count == 4 or None
|
||||
await wait_for(produced_one_merge, time.time() + 120)
|
||||
|
||||
mark, _ = await log.wait_for(f"merge_completion_fiber: waiting", from_mark=mark)
|
||||
await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
|
||||
mark, _ = await log.wait_for(f"merge_completion_fiber: message received", from_mark=mark)
|
||||
|
||||
async def finished_merge():
|
||||
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
|
||||
return tablet_count == 1 or None
|
||||
|
||||
await wait_for(finished_merge, time.time() + 120)
|
||||
|
||||
await manager.server_stop(servers[0].server_id)
|
||||
|
||||
@@ -94,6 +94,8 @@ async def test_remove_garbage_group0_members(manager: ManagerClient):
|
||||
logging.info(f'stop {servers[1]}')
|
||||
await manager.server_stop_gracefully(servers[1].server_id)
|
||||
|
||||
await wait_for_token_ring_and_group0_consistency(manager, time.time() + 60)
|
||||
|
||||
logging.info(f'removenode {servers[1]} using {servers[2]}')
|
||||
await manager.remove_node(servers[2].server_id, servers[1].server_id)
|
||||
|
||||
|
||||
@@ -559,6 +559,9 @@ private:
|
||||
cfg->ring_delay_ms.set(500);
|
||||
cfg->shutdown_announce_in_ms.set(0);
|
||||
cfg->broadcast_to_all_shards().get();
|
||||
smp::invoke_on_all([&] {
|
||||
sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
|
||||
}).get();
|
||||
create_directories((data_dir_path + "/system").c_str());
|
||||
create_directories(cfg->commitlog_directory().c_str());
|
||||
create_directories(cfg->schema_commitlog_directory().c_str());
|
||||
|
||||
@@ -449,3 +449,68 @@ def test_repair_incremenatal_repair(nodetool, mode):
|
||||
Starting repair with task_id={id1} keyspace=ks table=table1
|
||||
Repair with task_id={id1} finished
|
||||
"""
|
||||
|
||||
def test_cluster_repair_table_dropped(nodetool):
|
||||
id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
|
||||
res = nodetool("cluster", "repair", "ks", expected_requests=[
|
||||
expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
|
||||
expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
|
||||
expected_request("GET", "/column_family", response=[{"ks": "ks", "cf": "table1"}, {"ks": "ks", "cf": "table2"}]),
|
||||
expected_request(
|
||||
"POST",
|
||||
"/storage_service/tablets/repair",
|
||||
params={
|
||||
"ks": "ks",
|
||||
"table": "table1",
|
||||
"tokens": "all"},
|
||||
response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
|
||||
expected_request(
|
||||
"POST",
|
||||
"/storage_service/tablets/repair",
|
||||
params={
|
||||
"ks": "ks",
|
||||
"table": "table2",
|
||||
"tokens": "all"},
|
||||
response={"tablet_task_id": id1}),
|
||||
expected_request(
|
||||
"GET",
|
||||
f"/task_manager/wait_task/{id1}",
|
||||
response={"state": "done"}),
|
||||
])
|
||||
|
||||
assert _remove_log_timestamp(res.stdout) == f"""\
|
||||
Starting repair with task_id={id1} keyspace=ks table=table2
|
||||
Repair with task_id={id1} finished
|
||||
"""
|
||||
|
||||
def test_cluster_repair_specified_table_dropped(nodetool):
|
||||
id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
|
||||
check_nodetool_fails_with_error_contains(
|
||||
nodetool,
|
||||
("cluster", "repair", "ks", "table1", "table2"),
|
||||
{"expected_requests": [
|
||||
expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
|
||||
expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
|
||||
expected_request(
|
||||
"POST",
|
||||
"/storage_service/tablets/repair",
|
||||
params={
|
||||
"ks": "ks",
|
||||
"table": "table1",
|
||||
"tokens": "all"},
|
||||
response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
|
||||
expected_request(
|
||||
"POST",
|
||||
"/storage_service/tablets/repair",
|
||||
params={
|
||||
"ks": "ks",
|
||||
"table": "table2",
|
||||
"tokens": "all"},
|
||||
response={"tablet_task_id": id1}),
|
||||
expected_request(
|
||||
"GET",
|
||||
f"/task_manager/wait_task/{id1}",
|
||||
response={"state": "done"}),
|
||||
]
|
||||
},
|
||||
[f"Can't find a column family table1 in keyspace ks"])
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <memory>
|
||||
#include <signal.h>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/core/app-template.hh>
|
||||
#include <seastar/http/client.hh>
|
||||
@@ -78,6 +79,23 @@ static future<> make_request(http::experimental::client& cli, sstring operation,
|
||||
});
|
||||
}
|
||||
|
||||
static void wait_for_alternator(const test_config& c) {
|
||||
for (int attempt = 0; attempt < 3000; ++attempt) {
|
||||
try {
|
||||
auto cli = get_client(c);
|
||||
auto close = defer([&] { cli.close().get(); });
|
||||
make_request(cli, "ListTables", "{}").get();
|
||||
return;
|
||||
} catch (...) {
|
||||
}
|
||||
seastar::sleep(std::chrono::milliseconds(100)).get();
|
||||
if (attempt >= 100 && attempt % 10 == 0) {
|
||||
std::cout << fmt::format("Retrying connect to alternator port (attempt {})", attempt + 1) << std::endl;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Timed out waiting for alternator port to become ready");
|
||||
}
|
||||
|
||||
static void delete_alternator_table(http::experimental::client& cli) {
|
||||
try {
|
||||
make_request(cli, "DeleteTable", R"({"TableName": "workloads_test"})").get();
|
||||
@@ -373,6 +391,8 @@ auto make_client_pool(const test_config& c) {
|
||||
void workload_main(const test_config& c) {
|
||||
std::cout << "Running test with config: " << c << std::endl;
|
||||
|
||||
wait_for_alternator(c);
|
||||
|
||||
auto cli = get_client(c);
|
||||
auto finally = defer([&] {
|
||||
delete_alternator_table(cli);
|
||||
|
||||
@@ -330,10 +330,13 @@ int scylla_simple_query_main(int argc, char** argv) {
|
||||
("counters", "test counters")
|
||||
("tablets", "use tablets")
|
||||
("initial-tablets", bpo::value<unsigned>()->default_value(128), "initial number of tablets")
|
||||
("sstable-summary-ratio", bpo::value<double>(), "Generate summary entry, so that summary file size / data file size ~= this ratio")
|
||||
("sstable-format", bpo::value<std::string>(), "SSTable format name to use")
|
||||
("flush", "flush memtables before test")
|
||||
("memtable-partitions", bpo::value<unsigned>(), "apply this number of partitions to memtable, then flush")
|
||||
("json-result", bpo::value<std::string>(), "name of the json result file")
|
||||
("enable-cache", bpo::value<bool>()->default_value(true), "enable row cache")
|
||||
("enable-index-cache", bpo::value<bool>()->default_value(true), "enable partition index cache")
|
||||
("stop-on-error", bpo::value<bool>()->default_value(true), "stop after encountering the first error")
|
||||
("timeout", bpo::value<std::string>()->default_value(""), "use timeout")
|
||||
("bypass-cache", "use bypass cache when querying")
|
||||
@@ -357,8 +360,19 @@ int scylla_simple_query_main(int argc, char** argv) {
|
||||
auto db_cfg = ::make_shared<db::config>(ext);
|
||||
|
||||
const auto enable_cache = app.configuration()["enable-cache"].as<bool>();
|
||||
const auto enable_index_cache = app.configuration()["enable-index-cache"].as<bool>();
|
||||
std::cout << "enable-cache=" << enable_cache << '\n';
|
||||
std::cout << "enable-index-cache=" << enable_index_cache << '\n';
|
||||
db_cfg->enable_cache(enable_cache);
|
||||
db_cfg->cache_index_pages(enable_index_cache);
|
||||
if (app.configuration().contains("sstable-summary-ratio")) {
|
||||
db_cfg->sstable_summary_ratio(app.configuration()["sstable-summary-ratio"].as<double>());
|
||||
}
|
||||
std::cout << "sstable-summary-ratio=" << db_cfg->sstable_summary_ratio() << '\n';
|
||||
if (app.configuration().contains("sstable-format")) {
|
||||
db_cfg->sstable_format(app.configuration()["sstable-format"].as<std::string>());
|
||||
}
|
||||
std::cout << "sstable-format=" << db_cfg->sstable_format() << '\n';
|
||||
cql_test_config cfg(db_cfg);
|
||||
if (app.configuration().contains("tablets")) {
|
||||
cfg.db_config->tablets_mode_for_new_keyspaces.set(db::tablets_mode_t::mode::enabled);
|
||||
|
||||
@@ -15,9 +15,8 @@ from cassandra.cluster import ConsistencyLevel
|
||||
from cassandra.query import SimpleStatement
|
||||
from typing import Callable
|
||||
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table, reconnect_driver
|
||||
from test.pylib.manager_client import ManagerClient, wait_for_cql_and_get_hosts
|
||||
from test.pylib.tablets import get_tablet_count
|
||||
from test.pylib.util import Host
|
||||
from test.storage.conftest import space_limited_servers
|
||||
@@ -81,6 +80,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
|
||||
|
||||
@@ -91,8 +91,9 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
|
||||
logger.info("Restart the node")
|
||||
mark = await log.mark()
|
||||
await manager.server_restart(servers[0].server_id)
|
||||
await manager.driver_connect()
|
||||
cql = manager.get_cql()
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
cql = await reconnect_driver(manager)
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
|
||||
|
||||
@@ -104,6 +105,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
|
||||
await validate_data_existence(cql, hosts[1:], [hosts[0]], cf, 1)
|
||||
|
||||
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)
|
||||
|
||||
@@ -112,7 +114,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_autotoogle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
|
||||
async def test_autotoggle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
|
||||
cmdline = [*global_cmdline,
|
||||
"--logger-log-level", "compaction=debug"]
|
||||
async with space_limited_servers(manager, volumes_factory, ["100M"]*3, cmdline=cmdline) as servers:
|
||||
@@ -136,15 +138,20 @@ async def test_autotoogle_compaction(manager: ManagerClient, volumes_factory: Ca
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
|
||||
|
||||
logger.info("Restart the node")
|
||||
mark = await log.mark()
|
||||
await manager.server_restart(servers[0].server_id)
|
||||
await reconnect_driver(manager)
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
|
||||
|
||||
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
|
||||
|
||||
@@ -235,7 +242,8 @@ async def test_reject_split_compaction(manager: ManagerClient, volumes_factory:
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain")
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain", from_mark=mark)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -260,6 +268,7 @@ async def test_split_compaction_not_triggered(manager: ManagerClient, volumes_fa
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
s1_mark, _ = await s1_log.wait_for("Reached the critical disk utilization level", from_mark=s1_mark)
|
||||
for _ in range(2):
|
||||
s1_mark, _ = await s1_log.wait_for("compaction_manager - Drained", from_mark=s1_mark)
|
||||
|
||||
@@ -294,10 +303,13 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
|
||||
await manager.server_stop_gracefully(servers[0].server_id)
|
||||
await manager.server_wipe_sstables(servers[0].server_id, ks, table)
|
||||
await manager.server_start(servers[0].server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("repair - Drained", from_mark=mark)
|
||||
|
||||
@@ -328,16 +340,18 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
|
||||
logger.info("Restart the node")
|
||||
mark = await log.mark()
|
||||
await manager.server_restart(servers[0].server_id, wait_others=2)
|
||||
await manager.driver_connect()
|
||||
await reconnect_driver(manager)
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("repair - Drained", from_mark=mark)
|
||||
|
||||
logger.info("With blob file removed, wait for the tablet repair to succeed")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
await manager.api.wait_task(servers[0].ip_addr, task_id)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
|
||||
async def test_autotoggle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
|
||||
cfg = {
|
||||
'tablet_load_stats_refresh_interval_in_seconds': 1,
|
||||
}
|
||||
@@ -377,6 +391,7 @@ async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, vol
|
||||
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
|
||||
|
||||
@@ -387,6 +402,7 @@ async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, vol
|
||||
mark, _ = await log.wait_for("Streaming for tablet migration .* failed", from_mark=mark)
|
||||
|
||||
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)
|
||||
|
||||
@@ -435,6 +451,7 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
|
||||
|
||||
@@ -447,7 +464,11 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
|
||||
await cql.run_async(f"ALTER TABLE {cf} WITH tablets = {{'min_tablet_count': 2}};")
|
||||
await coord_log.wait_for(f"Generating resize decision for table {table_id} of type split")
|
||||
|
||||
mark = await log.mark()
|
||||
await manager.server_restart(servers[0].server_id, wait_others=2)
|
||||
cql = await reconnect_driver(manager)
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
|
||||
logger.info("Check if tablet split happened")
|
||||
await assert_resize_task_info(table_id, lambda response: len(response) == 1 and response[0].resize_task_info is not None)
|
||||
@@ -456,6 +477,7 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
|
||||
assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []
|
||||
|
||||
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
|
||||
mark, _ = await log.wait_for(f"Detected tablet split for table {cf}, increasing from 1 to 2 tablets", from_mark=mark)
|
||||
@@ -521,6 +543,7 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
|
||||
logger.info("Create a big file on the target node to reach critical disk utilization level")
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
|
||||
|
||||
@@ -533,9 +556,100 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
|
||||
assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []
|
||||
|
||||
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
|
||||
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
|
||||
for _ in range(2):
|
||||
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
|
||||
|
||||
await repair_task
|
||||
|
||||
mark, _ = await log.wait_for(f"Detected tablet split for table {cf}", from_mark=mark)
|
||||
|
||||
# Since we create 20M volumes, we need to reduce the commitlog segment size
|
||||
# otherwise we hit out of space.
|
||||
global_cmdline_with_disabled_monitor = [
|
||||
"--disk-space-monitor-normal-polling-interval-in-seconds", "1",
|
||||
"--critical-disk-utilization-level", "1.0",
|
||||
"--commitlog-segment-size-in-mb", "2",
|
||||
"--schema-commitlog-segment-size-in-mb", "4",
|
||||
"--tablet-load-stats-refresh-interval-in-seconds", "1",
|
||||
]
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_sstables_incrementally_released_during_streaming(manager: ManagerClient, volumes_factory: Callable) -> None:
|
||||
"""
|
||||
Test that source node will not run out of space if major compaction rewrites the sstables being streamed.
|
||||
Expects the file streaming and major will both release sstables incrementally, reducing chances of 2x
|
||||
space amplification.
|
||||
|
||||
Scenario:
|
||||
- Create a 2-node cluster with limited disk space.
|
||||
- Create a table with 2 tablets, one in each node
|
||||
- Write 20% of node capacity to each tablet.
|
||||
- Start decommissioning one node.
|
||||
- During streaming, create a large file on the source node to push it over 85%
|
||||
- Run major expecting the file streaming released the sstables incrementally. Had it not, source node runs out of space.
|
||||
- Unblock streaming
|
||||
- Verify that the decommission operation succeeds.
|
||||
"""
|
||||
cmdline = [*global_cmdline_with_disabled_monitor,
|
||||
"--logger-log-level", "load_balancer=debug",
|
||||
"--logger-log-level", "debug_error_injection=debug"
|
||||
]
|
||||
# the coordinator needs more space, so creating a 40M volume for it.
|
||||
async with space_limited_servers(manager, volumes_factory, ["40M", "20M"], cmdline=cmdline,
|
||||
property_file=[{"dc": "dc1", "rack": "r1"}]*2) as servers:
|
||||
cql, _ = await manager.get_ready_cql(servers)
|
||||
|
||||
workdir = await manager.server_get_workdir(servers[1].server_id)
|
||||
log = await manager.server_open_log(servers[1].server_id)
|
||||
|
||||
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'dc1': ['{servers[1].rack}'] }}"
|
||||
" AND tablets = {'initial': 2}") as ks:
|
||||
await manager.disable_tablet_balancing()
|
||||
|
||||
# Needs 1mb fragments in order to stress incremental release in file streaming
|
||||
extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'sstable_size_in_mb' : '1'} and compression = {}"
|
||||
async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
|
||||
before_disk_info = psutil.disk_usage(workdir)
|
||||
# About 4mb per tablet
|
||||
await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 8000)])
|
||||
|
||||
# split data into 1mb fragments
|
||||
await manager.api.keyspace_flush(servers[1].ip_addr, ks)
|
||||
await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
|
||||
|
||||
after_disk_info = psutil.disk_usage(workdir)
|
||||
percent_by_writes = after_disk_info.percent - before_disk_info.percent
|
||||
logger.info(f"Percent taken by writes {percent_by_writes}")
|
||||
|
||||
# assert sstable data content account for more than 20% of node's storage.
|
||||
assert percent_by_writes > 20
|
||||
|
||||
# We want to trap only migrations which happened during decommission
|
||||
await manager.api.quiesce_topology(servers[0].ip_addr)
|
||||
|
||||
await manager.api.enable_injection(servers[1].ip_addr, "tablet_stream_files_end_wait", one_shot=True)
|
||||
mark = await log.mark()
|
||||
|
||||
logger.info(f"Workdir {workdir}")
|
||||
|
||||
decomm_task = asyncio.create_task(manager.decommission_node(servers[1].server_id))
|
||||
await manager.enable_tablet_balancing()
|
||||
mark, _ = await log.wait_for("tablet_stream_files_end_wait: waiting", from_mark=mark)
|
||||
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
logger.info(f"Percent used before major {disk_info.percent}")
|
||||
|
||||
# Run major in order to try to reproduce 2x space amplification if files aren't released
|
||||
# incrementally by streamer.
|
||||
await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
|
||||
await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 100)])
|
||||
|
||||
disk_info = psutil.disk_usage(workdir)
|
||||
logger.info(f"Percent used after major {disk_info.percent}")
|
||||
|
||||
await manager.api.message_injection(servers[1].ip_addr, "tablet_stream_files_end_wait")
|
||||
|
||||
await decomm_task
|
||||
|
||||
@@ -1102,7 +1102,7 @@ SEASTAR_TEST_CASE(vector_store_client_https_wrong_hostname) {
|
||||
}));
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(vector_store_client_https_different_ca_cert_verification_error) {
|
||||
SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error) {
|
||||
auto broken_cert = co_await seastar::make_tmp_file();
|
||||
certificates certs;
|
||||
auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
|
||||
@@ -1129,6 +1129,33 @@ SEASTAR_TEST_CASE(vector_store_client_https_different_ca_cert_verification_error
|
||||
}));
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error_host_is_ip) {
|
||||
auto broken_cert = co_await seastar::make_tmp_file();
|
||||
certificates certs;
|
||||
auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
|
||||
auto cfg = make_config();
|
||||
cfg.db_config->vector_store_primary_uri.set(format("https://{}:{}", server->host(), server->port()));
|
||||
cfg.db_config->vector_store_encryption_options.set({{"truststore", broken_cert.get_path().string()}});
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
auto as = abort_source_timeout();
|
||||
auto schema = co_await create_test_table(env, "ks", "idx");
|
||||
auto& vs = env.local_qp().vector_store_client();
|
||||
configure(vs).with_dns({{server->host(), std::vector<std::string>{server->host()}}});
|
||||
vs.start_background_tasks();
|
||||
|
||||
auto keys = co_await vs.ann("ks", "idx", schema, std::vector<float>{0.1, 0.2, 0.3}, 2, rjson::empty_object(), as.reset());
|
||||
|
||||
BOOST_REQUIRE(!keys);
|
||||
BOOST_CHECK(std::holds_alternative<vector_store_client::service_unavailable>(keys.error()));
|
||||
},
|
||||
cfg)
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
co_await remove(broken_cert);
|
||||
}));
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(vector_store_client_high_availability_unreachable) {
|
||||
auto server = co_await make_vs_mock_server();
|
||||
auto unreachable = co_await make_unreachable_socket();
|
||||
|
||||
@@ -690,6 +690,9 @@ void cluster_repair_operation(scylla_rest_client& client, const bpo::variables_m
|
||||
// will repair also their colocated tables.
|
||||
continue;
|
||||
}
|
||||
if (tables.empty() && std::string(ex.what()).contains("Can't find a column family")) {
|
||||
continue;
|
||||
}
|
||||
log("ERROR: Repair request for keyspace={} table={} failed with {}", keyspace, table, ex);
|
||||
exit_code = EXIT_FAILURE;
|
||||
}
|
||||
|
||||
@@ -67,14 +67,17 @@ void result_message::visitor_base::visit(const result_message::exception& ex) {
|
||||
ex.throw_me();
|
||||
}
|
||||
|
||||
result_message::prepared::prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt)
|
||||
: _prepared(std::move(prepared))
|
||||
result_message::prepared::prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
|
||||
: _prepared_entry(std::move(prepared_entry))
|
||||
, _metadata(
|
||||
_prepared->bound_names,
|
||||
_prepared->partition_key_bind_indices,
|
||||
support_lwt_opt ? _prepared->statement->is_conditional() : false)
|
||||
, _result_metadata{extract_result_metadata(_prepared->statement)}
|
||||
(*_prepared_entry)->bound_names,
|
||||
(*_prepared_entry)->partition_key_bind_indices,
|
||||
support_lwt_opt ? (*_prepared_entry)->statement->is_conditional() : false)
|
||||
, _result_metadata{extract_result_metadata((*_prepared_entry)->statement)}
|
||||
{
|
||||
for (const auto& w : (*_prepared_entry)->warnings){
|
||||
add_warning(w);
|
||||
}
|
||||
}
|
||||
|
||||
::shared_ptr<const cql3::metadata> result_message::prepared::extract_result_metadata(::shared_ptr<cql3::cql_statement> statement) {
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <concepts>
|
||||
|
||||
#include "cql3/result_set.hh"
|
||||
#include "cql3/prepared_statements_cache.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
#include "cql3/query_options.hh"
|
||||
|
||||
@@ -30,14 +31,14 @@ namespace messages {
|
||||
|
||||
class result_message::prepared : public result_message {
|
||||
private:
|
||||
cql3::statements::prepared_statement::checked_weak_ptr _prepared;
|
||||
cql3::prepared_statements_cache::pinned_value_type _prepared_entry;
|
||||
cql3::prepared_metadata _metadata;
|
||||
::shared_ptr<const cql3::metadata> _result_metadata;
|
||||
protected:
|
||||
prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt);
|
||||
prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt);
|
||||
public:
|
||||
cql3::statements::prepared_statement::checked_weak_ptr& get_prepared() {
|
||||
return _prepared;
|
||||
cql3::statements::prepared_statement::checked_weak_ptr get_prepared() {
|
||||
return (*_prepared_entry)->checked_weak_from_this();
|
||||
}
|
||||
|
||||
const cql3::prepared_metadata& metadata() const {
|
||||
@@ -49,7 +50,7 @@ public:
|
||||
}
|
||||
|
||||
cql3::cql_metadata_id_type get_metadata_id() const {
|
||||
return _prepared->get_metadata_id();
|
||||
return (*_prepared_entry)->get_metadata_id();
|
||||
}
|
||||
|
||||
class cql;
|
||||
@@ -166,8 +167,8 @@ std::ostream& operator<<(std::ostream& os, const result_message::set_keyspace& m
|
||||
class result_message::prepared::cql : public result_message::prepared {
|
||||
bytes _id;
|
||||
public:
|
||||
cql(const bytes& id, cql3::statements::prepared_statement::checked_weak_ptr p, bool support_lwt_opt)
|
||||
: result_message::prepared(std::move(p), support_lwt_opt)
|
||||
cql(const bytes& id, cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
|
||||
: result_message::prepared(std::move(prepared_entry), support_lwt_opt)
|
||||
, _id{id}
|
||||
{ }
|
||||
|
||||
|
||||
@@ -715,15 +715,6 @@ void write_collection_value(bytes_ostream& out, atomic_cell_value_view val) {
|
||||
}
|
||||
}
|
||||
|
||||
void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
|
||||
while (val.size() > 0) {
|
||||
size_t current_n = std::min(val.size(), out.current_fragment().size());
|
||||
memcpy(out.current_fragment().data(), val.data(), current_n);
|
||||
val.remove_prefix(current_n);
|
||||
out.remove_prefix(current_n);
|
||||
}
|
||||
}
|
||||
|
||||
template<std::integral T>
|
||||
void write_simple(managed_bytes_mutable_view& out, std::type_identity_t<T> val) {
|
||||
val = net::hton(val);
|
||||
|
||||
@@ -566,6 +566,16 @@ inline managed_bytes::managed_bytes(const managed_bytes& o) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
|
||||
while (val.size() > 0) {
|
||||
size_t current_n = std::min(val.size(), out.current_fragment().size());
|
||||
memcpy(out.current_fragment().data(), val.data(), current_n);
|
||||
val.remove_prefix(current_n);
|
||||
out.remove_prefix(current_n);
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
struct appending_hash<managed_bytes_view> {
|
||||
template<Hasher Hasher>
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "utils/allocation_strategy.hh"
|
||||
|
||||
@@ -27,10 +28,8 @@ private:
|
||||
T _data[0];
|
||||
|
||||
external(external&& other) noexcept : _backref(other._backref) {
|
||||
for (unsigned i = 0; i < _backref->size(); i++) {
|
||||
new (_data + i) T(std::move(other._data[i]));
|
||||
other._data[i].~T();
|
||||
}
|
||||
std::uninitialized_move(other._data, other._data + other._backref->_size, _data);
|
||||
std::destroy(other._data, other._data + other._backref->_size);
|
||||
_backref->_data = _data;
|
||||
}
|
||||
size_t storage_size() const noexcept {
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <chrono>
|
||||
#include <fmt/format.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <seastar/net/inet_address.hh>
|
||||
|
||||
using namespace seastar;
|
||||
using namespace std::chrono_literals;
|
||||
@@ -28,6 +29,10 @@ using namespace std::chrono_literals;
|
||||
namespace vector_search {
|
||||
namespace {
|
||||
|
||||
bool is_ip_address(const sstring& host) {
|
||||
return net::inet_address::parse_numerical(host).has_value();
|
||||
}
|
||||
|
||||
class client_connection_factory : public http::experimental::connection_factory {
|
||||
client::endpoint_type _endpoint;
|
||||
shared_ptr<tls::certificate_credentials> _creds;
|
||||
@@ -55,7 +60,11 @@ private:
|
||||
future<connected_socket> connect() {
|
||||
auto addr = socket_address(_endpoint.ip, _endpoint.port);
|
||||
if (_creds) {
|
||||
auto socket = co_await tls::connect(_creds, addr, tls::tls_options{.server_name = _endpoint.host});
|
||||
tls::tls_options opts;
|
||||
if (!is_ip_address(_endpoint.host)) {
|
||||
opts.server_name = _endpoint.host;
|
||||
}
|
||||
auto socket = co_await tls::connect(_creds, addr, std::move(opts));
|
||||
// tls::connect() only performs the TCP handshake — the TLS handshake is deferred until the first I/O operation.
|
||||
// Force the TLS handshake to happen here so that the connection timeout applies to it.
|
||||
co_await tls::check_session_is_resumed(socket);
|
||||
@@ -124,7 +133,7 @@ seastar::future<client::request_result> client::request(
|
||||
co_return std::unexpected{aborted_error{}};
|
||||
}
|
||||
if (is_server_problem(err)) {
|
||||
handle_server_unavailable();
|
||||
handle_server_unavailable(err);
|
||||
}
|
||||
co_return std::unexpected{co_await map_err(err)};
|
||||
}
|
||||
@@ -165,8 +174,9 @@ seastar::future<> client::close() {
|
||||
co_await _http_client.close();
|
||||
}
|
||||
|
||||
void client::handle_server_unavailable() {
|
||||
void client::handle_server_unavailable(std::exception_ptr err) {
|
||||
if (!is_checking_status_in_progress()) {
|
||||
_logger.warn("Request to vector store {} {}:{} failed: {}", _endpoint.host, _endpoint.ip, _endpoint.port, err);
|
||||
_checking_status_future = run_checking_status();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "utils/log.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
@@ -60,7 +61,7 @@ private:
|
||||
seastar::future<response> request_impl(seastar::httpd::operation_type method, seastar::sstring path, std::optional<seastar::sstring> content,
|
||||
std::optional<seastar::http::reply::status_type>&& expected, seastar::abort_source& as);
|
||||
seastar::future<bool> check_status();
|
||||
void handle_server_unavailable();
|
||||
void handle_server_unavailable(std::exception_ptr err);
|
||||
seastar::future<> run_checking_status();
|
||||
bool is_checking_status_in_progress() const;
|
||||
std::chrono::milliseconds backoff_retry_max() const;
|
||||
|
||||
@@ -18,15 +18,6 @@
|
||||
|
||||
static_assert(-1 == ~0, "Not a twos-complement architecture");
|
||||
|
||||
// Accounts for the case that all bits are zero.
|
||||
static vint_size_type count_leading_zero_bits(uint64_t n) noexcept {
|
||||
if (n == 0) {
|
||||
return vint_size_type(std::numeric_limits<uint64_t>::digits);
|
||||
}
|
||||
|
||||
return vint_size_type(count_leading_zeros(n));
|
||||
}
|
||||
|
||||
static constexpr uint64_t encode_zigzag(int64_t n) noexcept {
|
||||
// The right shift has to be arithmetic and not logical.
|
||||
return (static_cast<uint64_t>(n) << 1) ^ static_cast<uint64_t>(n >> 63);
|
||||
@@ -55,16 +46,9 @@ int64_t signed_vint::deserialize(bytes_view v) {
|
||||
return decode_zigzag(un);
|
||||
}
|
||||
|
||||
vint_size_type signed_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
|
||||
return unsigned_vint::serialized_size_from_first_byte(first_byte);
|
||||
}
|
||||
|
||||
// The number of additional bytes that we need to read.
|
||||
static vint_size_type count_extra_bytes(int8_t first_byte) {
|
||||
// Sign extension.
|
||||
const int64_t v(first_byte);
|
||||
|
||||
return count_leading_zero_bits(static_cast<uint64_t>(~v)) - vint_size_type(64 - 8);
|
||||
return std::countl_zero(static_cast<uint8_t>(~first_byte));
|
||||
}
|
||||
|
||||
static void encode(uint64_t value, vint_size_type size, bytes::iterator out) {
|
||||
@@ -139,8 +123,3 @@ uint64_t unsigned_vint::deserialize(bytes_view v) {
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
vint_size_type unsigned_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
|
||||
int8_t first_byte_casted = first_byte;
|
||||
return 1 + (first_byte_casted >= 0 ? 0 : count_extra_bytes(first_byte_casted));
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "bytes.hh"
|
||||
|
||||
#include <cstdint>
|
||||
#include <bit>
|
||||
|
||||
using vint_size_type = bytes::size_type;
|
||||
|
||||
@@ -49,7 +50,9 @@ struct unsigned_vint final {
|
||||
|
||||
static value_type deserialize(bytes_view v);
|
||||
|
||||
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
|
||||
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
|
||||
return 1 + std::countl_zero(static_cast<uint8_t>(~first_byte));
|
||||
}
|
||||
};
|
||||
|
||||
struct signed_vint final {
|
||||
@@ -61,5 +64,7 @@ struct signed_vint final {
|
||||
|
||||
static value_type deserialize(bytes_view v);
|
||||
|
||||
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
|
||||
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
|
||||
return unsigned_vint::serialized_size_from_first_byte(first_byte);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user