Compare commits

..

2 Commits

Author SHA1 Message Date
Alex
faba13d2b7 test/auth_cluster: cover empty legacy table in service level upgrade
Add a cluster test that upgrades to raft topology with an empty legacy
`system_distributed.service_levels` table and verifies that the
migration still marks `service_level_version` as `2`.
2026-04-05 19:46:15 +03:00
Alex
d00443f4b0 service_levels: mark v2 migration complete on empty legacy table
During raft-topology upgrade in 2026.1, service_level_controller::migrate_to_v2()
returns early when system_distributed.service_levels is empty.
This skips the service_level_version = 2 write, so the cluster is never marked
as upgraded to service levels v2 even though there is no data to migrate.
Subsequent upgrades may then fail the startup check which requires
service_level_version == 2.
Remove the early return and let the migration commit the version marker even
when there are no legacy service levels rows to copy.

Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1198

backport: only needed in 2026.1 because its the critical upgrade before 2026.2,3,4
2026-04-05 18:00:12 +03:00
86 changed files with 822 additions and 2213 deletions

View File

@@ -15,19 +15,13 @@ jobs:
- name: Verify Org Membership
id: verify_author
env:
EVENT_NAME: ${{ github.event_name }}
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
shell: bash
run: |
if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
AUTHOR="$PR_AUTHOR"
ASSOCIATION="$PR_ASSOCIATION"
if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
AUTHOR="${{ github.event.pull_request.user.login }}"
else
AUTHOR="$COMMENT_AUTHOR"
ASSOCIATION="$COMMENT_ASSOCIATION"
AUTHOR="${{ github.event.comment.user.login }}"
fi
ORG="scylladb"
if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
@@ -40,11 +34,13 @@ jobs:
- name: Validate Comment Trigger
if: github.event_name == 'issue_comment'
id: verify_comment
env:
COMMENT_BODY: ${{ github.event.comment.body }}
shell: bash
run: |
CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
BODY=$(cat << 'EOF'
${{ github.event.comment.body }}
EOF
)
CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
echo "trigger=true" >> $GITHUB_OUTPUT

View File

@@ -78,7 +78,7 @@ fi
# Default scylla product/version tags
PRODUCT=scylla
VERSION=2026.1.1
VERSION=2026.1.0
if test -f version
then

View File

@@ -48,7 +48,6 @@
#include "mutation/mutation_fragment_stream_validator.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/chunked_vector.hh"
#include "utils/pretty_printers.hh"
#include "readers/multi_range.hh"
#include "readers/compacting.hh"
@@ -612,23 +611,23 @@ private:
}
// Called in a seastar thread
utils::chunked_vector<dht::partition_range>
dht::partition_range_vector
get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
// If owned ranges is disengaged, it means no cleanup work was done and
// so nothing needs to be invalidated.
if (!_owned_ranges) {
return {};
return dht::partition_range_vector{};
}
auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
auto non_owned_ranges = sstables
| std::views::transform([] (const sstables::shared_sstable& sst) {
seastar::thread::maybe_yield();
return dht::partition_range::make({sst->get_first_decorated_key(), true},
{sst->get_last_decorated_key(), true});
}) | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
}) | std::ranges::to<dht::partition_range_vector>();
return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
}
protected:
compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -719,8 +718,8 @@ protected:
compaction_completion_desc
get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
auto ranges = get_ranges_for_invalidation(input_sstables);
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
}
// Tombstone expiration is enabled based on the presence of sstable set.

View File

@@ -16,7 +16,6 @@
#include "sstables/sstable_set.hh"
#include "compaction_fwd.hh"
#include "mutation_writer/token_group_based_splitting_writer.hh"
#include "utils/chunked_vector.hh"
namespace compaction {
@@ -39,7 +38,7 @@ struct compaction_completion_desc {
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
std::vector<sstables::shared_sstable> new_sstables;
// Set of compacted partition ranges that should be invalidated in the cache.
utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
dht::partition_range_vector ranges_for_cache_invalidation;
};
// creates a new SSTable for a given shard

View File

@@ -105,7 +105,6 @@ public:
static const std::chrono::minutes entry_expiry;
using key_type = prepared_cache_key_type;
using pinned_value_type = cache_value_ptr;
using value_type = checked_weak_ptr;
using statement_is_too_big = typename cache_type::entry_is_too_big;
@@ -117,14 +116,9 @@ public:
: _cache(size, entry_expiry, logger)
{}
template <typename LoadFunc>
future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
}
template <typename LoadFunc>
future<value_type> get(const key_type& key, LoadFunc&& load) {
return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
});
}

View File

@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
try {
auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
auto prepared = get_statement(query_string, client_state, d);
prepared->calculate_metadata_id();
auto bound_terms = prepared->statement->get_bound_terms();
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
});
co_await utils::get_local_injector().inject(
"query_processor_prepare_wait_after_cache_get",
utils::wait_for_message(std::chrono::seconds(60)));
auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
const auto& warnings = prep_ptr->warnings;
const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
co_return std::move(msg);
for (const auto& w : warnings) {
msg->add_warning(w);
}
co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
} catch(typename prepared_statements_cache::statement_is_too_big&) {
throw prepared_statement_is_too_big(query_string);
}

View File

@@ -29,7 +29,6 @@
#include "utils/assert.hh"
#include "utils/updateable_value.hh"
#include "utils/labels.hh"
#include "utils/chunked_vector.hh"
namespace cache {
@@ -1216,10 +1215,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
}
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
}
future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
auto on_failure = defer([this] () noexcept {

View File

@@ -17,7 +17,6 @@
#include "utils/histogram.hh"
#include "mutation/partition_version.hh"
#include "utils/double-decker.hh"
#include "utils/chunked_vector.hh"
#include "db/cache_tracker.hh"
#include "readers/empty.hh"
#include "readers/mutation_source.hh"
@@ -458,7 +457,7 @@ public:
// mutation source made prior to the call to invalidate().
future<> invalidate(external_updater, const dht::decorated_key&);
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
// Evicts entries from cache.
//

View File

@@ -105,7 +105,7 @@ namespace {
schema_builder::register_schema_initializer([](schema_builder& builder) {
if (builder.ks_name() == schema_tables::NAME) {
// all schema tables are group0 tables
builder.set_is_group0_table();
builder.set_is_group0_table(true);
}
});
}

View File

@@ -87,15 +87,31 @@ namespace {
static const std::unordered_set<sstring> tables = {
schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
system_keyspace::BROADCAST_KV_STORE,
system_keyspace::CDC_GENERATIONS_V3,
system_keyspace::RAFT,
system_keyspace::RAFT_SNAPSHOTS,
system_keyspace::RAFT_SNAPSHOT_CONFIG,
system_keyspace::GROUP0_HISTORY,
system_keyspace::DISCOVERY,
system_keyspace::TABLETS,
system_keyspace::TOPOLOGY,
system_keyspace::TOPOLOGY_REQUESTS,
system_keyspace::LOCAL,
system_keyspace::PEERS,
system_keyspace::SCYLLA_LOCAL,
system_keyspace::COMMITLOG_CLEANUPS,
system_keyspace::SERVICE_LEVELS_V2,
system_keyspace::VIEW_BUILD_STATUS_V2,
system_keyspace::CDC_STREAMS_STATE,
system_keyspace::CDC_STREAMS_HISTORY,
system_keyspace::ROLES,
system_keyspace::ROLE_MEMBERS,
system_keyspace::ROLE_ATTRIBUTES,
system_keyspace::ROLE_PERMISSIONS,
system_keyspace::CDC_LOCAL,
system_keyspace::DICTS,
system_keyspace::VIEW_BUILDING_TASKS,
system_keyspace::CLIENT_ROUTES,
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.enable_schema_commitlog();
@@ -127,7 +143,7 @@ namespace {
system_keyspace::REPAIR_TASKS,
};
if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
builder.set_is_group0_table();
builder.set_is_group0_table(true);
}
});
}

View File

@@ -930,7 +930,8 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
const row& existing_row = existing.cells();
const row& updated_row = update.cells();
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
const auto view_it = _view->columns_by_name().find(cdef.name());
const bool column_is_selected = view_it != _view->columns_by_name().end();
@@ -938,29 +939,49 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
// as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
// Because of that, we don't generate view updates when the value in an unselected column is created
// or changes.
if (!column_is_selected) {
if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
return true;
}
// We cannot skip if the value was created or deleted
//TODO(sarna): Optimize collections case - currently they do not go under optimization
if (!cdef.is_atomic()) {
return false;
}
// We cannot skip if the value was created or deleted, unless we have a non-expiring marker
const auto* existing_cell = existing_row.find_cell(cdef.id);
const auto* updated_cell = updated_row.find_cell(cdef.id);
if (existing_cell == nullptr || updated_cell == nullptr) {
return existing_cell == updated_cell;
return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
}
if (!cdef.is_atomic()) {
return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
}
atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);
// We cannot skip when a selected column is changed
if (view_it->second->is_view_virtual()) {
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
if (column_is_selected) {
if (view_it->second->is_view_virtual()) {
return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
}
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
}
return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
// With non-expiring row marker, liveness checks below are not relevant
if (base_has_nonexpiring_marker) {
return true;
}
if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
return false;
}
// We cannot skip if the change updates TTL
const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
if (existing_has_ttl || updated_has_ttl) {
return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
}
return true;
});
}
@@ -1728,7 +1749,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
std::vector<std::reference_wrapper<const locator::node>> base_nodes,
std::vector<std::reference_wrapper<const locator::node>> view_nodes,
locator::endpoint_dc_rack my_location,
const bool network_topology,
const locator::network_topology_strategy* network_topology,
replica::cf_stats& cf_stats) {
using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
node_vector base_endpoints, view_endpoints;
@@ -1881,7 +1902,7 @@ endpoints_to_update get_view_natural_endpoint(
locator::host_id me,
const locator::effective_replication_map_ptr& base_erm,
const locator::effective_replication_map_ptr& view_erm,
const bool network_topology,
const locator::abstract_replication_strategy& replication_strategy,
const dht::token& base_token,
const dht::token& view_token,
bool use_tablets,
@@ -1889,6 +1910,7 @@ endpoints_to_update get_view_natural_endpoint(
auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
auto& my_location = topology.get_location(me);
auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);
auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
if (auto* np = topology.find_node(ep)) {
@@ -1922,7 +1944,7 @@ endpoints_to_update get_view_natural_endpoint(
// view pairing as the leaving base replica.
// note that the recursive call will not recurse again because leaving_base is in base_nodes.
auto leaving_base = it->get().host_id();
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
view_token, use_tablets, cf_stats);
}
}
@@ -2018,9 +2040,7 @@ future<> view_update_generator::mutate_MV(
wait_for_all_updates wait_for_all)
{
auto& ks = _db.find_keyspace(base->ks_name());
const bool uses_tablets = ks.uses_tablets();
const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
// The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
auto& replication = ks.get_replication_strategy();
std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
auto get_erm = [&] (table_id id) {
auto it = erms.find(id);
@@ -2039,8 +2059,8 @@ future<> view_update_generator::mutate_MV(
co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
auto view_token = dht::get_token(*mut.s, mut.fm.key());
auto view_ermp = erms.at(mut.s->id());
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
uses_tablets, cf_stats);
auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
ks.uses_tablets(), cf_stats);
auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
if (no_pairing_endpoint) {

View File

@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
locator::host_id node,
const locator::effective_replication_map_ptr& base_erm,
const locator::effective_replication_map_ptr& view_erm,
const bool network_topology,
const locator::abstract_replication_strategy& replication_strategy,
const dht::token& base_token,
const dht::token& view_token,
bool use_tablets,

View File

@@ -352,16 +352,6 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
return prs;
}
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
utils::chunked_vector<dht::partition_range> prs;
prs.reserve(ranges.size());
for (auto& range : ranges) {
prs.push_back(dht::to_partition_range(range));
co_await coroutine::maybe_yield();
}
co_return prs;
}
std::map<unsigned, dht::partition_range_vector>
split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
std::map<unsigned, dht::partition_range_vector> ret;
@@ -374,11 +364,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
return ret;
}
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
auto cmp = dht::ring_position_comparator(schema);
// optimize set of potentially overlapping ranges by deoverlapping them.
auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
utils::chunked_vector<dht::partition_range> res;
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
dht::partition_range_vector res;
res.reserve(ranges.size() * 2);
auto range = ranges.begin();

View File

@@ -91,7 +91,6 @@ inline token get_token(const schema& s, partition_key_view key) {
dht::partition_range to_partition_range(dht::token_range);
dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);
// Each shard gets a sorted, disjoint vector of ranges
std::map<unsigned, dht::partition_range_vector>
@@ -106,7 +105,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
// Returns a sorted and deoverlapped list of ranges that are
// the result of subtracting all ranges from ranges_to_subtract.
// ranges_to_subtract must be sorted and deoverlapped.
future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
// Returns a token_range vector split based on the given number of most-significant bits
dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);

View File

@@ -30,31 +30,6 @@ enum class token_kind {
after_all_keys,
};
// Represents a token for partition keys.
// Has a disengaged state, which sorts before all engaged states.
struct raw_token {
int64_t value;
/// Constructs a disengaged token.
raw_token() : value(std::numeric_limits<int64_t>::min()) {}
/// Constructs an engaged token.
/// The token must be of token_kind::key kind.
explicit raw_token(const token&);
explicit raw_token(int64_t v) : value(v) {};
std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
std::strong_ordering operator<=>(const token& o) const noexcept;
/// Returns true iff engaged.
explicit operator bool() const noexcept {
return value != std::numeric_limits<int64_t>::min();
}
};
using raw_token_opt = seastar::optimized_optional<raw_token>;
class token {
// INT64_MIN is not a legal token, but a special value used to represent
// infinity in token intervals.
@@ -77,10 +52,6 @@ public:
constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}
token(raw_token raw) noexcept
: token(raw ? kind::key : kind::before_all_keys, raw.value)
{ }
// This constructor seems redundant with the bytes_view constructor, but
// it's necessary for IDL, which passes a deserialized_bytes_proxy here.
// (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -252,29 +223,6 @@ public:
}
};
inline
raw_token::raw_token(const token& t)
: value(t.raw())
{
#ifdef DEBUG
assert(t._kind == token::kind::key);
#endif
}
inline
std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
switch (o._kind) {
case token::kind::after_all_keys:
return std::strong_ordering::less;
case token::kind::before_all_keys:
// before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
// So we can order them by just comparing raw values.
[[fallthrough]];
case token::kind::key:
return value <=> o._data;
}
}
inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
if (l1 == l2) {
return std::strong_ordering::equal;
@@ -381,17 +329,6 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
}
};
template <>
struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
template <typename FormatContext>
auto format(const dht::raw_token& t, FormatContext& ctx) const {
if (!t) {
return fmt::format_to(ctx.out(), "null");
}
return fmt::format_to(ctx.out(), "{}", t.value);
}
};
namespace std {
template<>

View File

@@ -281,8 +281,8 @@ For example::
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
See :ref:`WHERE <where-clause>`.
Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
or columns provided in a definition of the index.
For example::
@@ -290,6 +290,10 @@ For example::
WHERE user_id = 'user123'
ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
Other filtering scenarios are currently not supported.
.. note::
Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.

View File

@@ -52,7 +52,7 @@ Install ScyllaDB
.. code-block:: console
:substitutions:
sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
#. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
.. code-block:: console
:substitutions:
sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
#. Install ScyllaDB packages.
@@ -133,19 +133,19 @@ Install ScyllaDB
sudo yum install scylla
Running the command installs the latest official version of ScyllaDB.
Alternatively, you can install a specific patch version:
Running the command installs the latest official version of ScyllaDB Open Source.
Alternatively, you can to install a specific patch version:
.. code-block:: console
sudo yum install scylla-<your patch version>
Example: The following example shows installing ScyllaDB 2025.3.1.
Example: The following example shows the command to install ScyllaDB 5.2.3.
.. code-block:: console
:class: hide-copy-button
sudo yum install scylla-2025.3.1
sudo yum install scylla-5.2.3
.. include:: /getting-started/_common/setup-after-install.rst

View File

@@ -36,8 +36,11 @@ release versions, run:
curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
To install a non-default version, run the command with the ``--scylla-version``
option to specify the version you want to install.
Versions 2025.1 and Later
==============================
Run the command with the ``--scylla-version`` option to specify the version
you want to install.
**Example**
@@ -47,4 +50,20 @@ option to specify the version you want to install.
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|
Versions Earlier than 2025.1
================================
To install a supported version of *ScyllaDB Enterprise*, run the command with:
* ``--scylla-product scylla-enterprise`` to specify that you want to install
ScyllaDB Entrprise.
* ``--scylla-version`` to specify the version you want to install.
For example:
.. code:: console
curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
.. include:: /getting-started/_common/setup-after-install.rst

View File

@@ -1,492 +0,0 @@
=================================================
Cluster Platform Migration Using Node Cycling
=================================================
This procedure describes how to migrate a ScyllaDB cluster to new instance types
using the add-and-replace approach, which is commonly used for:
* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
* Upgrading to newer instance types with better performance
* Changing instance families within the same cloud provider
The add-and-replace approach maintains data replication throughout the migration
and ensures zero downtime for client applications.
.. note::
This procedure does **not** change the ScyllaDB software version. All nodes
(both existing and new) must run the same ScyllaDB version. For software
version upgrades, see :doc:`Upgrade </upgrade/index>`.
Overview
--------
The add-and-replace migration follows these steps:
#. Add new nodes (on target instance type) to the existing cluster
#. Wait for data to stream to the new nodes
#. Decommission old nodes (on source instance type)
This approach keeps the cluster operational throughout the migration while
maintaining the configured replication factor.
Key characteristics
===================
* **Zero downtime**: Client applications continue to operate during migration
* **Data safety**: Replication factor is maintained throughout the process
* **Flexible**: Works with both vnodes and tablets-enabled clusters
* **Multi-DC support**: Can migrate nodes across multiple datacenters
.. warning::
Ensure your cluster has sufficient capacity during the migration. At the peak
of the process, your cluster will temporarily have double the number of nodes.
Prerequisites
-------------
Check cluster health
====================
Before starting the migration, verify that your cluster is healthy:
#. Check that all nodes are in Up Normal (UN) status:
.. code-block:: shell
nodetool status
All nodes should show ``UN`` status. Do not proceed if any nodes are down.
#. Ensure no streaming or repair operations are in progress:
.. code-block:: shell
nodetool netstats
nodetool compactionstats
Plan the migration
==================
Before provisioning new instances, plan the following:
**Instance type mapping**: Identify the source and target instance types.
If your cluster uses vnodes (not tablets), consider that mismatched shard
counts between source and target instance types can cause slower repairs.
With tablets enabled, shard count mismatch is fully supported.
**Rack assignment planning**: Each new node must be assigned to the same rack
as the node it will replace. This maintains rack-aware topology for:
* Rack-aware replication (NetworkTopologyStrategy)
* Proper data distribution across failure domains
* Minimizing data movement during decommission
Example mapping for a 3-node cluster:
.. code-block:: none
Source nodes (to be decommissioned): Target nodes (to be added):
192.168.1.10 - RACK0 → 192.168.2.10 - RACK0
192.168.1.11 - RACK1 → 192.168.2.11 - RACK1
192.168.1.12 - RACK2 → 192.168.2.12 - RACK2
Create a backup
===============
Back up the data before starting the migration. One of the following
methods can be used:
* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
cluster-wide backup. See the
`ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
for details.
* **Snapshots**: On each node in the cluster, create a snapshot:
.. code-block:: shell
nodetool snapshot -t pre_migration_backup
nodetool listsnapshots
.. note::
Snapshots are local to each node and do not protect against node or disk
failure. For full disaster recovery, use ScyllaDB Manager backup.
Procedure
---------
Adding new nodes
================
#. Provision new instances with the target instance type. Ensure:
* The same ScyllaDB version as existing nodes
* Same network configuration and security groups
* Appropriate storage configuration
#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
cluster:
* **cluster_name**: Must match the existing cluster name
* **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
* **endpoint_snitch**: Must match the existing cluster configuration
* **listen_address**: IP address of the new node
* **rpc_address**: IP address of the new node
All other cluster-wide settings (tablets configuration, encryption settings,
experimental features, etc.) must match the existing nodes.
.. caution::
Make sure that the ScyllaDB version on the new node is identical to the
version on the other nodes in the cluster. Running nodes with different
versions is not supported.
#. If using ``GossipingPropertyFileSnitch``, configure
``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
and rack assignment for this node:
.. code-block:: none
dc = <datacenter-name>
rack = <rack-name>
prefer_local = true
.. warning::
Each node must have the correct rack assignment. Using the same rack for
all new nodes breaks rack-aware replication topology.
#. Start ScyllaDB on the new node:
.. code-block:: shell
sudo systemctl start scylla-server
For Docker deployments:
.. code-block:: shell
docker exec -it <container-name> supervisorctl start scylla
#. Monitor the bootstrap process from an existing node:
.. code-block:: shell
nodetool status
The new node will appear with ``UJ`` (Up, Joining) status while streaming
data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
**Example output during bootstrap:**
.. code-block:: shell
Datacenter: dc1
Status=Up/Down
State=Normal/Leaving/Joining/Moving
-- Address Load Tokens Owns Host ID Rack
UN 192.168.1.10 500 MB 256 33.3% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
UN 192.168.1.11 500 MB 256 33.3% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
UN 192.168.1.12 500 MB 256 33.3% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
UJ 192.168.2.10 250 MB 256 ? a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
**Example output after bootstrap completes:**
.. code-block:: shell
Datacenter: dc1
Status=Up/Down
State=Normal/Leaving/Joining/Moving
-- Address Load Tokens Owns Host ID Rack
UN 192.168.1.10 400 MB 256 25.0% 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c RACK0
UN 192.168.1.11 400 MB 256 25.0% 125ed9f4-7777-1dbn-mac8-43fddce9123e RACK1
UN 192.168.1.12 400 MB 256 25.0% 675ed9f4-6564-6dbd-can8-43fddce952gy RACK2
UN 192.168.2.10 400 MB 256 25.0% a1b2c3d4-5678-90ab-cdef-112233445566 RACK0
#. For tablets-enabled clusters, wait for tablet load balancing to complete.
After the node reaches ``UN`` status, verify no streaming is in progress:
.. code-block:: shell
nodetool netstats
Wait until output shows "Not sending any streams" and no active receiving streams.
#. Repeat steps 1-6 for each new node to be added.
.. note::
You can add multiple nodes in parallel if they are in different datacenters.
Within a single datacenter, add nodes one at a time for best results.
Updating seed node configuration
================================
If any of your original nodes are configured as seed nodes, you must update
the seed configuration before decommissioning them.
#. Check the current seed configuration on any node:
.. code-block:: shell
grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
on **all new nodes** to use the new node IPs as seeds:
.. code-block:: yaml
seed_provider:
- class_name: org.apache.cassandra.locator.SimpleSeedProvider
parameters:
- seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
.. note::
Updating seed configuration on the **old nodes** (that will be
decommissioned) is optional. Seeds are only used during node startup
to discover the cluster. If you don't plan to restart the old nodes
before decommissioning them, their seed configuration doesn't matter.
However, updating all nodes is recommended for safety in case an old
node unexpectedly restarts during the migration.
#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
configuration:
.. code-block:: shell
sudo systemctl restart scylla-server
Wait for the node to fully start before restarting the next node.
#. After restarting the new nodes, verify the cluster is healthy:
.. code-block:: shell
nodetool status
nodetool describecluster
.. warning::
Complete this seed list update on **all new nodes** before decommissioning
any old nodes. This ensures the new nodes can reform the cluster after
the old nodes are removed.
Decommissioning old nodes
=========================
After all new nodes are added and healthy, decommission the old nodes one
at a time.
#. Verify all nodes are healthy before starting decommission:
.. code-block:: shell
nodetool status
All nodes should show ``UN`` status.
#. On the node to be decommissioned, run:
.. code-block:: shell
nodetool decommission
This command blocks until the decommission is complete. The node will
stream its data to the remaining nodes.
#. Monitor the decommission progress from another node:
.. code-block:: shell
nodetool status
The decommissioning node will transition from ``UN````UL`` (Up, Leaving)
→ removed from the cluster.
You can also monitor streaming progress:
.. code-block:: shell
nodetool netstats
#. After decommission completes, verify the node is no longer in the cluster:
.. code-block:: shell
nodetool status
The decommissioned node should no longer appear in the output.
#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
no longer belongs to them after the topology change:
.. code-block:: shell
nodetool cleanup
.. note::
``nodetool cleanup`` can be resource-intensive. Run it on one node at a
time during low-traffic periods.
#. Wait for the cluster to stabilize before decommissioning the next node.
Ensure no streaming operations are in progress.
#. Repeat steps 1-7 for each old node to be decommissioned.
Post-migration verification
---------------------------
After all old nodes are decommissioned, verify the migration was successful.
Verify cluster topology
=======================
.. code-block:: shell
nodetool status
Confirm:
* All nodes show ``UN`` (Up, Normal) status
* Only the new instance type nodes are present
* Nodes are balanced across racks
Verify schema agreement
=======================
.. code-block:: shell
nodetool describecluster
All nodes should report the same schema version.
Verify data connectivity
========================
Connect to the cluster and run a test query:
.. code-block:: shell
cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
.. note::
If ScyllaDB is configured with ``listen_interface``, you must use the
node's interface IP address (not localhost) for cqlsh connections.
Verify ScyllaDB version
=======================
Confirm all nodes are running the same ScyllaDB version:
.. code-block:: shell
scylla --version
Verify data integrity (optional)
================================
Run data validation on each keyspace to verify sstable integrity:
.. code-block:: shell
nodetool scrub --mode=VALIDATE <keyspace_name>
Rollback
--------
If issues occur during the migration, you can roll back by reversing the
procedure.
During add phase
================
If a new node fails to bootstrap:
#. Stop ScyllaDB on the new node:
.. code-block:: shell
sudo systemctl stop scylla-server
#. From an existing node, remove the failed node:
.. code-block:: shell
nodetool removenode <host-id-of-failed-node>
During decommission phase
=========================
If a decommission operation gets stuck:
#. If the node is still reachable, try stopping and restarting ScyllaDB
#. If the node is unresponsive, from another node:
.. code-block:: shell
nodetool removenode <host-id>
See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
for more details.
Full rollback
=============
To roll back after the migration is complete (all nodes on new instance type),
apply the same add-and-replace procedure in reverse:
#. Add new nodes on the original instance type
#. Wait for data streaming to complete
#. Decommission the nodes on the new instance type
Troubleshooting
---------------
Node stuck in Joining (UJ) state
================================
If a new node remains in ``UJ`` state for an extended period:
* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
* Verify network connectivity between nodes
* Ensure sufficient disk space on all nodes
* Check for any ongoing operations that may be blocking
Decommission taking too long
============================
Decommission duration depends on data size. If it appears stuck:
* Check streaming progress: ``nodetool netstats``
* Look for errors in ScyllaDB logs
* Verify network bandwidth between nodes
Schema disagreement
===================
If nodes report different schema versions:
* Wait a few minutes for schema to propagate
* If disagreement persists, restart the nodes one by one
* Run ``nodetool describecluster`` to verify agreement
Additional resources
--------------------
* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
* :doc:`Upgrade </upgrade/index>`

View File

@@ -26,7 +26,6 @@ Cluster Management Procedures
Safely Restart Your Cluster <safe-start>
repair-based-node-operation
Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
Cluster Platform Migration <cluster-platform-migration>
.. panel-box::
@@ -86,8 +85,6 @@ Cluster Management Procedures
* :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`
* :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
.. panel-box::
:title: Topology Changes
:id: "getting-started"

View File

@@ -57,11 +57,12 @@ To enable shared dictionaries:
internode_compression_enable_advanced: true
rpc_dict_training_when: when_leader
.. note::
.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
Some dictionary training data may be encrypted using storage-level encryption
(if enabled) instead of database-level encryption, meaning protection is
applied at the storage layer rather than within the database itself.
Trained dictionaries contain randomly chosen samples of data transferred between
nodes. The data samples are persisted in the Raft log, which is not encrypted.
As a result, some data from otherwise encrypted tables might be stored on disk
unencrypted.
Reference

View File

@@ -42,14 +42,7 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic
sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
const auto replication_factor = erm.get_replication_factor();
if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
if (read_replicas.size() > replication_factor + 1) {
return seastar::format(
"everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
"cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
read_replicas.size(), replication_factor);
}
} else if (read_replicas.size() > replication_factor) {
if (read_replicas.size() > replication_factor) {
return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
}
return {};

View File

@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(
writev(v.serialize());
}
return collection_mutation(type, std::move(ret));
return collection_mutation(type, ret);
}
collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {

View File

@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
.entity = stats.entity,
.progress_units = "",
.progress = tasks::task_manager::task::progress{},
.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
};
}

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:088a9d7e165d33436eb3029ab092582cbae61f0e17486c226d8947ff44658c78
size 6535832
oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
size 6530196

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5f0c0709f9724cd3a545ebcc50ed587f28b2424d55e2334ac2db5d917903bcaf
size 6536892
oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
size 6528308

View File

@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
on_internal_error_noexcept(rcslog,
format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
_resources, _initial_resources));
_resources.count = std::min(_resources.count, _initial_resources.count);
_resources.memory = std::min(_resources.memory, _initial_resources.memory);
_resources.count = std::max(_resources.count, _initial_resources.count);
_resources.memory = std::max(_resources.memory, _initial_resources.memory);
}
maybe_wake_execution_loop();
}

View File

@@ -432,9 +432,7 @@ public:
// refresh_mutation_source must be called when there are changes to data source
// structures but logical state of data is not changed (e.g. when state for a
// new tablet replica is allocated).
virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
const locator::effective_replication_map& erm,
noncopyable_function<void()> refresh_mutation_source) = 0;
virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
@@ -444,7 +442,7 @@ public:
virtual storage_group& storage_group_for_token(dht::token) const = 0;
virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;
virtual locator::combined_load_stats table_load_stats() const = 0;
virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
virtual bool all_storage_groups_split() = 0;
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;

View File

@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
if (!range.is_singular()) {
continue;
}
auto token = dht::token::to_int64(range.start()->value().token());
auto token = dht::token::to_int64(ranges.front().start()->value().token());
if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
// Don't return immediately - account all ranges first
ret = can_proceed::no;

View File

@@ -1129,7 +1129,9 @@ public:
return _stats;
}
locator::combined_load_stats table_load_stats() const;
// The tablet filter is used to not double account migrating tablets, so it's important that
// only one of pending or leaving replica is accounted based on current migration stage.
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
const db::view::stats& get_view_stats() const {
return _view_stats;

View File

@@ -711,9 +711,7 @@ public:
return make_ready_future<>();
}
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
const locator::effective_replication_map& erm,
noncopyable_function<void()> refresh_mutation_source) override {}
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
compaction_group& compaction_group_for_token(dht::token token) const override {
return get_compaction_group();
@@ -736,7 +734,7 @@ public:
return *_single_sg;
}
locator::combined_load_stats table_load_stats() const override {
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
return locator::combined_load_stats{
.table_ls = locator::table_load_stats{
.size_in_bytes = _single_sg->live_disk_space_used(),
@@ -759,11 +757,6 @@ public:
}
};
struct background_merge_guard {
compaction::compaction_reenabler compaction_guard;
locator::effective_replication_map_ptr erm_guard;
};
class tablet_storage_group_manager final : public storage_group_manager {
replica::table& _t;
locator::host_id _my_host_id;
@@ -784,7 +777,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
utils::phased_barrier _merge_fiber_barrier;
std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
// Holds compaction reenabler which disables compaction temporarily during tablet merge
std::vector<background_merge_guard> _compaction_reenablers_for_merging;
std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
private:
const schema_ptr& schema() const {
return _t.schema();
@@ -808,8 +801,7 @@ private:
// Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
// the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
// are merged into a new storage group with id (X >> 1).
void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
// When merge completes, compaction groups of sibling tablets are added to same storage
// group, but they're not merged yet into one, since the merge completion handler happens
@@ -903,9 +895,7 @@ public:
std::exchange(_stop_fut, make_ready_future())).discard_result();
}
void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
const locator::effective_replication_map& erm,
noncopyable_function<void()> refresh_mutation_source) override;
void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
compaction_group& compaction_group_for_token(dht::token token) const override;
utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
@@ -919,7 +909,7 @@ public:
return storage_group_for_id(storage_group_of(token).first);
}
locator::combined_load_stats table_load_stats() const override;
locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
bool all_storage_groups_split() override;
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
future<> maybe_split_compaction_group_of(size_t idx) override;
@@ -2943,108 +2933,17 @@ void table::on_flush_timer() {
});
}
// The following functions return true if we should return the tablet size of a tablet in
// migration depending on its transition stage and whether it is a leaving or pending replica
bool has_size_on_leaving (locator::tablet_transition_stage stage) {
switch (stage) {
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
case locator::tablet_transition_stage::streaming: [[fallthrough]];
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
case locator::tablet_transition_stage::use_new: [[fallthrough]];
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
case locator::tablet_transition_stage::rebuild_repair: [[fallthrough]];
case locator::tablet_transition_stage::repair: [[fallthrough]];
case locator::tablet_transition_stage::end_repair:
return true;
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
case locator::tablet_transition_stage::end_migration:
return false;
}
}
bool has_size_on_pending (locator::tablet_transition_stage stage) {
switch (stage) {
case locator::tablet_transition_stage::allow_write_both_read_old: [[fallthrough]];
case locator::tablet_transition_stage::write_both_read_old: [[fallthrough]];
case locator::tablet_transition_stage::streaming: [[fallthrough]];
case locator::tablet_transition_stage::cleanup_target: [[fallthrough]];
case locator::tablet_transition_stage::revert_migration: [[fallthrough]];
case locator::tablet_transition_stage::rebuild_repair:
return false;
case locator::tablet_transition_stage::write_both_read_new: [[fallthrough]];
case locator::tablet_transition_stage::use_new: [[fallthrough]];
case locator::tablet_transition_stage::cleanup: [[fallthrough]];
case locator::tablet_transition_stage::end_migration: [[fallthrough]];
case locator::tablet_transition_stage::repair: [[fallthrough]];
case locator::tablet_transition_stage::end_repair:
return true;
}
}
locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
locator::table_load_stats table_stats;
table_stats.split_ready_seq_number = _split_ready_seq_number;
locator::tablet_load_stats tablet_stats;
for_each_storage_group([&] (size_t id, storage_group& sg) {
auto tid = locator::tablet_id(id);
locator::global_tablet_id gid { _t.schema()->id(), tid };
locator::tablet_replica me { _my_host_id, this_shard_id() };
const uint64_t tablet_size = sg.live_disk_space_used();
auto transition = _tablet_map->get_tablet_transition_info(tid);
auto& info = _tablet_map->get_tablet_info(tid);
bool is_pending = transition && transition->pending_replica == me;
bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
// It's important to tackle the anomaly in reported size, since both leaving and
// pending replicas could otherwise be accounted during tablet migration.
// If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
// Otherwise, pending replicas are accounted.
// This helps to reduce the discrepancy window.
auto table_size_filter = [&] () {
// if tablet is not in transit, it's filtered in.
if (!transition) {
return true;
}
auto s = transition->reads; // read selector
return (!is_pending && !is_leaving)
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|| (is_pending && s == locator::read_replica_set_selector::next);
};
// When a tablet is in migration, we want to send its size during any migration stage when
// we still know the tablet's size. This way the balancer will have better information about
// tablet sizes, and we reduce the chance that the node will be ignored during balancing
// due to missing tablet size. On the leaving replica we include tablets until the use_new
// stage (inclusive), and on the pending we include tablets after the streaming stage.
// There is an overlap in tablet sizes (we report sizes on both the leaving and pending
// replicas for some stages), but that should not be a problem.
auto tablet_size_filter = [&] () {
// if tablet is not in transit, it's filtered in.
if (!transition) {
return true;
}
if (is_leaving) {
return has_size_on_leaving(transition->stage);
} else if (is_pending) {
return has_size_on_pending(transition->stage);
}
return true;
};
if (table_size_filter()) {
locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
if (tablet_filter(*_tablet_map, gid)) {
const uint64_t tablet_size = sg.live_disk_space_used();
table_stats.size_in_bytes += tablet_size;
}
if (tablet_size_filter()) {
const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
// Make sure the token range is in the form (a, b]
SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
@@ -3057,8 +2956,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats() co
};
}
locator::combined_load_stats table::table_load_stats() const {
return _sg_manager->table_load_stats();
locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
return _sg_manager->table_load_stats(std::move(tablet_filter));
}
void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
@@ -3170,9 +3069,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
}
}
void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
const locator::tablet_map& old_tmap,
const locator::tablet_map& new_tmap) {
void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
auto table_id = schema()->id();
size_t old_tablet_count = old_tmap.tablet_count();
size_t new_tablet_count = new_tmap.tablet_count();
@@ -3196,7 +3093,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effec
auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
for (auto& view : new_cg->all_views()) {
auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
_compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
_compaction_reenablers_for_merging.push_back(std::move(cre));
}
auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));
@@ -3229,11 +3126,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effec
_merge_completion_event.signal();
}
void tablet_storage_group_manager::update_effective_replication_map(
const locator::effective_replication_map_ptr& old_erm,
const locator::effective_replication_map& erm,
noncopyable_function<void()> refresh_mutation_source)
{
void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);
@@ -3249,7 +3142,7 @@ void tablet_storage_group_manager::update_effective_replication_map(
if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
}
handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
}
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
@@ -3335,7 +3228,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
};
if (uses_tablets()) {
_sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
_sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
}
if (old_erm) {
old_erm->invalidate();
@@ -3797,6 +3690,7 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
std::vector<snapshot_sstable_set> sstable_sets(smp::count);
std::vector<int64_t> tablet_counts(smp::count);
co_await writer->init();
co_await smp::invoke_on_all([&] -> future<> {
@@ -3804,6 +3698,7 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
auto [tables, permit] = co_await t.snapshot_sstables();
auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
tablet_counts[this_shard_id()] = t.calculate_tablet_count();
});
co_await writer->sync();
@@ -3817,13 +3712,12 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
});
tlogger.debug("snapshot {}: seal_snapshot", name);
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
std::optional<int64_t> tablet_count;
std::optional<int64_t> min_tablet_count;
if (t.uses_tablets()) {
auto erm = t.get_effective_replication_map();
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
tablet_count = tm.tablet_count();
SCYLLA_ASSERT(!tablet_counts.empty());
min_tablet_count = *std::ranges::min_element(tablet_counts);
}
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
ex = std::move(ptr);
});
@@ -3881,7 +3775,6 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
}
auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
auto close_lister = deferred_close(lister);
while (auto de = lister.get().get()) {
auto snapshot_name = de->name;
all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -3889,9 +3782,6 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
auto& sd = all_snapshots.at(snapshot_name);
sd.total += details.total;
sd.live += details.live;
utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
throw std::runtime_error("Injected exception in get_snapshot_details");
}).get();
}
}
return all_snapshots;
@@ -3911,66 +3801,53 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
}
auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
std::exception_ptr ex;
try {
while (auto de = co_await lister.get()) {
const auto& name = de->name;
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
auto size = sd.allocated_size;
while (auto de = co_await lister.get()) {
const auto& name = de->name;
future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
auto size = sd.allocated_size;
utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
}).get();
// The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
//
// All the others should just generate an exception: there is something wrong, so don't blindly
// add it to the size.
if (name != "manifest.json" && name != "schema.cql") {
details.total += size;
if (sd.number_of_links == 1) {
// File exists only in the snapshot directory.
details.live += size;
continue;
}
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
// So check the datadir for the file too.
} else {
// The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
//
// All the others should just generate an exception: there is something wrong, so don't blindly
// add it to the size.
if (name != "manifest.json" && name != "schema.cql") {
details.total += size;
if (sd.number_of_links == 1) {
// File exists only in the snapshot directory.
details.live += size;
continue;
}
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
try {
// File exists in the main SSTable directory. Snapshots are not contributing to size
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
co_return false;
}
co_return true;
} catch (std::system_error& e) {
if (e.code() != std::error_code(ENOENT, std::system_category())) {
throw;
}
co_return false;
}
};
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
!co_await exists_in_dir(data_directory, datadir, name)) {
details.live += size;
}
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
// So check the datadir for the file too.
} else {
continue;
}
auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
try {
// File exists in the main SSTable directory. Snapshots are not contributing to size
auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
// File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
(path / name).native(), psd.device_id, psd.inode_number, psd.size,
(snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
co_return false;
}
co_return true;
} catch (std::system_error& e) {
if (e.code() != std::error_code(ENOENT, std::system_category())) {
throw;
}
co_return false;
}
};
// Check staging dir first, as files might be moved from there to the datadir concurrently to this check
if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
!co_await exists_in_dir(data_directory, datadir, name)) {
details.live += size;
}
} catch (...) {
ex = std::current_exception();
}
co_await lister.close();
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_return details;

View File

@@ -263,9 +263,8 @@ public:
void enable_schema_commitlog() {
_static_props.enable_schema_commitlog();
}
void set_is_group0_table() {
_static_props.is_group0_table = true;
enable_schema_commitlog();
void set_is_group0_table(bool enabled = true) {
_static_props.is_group0_table = enabled;
}
class default_names {

View File

@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
auto ps_ptr = qp.get_prepared(cache_key);
if (!ps_ptr) {
const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
ps_ptr = msg_ptr->get_prepared();
ps_ptr = std::move(msg_ptr->get_prepared());
if (!ps_ptr) {
on_internal_error(paxos_state::logger, "prepared statement is null");
}

View File

@@ -948,10 +948,6 @@ future<> service_level_controller::migrate_to_v2(size_t nodes_count, db::system_
qs,
{},
cql3::query_processor::cache_internal::no);
if (rows->empty()) {
co_return;
}
auto col_names = schema->all_columns() | std::views::transform([] (const auto& col) {return col.name_as_cql_string(); }) | std::ranges::to<std::vector<sstring>>();
auto col_names_str = fmt::to_string(fmt::join(col_names, ", "));

View File

@@ -350,10 +350,6 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
if (!schema->static_props().is_group0_table) {
on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
}
if (!schema->static_props().use_schema_commitlog) {
on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
}
}
};

View File

@@ -559,7 +559,6 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
group0_id = g0_info.group0_id;
raft::server_address my_addr{my_id, {}};
bool starting_server_as_follower = false;
if (server == nullptr) {
// This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
raft::configuration initial_configuration;
@@ -587,7 +586,6 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
// trigger an empty snapshot transfer.
nontrivial_snapshot = true;
} else {
starting_server_as_follower = true;
co_await handshaker->pre_server_start(g0_info);
}
@@ -616,9 +614,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
}
SCYLLA_ASSERT(server);
co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
utils::wait_for_message(std::chrono::minutes{5}));
if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
if (server->get_configuration().contains(my_id)) {
// True if we started a new group or completed a configuration change initiated earlier.
group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
server->get_configuration().can_vote(my_id)? "voter" : "non-voter");

View File

@@ -6156,57 +6156,6 @@ future<> storage_service::snitch_reconfigured() {
}
}
future<> storage_service::local_topology_barrier() {
if (this_shard_id() != 0) {
co_await container().invoke_on(0, [] (storage_service& ss) {
return ss.local_topology_barrier();
});
co_return;
}
auto version = _topology_state_machine._topology.version;
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
});
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
for (auto& n : _topology_state_machine._topology.transition_nodes) {
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
break;
}
}
}
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
const auto current_version = ss._shared_token_metadata.get()->get_version();
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
version, current_version);
// This shouldn't happen under normal operation, it's only plausible
// if the topology change coordinator has
// moved to another node and managed to update the topology
// parallel to this method. The previous coordinator
// should be inactive now, so it won't observe this
// exception. By returning exception we aim
// to reveal any other conditions where this may arise.
if (current_version != version) {
co_await coroutine::return_exception(std::runtime_error(
::format("raft topology: command::barrier_and_drain, the version has changed, "
"version {}, current_version {}, the topology change coordinator "
" had probably migrated to another node",
version, current_version)));
}
co_await ss._shared_token_metadata.stale_versions_in_use();
co_await get_topology_session_manager().drain_closing_sessions();
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
});
}
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
raft_topology_cmd_result result;
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
@@ -6234,6 +6183,12 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
state.last_index = cmd_index;
}
// We capture the topology version right after the checks
// above, before any yields. This is crucial since _topology_state_machine._topology
// might be altered concurrently while this method is running,
// which can cause the fence command to apply an invalid fence version.
const auto version = _topology_state_machine._topology.version;
switch (cmd.cmd) {
case raft_topology_cmd::command::barrier: {
utils::get_local_injector().inject("raft_topology_barrier_fail",
@@ -6272,7 +6227,43 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
}
break;
case raft_topology_cmd::command::barrier_and_drain: {
co_await local_topology_barrier();
utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
});
co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
for (auto& n : _topology_state_machine._topology.transition_nodes) {
if (!_address_map.find(locator::host_id{n.first.uuid()})) {
rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
break;
}
}
}
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
const auto current_version = ss._shared_token_metadata.get()->get_version();
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
version, current_version);
// This shouldn't happen under normal operation, it's only plausible
// if the topology change coordinator has
// moved to another node and managed to update the topology
// parallel to this method. The previous coordinator
// should be inactive now, so it won't observe this
// exception. By returning exception we aim
// to reveal any other conditions where this may arise.
if (current_version != version) {
co_await coroutine::return_exception(std::runtime_error(
::format("raft topology: command::barrier_and_drain, the version has changed, "
"version {}, current_version {}, the topology change coordinator "
" had probably migrated to another node",
version, current_version)));
}
co_await ss._shared_token_metadata.stale_versions_in_use();
co_await get_topology_session_manager().drain_closing_sessions();
rtlogger.info("raft_topology_cmd::barrier_and_drain done");
});
co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
auto ks = handler.get("keyspace");
@@ -7368,8 +7359,34 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
if (!table) {
continue;
}
auto erm = table->get_effective_replication_map();
auto& token_metadata = erm->get_token_metadata();
auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };
locator::combined_load_stats combined_ls { table->table_load_stats() };
// It's important to tackle the anomaly in reported size, since both leaving and
// pending replicas could otherwise be accounted during tablet migration.
// If transition hasn't reached cleanup stage, then leaving replicas are accounted.
// If transition is past cleanup stage, then pending replicas are accounted.
// This helps to reduce the discrepancy window.
auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
auto transition = tmap.get_tablet_transition_info(id.tablet);
auto& info = tmap.get_tablet_info(id.tablet);
// if tablet is not in transit, it's filtered in.
if (!transition) {
return true;
}
bool is_pending = transition->pending_replica == me;
bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
auto s = transition->reads; // read selector
return (!is_pending && !is_leaving)
|| (is_leaving && s == locator::read_replica_set_selector::previous)
|| (is_pending && s == locator::read_replica_set_selector::next);
};
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);

View File

@@ -944,9 +944,6 @@ public:
future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
future<> raft_initialize_discovery_leader(const join_node_request_params& params);
future<> initialize_done_topology_upgrade_state();
// Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
// In particular, waits for non-latest local erms to go die.
future<> local_topology_barrier();
private:
// State machine that is responsible for topology change
topology_state_machine& _topology_state_machine;

View File

@@ -21,6 +21,7 @@ namespace service {
struct status_helper {
tasks::task_status status;
utils::chunked_vector<locator::tablet_id> tablets;
std::optional<locator::tablet_replica> pending_replica;
};
@@ -147,40 +148,18 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
}
tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
while (true) {
co_await _ss._topology_state_machine.event.wait([&] {
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
return true;
}
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
if (is_resize_task(task_type)) { // Resize task.
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
} else if (tablet_id_opt.has_value()) { // Migration task.
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
} else { // Repair task.
return true;
}
});
if (!is_repair_task(task_type)) {
break;
co_await _ss._topology_state_machine.event.wait([&] {
auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
if (is_resize_task(task_type)) { // Resize task.
return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
} else if (tablet_id_opt.has_value()) { // Migration task.
return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
} else { // Repair task.
return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
});
}
auto tmptr = _ss.get_token_metadata_ptr();
if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
break;
}
auto& tmap = tmptr->tablets().get_tablet_map(table);
bool repair_still_running = false;
co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
return make_ready_future();
});
if (!repair_still_running) {
break;
}
}
});
res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
if (is_migration_task(task_type)) {
@@ -190,9 +169,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
} else if (is_resize_task(task_type)) {
auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
} else {
res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
}
res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
co_return res->status;
@@ -278,7 +257,6 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
auto& tmap = tmptr->tablets().get_tablet_map(table);
bool repair_task_finished = false;
bool repair_task_pending = false;
bool no_tablets_processed = true;
if (is_repair_task(task_type)) {
auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
if (progress) {
@@ -295,37 +273,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
auto& task_info = info.repair_task_info;
if (task_info.tablet_task_id.uuid() == id.uuid()) {
update_status(task_info, res.status, sched_nr);
no_tablets_processed = false;
res.tablets.push_back(tid);
}
return make_ready_future();
});
res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
} else if (is_migration_task(task_type)) { // Migration task.
auto tablet_id = hint.get_tablet_id();
res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
if (task_info.tablet_task_id.uuid() == id.uuid()) {
update_status(task_info, res.status, sched_nr);
no_tablets_processed = false;
res.tablets.push_back(tablet_id);
}
} else { // Resize task.
auto& task_info = tmap.resize_task_info();
if (task_info.tablet_task_id.uuid() == id.uuid()) {
update_status(task_info, res.status, sched_nr);
res.status.state = tasks::task_manager::task_state::running;
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
co_return res;
}
}
if (!no_tablets_processed) {
if (!res.tablets.empty()) {
res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
co_return res;
}
if (repair_task_pending) {
// When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
co_return res;
}
if (repair_task_finished) {

View File

@@ -2193,19 +2193,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
_tablet_allocator.set_load_stats(reconciled_stats);
}
}
// Wait for the background storage group merge to finish before releasing the state machine.
// Background merge holds the old erm, so a successful barrier joins with it.
// This guarantees that the background merge doesn't run concurrently with the next merge.
// Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
// by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
// The background merge fiber will try to stop a compaction group which is locked, and the lock is held
// by the background merge fiber.
tm = nullptr;
if (!guard) {
guard = co_await start_operation();
}
co_await global_tablet_token_metadata_barrier(std::move(guard));
}
future<> handle_truncate_table(group0_guard guard) {

View File

@@ -201,47 +201,95 @@ public:
virtual future<std::optional<entry_info>> next_entry() = 0;
};
// Promoted index information produced by the parser.
struct parsed_promoted_index_entry {
deletion_time del_time;
uint64_t promoted_index_start;
uint32_t promoted_index_size;
uint32_t num_blocks;
};
// Allocated inside LSA.
class promoted_index {
deletion_time _del_time;
uint64_t _promoted_index_start;
uint32_t _promoted_index_size;
uint32_t _num_blocks;
public:
promoted_index(const schema& s,
deletion_time del_time,
uint64_t promoted_index_start,
uint32_t promoted_index_size,
uint32_t num_blocks)
: _del_time{del_time}
, _promoted_index_start(promoted_index_start)
, _promoted_index_size(promoted_index_size)
, _num_blocks(num_blocks)
{ }
using promoted_index = parsed_promoted_index_entry;
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
// Call under allocating_section.
// For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
reader_permit,
tracing::trace_state_ptr,
file_input_stream_options,
use_caching);
};
// A partition index element.
// Allocated inside LSA.
struct [[gnu::packed]] index_entry {
mutable int64_t raw_token;
uint64_t data_file_offset;
uint32_t key_offset;
class index_entry {
private:
managed_bytes _key;
mutable std::optional<dht::token> _token;
uint64_t _position;
managed_ref<promoted_index> _index;
uint64_t position() const { return data_file_offset; }
dht::raw_token token() const { return dht::raw_token(raw_token); }
public:
key_view get_key() const {
return key_view{_key};
}
// May allocate so must be called under allocating_section.
decorated_key_view get_decorated_key(const schema& s) const {
if (!_token) {
_token.emplace(s.get_partitioner().get_token(get_key()));
}
return decorated_key_view(*_token, get_key());
}
uint64_t position() const { return _position; };
std::optional<deletion_time> get_deletion_time() const {
if (_index) {
return _index->get_deletion_time();
}
return {};
}
index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
: _key(std::move(key))
, _position(position)
, _index(std::move(index))
{}
index_entry(index_entry&&) = default;
index_entry& operator=(index_entry&&) = default;
// Can be nullptr
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
managed_ref<promoted_index>& get_promoted_index() { return _index; }
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
size_t external_memory_usage() const {
return _key.external_memory_usage() + _index.external_memory_usage();
}
};
// Required for optimized LSA migration of storage of managed_vector.
static_assert(std::is_trivially_move_assignable_v<index_entry>);
static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
// A partition index page.
//
// Allocated in the standard allocator space but with an LSA allocator as the current allocator.
// So the shallow part is in the standard allocator but all indirect objects are inside LSA.
class partition_index_page {
public:
lsa::chunked_managed_vector<index_entry> _entries;
managed_bytes _key_storage;
// Stores promoted index information of index entries.
// The i-th element corresponds to the i-th entry in _entries.
// Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
// that entry doesn't have a promoted index.
// Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
// which is typical in workloads with small partitions.
lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
public:
partition_index_page() = default;
partition_index_page(partition_index_page&&) noexcept = default;
@@ -250,68 +298,15 @@ public:
bool empty() const { return _entries.empty(); }
size_t size() const { return _entries.size(); }
stop_iteration clear_gently() {
// Vectors have trivial storage, so are fast to destroy.
return stop_iteration::yes;
}
void clear_one_entry() {
_entries.pop_back();
}
bool has_promoted_index(size_t i) const {
return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
}
/// Get promoted index for the i-th entry.
/// Call only when has_promoted_index(i) is true.
const promoted_index& get_promoted_index(size_t i) const {
return _promoted_indexes[i];
}
/// Get promoted index for the i-th entry.
/// Call only when has_promoted_index(i) is true.
promoted_index& get_promoted_index(size_t i) {
return _promoted_indexes[i];
}
/// Get promoted index size for the i-th entry.
uint32_t get_promoted_index_size(size_t i) const {
return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
}
/// Get deletion_time for partition represented by the i-th entry.
/// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
/// It has to be read from the data file.
std::optional<deletion_time> get_deletion_time(size_t i) const {
if (has_promoted_index(i)) {
return get_promoted_index(i).del_time;
}
return {};
}
key_view get_key(size_t i) const {
auto start = _entries[i].key_offset;
auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
auto v = managed_bytes_view(_key_storage).prefix(end);
v.remove_prefix(start);
return key_view(v);
}
decorated_key_view get_decorated_key(const schema& s, size_t i) const {
auto key = get_key(i);
auto t = _entries[i].token();
if (!t) {
t = dht::raw_token(s.get_partitioner().get_token(key));
_entries[i].raw_token = t.value;
}
return decorated_key_view(dht::token(t), key);
}
size_t external_memory_usage() const {
size_t size = _entries.external_memory_usage();
size += _promoted_indexes.external_memory_usage();
size += _key_storage.external_memory_usage();
for (auto&& e : _entries) {
size += sizeof(index_entry) + e->external_memory_usage();
}
return size;
}
};

View File

@@ -25,6 +25,14 @@ namespace sstables {
extern seastar::logger sstlog;
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;
// Promoted index information produced by the parser.
struct parsed_promoted_index_entry {
deletion_time del_time;
uint64_t promoted_index_start;
uint32_t promoted_index_size;
uint32_t num_blocks;
};
// Partition index entry information produced by the parser.
struct parsed_partition_index_entry {
temporary_buffer<char> key;
@@ -45,10 +53,9 @@ class index_consumer {
schema_ptr _s;
logalloc::allocating_section _alloc_section;
logalloc::region& _region;
utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
size_t _key_storage_size = 0;
public:
index_list indexes;
index_consumer(logalloc::region& r, schema_ptr s)
: _s(s)
, _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
@@ -57,63 +64,36 @@ public:
, _region(r)
{ }
void consume_entry(parsed_partition_index_entry&& e) {
_key_storage_size += e.key.size();
_parsed_entries.emplace_back(std::move(e));
if (e.promoted_index) {
_max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
}
~index_consumer() {
with_allocator(_region.allocator(), [&] {
indexes._entries.clear_and_release();
});
}
future<index_list> finalize() {
index_list result;
// In case of exception, need to deallocate under region allocator.
auto delete_result = seastar::defer([&] {
void consume_entry(parsed_partition_index_entry&& e) {
_alloc_section(_region, [&] {
with_allocator(_region.allocator(), [&] {
result._entries = {};
result._promoted_indexes = {};
result._key_storage = {};
managed_ref<promoted_index> pi;
if (e.promoted_index) {
pi = make_managed<promoted_index>(*_s,
e.promoted_index->del_time,
e.promoted_index->promoted_index_start,
e.promoted_index->promoted_index_size,
e.promoted_index->num_blocks);
}
auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
});
});
auto i = _parsed_entries.begin();
size_t key_offset = 0;
while (i != _parsed_entries.end()) {
_alloc_section(_region, [&] {
with_allocator(_region.allocator(), [&] {
result._entries.reserve(_parsed_entries.size());
result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
if (result._key_storage.empty()) {
result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
}
managed_bytes_mutable_view key_out(result._key_storage);
key_out.remove_prefix(key_offset);
while (i != _parsed_entries.end()) {
parsed_partition_index_entry& e = *i;
if (e.promoted_index) {
result._promoted_indexes[result._entries.size()] = *e.promoted_index;
}
write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
++i;
key_offset += e.key.size();
if (need_preempt()) {
break;
}
}
});
});
co_await coroutine::maybe_yield();
}
delete_result.cancel();
_parsed_entries.clear();
co_return std::move(result);
}
void prepare(uint64_t size) {
_max_promoted_index_entry_plus_one = 0;
_key_storage_size = 0;
_parsed_entries.clear();
_parsed_entries.reserve(size);
_alloc_section = logalloc::allocating_section();
_alloc_section(_region, [&] {
with_allocator(_region.allocator(), [&] {
indexes._entries.reserve(size);
});
});
}
};
@@ -218,14 +198,10 @@ public:
switch (_state) {
// START comes first, to make the handling of the 0-quantity case simpler
state_START:
case state::START:
sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
_state = state::KEY_SIZE;
if (data.size() == 0) {
break;
}
[[fallthrough]];
break;
case state::KEY_SIZE:
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
_entry_offset = current_pos();
@@ -251,16 +227,7 @@ public:
case state::PROMOTED_SIZE:
sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
_position = this->_u64;
if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
data.trim_front(1);
_consumer.consume_entry(parsed_partition_index_entry{
.key = std::move(_key),
.data_file_offset = _position,
.index_offset = _entry_offset,
.promoted_index = std::nullopt
});
goto state_START;
} else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
_state = state::PARTITION_HEADER_LENGTH_1;
break;
}
@@ -372,6 +339,33 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
}
inline
std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
reader_permit permit,
tracing::trace_state_ptr trace_state,
file_input_stream_options options,
use_caching caching)
{
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
seastar::shared_ptr<cached_file> cached_file_ptr = caching
? sst->_cached_index_file
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
sst->manager().get_cache_tracker().get_lru(),
sst->manager().get_cache_tracker().region(),
sst->_index_file_size);
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
_promoted_index_start, _promoted_index_size,
promoted_index_cache_metrics, permit,
sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
}
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
}
// Less-comparator for lookups in the partition index.
class index_comparator {
dht::ring_position_comparator_for_sstables _tri_cmp;
@@ -382,16 +376,26 @@ public:
return _tri_cmp(e.get_decorated_key(), rp) < 0;
}
bool operator()(const index_entry& e, dht::ring_position_view rp) const {
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
}
bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
return operator()(*e, rp);
}
bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
return operator()(rp, *e);
}
bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
return _tri_cmp(e.get_decorated_key(), rp) > 0;
}
};
inline
std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
dht::ring_position_comparator_for_sstables tri_cmp(s);
return tri_cmp(page.get_decorated_key(s, idx), rp);
}
bool operator()(dht::ring_position_view rp, const index_entry& e) const {
return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
}
};
// Contains information about index_reader position in the index file
struct index_bound {
@@ -533,7 +537,7 @@ private:
if (ex) {
return make_exception_future<index_list>(std::move(ex));
}
return bound.consumer->finalize();
return make_ready_future<index_list>(std::move(bound.consumer->indexes));
});
});
};
@@ -546,18 +550,17 @@ private:
if (bound.current_list->empty()) {
throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
}
bound.data_file_position = bound.current_list->_entries[0].position();
bound.data_file_position = bound.current_list->_entries[0]->position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
if (sstlog.is_enabled(seastar::log_level::trace)) {
sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
logalloc::reclaim_lock rl(_region);
for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
auto& e = bound.current_list->_entries[i];
for (auto&& e : bound.current_list->_entries) {
auto dk = dht::decorate_key(*_sstable->_schema,
bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
sstlog.trace(" {} -> {}", dk, e.position());
e->get_key().to_partition_key(*_sstable->_schema));
sstlog.trace(" {} -> {}", dk, e->position());
}
}
@@ -601,13 +604,7 @@ private:
// Valid if partition_data_ready(bound)
index_entry& current_partition_entry(index_bound& bound) {
parse_assert(bool(bound.current_list), _sstable->index_filename());
return bound.current_list->_entries[bound.current_index_idx];
}
// Valid if partition_data_ready(bound)
partition_index_page& current_page(index_bound& bound) {
parse_assert(bool(bound.current_list), _sstable->index_filename());
return *bound.current_list;
return *bound.current_list->_entries[bound.current_index_idx];
}
future<> advance_to_next_partition(index_bound& bound) {
@@ -620,7 +617,7 @@ private:
if (bound.current_index_idx + 1 < bound.current_list->size()) {
++bound.current_index_idx;
bound.current_pi_idx = 0;
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
return reset_clustered_cursor(bound);
@@ -683,13 +680,9 @@ private:
return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
auto i = _alloc_section(_region, [&] {
auto& page = *bound.current_list;
auto& s = *_sstable->_schema;
auto r = std::views::iota(bound.current_index_idx, page._entries.size());
auto it = std::ranges::partition_point(r, [&] (int idx) {
return index_entry_tri_cmp(s, page, idx, pos) < 0;
});
return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
auto& entries = bound.current_list->_entries;
return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
index_comparator(*_sstable->_schema));
});
// i is valid until next allocation point
auto& entries = bound.current_list->_entries;
@@ -704,7 +697,7 @@ private:
}
bound.current_index_idx = std::distance(std::begin(entries), i);
bound.current_pi_idx = 0;
bound.data_file_position = (*i).position();
bound.data_file_position = (*i)->position();
bound.element = indexable_element::partition;
bound.end_open_marker.reset();
sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
@@ -807,34 +800,6 @@ public:
}
}
static
std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
shared_sstable sst,
reader_permit permit,
tracing::trace_state_ptr trace_state,
file_input_stream_options options,
use_caching caching)
{
if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
seastar::shared_ptr<cached_file> cached_file_ptr = caching
? sst->_cached_index_file
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
sst->manager().get_cache_tracker().get_lru(),
sst->manager().get_cache_tracker().region(),
sst->_index_file_size);
return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
pi.promoted_index_start, pi.promoted_index_size,
promoted_index_cache_metrics, permit,
sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
}
auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
}
// Ensures that partition_data_ready() returns true.
// Can be called only when !eof()
future<> read_partition_data() override {
@@ -870,10 +835,10 @@ public:
clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
if (!bound.clustered_cursor) {
_alloc_section(_region, [&] {
partition_index_page& page = current_page(bound);
if (page.has_promoted_index(bound.current_index_idx)) {
promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
index_entry& e = current_partition_entry(bound);
promoted_index* pi = e.get_promoted_index().get();
if (pi) {
bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
get_file_input_stream_options(), _use_caching);
}
});
@@ -896,15 +861,15 @@ public:
// It may be unavailable for old sstables for which this information was not generated.
// Can be called only when partition_data_ready().
std::optional<sstables::deletion_time> partition_tombstone() override {
return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
return current_partition_entry(_lower_bound).get_deletion_time();
}
// Returns the key for current partition.
// Can be called only when partition_data_ready().
std::optional<partition_key> get_partition_key() override {
return _alloc_section(_region, [this] {
return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
.to_partition_key(*_sstable->_schema);
index_entry& e = current_partition_entry(_lower_bound);
return e.get_key().to_partition_key(*_sstable->_schema);
});
}
@@ -918,8 +883,8 @@ public:
// Returns the number of promoted index entries for the current partition.
// Can be called only when partition_data_ready().
uint64_t get_promoted_index_size() {
partition_index_page& page = current_page(_lower_bound);
return page.get_promoted_index_size(_lower_bound.current_index_idx);
index_entry& e = current_partition_entry(_lower_bound);
return e.get_promoted_index_size();
}
bool partition_data_ready() const override {
@@ -1010,9 +975,9 @@ public:
return make_ready_future<bool>(false);
}
return read_partition_data().then([this, key] {
index_comparator cmp(*_sstable->_schema);
bool found = _alloc_section(_region, [&] {
auto& page = current_page(_lower_bound);
return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
return cmp(key, current_partition_entry(_lower_bound)) == 0;
});
return make_ready_future<bool>(found);
});

View File

@@ -257,11 +257,14 @@ public:
while (partial_page || i != _cache.end()) {
if (partial_page) {
auto preempted = with_allocator(_region.allocator(), [&] {
while (partial_page->clear_gently() != stop_iteration::yes) {
return true;
while (!partial_page->empty()) {
partial_page->clear_one_entry();
if (need_preempt()) {
return true;
}
}
partial_page.reset();
return need_preempt();
return false;
});
if (preempted) {
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;

View File

@@ -1094,6 +1094,7 @@ public:
friend class mc::writer;
friend class index_reader;
friend class promoted_index;
friend class sstables_manager;
template <typename DataConsumeRowsContext>
friend future<std::unique_ptr<DataConsumeRowsContext>>

View File

@@ -436,10 +436,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
stream_options.buffer_size = file_stream_buffer_size;
stream_options.read_ahead = file_stream_read_ahead;
for (auto&& source_info : sources) {
// Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
// of the sstable component to be released right after it has been streamed.
auto info = std::exchange(source_info, {});
for (auto& info : sources) {
auto& filename = info.filename;
std::optional<input_stream<char>> fstream;
bool fstream_closed = false;
@@ -620,7 +617,6 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
ops_id, filename, targets, total_size, get_bw(total_size, start_time));
}
}
co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
if (error) {
blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
@@ -684,20 +680,15 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
if (files.empty()) {
co_return resp;
}
auto sstable_nr = sstables.size();
// Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
// a sstable - that has been compacted - can have its space released from disk right after
// that sstable's content has been fully streamed.
sstables.clear();
blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
req.ops_id, sstable_nr, files.size(), files, req.range);
req.ops_id, sstables.size(), files.size(), files, req.range);
auto ops_start_time = std::chrono::steady_clock::now();
auto files_nr = files.size();
size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
resp.stream_bytes = stream_bytes;
auto duration = std::chrono::steady_clock::now() - ops_start_time;
blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
co_return resp;
}

View File

@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
try {
shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
_prepared_stmt = msg_ptr->get_prepared();
_prepared_stmt = std::move(msg_ptr->get_prepared());
shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
_insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
_is_fallback_stmt = fallback;

View File

@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
: _module(std::move(module))
{}
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
auto ms = module->get_task_manager()._messaging;
if (!ms) {
auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
@@ -417,18 +417,19 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
tmlogger.info("tasks_vt_get_children: waiting");
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
});
co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
return resp | std::views::transform([host_id] (auto id) {
return task_identity{
.host_id = host_id,
.task_id = id
};
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
}).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
return utils::chunked_vector<task_identity>{};
});
co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
if (is_host_alive(host_id)) {
return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
return resp | std::views::transform([host_id] (auto id) {
return task_identity{
.host_id = host_id,
.task_id = id
};
}) | std::ranges::to<utils::chunked_vector<task_identity>>();
});
} else {
return make_ready_future<utils::chunked_vector<task_identity>>();
}
}, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
std::move(b.begin(), b.end(), std::back_inserter(a));
return a;

View File

@@ -19,7 +19,6 @@
#include "db_clock.hh"
#include "utils/log.hh"
#include "locator/host_id.hh"
#include "locator/token_metadata_fwd.hh"
#include "schema/schema_fwd.hh"
#include "tasks/types.hh"
#include "utils/chunked_vector.hh"
@@ -283,7 +282,7 @@ public:
impl& operator=(impl&&) = delete;
virtual ~impl() = default;
protected:
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
public:
virtual task_group get_group() const noexcept = 0;
// Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.

View File

@@ -62,11 +62,7 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
// cfg.db_config->index_cache_fraction.set(1.0);
return do_with_cql_env_thread([] (cql_test_env& e) {
// We disable compactions because they cause confusing cache mispopulations.
// We disable compression because the sstable writer targets a specific
// (*compressed* data file size : summary file size) ratio,
// so the number of keys per index page becomes hard to control,
// and might be arbitrarily large.
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();
@@ -158,11 +154,7 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
// cfg.db_config->index_cache_fraction.set(0.0);
return do_with_cql_env_thread([] (cql_test_env& e) {
// We disable compactions because they cause confusing cache mispopulations.
// We disable compression because the sstable writer targets a specific
// (*compressed* data file size : summary file size) ratio,
// so the number of keys per index page becomes hard to control,
// and might be arbitrarily large.
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

View File

@@ -1111,30 +1111,6 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
});
}
SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
#ifndef SCYLLA_ENABLE_ERROR_INJECTION
testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
return make_ready_future();
#endif
return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
sharded<db::snapshot_ctl> sc;
sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
auto stop_sc = deferred_stop(sc);
auto& cf = e.local_db().find_column_family("ks", "cf");
take_snapshot(e).get();
utils::get_local_injector().enable("get_snapshot_details", true);
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
auto details = cf.get_snapshot_details().get();
BOOST_REQUIRE_EQUAL(details.size(), 1);
});
}
// toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
@@ -1881,7 +1857,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {
schema_builder::register_schema_initializer([] (schema_builder& builder) {
if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
builder.set_is_group0_table();
builder.set_is_group0_table(true);
}
});
auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")

View File

@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
// (group0 mutations are not allowed on non-group0 tables)
schema_builder::register_schema_initializer([](schema_builder& builder) {
if (builder.cf_name() == "test_group0_batch") {
builder.set_is_group0_table();
builder.set_is_group0_table(true);
}
});
@@ -345,29 +345,4 @@ SEASTAR_TEST_CASE(test_group0_batch) {
});
}
SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
return do_with_cql_env([] (cql_test_env& e) {
schema_builder::register_schema_initializer([](schema_builder& builder) {
if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
builder.set_is_group0_table();
}
});
auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
.with_column("pk", utf8_type, column_kind::partition_key)
.build();
auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
.with_column("pk", utf8_type, column_kind::partition_key)
.build();
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
return make_ready_future();
});
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -1499,7 +1499,7 @@ SEASTAR_THREAD_TEST_CASE(tablets_simple_rack_aware_view_pairing_test) {
base_host,
base_erm,
view_erm,
true, // uses NTS
*ars_ptr,
base_token,
view_token,
use_tablets,

View File

@@ -719,7 +719,7 @@ SEASTAR_THREAD_TEST_CASE(test_dht_subtract_ranges) {
auto get_random_ranges = [&] (size_t max_count) {
auto count = tests::random::get_int<size_t>(1, max_count);
utils::chunked_vector<dht::partition_range> ranges;
dht::partition_range_vector ranges;
ranges.reserve(count);
for (size_t i = 0; i < count; i++) {

View File

@@ -20,24 +20,16 @@ static void add_entry(logalloc::region& r,
const schema& s,
partition_index_page& page,
const partition_key& key,
uint64_t position,
std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
uint64_t position)
{
logalloc::allocating_section as;
as(r, [&] {
with_allocator(r.allocator(), [&] {
sstables::key sst_key = sstables::key::from_partition_key(s, key);
auto key_offset = page._key_storage.size();
auto old_storage = std::move(page._key_storage);
page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
auto out = managed_bytes_mutable_view(page._key_storage);
write_fragmented(out, managed_bytes_view(old_storage));
write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
if (promoted_index) {
page._promoted_indexes.resize(page._entries.size());
page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
}
page._entries.push_back(make_managed<index_entry>(
managed_bytes(sst_key.get_bytes()),
position,
managed_ref<promoted_index>()));
});
});
}
@@ -62,10 +54,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
static void has_page0(partition_index_cache::entry_ptr ptr) {
BOOST_REQUIRE(!ptr->empty());
BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
};
SEASTAR_THREAD_TEST_CASE(test_caching) {
@@ -147,59 +139,6 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
}
}
SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
::lru lru;
simple_schema s;
logalloc::region r;
partition_index_cache_stats stats;
partition_index_cache cache(lru, r, stats);
auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
partition_index_page page;
auto destroy_page = defer([&] {
with_allocator(r.allocator(), [&] {
auto p = std::move(page);
});
});
add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
.promoted_index_start = 1,
.promoted_index_size = 10,
.num_blocks = 3
});
add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
.promoted_index_start = 2,
.promoted_index_size = 13,
.num_blocks = 1
});
add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
destroy_page.cancel();
co_return std::move(page);
};
auto page = cache.get_or_load(0, page0_loader).get();
BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
with_allocator(r.allocator(), [&] {
lru.evict_all();
});
}
template <typename T>
static future<> ignore_result(future<T>&& f) {
return f.then_wrapped([] (auto&& f) {

View File

@@ -1607,29 +1607,6 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
}
}
static
future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
auto& stm = e.local_db().get_shared_token_metadata();
auto tm = stm.get();
e.get_topology_state_machine().local()._topology.version = tm->get_version();
co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
utils::chunked_vector<frozen_mutation> muts;
muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
.set_version(tm->get_version())
.build().to_mutation(db::system_keyspace::topology())));
co_await e.local_db().apply(muts, db::no_timeout);
co_await e.get_storage_service().local().update_tablet_metadata({});
// Need a new guard to make sure later changes use later timestamp.
// Also, so that the table layer processes the changes we persisted, which is important for splits.
// Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
release_guard(std::move(guard));
abort_source as;
co_return co_await e.get_raft_group0_client().start_operation(as);
}
static
future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
auto& talloc = e.get_tablet_allocator().local();
@@ -1649,14 +1626,19 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
changed = true;
tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
tm.set_version(tm.get_version() + 1);
return make_ready_future<>();
});
}
if (changed) {
// Need to reload on each resize because table object expects tablet count to change by a factor of 2.
guard = co_await save_token_metadata(e, std::move(guard));
co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
co_await e.get_storage_service().local().update_tablet_metadata({});
// Need a new guard to make sure later changes use later timestamp.
release_guard(std::move(guard));
abort_source as;
guard = co_await e.get_raft_group0_client().start_operation(as);
if (load_stats) {
auto new_tm = stm.get();
@@ -1665,11 +1647,6 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
load_stats->stats = *reconciled_stats;
}
}
testlog.debug("Calling local_topology_barrier()");
old_tm = nullptr;
co_await e.get_storage_service().local().local_topology_barrier();
testlog.debug("Finished local_topology_barrier()");
}
}
@@ -1773,22 +1750,13 @@ void do_rebalance_tablets(cql_test_env& e,
}).get();
if (auto_split && load_stats) {
bool reload = false;
auto& tm = *stm.get();
for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
reload = true;
}
testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
}
}
// Need to order split-ack before split finalization, storage_group assumes that.
if (reload) {
guard = save_token_metadata(e, std::move(guard)).get();
}
}
handle_resize_finalize(e, guard, plan, load_stats).get();

View File

@@ -331,28 +331,4 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
std::cerr.rdbuf(oldCerr);
BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
}
SEASTAR_THREAD_TEST_CASE(test_raw_token) {
const auto t1 = dht::token::from_int64(1);
const auto t2 = dht::token::from_int64(2);
dht::raw_token_opt rt_opt;
BOOST_REQUIRE(!rt_opt);
rt_opt = dht::raw_token(t1);
BOOST_REQUIRE(*rt_opt == t1);
BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
BOOST_REQUIRE(dht::raw_token() < dht::first_token());
BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
auto rt1 = dht::raw_token(t1);
BOOST_REQUIRE(bool(rt1));
BOOST_REQUIRE(rt1 > dht::raw_token());
BOOST_REQUIRE(rt1 > dht::minimum_token());
BOOST_REQUIRE_EQUAL(rt1, t1);
BOOST_REQUIRE(rt1 == t1);
BOOST_REQUIRE(rt1 < t2);
BOOST_REQUIRE(rt1 < dht::maximum_token());
}
}

View File

@@ -3221,87 +3221,6 @@ SEASTAR_TEST_CASE(test_view_update_generating_writetime) {
});
}
// Usually if only an unselected column in the base table is modified, we expect an optimization that a view
// update is not done, but we had an bug(https://scylladb.atlassian.net/browse/SCYLLADB-808) where the existence
// of a collection selected in the view caused us to skip this optimization, even when it was not modified.
// This test reproduces this bug.
SEASTAR_TEST_CASE(test_view_update_unmodified_collection) {
// In this test we verify that we correctly skip (or not) view updates to a view that selects
// a collection column. We use two MVs, similarly as in the test above test.
return do_with_cql_env_thread([] (cql_test_env& e) {
auto f1 = e.local_view_builder().wait_until_built("ks", "mv1");
auto f2 = e.local_view_builder().wait_until_built("ks", "mv2");
e.execute_cql("CREATE TABLE t (k int, c int, a int, b list<int>, g int, primary key(k, c))").get();
e.execute_cql("CREATE MATERIALIZED VIEW mv1 AS SELECT k,c,a,b FROM t "
"WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)").get();
e.execute_cql("CREATE MATERIALIZED VIEW mv2 AS SELECT k,c,a,b FROM t "
"WHERE k IS NOT NULL AND c IS NOT NULL AND a IS NOT NULL PRIMARY KEY (c, k, a)").get();
f1.get();
f2.get();
auto total_t_view_updates = [&] {
return e.db().map_reduce0([] (replica::database& local_db) {
const db::view::stats& local_stats = local_db.find_column_family("ks", "t").get_view_stats();
return local_stats.view_updates_pushed_local + local_stats.view_updates_pushed_remote;
}, 0, std::plus<int64_t>()).get();
};
auto total_mv1_updates = [&] {
return e.db().map_reduce0([] (replica::database& local_db) {
return local_db.find_column_family("ks", "mv1").get_stats().writes.hist.count;
}, 0, std::plus<int64_t>()).get();
};
auto total_mv2_updates = [&] {
return e.db().map_reduce0([] (replica::database& local_db) {
return local_db.find_column_family("ks", "mv2").get_stats().writes.hist.count;
}, 0, std::plus<int64_t>()).get();
};
::shared_ptr<cql_transport::messages::result_message> msg;
e.execute_cql("INSERT INTO t (k, c, a) VALUES (1, 1, 1)").get();
eventually([&] {
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
const update_counter expected{1, 1, 2};
BOOST_REQUIRE_EQUAL(results, expected);
});
// We update an unselected column and the collection remains NULL, so we should generate an
// update to the virtual column in mv1 but not to mv2.
e.execute_cql("UPDATE t SET g=1 WHERE k=1 AND c=1;").get();
eventually([&] {
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
const update_counter expected{2, 1, 3};
BOOST_REQUIRE_EQUAL(results, expected);
});
// We update the collection with an initial value
e.execute_cql("UPDATE t SET b=[1] WHERE k=1 AND c=1;").get();
eventually([&] {
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
const update_counter expected{3, 2, 5};
BOOST_REQUIRE_EQUAL(results, expected);
});
// We update an unselected column again with a non-NULL selected collection. Because the liveness of the updated column is unchanged
// and no other selected column is updated (in particular, the collection column), we should generate no view updates.
e.execute_cql("UPDATE t SET g=2 WHERE k=1 AND c=1;").get();
eventually([&] {
const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
const update_counter expected{3, 2, 5};
BOOST_REQUIRE_EQUAL(results, expected);
});
});
}
SEASTAR_TEST_CASE(test_conflicting_batch) {
return do_with_cql_env_thread([] (cql_test_env& e) {

View File

@@ -114,6 +114,27 @@ async def test_service_levels_upgrade(request, manager: ManagerClient, build_mod
result_with_sl_v2 = await cql.run_async(f"SELECT service_level FROM system.service_levels_v2")
assert set([sl.service_level for sl in result_with_sl_v2]) == set(sls + [DRIVER_SL_NAME] + [sl_v2])
@pytest.mark.asyncio
async def test_service_levels_upgrade_with_empty_legacy_table(manager: ManagerClient):
cfg = {**auth_config, "force_gossip_topology_changes": True, "tablets_mode_for_new_keyspaces": "disabled"}
servers = [await manager.server_add(config=cfg)]
cfg.pop("force_gossip_topology_changes")
servers += [await manager.server_add(config=cfg) for _ in range(2)]
cql = manager.get_cql()
assert cql
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
rows = await cql.run_async("SELECT service_level FROM system_distributed.service_levels")
assert list(rows) == []
await manager.api.upgrade_to_raft_topology(hosts[0].address)
await asyncio.gather(*(wait_until_topology_upgrade_finishes(manager, h.address, time.time() + 60) for h in hosts))
sl_version = await cql.run_async("SELECT value FROM system.scylla_local WHERE key = 'service_level_version'")
assert sl_version[0].value == "2"
@pytest.mark.asyncio
async def test_service_levels_work_during_recovery(manager: ManagerClient):
# FIXME: move this test to the Raft-based recovery procedure or remove it if unneeded.

View File

@@ -254,3 +254,27 @@ async def test_node_ops_task_wait(manager: ManagerClient):
await decommission_task
await waiting_task
@pytest.mark.asyncio
async def test_get_children(manager: ManagerClient):
module_name = "node_ops"
tm = TaskManagerClient(manager.api)
servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
injection = "tasks_vt_get_children"
handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
log = await manager.server_open_log(servers[0].server_id)
mark = await log.mark()
bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
async def _decommission():
await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
await manager.decommission_node(servers[1].server_id)
await handler.message()
async def _get_status():
await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
await asyncio.gather(*(_decommission(), _get_status()))

View File

@@ -12,11 +12,9 @@ import pytest
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id
from test.pylib.rest_client import read_barrier
from test.pylib.tablets import get_all_tablet_replicas
from test.cluster.conftest import skip_mode
from test.cluster.util import create_new_test_keyspace, new_test_keyspace, get_topology_coordinator, find_server_by_host_id
from test.cluster.test_incremental_repair import trigger_tablet_merge
from test.cluster.util import create_new_test_keyspace, new_test_keyspace
from test.cluster.test_tablets2 import inject_error_on
from test.cluster.tasks.task_manager_client import TaskManagerClient
from test.cluster.tasks.task_manager_types import TaskStatus, TaskStats
@@ -153,45 +151,6 @@ async def test_tablet_repair_task_list(manager: ManagerClient):
await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks))
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_repair_wait(manager: ManagerClient):
module_name = "tablets"
tm = TaskManagerClient(manager.api)
stop_repair_injection = "repair_tablet_repair_task_impl_run"
servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager)
assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
await inject_error_on(manager, stop_repair_injection, servers)
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", "all", await_completion=False)
repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
task = repair_tasks[0]
log = await manager.server_open_log(servers[0].server_id)
mark = await log.mark()
async def wait_for_task():
await enable_injection(manager, servers, "tablet_virtual_task_wait")
status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
async def merge_tablets():
await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark)
# Resume repair.
await message_injection(manager, servers, stop_repair_injection)
# Merge tablets.
coord = await find_server_by_host_id(manager, servers, await get_topology_coordinator(manager))
log2 = await manager.server_open_log(coord.server_id)
await trigger_tablet_merge(manager, servers, [log2])
await read_barrier(manager.api, servers[0].ip_addr)
await message_injection(manager, servers, "tablet_virtual_task_wait")
await asyncio.gather(wait_for_task(), merge_tablets())
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_repair_task_children(manager: ManagerClient):

View File

@@ -1,70 +0,0 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import asyncio
import time
import pytest
from test.cluster.util import get_current_group0_config
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import read_barrier
from test.pylib.util import wait_for
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
"""Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
The bug was that when the bootstrapping node joined group0 before reaching
post_server_start, it skipped post_server_start and thus hung forever.
The test simulates the scenario by starting the second node with the
join_group0_pause_before_config_check injection. Without the fix, the
startup times out.
"""
logger.info("Adding first server")
s1 = await manager.server_add()
logger.info("Adding second server with join_group0_pause_before_config_check enabled")
s2 = await manager.server_add(start=False, config={
'error_injections_at_startup': ['join_group0_pause_before_config_check']
})
logger.info(f"Starting {s2}")
start_task = asyncio.create_task(manager.server_start(s2.server_id))
s2_log = await manager.server_open_log(s2.server_id)
await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
s1_host_id = await manager.get_host_id(s1.server_id)
s2_host_id = await manager.get_host_id(s2.server_id)
async def s2_in_group0_config_on_s1():
config = await get_current_group0_config(manager, s1)
ids = {m[0] for m in config}
assert s1_host_id in ids # sanity check
return True if s2_host_id in ids else None
# Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
# get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
# to see s2 and then perform a read barrier on s2.
logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
await read_barrier(manager.api, s2.ip_addr)
logger.info(f"Unblocking {s2}")
await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
logger.info(f"Waiting for {s2} to complete bootstrap")
await asyncio.wait_for(start_task, timeout=60)

View File

@@ -433,8 +433,7 @@ async def test_non_existant_table_master_key(manager: ManagerClient, tmpdir):
async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
cfg = {"authenticator": "org.apache.cassandra.auth.PasswordAuthenticator",
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer",
"commitlog_sync": "batch" }
"authorizer": "org.apache.cassandra.auth.CassandraAuthorizer"}
servers: list[ServerInfo] = await manager.servers_add(servers_num = 1, config=cfg,
driver_connect_opts={'auth_provider': PlainTextAuthProvider(username='cassandra', password='cassandra')})
@@ -451,14 +450,11 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
file_paths = [f for f in file_paths if os.path.isfile(f) and not os.path.islink(f)]
for file_path in file_paths:
try:
with open(file_path, 'rb') as f:
data = f.read()
if pbytes in data:
pattern_found_counter += 1
logger.debug("Pattern '%s' found in %s", pattern, file_path)
except FileNotFoundError:
pass # assume just compacted away
with open(file_path, 'rb') as f:
data = f.read()
if pbytes in data:
pattern_found_counter += 1
logger.debug("Pattern '%s' found in %s", pattern, file_path)
if expect:
assert pattern_found_counter > 0
@@ -466,15 +462,15 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
assert pattern_found_counter == 0
async def verify_system_info(expect: bool):
user = f"user_{str(uuid.uuid4())}".replace('-','_')
user = f"user_{str(uuid.uuid4())}"
pwd = f"pwd_{str(uuid.uuid4())}"
cql.execute(f"CREATE USER {user} WITH PASSWORD '{pwd}' NOSUPERUSER")
assert_one(cql, f"LIST ROLES of {user}", [user, False, True, {}])
logger.debug("Verify PART 1: check commitlogs -------------")
await grep_database_files(pwd, "commitlog", "**/*.log", False)
await grep_database_files(user, "commitlog", "**/*.log", expect)
grep_database_files(pwd, "commitlog", "**/*.log", expect)
grep_database_files(user, "commitlog", "**/*.log", True)
salted_hash = None
system_auth = None
@@ -491,38 +487,39 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
assert salted_hash is not None
assert system_auth is not None
await grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
rand_comment = f"comment_{str(uuid.uuid4())}"
async with await create_ks(manager) as ks:
async with new_test_table(manager, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
async with await new_test_table(cql, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
cql.execute(f"ALTER TABLE {table} WITH comment = '{rand_comment}'")
await grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
# Note: original test did greping in sstables. This does no longer work
# since all system tables are compressed, and thus binary greping will
# not work. We could do scylla sstable dump-data and grep in the json,
# but this is somewhat pointless as this would, if it handles it, just
# decrypt the info from the sstable, thus we can't really verify anything.
# We could maybe check that the expected system tables are in fact encrypted,
# though this is more a promise than guarantee... Also, the only tables
# encrypted are paxos and batchlog -> pointless
grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
nodetool.flush_all(cql)
await verify_system_info(True) # not encrypted
logger.debug("Verify PART 2: check sstable files -------------\n`system_info_encryption` won't encrypt sstable files on disk")
logger.debug("GREP_DB_FILES: Check PM key user in sstable file ....")
grep_database_files(user, f"data/{system_auth}/", "**/*-Data.db", expect=True)
logger.debug("GREP_DB_FILES: Check original password in commitlogs .... Original password should never be saved")
grep_database_files(pwd, f"data/{system_auth}/", "**/*-Data.db", expect=False)
logger.debug("GREP_DB_FILES: Check salted_hash of password in sstable file ....")
grep_database_files(salted_hash, f"data/{system_auth}/", "**/*-Data.db", expect=False)
logger.debug("GREP_DB_FILES: Check table comment in sstable file ....")
grep_database_files(rand_comment, "data/system_schema/", "**/*-Data.db", expect=True)
verify_system_info(True) # not encrypted
cfg = {"system_info_encryption": {
"enabled": True,
"key_provider": "LocalFileSystemKeyProviderFactory"},
"system_key_directory": os.path.join(tmpdir, "resources/system_keys")
"key_provider": "LocalFileSystemKeyProviderFactory"}
}
for server in servers:
await manager.server_update_config(server.server_id, config_options=cfg)
await manager.server_restart(server.server_id)
manager.server_update_config(server.server_id, config_options=cfg)
await manager.rolling_restart(servers)
await verify_system_info(False) # should not see stuff now
verify_system_info(False) # should not see stuff now
async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):

View File

@@ -609,19 +609,14 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
scylla_path = get_scylla_path(cql)
coord = await get_topology_coordinator(manager)
coord_serv = await find_server_by_host_id(manager, servers, coord)
coord_log = await manager.server_open_log(coord_serv.server_id)
# Trigger merge and error in merge
mark = await coord_log.mark()
await inject_error_on(manager, error, [coord_serv])
s1_mark = await logs[0].mark()
await inject_error_on(manager, error, servers[:1])
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
await coord_log.wait_for(f'Got {error}', from_mark=mark)
await logs[0].wait_for(f'Got {error}', from_mark=s1_mark)
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
await manager.server_stop(coord_serv.server_id)
await manager.server_start(coord_serv.server_id)
await manager.server_stop(servers[0].server_id)
await manager.server_start(servers[0].server_id)
for server in servers:
await manager.server_stop_gracefully(server.server_id)
@@ -867,6 +862,50 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
logger.info("Starting vnode repair")
await manager.api.repair(servers[1].ip_addr, ks, "test")
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
# Incremental repair vs tablet merge
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
cmdline = ['--logger-log-level', 'repair=debug']
servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
coord = await get_topology_coordinator(manager)
coord_serv = await find_server_by_host_id(manager, servers, coord)
coord_log = await manager.server_open_log(coord_serv.server_id)
# Trigger merge and wait until the merge fiber starts
s1_mark = await coord_log.mark()
await inject_error_on(manager, "merge_completion_fiber", servers)
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
# Trigger repair and wait for the inc repair prepare preparation to start
s1_mark = await coord_log.mark()
await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
# Wait for preparation to start.
await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
# Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
# With the serialization, preparation will wait for merge fiber to finish.
await asyncio.sleep(0.1)
# Continue to execute the merge fiber so that the compaction group is removed
await inject_error_on(manager, "replica_merge_completion_wait", servers)
for s in servers:
await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
# Continue the repair to trigger use-after-free
for s in servers:
await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
# Reproducer for https://github.com/scylladb/scylladb/issues/27365
# Incremental repair vs table drop
@pytest.mark.asyncio

View File

@@ -162,12 +162,7 @@ async def do_test_internode_compression_between_datacenters(manager: ManagerClie
await asyncio.gather(*[manager.server_stop(s.server_id) for s,_ in servers])
await asyncio.gather(*[p.stop() for p in proxies])
# these will all except, because we just stopped them above
for coro in proxy_futs:
try:
await coro
except:
pass
async def test_internode_compression_compress_packets_between_nodes(request, manager: ManagerClient) -> None:
def check_expected(msg_size, node1_proxy, node2_proxy, node3_proxy):

View File

@@ -1,65 +0,0 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import asyncio
import pytest
from test.cluster.util import new_test_keyspace, new_test_table
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import inject_error_one_shot
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_prepare_fails_if_cached_statement_is_invalidated_mid_prepare(manager: ManagerClient):
server = await manager.server_add()
cql = manager.get_cql()
log = await manager.server_open_log(server.server_id)
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks:
async with new_test_table(manager, ks, "pk int PRIMARY KEY") as table:
query = f"SELECT * FROM {table} WHERE pk = ?"
loop = asyncio.get_running_loop()
await cql.run_async(f"INSERT INTO {table} (pk) VALUES (7)")
await cql.run_async(f"INSERT INTO {table} (pk) VALUES (8)")
handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
mark = await log.mark()
prepare_future = loop.run_in_executor(None, lambda: cql.prepare(query))
await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=mark, timeout=60)
# Trigger table schema update (metadata-only) to invalidate prepared statements while PREPARE is paused.
await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race'")
await handler.message()
done, _ = await asyncio.wait({prepare_future}, timeout=15)
if not done:
pytest.fail("Timed out waiting for PREPARE to complete after signaling injection")
result = done.pop().result()
print(f"PREPARE succeeded as expected: {result!r}")
rows = cql.execute(result, [7])
row = rows.one()
assert row is not None and row.pk == 7
# Invalidate prepared statements again, then execute the same prepared object.
# The driver should transparently re-prepare and re-request execution.
await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race-again'")
reprepare_handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
reprepare_mark = await log.mark()
execute_future = loop.run_in_executor(None, lambda: cql.execute(result, [8]))
await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=reprepare_mark, timeout=60)
await reprepare_handler.message()
execute_done, _ = await asyncio.wait({execute_future}, timeout=15)
if not execute_done:
pytest.fail("Timed out waiting for driver execute to finish after re-prepare signaling")
retried_rows = execute_done.pop().result()
retried_row = retried_rows.one()
assert retried_row is not None and retried_row.pk == 8

View File

@@ -16,10 +16,8 @@ import pytest
import socket
import ssl
import struct
import time
from test.pylib.manager_client import ManagerClient
from test.pylib.util import wait_for
logger = logging.getLogger(__name__)
@@ -271,28 +269,6 @@ async def send_cql_with_proxy_header_tls(
sock.close()
async def wait_for_results(cql, query: str, expected_count: int, timeout: float = 30.0, filter_fn=None):
"""
Polls `query` until at least `expected_count` rows satisfy `filter_fn` (all rows if no filter is given).
On timeout, logs the full result set from the last poll to aid debugging.
"""
last_rows: list = []
async def check_resultset():
nonlocal last_rows
last_rows = list(await cql.run_async(query))
matching = filter_fn(last_rows) if filter_fn is not None else last_rows
if len(matching) >= expected_count:
return matching
return None
try:
return await wait_for(check_resultset, time.time() + timeout, period=0.1)
except Exception:
logger.error('Timed out waiting for %d matching rows in system.clients. Last poll returned %d total rows:\n%s',
expected_count, len(last_rows),'\n'.join(str(r) for r in last_rows))
raise
# Shared server configuration for all tests
# We configure explicit SSL ports to keep the standard ports unencrypted
# so the Python driver can connect without TLS.
@@ -392,12 +368,9 @@ async def test_proxy_protocol_shard_aware(proxy_server):
await do_cql_handshake(reader, writer)
# Now query system.clients to verify shard assignments
rows = await wait_for_results(
cql,
'SELECT address, port, shard_id FROM system.clients',
expected_count=num_shards,
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
)
rows = list(cql.execute(
f"SELECT address, port, shard_id FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
))
# Build a map of port -> shard_id from the results
port_to_shard = {row.port: row.shard_id for row in rows}
@@ -473,12 +446,9 @@ async def test_proxy_protocol_port_preserved_in_system_clients(proxy_server):
# Now query system.clients using the driver to see our connection
cql = manager.get_cql()
rows = await wait_for_results(
cql,
'SELECT address, port FROM system.clients',
expected_count=1,
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
)
rows = list(cql.execute(
f"SELECT address, port FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
))
# We should find our connection with the fake source address and port
assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
@@ -599,12 +569,9 @@ async def test_proxy_protocol_ssl_shard_aware(proxy_server):
ssl_sock.recv(4096)
# Now query system.clients to verify shard assignments
rows = await wait_for_results(
cql,
'SELECT address, port, shard_id, ssl_enabled FROM system.clients',
expected_count=num_shards,
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
)
rows = list(cql.execute(
f"SELECT address, port, shard_id, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
))
# Build a map of port -> (shard_id, ssl_enabled) from the results
port_to_info = {row.port: (row.shard_id, row.ssl_enabled) for row in rows}
@@ -689,12 +656,9 @@ async def test_proxy_protocol_ssl_port_preserved(proxy_server):
# Now query system.clients using the driver to see our connection
cql = manager.get_cql()
rows = await wait_for_results(
cql,
'SELECT address, port, ssl_enabled FROM system.clients',
expected_count=1,
filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
)
rows = list(cql.execute(
f"SELECT address, port, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
))
# We should find our connection
assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"

View File

@@ -7,7 +7,6 @@ import logging
import pytest
import asyncio
from test.pylib.internal_types import ServerNum
from test.pylib.manager_client import ManagerClient
from test.cluster.conftest import skip_mode
from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
@@ -21,20 +20,6 @@ def fixture_raft_op_timeout(build_mode):
return 10000 if build_mode == 'debug' else 1000
async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
running_ids = [srv.server_id for srv in await manager.running_servers()]
if server_id in running_ids:
# If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
# doesn't guarantee that the new config file is active. Work around this by looking at the logs.
log_file = await manager.server_open_log(server_id)
mark = await log_file.mark()
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
else:
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
@@ -57,6 +42,7 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
config = {
'direct_failure_detector_ping_timeout_in_ms': 300,
'group0_raft_op_timeout_in_ms': raft_op_timeout,
'error_injections_at_startup': [
{
'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -78,10 +64,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
manager.server_stop_gracefully(servers[3].server_id),
manager.server_stop_gracefully(servers[4].server_id))
# Do it here to prevent unexpected timeouts before quorum loss.
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
for srv in servers[:2]))
logger.info("starting a sixth node with no quorum")
await manager.server_add(expected_error="raft operation \\[read_barrier\\] timed out, there is no raft quorum",
timeout=60)
@@ -94,6 +76,7 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
config = {
'group0_raft_op_timeout_in_ms': raft_op_timeout,
'error_injections_at_startup': [
{
'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -124,9 +107,6 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
manager.server_stop_gracefully(servers[2].server_id))
# Do it here to prevent unexpected timeouts before quorum loss.
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
logger.info("release join-node-before-add-entry injection")
await injection_handler.message()
@@ -146,6 +126,7 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
logger.info("adding a fourth node")
servers += [await manager.server_add(config={
'group0_raft_op_timeout_in_ms': raft_op_timeout,
'error_injections_at_startup': [
{
'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -172,9 +153,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
manager.server_stop_gracefully(servers[2].server_id))
# Do it here to prevent unexpected timeouts before quorum loss.
await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
logger.info("release join-node-response_handler-before-read-barrier injection")
injection_handler = InjectionHandler(manager.api,
'join-node-response_handler-before-read-barrier',
@@ -191,6 +169,7 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
logger.info("starting a first node (the leader)")
servers = [await manager.server_add(config={
'group0_raft_op_timeout_in_ms': raft_op_timeout,
'error_injections_at_startup': [
{
'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -210,9 +189,6 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
manager.server_stop_gracefully(servers[2].server_id))
# Do it here to prevent unexpected timeouts before quorum loss.
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
logger.info("attempting removenode for the second node")
await manager.remove_node(servers[0].server_id, servers[1].server_id,
expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
@@ -256,7 +232,9 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))
# This ensures the read barriers below fail quickly without group 0 quorum.
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))
logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
for srv in servers))
logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
for idx, srv in enumerate(servers[:2]):
@@ -268,7 +246,8 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
# Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
# times out.
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
for srv in servers))
logger.info(f"Restarting {servers[2:]} with group 0 quorum")
for srv in servers[2:]:

View File

@@ -978,7 +978,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
await wait_for_tablet_count(manager, s0, ks, 'test', lambda c: c == 1, 1, timeout_s=15)
logger.info("Ensure the guard decided to retain the erm")
m, _ = await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
from_mark=m, timeout=10)
tablets = await get_all_tablet_replicas(manager, s0, ks, 'test')
@@ -986,11 +986,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
tablet = tablets[0]
assert tablet.replicas == [(s0_host_id, 0)]
# Since merge now waits for erms before releasing the state machine,
# the migration initiated below will not start until paxos released the erm.
# The barrier which is blocked is the one in merge finalization.
# I keep the tablet movement as a guard against regressions in case the behavior changes.
m = await log0.mark()
migration_task = asyncio.create_task(manager.api.move_tablet(s0.ip_addr, ks, "test",
s0_host_id, 0,
s0_host_id, 1,

View File

@@ -441,6 +441,84 @@ async def test_tablet_split_merge_with_many_tables(build_mode: str, manager: Man
await check_logs("after merge completion")
# Reproduces use-after-free when migration right after merge, but concurrently to background
# merge completion handler.
# See: https://github.com/scylladb/scylladb/issues/24045
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_migration_running_concurrently_to_merge_completion_handling(manager: ManagerClient):
cmdline = []
# Size based balancing can attempt to migrate the merged tablet as soon as the merge is complete
# because of a lower transient effective_capacity on the node with the merged tablet.
# This migration will timeout on cleanup because the compaction group still has an active task,
# which is held by the merge_completion_fiber injection, so the tablet's compaction group gate
# can not be closed, resulting in cleanup getting stuck. We force capacity based balancing to
# avoid this problem.
cfg = {'force_capacity_based_balancing': True}
servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
await manager.disable_tablet_balancing()
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count == 2
old_tablet_count = tablet_count
keys = range(100)
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
await cql.run_async(f"ALTER KEYSPACE {ks} WITH tablets = {{'initial': 1}};")
s0_log = await manager.server_open_log(servers[0].server_id)
s0_mark = await s0_log.mark()
await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
await manager.api.enable_injection(servers[0].ip_addr, "replica_merge_completion_wait", one_shot=True)
await manager.enable_tablet_balancing()
servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
s1_host_id = await manager.get_host_id(servers[1].server_id)
async def finished_merging():
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count < old_tablet_count or None
await wait_for(finished_merging, time.time() + 120)
await manager.disable_tablet_balancing()
await manager.api.enable_injection(servers[0].ip_addr, "take_storage_snapshot", one_shot=True)
await s0_log.wait_for(f"merge_completion_fiber: waiting", from_mark=s0_mark)
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count == 1
tablet_token = 0 # Doesn't matter since there is one tablet
replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
s0_host_id = await manager.get_host_id(servers[0].server_id)
src_shard = replica[1]
dst_shard = src_shard
migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, s1_host_id, dst_shard, tablet_token))
await s0_log.wait_for(f"take_storage_snapshot: waiting", from_mark=s0_mark)
await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
await s0_log.wait_for(f"Merge completion fiber finished", from_mark=s0_mark)
await manager.api.message_injection(servers[0].ip_addr, "take_storage_snapshot")
await migration
rows = await cql.run_async(f"SELECT * FROM {ks}.test;")
assert len(rows) == len(keys)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_missing_data(manager: ManagerClient):
@@ -577,77 +655,3 @@ async def test_merge_with_drop(manager: ManagerClient):
await asyncio.sleep(0.1)
await manager.api.message_injection(server.ip_addr, "compaction_group_stop_wait")
await drop_table_fut
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_background_merge_deadlock(manager: ManagerClient):
"""
Reproducer for https://scylladb.atlassian.net/browse/SCYLLADB-928
Reproduces a deadlock in the background merge completion handler that can happen when multiple merges accumulate.
If we accumulate more than 1 merge cycle for the fiber, deadlock occurs due to compaction lock taken
on the main group (post-merge). The lock is held until compaction groups are precessed by the background merge
fiber
Example:
Initial state:
cg0: main,
cg1: main
cg2: main
cg3: main
After 1st merge:
cg0': main [locked], merging_groups=[cg0.main, cg1.main]
cg1': main [locked], merging_groups=[cg2.main, cg3.main]
After 2nd merge:
cg0'': main [locked], merging_groups=[cg0'.main [locked], cg0.main, cg1.main, cg1'.main [locked], cg2.main, cg3.main]
The test reproduces this by doing a tablet merge from 8 tablets to 1 (8 -> 4 -> 2 -> 1). The background merge fiber
is blocked until after the first merge (to 4), so that there is a higher chance of two merges queueing in the fiber.
If deadlock occurs, node shutdown will hang waiting for the background merge fiber. That's why the test
tries to stop the node at the end.
"""
cmdline = [
'--logger-log-level', 'load_balancer=debug',
'--logger-log-level', 'raft_topology=debug',
]
servers = [await manager.server_add(cmdline=cmdline)]
cql, _ = await manager.get_ready_cql(servers)
ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
# Create a table which will go through 3 merge cycles.
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) with tablets = {{'min_tablet_count': 8}};")
await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
log = await manager.server_open_log(servers[0].server_id)
mark = await log.mark()
# Trigger tablet merging
await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': 1}};")
async def produced_one_merge():
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count == 4 or None
await wait_for(produced_one_merge, time.time() + 120)
mark, _ = await log.wait_for(f"merge_completion_fiber: waiting", from_mark=mark)
await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
mark, _ = await log.wait_for(f"merge_completion_fiber: message received", from_mark=mark)
async def finished_merge():
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count == 1 or None
await wait_for(finished_merge, time.time() + 120)
await manager.server_stop(servers[0].server_id)

View File

@@ -94,8 +94,6 @@ async def test_remove_garbage_group0_members(manager: ManagerClient):
logging.info(f'stop {servers[1]}')
await manager.server_stop_gracefully(servers[1].server_id)
await wait_for_token_ring_and_group0_consistency(manager, time.time() + 60)
logging.info(f'removenode {servers[1]} using {servers[2]}')
await manager.remove_node(servers[2].server_id, servers[1].server_id)

View File

@@ -559,9 +559,6 @@ private:
cfg->ring_delay_ms.set(500);
cfg->shutdown_announce_in_ms.set(0);
cfg->broadcast_to_all_shards().get();
smp::invoke_on_all([&] {
sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
}).get();
create_directories((data_dir_path + "/system").c_str());
create_directories(cfg->commitlog_directory().c_str());
create_directories(cfg->schema_commitlog_directory().c_str());

View File

@@ -449,68 +449,3 @@ def test_repair_incremenatal_repair(nodetool, mode):
Starting repair with task_id={id1} keyspace=ks table=table1
Repair with task_id={id1} finished
"""
def test_cluster_repair_table_dropped(nodetool):
id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
res = nodetool("cluster", "repair", "ks", expected_requests=[
expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
expected_request("GET", "/column_family", response=[{"ks": "ks", "cf": "table1"}, {"ks": "ks", "cf": "table2"}]),
expected_request(
"POST",
"/storage_service/tablets/repair",
params={
"ks": "ks",
"table": "table1",
"tokens": "all"},
response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
expected_request(
"POST",
"/storage_service/tablets/repair",
params={
"ks": "ks",
"table": "table2",
"tokens": "all"},
response={"tablet_task_id": id1}),
expected_request(
"GET",
f"/task_manager/wait_task/{id1}",
response={"state": "done"}),
])
assert _remove_log_timestamp(res.stdout) == f"""\
Starting repair with task_id={id1} keyspace=ks table=table2
Repair with task_id={id1} finished
"""
def test_cluster_repair_specified_table_dropped(nodetool):
id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
check_nodetool_fails_with_error_contains(
nodetool,
("cluster", "repair", "ks", "table1", "table2"),
{"expected_requests": [
expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
expected_request(
"POST",
"/storage_service/tablets/repair",
params={
"ks": "ks",
"table": "table1",
"tokens": "all"},
response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
expected_request(
"POST",
"/storage_service/tablets/repair",
params={
"ks": "ks",
"table": "table2",
"tokens": "all"},
response={"tablet_task_id": id1}),
expected_request(
"GET",
f"/task_manager/wait_task/{id1}",
response={"state": "done"}),
]
},
[f"Can't find a column family table1 in keyspace ks"])

View File

@@ -10,7 +10,6 @@
#include <memory>
#include <signal.h>
#include <seastar/core/future.hh>
#include <seastar/core/sleep.hh>
#include <seastar/core/thread.hh>
#include <seastar/core/app-template.hh>
#include <seastar/http/client.hh>
@@ -79,23 +78,6 @@ static future<> make_request(http::experimental::client& cli, sstring operation,
});
}
static void wait_for_alternator(const test_config& c) {
for (int attempt = 0; attempt < 3000; ++attempt) {
try {
auto cli = get_client(c);
auto close = defer([&] { cli.close().get(); });
make_request(cli, "ListTables", "{}").get();
return;
} catch (...) {
}
seastar::sleep(std::chrono::milliseconds(100)).get();
if (attempt >= 100 && attempt % 10 == 0) {
std::cout << fmt::format("Retrying connect to alternator port (attempt {})", attempt + 1) << std::endl;
}
}
throw std::runtime_error("Timed out waiting for alternator port to become ready");
}
static void delete_alternator_table(http::experimental::client& cli) {
try {
make_request(cli, "DeleteTable", R"({"TableName": "workloads_test"})").get();
@@ -391,8 +373,6 @@ auto make_client_pool(const test_config& c) {
void workload_main(const test_config& c) {
std::cout << "Running test with config: " << c << std::endl;
wait_for_alternator(c);
auto cli = get_client(c);
auto finally = defer([&] {
delete_alternator_table(cli);

View File

@@ -330,13 +330,10 @@ int scylla_simple_query_main(int argc, char** argv) {
("counters", "test counters")
("tablets", "use tablets")
("initial-tablets", bpo::value<unsigned>()->default_value(128), "initial number of tablets")
("sstable-summary-ratio", bpo::value<double>(), "Generate summary entry, so that summary file size / data file size ~= this ratio")
("sstable-format", bpo::value<std::string>(), "SSTable format name to use")
("flush", "flush memtables before test")
("memtable-partitions", bpo::value<unsigned>(), "apply this number of partitions to memtable, then flush")
("json-result", bpo::value<std::string>(), "name of the json result file")
("enable-cache", bpo::value<bool>()->default_value(true), "enable row cache")
("enable-index-cache", bpo::value<bool>()->default_value(true), "enable partition index cache")
("stop-on-error", bpo::value<bool>()->default_value(true), "stop after encountering the first error")
("timeout", bpo::value<std::string>()->default_value(""), "use timeout")
("bypass-cache", "use bypass cache when querying")
@@ -360,19 +357,8 @@ int scylla_simple_query_main(int argc, char** argv) {
auto db_cfg = ::make_shared<db::config>(ext);
const auto enable_cache = app.configuration()["enable-cache"].as<bool>();
const auto enable_index_cache = app.configuration()["enable-index-cache"].as<bool>();
std::cout << "enable-cache=" << enable_cache << '\n';
std::cout << "enable-index-cache=" << enable_index_cache << '\n';
db_cfg->enable_cache(enable_cache);
db_cfg->cache_index_pages(enable_index_cache);
if (app.configuration().contains("sstable-summary-ratio")) {
db_cfg->sstable_summary_ratio(app.configuration()["sstable-summary-ratio"].as<double>());
}
std::cout << "sstable-summary-ratio=" << db_cfg->sstable_summary_ratio() << '\n';
if (app.configuration().contains("sstable-format")) {
db_cfg->sstable_format(app.configuration()["sstable-format"].as<std::string>());
}
std::cout << "sstable-format=" << db_cfg->sstable_format() << '\n';
cql_test_config cfg(db_cfg);
if (app.configuration().contains("tablets")) {
cfg.db_config->tablets_mode_for_new_keyspaces.set(db::tablets_mode_t::mode::enabled);

View File

@@ -15,8 +15,9 @@ from cassandra.cluster import ConsistencyLevel
from cassandra.query import SimpleStatement
from typing import Callable
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table, reconnect_driver
from test.pylib.manager_client import ManagerClient, wait_for_cql_and_get_hosts
from test.cluster.conftest import skip_mode
from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table
from test.pylib.manager_client import ManagerClient
from test.pylib.tablets import get_tablet_count
from test.pylib.util import Host
from test.storage.conftest import space_limited_servers
@@ -80,7 +81,6 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
@@ -91,9 +91,8 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
logger.info("Restart the node")
mark = await log.mark()
await manager.server_restart(servers[0].server_id)
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
cql = await reconnect_driver(manager)
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await manager.driver_connect()
cql = manager.get_cql()
for _ in range(2):
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
@@ -105,7 +104,6 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
await validate_data_existence(cql, hosts[1:], [hosts[0]], cf, 1)
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)
@@ -114,7 +112,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
@pytest.mark.asyncio
async def test_autotoggle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
async def test_autotoogle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
cmdline = [*global_cmdline,
"--logger-log-level", "compaction=debug"]
async with space_limited_servers(manager, volumes_factory, ["100M"]*3, cmdline=cmdline) as servers:
@@ -138,20 +136,15 @@ async def test_autotoggle_compaction(manager: ManagerClient, volumes_factory: Ca
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
logger.info("Restart the node")
mark = await log.mark()
await manager.server_restart(servers[0].server_id)
await reconnect_driver(manager)
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
@@ -242,8 +235,7 @@ async def test_reject_split_compaction(manager: ManagerClient, volumes_factory:
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain", from_mark=mark)
await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain")
@pytest.mark.asyncio
@@ -268,7 +260,6 @@ async def test_split_compaction_not_triggered(manager: ManagerClient, volumes_fa
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
s1_mark, _ = await s1_log.wait_for("Reached the critical disk utilization level", from_mark=s1_mark)
for _ in range(2):
s1_mark, _ = await s1_log.wait_for("compaction_manager - Drained", from_mark=s1_mark)
@@ -303,13 +294,10 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
await manager.server_stop_gracefully(servers[0].server_id)
await manager.server_wipe_sstables(servers[0].server_id, ks, table)
await manager.server_start(servers[0].server_id)
cql = await reconnect_driver(manager)
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("repair - Drained", from_mark=mark)
@@ -340,18 +328,16 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
logger.info("Restart the node")
mark = await log.mark()
await manager.server_restart(servers[0].server_id, wait_others=2)
await reconnect_driver(manager)
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
await manager.driver_connect()
for _ in range(2):
mark, _ = await log.wait_for("repair - Drained", from_mark=mark)
logger.info("With blob file removed, wait for the tablet repair to succeed")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
await manager.api.wait_task(servers[0].ip_addr, task_id)
@pytest.mark.asyncio
async def test_autotoggle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
cfg = {
'tablet_load_stats_refresh_interval_in_seconds': 1,
}
@@ -391,7 +377,6 @@ async def test_autotoggle_reject_incoming_migrations(manager: ManagerClient, vol
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)
@@ -402,7 +387,6 @@ async def test_autotoggle_reject_incoming_migrations(manager: ManagerClient, vol
mark, _ = await log.wait_for("Streaming for tablet migration .* failed", from_mark=mark)
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)
@@ -451,7 +435,6 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
@@ -464,11 +447,7 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
await cql.run_async(f"ALTER TABLE {cf} WITH tablets = {{'min_tablet_count': 2}};")
await coord_log.wait_for(f"Generating resize decision for table {table_id} of type split")
mark = await log.mark()
await manager.server_restart(servers[0].server_id, wait_others=2)
cql = await reconnect_driver(manager)
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
logger.info("Check if tablet split happened")
await assert_resize_task_info(table_id, lambda response: len(response) == 1 and response[0].resize_task_info is not None)
@@ -477,7 +456,6 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
mark, _ = await log.wait_for(f"Detected tablet split for table {cf}, increasing from 1 to 2 tablets", from_mark=mark)
@@ -543,7 +521,6 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
logger.info("Create a big file on the target node to reach critical disk utilization level")
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)
@@ -556,100 +533,9 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []
logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
for _ in range(2):
mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
await repair_task
mark, _ = await log.wait_for(f"Detected tablet split for table {cf}", from_mark=mark)
# Since we create 20M volumes, we need to reduce the commitlog segment size
# otherwise we hit out of space.
global_cmdline_with_disabled_monitor = [
"--disk-space-monitor-normal-polling-interval-in-seconds", "1",
"--critical-disk-utilization-level", "1.0",
"--commitlog-segment-size-in-mb", "2",
"--schema-commitlog-segment-size-in-mb", "4",
"--tablet-load-stats-refresh-interval-in-seconds", "1",
]
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_sstables_incrementally_released_during_streaming(manager: ManagerClient, volumes_factory: Callable) -> None:
"""
Test that source node will not run out of space if major compaction rewrites the sstables being streamed.
Expects the file streaming and major will both release sstables incrementally, reducing chances of 2x
space amplification.
Scenario:
- Create a 2-node cluster with limited disk space.
- Create a table with 2 tablets, one in each node
- Write 20% of node capacity to each tablet.
- Start decommissioning one node.
- During streaming, create a large file on the source node to push it over 85%
- Run major expecting the file streaming released the sstables incrementally. Had it not, source node runs out of space.
- Unblock streaming
- Verify that the decommission operation succeeds.
"""
cmdline = [*global_cmdline_with_disabled_monitor,
"--logger-log-level", "load_balancer=debug",
"--logger-log-level", "debug_error_injection=debug"
]
# the coordinator needs more space, so creating a 40M volume for it.
async with space_limited_servers(manager, volumes_factory, ["40M", "20M"], cmdline=cmdline,
property_file=[{"dc": "dc1", "rack": "r1"}]*2) as servers:
cql, _ = await manager.get_ready_cql(servers)
workdir = await manager.server_get_workdir(servers[1].server_id)
log = await manager.server_open_log(servers[1].server_id)
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'dc1': ['{servers[1].rack}'] }}"
" AND tablets = {'initial': 2}") as ks:
await manager.disable_tablet_balancing()
# Needs 1mb fragments in order to stress incremental release in file streaming
extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'sstable_size_in_mb' : '1'} and compression = {}"
async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
before_disk_info = psutil.disk_usage(workdir)
# About 4mb per tablet
await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 8000)])
# split data into 1mb fragments
await manager.api.keyspace_flush(servers[1].ip_addr, ks)
await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
after_disk_info = psutil.disk_usage(workdir)
percent_by_writes = after_disk_info.percent - before_disk_info.percent
logger.info(f"Percent taken by writes {percent_by_writes}")
# assert sstable data content account for more than 20% of node's storage.
assert percent_by_writes > 20
# We want to trap only migrations which happened during decommission
await manager.api.quiesce_topology(servers[0].ip_addr)
await manager.api.enable_injection(servers[1].ip_addr, "tablet_stream_files_end_wait", one_shot=True)
mark = await log.mark()
logger.info(f"Workdir {workdir}")
decomm_task = asyncio.create_task(manager.decommission_node(servers[1].server_id))
await manager.enable_tablet_balancing()
mark, _ = await log.wait_for("tablet_stream_files_end_wait: waiting", from_mark=mark)
disk_info = psutil.disk_usage(workdir)
with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
disk_info = psutil.disk_usage(workdir)
logger.info(f"Percent used before major {disk_info.percent}")
# Run major in order to try to reproduce 2x space amplification if files aren't released
# incrementally by streamer.
await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 100)])
disk_info = psutil.disk_usage(workdir)
logger.info(f"Percent used after major {disk_info.percent}")
await manager.api.message_injection(servers[1].ip_addr, "tablet_stream_files_end_wait")
await decomm_task

View File

@@ -1102,7 +1102,7 @@ SEASTAR_TEST_CASE(vector_store_client_https_wrong_hostname) {
}));
}
SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error) {
SEASTAR_TEST_CASE(vector_store_client_https_different_ca_cert_verification_error) {
auto broken_cert = co_await seastar::make_tmp_file();
certificates certs;
auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
@@ -1129,33 +1129,6 @@ SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error) {
}));
}
SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error_host_is_ip) {
auto broken_cert = co_await seastar::make_tmp_file();
certificates certs;
auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
auto cfg = make_config();
cfg.db_config->vector_store_primary_uri.set(format("https://{}:{}", server->host(), server->port()));
cfg.db_config->vector_store_encryption_options.set({{"truststore", broken_cert.get_path().string()}});
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
auto as = abort_source_timeout();
auto schema = co_await create_test_table(env, "ks", "idx");
auto& vs = env.local_qp().vector_store_client();
configure(vs).with_dns({{server->host(), std::vector<std::string>{server->host()}}});
vs.start_background_tasks();
auto keys = co_await vs.ann("ks", "idx", schema, std::vector<float>{0.1, 0.2, 0.3}, 2, rjson::empty_object(), as.reset());
BOOST_REQUIRE(!keys);
BOOST_CHECK(std::holds_alternative<vector_store_client::service_unavailable>(keys.error()));
},
cfg)
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
co_await remove(broken_cert);
}));
}
SEASTAR_TEST_CASE(vector_store_client_high_availability_unreachable) {
auto server = co_await make_vs_mock_server();
auto unreachable = co_await make_unreachable_socket();

View File

@@ -690,9 +690,6 @@ void cluster_repair_operation(scylla_rest_client& client, const bpo::variables_m
// will repair also their colocated tables.
continue;
}
if (tables.empty() && std::string(ex.what()).contains("Can't find a column family")) {
continue;
}
log("ERROR: Repair request for keyspace={} table={} failed with {}", keyspace, table, ex);
exit_code = EXIT_FAILURE;
}

View File

@@ -67,17 +67,14 @@ void result_message::visitor_base::visit(const result_message::exception& ex) {
ex.throw_me();
}
result_message::prepared::prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
: _prepared_entry(std::move(prepared_entry))
result_message::prepared::prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt)
: _prepared(std::move(prepared))
, _metadata(
(*_prepared_entry)->bound_names,
(*_prepared_entry)->partition_key_bind_indices,
support_lwt_opt ? (*_prepared_entry)->statement->is_conditional() : false)
, _result_metadata{extract_result_metadata((*_prepared_entry)->statement)}
_prepared->bound_names,
_prepared->partition_key_bind_indices,
support_lwt_opt ? _prepared->statement->is_conditional() : false)
, _result_metadata{extract_result_metadata(_prepared->statement)}
{
for (const auto& w : (*_prepared_entry)->warnings){
add_warning(w);
}
}
::shared_ptr<const cql3::metadata> result_message::prepared::extract_result_metadata(::shared_ptr<cql3::cql_statement> statement) {

View File

@@ -13,7 +13,6 @@
#include <concepts>
#include "cql3/result_set.hh"
#include "cql3/prepared_statements_cache.hh"
#include "cql3/statements/prepared_statement.hh"
#include "cql3/query_options.hh"
@@ -31,14 +30,14 @@ namespace messages {
class result_message::prepared : public result_message {
private:
cql3::prepared_statements_cache::pinned_value_type _prepared_entry;
cql3::statements::prepared_statement::checked_weak_ptr _prepared;
cql3::prepared_metadata _metadata;
::shared_ptr<const cql3::metadata> _result_metadata;
protected:
prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt);
prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt);
public:
cql3::statements::prepared_statement::checked_weak_ptr get_prepared() {
return (*_prepared_entry)->checked_weak_from_this();
cql3::statements::prepared_statement::checked_weak_ptr& get_prepared() {
return _prepared;
}
const cql3::prepared_metadata& metadata() const {
@@ -50,7 +49,7 @@ public:
}
cql3::cql_metadata_id_type get_metadata_id() const {
return (*_prepared_entry)->get_metadata_id();
return _prepared->get_metadata_id();
}
class cql;
@@ -167,8 +166,8 @@ std::ostream& operator<<(std::ostream& os, const result_message::set_keyspace& m
class result_message::prepared::cql : public result_message::prepared {
bytes _id;
public:
cql(const bytes& id, cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
: result_message::prepared(std::move(prepared_entry), support_lwt_opt)
cql(const bytes& id, cql3::statements::prepared_statement::checked_weak_ptr p, bool support_lwt_opt)
: result_message::prepared(std::move(p), support_lwt_opt)
, _id{id}
{ }

View File

@@ -715,6 +715,15 @@ void write_collection_value(bytes_ostream& out, atomic_cell_value_view val) {
}
}
void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
while (val.size() > 0) {
size_t current_n = std::min(val.size(), out.current_fragment().size());
memcpy(out.current_fragment().data(), val.data(), current_n);
val.remove_prefix(current_n);
out.remove_prefix(current_n);
}
}
template<std::integral T>
void write_simple(managed_bytes_mutable_view& out, std::type_identity_t<T> val) {
val = net::hton(val);

View File

@@ -566,16 +566,6 @@ inline managed_bytes::managed_bytes(const managed_bytes& o) {
}
}
inline
void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
while (val.size() > 0) {
size_t current_n = std::min(val.size(), out.current_fragment().size());
memcpy(out.current_fragment().data(), val.data(), current_n);
val.remove_prefix(current_n);
out.remove_prefix(current_n);
}
}
template<>
struct appending_hash<managed_bytes_view> {
template<Hasher Hasher>

View File

@@ -10,7 +10,6 @@
#include <array>
#include <type_traits>
#include <algorithm>
#include "utils/allocation_strategy.hh"
@@ -28,8 +27,10 @@ private:
T _data[0];
external(external&& other) noexcept : _backref(other._backref) {
std::uninitialized_move(other._data, other._data + other._backref->_size, _data);
std::destroy(other._data, other._data + other._backref->_size);
for (unsigned i = 0; i < _backref->size(); i++) {
new (_data + i) T(std::move(other._data[i]));
other._data[i].~T();
}
_backref->_data = _data;
}
size_t storage_size() const noexcept {

View File

@@ -21,7 +21,6 @@
#include <chrono>
#include <fmt/format.h>
#include <netinet/tcp.h>
#include <seastar/net/inet_address.hh>
using namespace seastar;
using namespace std::chrono_literals;
@@ -29,10 +28,6 @@ using namespace std::chrono_literals;
namespace vector_search {
namespace {
bool is_ip_address(const sstring& host) {
return net::inet_address::parse_numerical(host).has_value();
}
class client_connection_factory : public http::experimental::connection_factory {
client::endpoint_type _endpoint;
shared_ptr<tls::certificate_credentials> _creds;
@@ -60,11 +55,7 @@ private:
future<connected_socket> connect() {
auto addr = socket_address(_endpoint.ip, _endpoint.port);
if (_creds) {
tls::tls_options opts;
if (!is_ip_address(_endpoint.host)) {
opts.server_name = _endpoint.host;
}
auto socket = co_await tls::connect(_creds, addr, std::move(opts));
auto socket = co_await tls::connect(_creds, addr, tls::tls_options{.server_name = _endpoint.host});
// tls::connect() only performs the TCP handshake — the TLS handshake is deferred until the first I/O operation.
// Force the TLS handshake to happen here so that the connection timeout applies to it.
co_await tls::check_session_is_resumed(socket);
@@ -133,7 +124,7 @@ seastar::future<client::request_result> client::request(
co_return std::unexpected{aborted_error{}};
}
if (is_server_problem(err)) {
handle_server_unavailable(err);
handle_server_unavailable();
}
co_return std::unexpected{co_await map_err(err)};
}
@@ -174,9 +165,8 @@ seastar::future<> client::close() {
co_await _http_client.close();
}
void client::handle_server_unavailable(std::exception_ptr err) {
void client::handle_server_unavailable() {
if (!is_checking_status_in_progress()) {
_logger.warn("Request to vector store {} {}:{} failed: {}", _endpoint.host, _endpoint.ip, _endpoint.port, err);
_checking_status_future = run_checking_status();
}
}

View File

@@ -12,7 +12,6 @@
#include "utils/log.hh"
#include "utils/updateable_value.hh"
#include <chrono>
#include <exception>
#include <seastar/core/future.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/abort_source.hh>
@@ -61,7 +60,7 @@ private:
seastar::future<response> request_impl(seastar::httpd::operation_type method, seastar::sstring path, std::optional<seastar::sstring> content,
std::optional<seastar::http::reply::status_type>&& expected, seastar::abort_source& as);
seastar::future<bool> check_status();
void handle_server_unavailable(std::exception_ptr err);
void handle_server_unavailable();
seastar::future<> run_checking_status();
bool is_checking_status_in_progress() const;
std::chrono::milliseconds backoff_retry_max() const;

View File

@@ -18,6 +18,15 @@
static_assert(-1 == ~0, "Not a twos-complement architecture");
// Accounts for the case that all bits are zero.
static vint_size_type count_leading_zero_bits(uint64_t n) noexcept {
if (n == 0) {
return vint_size_type(std::numeric_limits<uint64_t>::digits);
}
return vint_size_type(count_leading_zeros(n));
}
static constexpr uint64_t encode_zigzag(int64_t n) noexcept {
// The right shift has to be arithmetic and not logical.
return (static_cast<uint64_t>(n) << 1) ^ static_cast<uint64_t>(n >> 63);
@@ -46,9 +55,16 @@ int64_t signed_vint::deserialize(bytes_view v) {
return decode_zigzag(un);
}
vint_size_type signed_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
return unsigned_vint::serialized_size_from_first_byte(first_byte);
}
// The number of additional bytes that we need to read.
static vint_size_type count_extra_bytes(int8_t first_byte) {
return std::countl_zero(static_cast<uint8_t>(~first_byte));
// Sign extension.
const int64_t v(first_byte);
return count_leading_zero_bits(static_cast<uint64_t>(~v)) - vint_size_type(64 - 8);
}
static void encode(uint64_t value, vint_size_type size, bytes::iterator out) {
@@ -123,3 +139,8 @@ uint64_t unsigned_vint::deserialize(bytes_view v) {
#endif
return result;
}
vint_size_type unsigned_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
int8_t first_byte_casted = first_byte;
return 1 + (first_byte_casted >= 0 ? 0 : count_extra_bytes(first_byte_casted));
}

View File

@@ -35,7 +35,6 @@
#include "bytes.hh"
#include <cstdint>
#include <bit>
using vint_size_type = bytes::size_type;
@@ -50,9 +49,7 @@ struct unsigned_vint final {
static value_type deserialize(bytes_view v);
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
return 1 + std::countl_zero(static_cast<uint8_t>(~first_byte));
}
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
};
struct signed_vint final {
@@ -64,7 +61,5 @@ struct signed_vint final {
static value_type deserialize(bytes_view v);
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
return unsigned_vint::serialized_size_from_first_byte(first_byte);
}
static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
};