Currently, the tablet repair scheduler repairs all replicas of a tablet. It does not support hosts or DCs selection. It should be enough for most cases. However, users might still want to limit the repair to certain hosts or DCs in production. #21985 added the preparation work to add the config options for the selection. This patch adds the hosts or DCs selection support. Fixes #22417
1043 lines
46 KiB
C++
1043 lines
46 KiB
C++
/*
|
|
* Copyright (C) 2023-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#include <fmt/ranges.h>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
|
|
#include "types/types.hh"
|
|
#include "types/tuple.hh"
|
|
#include "types/list.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "schema/schema_builder.hh"
|
|
#include "cql3/query_processor.hh"
|
|
#include "cql3/untyped_result_set.hh"
|
|
#include "cql3/stats.hh"
|
|
#include "gms/feature_service.hh"
|
|
#include "replica/database.hh"
|
|
#include "replica/tablets.hh"
|
|
#include "replica/tablet_mutation_builder.hh"
|
|
#include "sstables/sstable_set.hh"
|
|
#include "dht/token.hh"
|
|
#include "mutation/async_utils.hh"
|
|
#include "compaction/compaction_manager.hh"
|
|
|
|
namespace replica {
|
|
|
|
using namespace locator;
|
|
|
|
static thread_local auto repair_scheduler_config_type = user_type_impl::get_instance(
|
|
"system", "repair_scheduler_config", {"auto_repair_enabled", "auto_repair_threshold"},
|
|
{boolean_type, long_type}, false);
|
|
static thread_local auto tablet_task_info_type = user_type_impl::get_instance(
|
|
"system", "tablet_task_info", {"request_type", "tablet_task_id", "request_time", "sched_nr", "sched_time", "repair_hosts_filter", "repair_dcs_filter"},
|
|
{utf8_type, uuid_type, timestamp_type, long_type, timestamp_type, utf8_type, utf8_type}, false);
|
|
static thread_local auto replica_type = tuple_type_impl::get_instance({uuid_type, int32_type});
|
|
static thread_local auto replica_set_type = list_type_impl::get_instance(replica_type, false);
|
|
static thread_local auto tablet_info_type = tuple_type_impl::get_instance({long_type, long_type, replica_set_type});
|
|
|
|
data_type get_replica_set_type() {
|
|
return replica_set_type;
|
|
}
|
|
|
|
data_type get_tablet_info_type() {
|
|
return tablet_info_type;
|
|
}
|
|
|
|
void tablet_add_repair_scheduler_user_types(const sstring& ks, replica::database& db) {
|
|
db.find_keyspace(ks).add_user_type(repair_scheduler_config_type);
|
|
db.find_keyspace(ks).add_user_type(tablet_task_info_type);
|
|
}
|
|
|
|
schema_ptr make_tablets_schema() {
|
|
// FIXME: Allow UDTs in system keyspace:
|
|
// CREATE TYPE tablet_replica (replica_id uuid, shard int);
|
|
// replica_set_type = frozen<list<tablet_replica>>
|
|
auto id = generate_legacy_id(db::system_keyspace::NAME, db::system_keyspace::TABLETS);
|
|
// Bump the schema version offset for tablet repair scheduler columns
|
|
return schema_builder(db::system_keyspace::NAME, db::system_keyspace::TABLETS, id)
|
|
.with_column("table_id", uuid_type, column_kind::partition_key)
|
|
.with_column("tablet_count", int32_type, column_kind::static_column)
|
|
.with_column("keyspace_name", utf8_type, column_kind::static_column)
|
|
.with_column("table_name", utf8_type, column_kind::static_column)
|
|
.with_column("last_token", long_type, column_kind::clustering_key)
|
|
.with_column("replicas", replica_set_type)
|
|
.with_column("new_replicas", replica_set_type)
|
|
.with_column("stage", utf8_type)
|
|
.with_column("transition", utf8_type)
|
|
.with_column("session", uuid_type)
|
|
.with_column("resize_type", utf8_type, column_kind::static_column)
|
|
.with_column("resize_seq_number", long_type, column_kind::static_column)
|
|
.with_column("repair_time", timestamp_type)
|
|
.with_column("repair_task_info", tablet_task_info_type)
|
|
.with_column("repair_scheduler_config", repair_scheduler_config_type, column_kind::static_column)
|
|
.with_column("migration_task_info", tablet_task_info_type)
|
|
.with_column("resize_task_info", tablet_task_info_type, column_kind::static_column)
|
|
.with_hash_version()
|
|
.build();
|
|
}
|
|
|
|
std::vector<data_value> replicas_to_data_value(const tablet_replica_set& replicas) {
|
|
std::vector<data_value> result;
|
|
result.reserve(replicas.size());
|
|
for (auto&& replica : replicas) {
|
|
result.emplace_back(make_tuple_value(replica_type, {
|
|
data_value(utils::UUID(replica.host.uuid())),
|
|
data_value(int(replica.shard))
|
|
}));
|
|
}
|
|
return result;
|
|
};
|
|
|
|
data_value tablet_task_info_to_data_value(const locator::tablet_task_info& info) {
|
|
data_value result = make_user_value(tablet_task_info_type, {
|
|
data_value(locator::tablet_task_type_to_string(info.request_type)),
|
|
data_value(info.tablet_task_id.uuid()),
|
|
data_value(info.request_time),
|
|
data_value(info.sched_nr),
|
|
data_value(info.sched_time),
|
|
data_value(locator::tablet_task_info::serialize_repair_hosts_filter(info.repair_hosts_filter)),
|
|
data_value(locator::tablet_task_info::serialize_repair_dcs_filter(info.repair_dcs_filter)),
|
|
});
|
|
return result;
|
|
};
|
|
|
|
data_value repair_scheduler_config_to_data_value(const locator::repair_scheduler_config& config) {
|
|
data_value result = make_user_value(repair_scheduler_config_type, {
|
|
data_value(config.auto_repair_enabled),
|
|
data_value(int64_t(config.auto_repair_threshold.count())),
|
|
});
|
|
return result;
|
|
};
|
|
|
|
future<mutation>
|
|
tablet_map_to_mutation(const tablet_map& tablets, table_id id, const sstring& keyspace_name, const sstring& table_name,
|
|
api::timestamp_type ts, const gms::feature_service& features) {
|
|
auto s = db::system_keyspace::tablets();
|
|
auto gc_now = gc_clock::now();
|
|
auto tombstone_ts = ts - 1;
|
|
|
|
mutation m(s, partition_key::from_single_value(*s,
|
|
data_value(id.uuid()).serialize_nonnull()
|
|
));
|
|
m.partition().apply(tombstone(tombstone_ts, gc_now));
|
|
m.set_static_cell("tablet_count", data_value(int(tablets.tablet_count())), ts);
|
|
m.set_static_cell("keyspace_name", data_value(keyspace_name), ts);
|
|
m.set_static_cell("table_name", data_value(table_name), ts);
|
|
m.set_static_cell("resize_type", data_value(tablets.resize_decision().type_name()), ts);
|
|
m.set_static_cell("resize_seq_number", data_value(int64_t(tablets.resize_decision().sequence_number)), ts);
|
|
if (features.tablet_resize_virtual_task && tablets.resize_task_info().is_valid()) {
|
|
m.set_static_cell("resize_task_info", tablet_task_info_to_data_value(tablets.resize_task_info()), ts);
|
|
}
|
|
if (features.tablet_repair_scheduler) {
|
|
m.set_static_cell("repair_scheduler_config", repair_scheduler_config_to_data_value(tablets.repair_scheduler_config()), ts);
|
|
}
|
|
|
|
tablet_id tid = tablets.first_tablet();
|
|
for (auto&& tablet : tablets.tablets()) {
|
|
auto last_token = tablets.get_last_token(tid);
|
|
auto ck = clustering_key::from_single_value(*s, data_value(dht::token::to_int64(last_token)).serialize_nonnull());
|
|
m.set_clustered_cell(ck, "replicas", make_list_value(replica_set_type, replicas_to_data_value(tablet.replicas)), ts);
|
|
if (features.tablet_migration_virtual_task && tablet.migration_task_info.is_valid()) {
|
|
m.set_clustered_cell(ck, "migration_task_info", tablet_task_info_to_data_value(tablet.migration_task_info), ts);
|
|
}
|
|
if (features.tablet_repair_scheduler) {
|
|
if (tablet.repair_task_info.is_valid()) {
|
|
m.set_clustered_cell(ck, "repair_task_info", tablet_task_info_to_data_value(tablet.repair_task_info), ts);
|
|
}
|
|
if (tablet.repair_time != db_clock::time_point{}) {
|
|
m.set_clustered_cell(ck, "repair_time", data_value(tablet.repair_time), ts);
|
|
}
|
|
}
|
|
if (auto tr_info = tablets.get_tablet_transition_info(tid)) {
|
|
m.set_clustered_cell(ck, "stage", tablet_transition_stage_to_string(tr_info->stage), ts);
|
|
m.set_clustered_cell(ck, "transition", tablet_transition_kind_to_string(tr_info->transition), ts);
|
|
m.set_clustered_cell(ck, "new_replicas", make_list_value(replica_set_type, replicas_to_data_value(tr_info->next)), ts);
|
|
if (tr_info->session_id) {
|
|
m.set_clustered_cell(ck, "session", data_value(tr_info->session_id.uuid()), ts);
|
|
}
|
|
}
|
|
tid = *tablets.next_tablet(tid);
|
|
co_await coroutine::maybe_yield();
|
|
}
|
|
co_return std::move(m);
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_new_replicas(dht::token last_token, locator::tablet_replica_set replicas) {
|
|
_m.set_clustered_cell(get_ck(last_token), "new_replicas", make_list_value(replica_set_type, replicas_to_data_value(replicas)), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_replicas(dht::token last_token, locator::tablet_replica_set replicas) {
|
|
_m.set_clustered_cell(get_ck(last_token), "replicas", make_list_value(replica_set_type, replicas_to_data_value(replicas)), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_stage(dht::token last_token, locator::tablet_transition_stage stage) {
|
|
_m.set_clustered_cell(get_ck(last_token), "stage", data_value(tablet_transition_stage_to_string(stage)), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_transition(dht::token last_token, locator::tablet_transition_kind kind) {
|
|
_m.set_clustered_cell(get_ck(last_token), "transition", data_value(tablet_transition_kind_to_string(kind)), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_session(dht::token last_token, service::session_id session_id) {
|
|
_m.set_clustered_cell(get_ck(last_token), "session", data_value(session_id.uuid()), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::del_session(dht::token last_token) {
|
|
auto session_col = _s->get_column_definition("session");
|
|
_m.set_clustered_cell(get_ck(last_token), *session_col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::del_transition(dht::token last_token) {
|
|
auto ck = get_ck(last_token);
|
|
auto stage_col = _s->get_column_definition("stage");
|
|
_m.set_clustered_cell(ck, *stage_col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
auto transition_col = _s->get_column_definition("transition");
|
|
_m.set_clustered_cell(ck, *transition_col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
auto new_replicas_col = _s->get_column_definition("new_replicas");
|
|
_m.set_clustered_cell(ck, *new_replicas_col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
auto session_col = _s->get_column_definition("session");
|
|
_m.set_clustered_cell(ck, *session_col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_resize_decision(locator::resize_decision resize_decision, const gms::feature_service& features) {
|
|
_m.set_static_cell("resize_type", data_value(resize_decision.type_name()), _ts);
|
|
_m.set_static_cell("resize_seq_number", data_value(int64_t(resize_decision.sequence_number)), _ts);
|
|
if (resize_decision.split_or_merge()) {
|
|
auto resize_task_info = std::holds_alternative<resize_decision::split>(resize_decision.way)
|
|
? locator::tablet_task_info::make_split_request()
|
|
: locator::tablet_task_info::make_merge_request();
|
|
resize_task_info.sched_nr++;
|
|
resize_task_info.sched_time = db_clock::now();
|
|
return set_resize_task_info(std::move(resize_task_info), features);
|
|
} else {
|
|
return del_resize_task_info(features);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_repair_scheduler_config(locator::repair_scheduler_config config) {
|
|
_m.set_static_cell("repair_scheduler_config", repair_scheduler_config_to_data_value(config), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_repair_time(dht::token last_token, db_clock::time_point repair_time) {
|
|
_m.set_clustered_cell(get_ck(last_token), "repair_time", data_value(repair_time), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_repair_task_info(dht::token last_token, locator::tablet_task_info repair_task_info) {
|
|
_m.set_clustered_cell(get_ck(last_token), "repair_task_info", tablet_task_info_to_data_value(repair_task_info), _ts);
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::del_repair_task_info(dht::token last_token) {
|
|
auto col = _s->get_column_definition("repair_task_info");
|
|
_m.set_clustered_cell(get_ck(last_token), *col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_migration_task_info(dht::token last_token, locator::tablet_task_info migration_task_info, const gms::feature_service& features) {
|
|
if (features.tablet_migration_virtual_task) {
|
|
_m.set_clustered_cell(get_ck(last_token), "migration_task_info", tablet_task_info_to_data_value(migration_task_info), _ts);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::del_migration_task_info(dht::token last_token, const gms::feature_service& features) {
|
|
if (features.tablet_migration_virtual_task) {
|
|
auto col = _s->get_column_definition("migration_task_info");
|
|
_m.set_clustered_cell(get_ck(last_token), *col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::set_resize_task_info(locator::tablet_task_info resize_task_info, const gms::feature_service& features) {
|
|
if (features.tablet_resize_virtual_task) {
|
|
_m.set_static_cell("resize_task_info", tablet_task_info_to_data_value(resize_task_info), _ts);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
tablet_mutation_builder&
|
|
tablet_mutation_builder::del_resize_task_info(const gms::feature_service& features) {
|
|
if (features.tablet_resize_virtual_task) {
|
|
auto col = _s->get_column_definition("resize_task_info");
|
|
_m.set_static_cell(*col, atomic_cell::make_dead(_ts, gc_clock::now()));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
mutation make_drop_tablet_map_mutation(table_id id, api::timestamp_type ts) {
|
|
auto s = db::system_keyspace::tablets();
|
|
mutation m(s, partition_key::from_single_value(*s,
|
|
data_value(id.uuid()).serialize_nonnull()
|
|
));
|
|
m.partition().apply(tombstone(ts, gc_clock::now()));
|
|
return m;
|
|
}
|
|
|
|
tablet_replica_set tablet_replica_set_from_cell(const data_value& v) {
|
|
tablet_replica_set result;
|
|
auto list_v = value_cast<list_type_impl::native_type>(v);
|
|
result.reserve(list_v.size());
|
|
for (const data_value& replica_v : list_v) {
|
|
std::vector<data_value> replica_dv = value_cast<tuple_type_impl::native_type>(replica_v);
|
|
result.emplace_back(
|
|
host_id(value_cast<utils::UUID>(replica_dv[0])),
|
|
shard_id(value_cast<int>(replica_dv[1]))
|
|
);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static
|
|
tablet_replica_set deserialize_replica_set(cql3::untyped_result_set_row::view_type raw_value) {
|
|
return tablet_replica_set_from_cell(
|
|
replica_set_type->deserialize_value(raw_value));
|
|
}
|
|
|
|
locator::tablet_task_info tablet_task_info_from_cell(const data_value& v) {
|
|
std::vector<data_value> dv = value_cast<user_type_impl::native_type>(v);
|
|
auto result = locator::tablet_task_info{
|
|
locator::tablet_task_type_from_string(value_cast<sstring>(dv[0])),
|
|
locator::tablet_task_id(value_cast<utils::UUID>(dv[1])),
|
|
value_cast<db_clock::time_point>(dv[2]),
|
|
value_cast<int64_t>(dv[3]),
|
|
value_cast<db_clock::time_point>(dv[4]),
|
|
locator::tablet_task_info::deserialize_repair_hosts_filter(value_cast<sstring>(dv[5])),
|
|
locator::tablet_task_info::deserialize_repair_dcs_filter(value_cast<sstring>(dv[6])),
|
|
};
|
|
return result;
|
|
}
|
|
|
|
static
|
|
locator::tablet_task_info deserialize_tablet_task_info(cql3::untyped_result_set_row::view_type raw_value) {
|
|
return tablet_task_info_from_cell(
|
|
tablet_task_info_type->deserialize_value(raw_value));
|
|
}
|
|
|
|
locator::repair_scheduler_config repair_scheduler_config_from_cell(const data_value& v) {
|
|
std::vector<data_value> dv = value_cast<user_type_impl::native_type>(v);
|
|
auto result = locator::repair_scheduler_config{
|
|
value_cast<bool>(dv[0]),
|
|
std::chrono::seconds(value_cast<int64_t>(dv[1])),
|
|
};
|
|
return result;
|
|
}
|
|
|
|
static
|
|
locator::repair_scheduler_config deserialize_repair_scheduler_config(cql3::untyped_result_set_row::view_type raw_value) {
|
|
return repair_scheduler_config_from_cell(
|
|
repair_scheduler_config_type->deserialize_value(raw_value));
|
|
}
|
|
|
|
future<> save_tablet_metadata(replica::database& db, const tablet_metadata& tm, api::timestamp_type ts) {
|
|
tablet_logger.trace("Saving tablet metadata: {}", tm);
|
|
std::vector<mutation> muts;
|
|
muts.reserve(tm.all_tables().size());
|
|
for (auto&& [id, tablets] : tm.all_tables()) {
|
|
// FIXME: Should we ignore missing tables? Currently doesn't matter because this is only used in tests.
|
|
auto s = db.find_schema(id);
|
|
muts.emplace_back(
|
|
co_await tablet_map_to_mutation(*tablets, id, s->ks_name(), s->cf_name(), ts, db.features()));
|
|
}
|
|
co_await db.apply(freeze(muts), db::no_timeout);
|
|
}
|
|
|
|
static table_id to_tablet_metadata_key(const schema& s, const partition_key& key) {
|
|
const auto elements = key.explode(s);
|
|
return ::table_id(value_cast<utils::UUID>(uuid_type->deserialize_value(elements.front())));
|
|
}
|
|
|
|
static dht::token to_tablet_metadata_row_key(const schema& s, const clustering_key& key) {
|
|
const auto elements = key.explode(s);
|
|
return dht::token::from_int64(value_cast<int64_t>(long_type->deserialize_value(elements[0])));
|
|
}
|
|
|
|
static void do_update_tablet_metadata_change_hint(locator::tablet_metadata_change_hint& hint, const schema& s, const mutation& m) {
|
|
const auto table_id = to_tablet_metadata_key(s, m.key());
|
|
auto it = hint.tables.try_emplace(table_id, locator::tablet_metadata_change_hint::table_hint{table_id, {}}).first;
|
|
|
|
const auto& mp = m.partition();
|
|
auto& tokens = it->second.tokens;
|
|
|
|
if (mp.partition_tombstone() || !mp.row_tombstones().empty() || !mp.static_row().empty()) {
|
|
// If there is a partition tombstone, range tombstone or static row,
|
|
// update the entire partition. Also clear any row hints that might be
|
|
// present to force a full read of the partition.
|
|
tokens.clear();
|
|
return;
|
|
}
|
|
|
|
for (const auto& row : mp.clustered_rows()) {
|
|
// TODO: we do not handle deletions yet, will revisit when tablet count
|
|
// reduction is worked out.
|
|
if (row.row().deleted_at()) {
|
|
tokens.clear();
|
|
return;
|
|
}
|
|
tokens.push_back(to_tablet_metadata_row_key(s, row.key()));
|
|
}
|
|
}
|
|
|
|
static std::optional<tablet_replica_set> maybe_deserialize_replica_set(const rows_entry& row, const column_definition& cdef) {
|
|
const auto* cell = row.row().cells().find_cell(cdef.id);
|
|
if (!cell) {
|
|
return std::nullopt;
|
|
}
|
|
auto dv = cdef.type->deserialize_value(cell->as_atomic_cell(cdef).value());
|
|
return tablet_replica_set_from_cell(dv);
|
|
}
|
|
|
|
static void do_validate_tablet_metadata_change(const locator::tablet_metadata& tm, const schema& s, const mutation& m) {
|
|
const auto table_id = to_tablet_metadata_key(s, m.key());
|
|
const auto& mp = m.partition();
|
|
|
|
if (mp.partition_tombstone() || !mp.row_tombstones().empty() || !mp.static_row().empty()) {
|
|
return;
|
|
}
|
|
|
|
auto& r_cdef = *s.get_column_definition("replicas");
|
|
auto& nr_cdef = *s.get_column_definition("new_replicas");
|
|
|
|
for (const auto& row : mp.clustered_rows()) {
|
|
if (row.row().deleted_at()) {
|
|
return;
|
|
}
|
|
|
|
auto new_replicas = maybe_deserialize_replica_set(row, nr_cdef);
|
|
if (!new_replicas) {
|
|
continue;
|
|
}
|
|
|
|
auto token = to_tablet_metadata_row_key(s, row.key());
|
|
auto replicas = maybe_deserialize_replica_set(row, r_cdef);
|
|
if (!replicas) {
|
|
replicas = tm.get_tablet_map(table_id).get_tablet_info(token).replicas;
|
|
}
|
|
|
|
std::unordered_set<tablet_replica> pending = substract_sets(*new_replicas, *replicas);
|
|
if (pending.size() > 1) {
|
|
throw std::runtime_error(fmt::format("Too many pending replicas for table {} last_token {}: {}",
|
|
table_id, token, pending));
|
|
}
|
|
}
|
|
}
|
|
|
|
std::optional<locator::tablet_metadata_change_hint> get_tablet_metadata_change_hint(const std::vector<canonical_mutation>& mutations) {
|
|
tablet_logger.trace("tablet_metadata_change_hint({})", mutations.size());
|
|
auto s = db::system_keyspace::tablets();
|
|
|
|
std::optional<locator::tablet_metadata_change_hint> hint;
|
|
|
|
for (const auto& cm : mutations) {
|
|
tablet_logger.trace("tablet_metadata_change_hint() {} == {}", cm.column_family_id(), s->id());
|
|
if (cm.column_family_id() != s->id()) {
|
|
continue;
|
|
}
|
|
if (!hint) {
|
|
hint.emplace();
|
|
hint->tables.reserve(mutations.size());
|
|
}
|
|
do_update_tablet_metadata_change_hint(*hint, *s, cm.to_mutation(s));
|
|
}
|
|
|
|
return hint;
|
|
}
|
|
|
|
void validate_tablet_metadata_change(const locator::tablet_metadata& tm, const std::vector<canonical_mutation>& mutations) {
|
|
auto s = db::system_keyspace::tablets();
|
|
|
|
for (const auto& cm : mutations) {
|
|
if (cm.column_family_id() != s->id()) {
|
|
continue;
|
|
}
|
|
|
|
do_validate_tablet_metadata_change(tm, *s, cm.to_mutation(s));
|
|
}
|
|
}
|
|
|
|
void update_tablet_metadata_change_hint(locator::tablet_metadata_change_hint& hint, const mutation& m) {
|
|
auto s = db::system_keyspace::tablets();
|
|
if (m.column_family_id() != s->id()) {
|
|
return;
|
|
}
|
|
do_update_tablet_metadata_change_hint(hint, *s, m);
|
|
}
|
|
|
|
namespace {
|
|
|
|
tablet_id process_one_row(replica::database* db, table_id table, tablet_map& map, tablet_id tid, const cql3::untyped_result_set_row& row) {
|
|
tablet_replica_set tablet_replicas;
|
|
if (row.has("replicas")) {
|
|
tablet_replicas = deserialize_replica_set(row.get_view("replicas"));
|
|
}
|
|
|
|
tablet_replica_set new_tablet_replicas;
|
|
if (row.has("new_replicas")) {
|
|
new_tablet_replicas = deserialize_replica_set(row.get_view("new_replicas"));
|
|
}
|
|
|
|
db_clock::time_point repair_time;
|
|
bool update_repair_time = false;
|
|
if (row.has("repair_time")) {
|
|
repair_time = row.get_as<db_clock::time_point>("repair_time");
|
|
update_repair_time = true;
|
|
}
|
|
|
|
locator::tablet_task_info repair_task_info;
|
|
if (row.has("repair_task_info")) {
|
|
repair_task_info = deserialize_tablet_task_info(row.get_view("repair_task_info"));
|
|
}
|
|
|
|
locator::tablet_task_info migration_task_info;
|
|
if (row.has("migration_task_info")) {
|
|
migration_task_info = deserialize_tablet_task_info(row.get_view("migration_task_info"));
|
|
}
|
|
|
|
if (row.has("stage")) {
|
|
auto stage = tablet_transition_stage_from_string(row.get_as<sstring>("stage"));
|
|
auto transition = tablet_transition_kind_from_string(row.get_as<sstring>("transition"));
|
|
|
|
std::unordered_set<tablet_replica> pending = substract_sets(new_tablet_replicas, tablet_replicas);
|
|
if (pending.size() > 1) {
|
|
throw std::runtime_error(fmt::format("Too many pending replicas for table {} tablet {}: {}",
|
|
table, tid, pending));
|
|
}
|
|
std::optional<tablet_replica> pending_replica;
|
|
if (pending.size() != 0) {
|
|
pending_replica = *pending.begin();
|
|
}
|
|
service::session_id session_id;
|
|
if (row.has("session")) {
|
|
session_id = service::session_id(row.get_as<utils::UUID>("session"));
|
|
}
|
|
map.set_tablet_transition_info(tid, tablet_transition_info{stage, transition,
|
|
std::move(new_tablet_replicas), pending_replica, session_id});
|
|
}
|
|
|
|
map.set_tablet(tid, tablet_info{std::move(tablet_replicas), repair_time, repair_task_info, migration_task_info});
|
|
|
|
if (update_repair_time && db) {
|
|
auto myid = db->get_token_metadata().get_my_id();
|
|
auto range = map.get_token_range(tid);
|
|
auto& info = map.get_tablet_info(tid);
|
|
for (auto r : info.replicas) {
|
|
if (r.host == myid) {
|
|
auto& gc_state = db->get_compaction_manager().get_tombstone_gc_state();
|
|
gc_state.insert_pending_repair_time_update(table, range, to_gc_clock(repair_time), r.shard);
|
|
tablet_logger.debug("Insert pending repair time for tombstone gc: table={} tablet={} range={} repair_time={}",
|
|
table, tid, range, repair_time);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
auto persisted_last_token = dht::token::from_int64(row.get_as<int64_t>("last_token"));
|
|
auto current_last_token = map.get_last_token(tid);
|
|
if (current_last_token != persisted_last_token) {
|
|
tablet_logger.debug("current tablet_map: {}", map);
|
|
throw std::runtime_error(format("last_token mismatch between on-disk ({}) and in-memory ({}) tablet map for table {} tablet {}",
|
|
persisted_last_token, current_last_token, table, tid));
|
|
}
|
|
|
|
return *map.next_tablet(tid);
|
|
}
|
|
|
|
struct tablet_metadata_builder {
|
|
tablet_metadata& tm;
|
|
struct active_tablet_map {
|
|
table_id table;
|
|
tablet_map map;
|
|
tablet_id tid;
|
|
};
|
|
std::optional<active_tablet_map> current;
|
|
|
|
void process_row(const cql3::untyped_result_set_row& row, replica::database* db) {
|
|
auto table = table_id(row.get_as<utils::UUID>("table_id"));
|
|
|
|
if (!current || current->table != table) {
|
|
if (current) {
|
|
tm.set_tablet_map(current->table, std::move(current->map));
|
|
}
|
|
auto tablet_count = row.get_as<int>("tablet_count");
|
|
auto tmap = tablet_map(tablet_count);
|
|
current = active_tablet_map{table, tmap, tmap.first_tablet()};
|
|
|
|
// Resize decision fields are static columns, so set them only once per table.
|
|
if (row.has("resize_type") && row.has("resize_seq_number")) {
|
|
auto resize_type_name = row.get_as<sstring>("resize_type");
|
|
int64_t resize_seq_number = row.get_as<int64_t>("resize_seq_number");
|
|
|
|
locator::resize_decision resize_decision(std::move(resize_type_name), resize_seq_number);
|
|
current->map.set_resize_decision(std::move(resize_decision));
|
|
}
|
|
if (row.has("resize_task_info")) {
|
|
current->map.set_resize_task_info(deserialize_tablet_task_info(row.get_view("resize_task_info")));
|
|
}
|
|
|
|
if (row.has("repair_scheduler_config")) {
|
|
auto config = deserialize_repair_scheduler_config(row.get_view("repair_scheduler_config"));
|
|
current->map.set_repair_scheduler_config(std::move(config));
|
|
}
|
|
}
|
|
|
|
current->tid = process_one_row(db, current->table, current->map, current->tid, row);
|
|
}
|
|
|
|
void on_end_of_stream() {
|
|
if (current) {
|
|
tm.set_tablet_map(current->table, std::move(current->map));
|
|
}
|
|
}
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
future<tablet_metadata> read_tablet_metadata(cql3::query_processor& qp) {
|
|
tablet_metadata tm;
|
|
tablet_metadata_builder builder{tm};
|
|
tablet_logger.trace("Start reading tablet metadata");
|
|
try {
|
|
co_await qp.query_internal("select * from system.tablets",
|
|
[&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
|
builder.process_row(row, qp.db().real_database_ptr());
|
|
return make_ready_future<stop_iteration>(stop_iteration::no);
|
|
});
|
|
} catch (...) {
|
|
if (builder.current) {
|
|
std::throw_with_nested(std::runtime_error(format("Failed to read tablet metadata for table {}", builder.current->table)));
|
|
} else {
|
|
std::throw_with_nested(std::runtime_error("Failed to read tablet metadata"));
|
|
}
|
|
}
|
|
builder.on_end_of_stream();
|
|
tablet_logger.trace("Read tablet metadata: {}", tm);
|
|
co_return std::move(tm);
|
|
}
|
|
|
|
future<std::unordered_set<locator::host_id>> read_required_hosts(cql3::query_processor& qp) {
|
|
std::unordered_set<locator::host_id> hosts;
|
|
|
|
auto process_row = [&] (const cql3::untyped_result_set_row& row) {
|
|
tablet_replica_set tablet_replicas;
|
|
if (row.has("replicas")) {
|
|
tablet_replicas = deserialize_replica_set(row.get_view("replicas"));
|
|
}
|
|
|
|
for (auto&& r : tablet_replicas) {
|
|
hosts.insert(r.host);
|
|
}
|
|
|
|
if (row.has("new_replicas")) {
|
|
tablet_replica_set new_tablet_replicas;
|
|
new_tablet_replicas = deserialize_replica_set(row.get_view("new_replicas"));
|
|
for (auto&& r : new_tablet_replicas) {
|
|
hosts.insert(r.host);
|
|
}
|
|
}
|
|
};
|
|
|
|
try {
|
|
co_await qp.query_internal("select replicas, new_replicas from system.tablets",
|
|
[&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
|
process_row(row);
|
|
return make_ready_future<stop_iteration>(stop_iteration::no);
|
|
});
|
|
} catch (...) {
|
|
std::throw_with_nested(std::runtime_error("Failed to read tablet required hosts"));
|
|
}
|
|
|
|
co_return std::move(hosts);
|
|
}
|
|
|
|
static future<>
|
|
do_update_tablet_metadata_partition(cql3::query_processor& qp, tablet_metadata& tm, const tablet_metadata_change_hint::table_hint& hint) {
|
|
tablet_metadata_builder builder{tm};
|
|
co_await qp.query_internal(
|
|
"select * from system.tablets where table_id = ?",
|
|
db::consistency_level::ONE,
|
|
{data_value(hint.table_id.uuid())},
|
|
1000,
|
|
[&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
|
|
builder.process_row(row, qp.db().real_database_ptr());
|
|
return make_ready_future<stop_iteration>(stop_iteration::no);
|
|
});
|
|
if (builder.current) {
|
|
tm.set_tablet_map(builder.current->table, std::move(builder.current->map));
|
|
} else {
|
|
tm.drop_tablet_map(hint.table_id);
|
|
}
|
|
}
|
|
|
|
static future<>
|
|
do_update_tablet_metadata_rows(replica::database& db, cql3::query_processor& qp, tablet_map& tmap, const tablet_metadata_change_hint::table_hint& hint) {
|
|
for (const auto token : hint.tokens) {
|
|
auto res = co_await qp.execute_internal(
|
|
"select * from system.tablets where table_id = ? and last_token = ?",
|
|
db::consistency_level::ONE,
|
|
{data_value(hint.table_id.uuid()), data_value(dht::token::to_int64(token))},
|
|
cql3::query_processor::cache_internal::yes);
|
|
const auto tid = tmap.get_tablet_id(token);
|
|
if (res->empty()) {
|
|
throw std::runtime_error("Failed to update tablet metadata: updated row is empty");
|
|
} else {
|
|
tmap.clear_tablet_transition_info(tid);
|
|
process_one_row(&db, hint.table_id, tmap, tid, res->one());
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> update_tablet_metadata(replica::database& db, cql3::query_processor& qp, tablet_metadata& tm, const locator::tablet_metadata_change_hint& hint) {
|
|
try {
|
|
for (const auto& [_, table_hint] : hint.tables) {
|
|
if (table_hint.tokens.empty()) {
|
|
co_await do_update_tablet_metadata_partition(qp, tm, table_hint);
|
|
} else {
|
|
co_await tm.mutate_tablet_map_async(table_hint.table_id, [&] (tablet_map& tmap) -> future<> {
|
|
co_await do_update_tablet_metadata_rows(db, qp, tmap, table_hint);
|
|
});
|
|
}
|
|
}
|
|
} catch (...) {
|
|
std::throw_with_nested(std::runtime_error("Failed to read tablet metadata"));
|
|
}
|
|
tablet_logger.trace("Updated tablet metadata: {}", tm);
|
|
}
|
|
|
|
future<std::vector<canonical_mutation>> read_tablet_mutations(seastar::sharded<replica::database>& db) {
|
|
auto s = db::system_keyspace::tablets();
|
|
auto rs = co_await db::system_keyspace::query_mutations(db, db::system_keyspace::NAME, db::system_keyspace::TABLETS);
|
|
std::vector<canonical_mutation> result;
|
|
result.reserve(rs->partitions().size());
|
|
for (auto& p: rs->partitions()) {
|
|
result.emplace_back(co_await make_canonical_mutation_gently(co_await unfreeze_gently(p.mut(), s)));
|
|
}
|
|
co_return std::move(result);
|
|
}
|
|
|
|
// This sstable set provides access to all the stables in the table, using a snapshot of all
|
|
// its tablets/storage_groups compound_sstable_set:s.
|
|
// The managed sets cannot be modified through tablet_sstable_set, but only jointly read from, so insert() and erase() are disabled.
|
|
class tablet_sstable_set : public sstables::sstable_set_impl {
|
|
schema_ptr _schema;
|
|
locator::tablet_map _tablet_map;
|
|
// Keep a single (compound) sstable_set per tablet/storage_group
|
|
absl::flat_hash_map<size_t, lw_shared_ptr<const sstables::sstable_set>, absl::Hash<size_t>> _sstable_sets;
|
|
// Used when ordering is required for correctness, but hot paths will use flat_hash_map
|
|
// which provides faster lookup time.
|
|
std::set<size_t> _sstable_set_ids;
|
|
size_t _size = 0;
|
|
uint64_t _bytes_on_disk = 0;
|
|
|
|
public:
|
|
tablet_sstable_set(schema_ptr s, const storage_group_manager& sgm, const locator::tablet_map& tmap)
|
|
: _schema(std::move(s))
|
|
, _tablet_map(tmap.tablet_count())
|
|
{
|
|
sgm.for_each_storage_group([this] (size_t id, storage_group& sg) {
|
|
auto set = sg.make_sstable_set();
|
|
_size += set->size();
|
|
_bytes_on_disk += set->bytes_on_disk();
|
|
_sstable_sets[id] = std::move(set);
|
|
_sstable_set_ids.insert(id);
|
|
});
|
|
}
|
|
|
|
static lw_shared_ptr<sstables::sstable_set> make(schema_ptr s, const storage_group_manager& sgm, const locator::tablet_map& tmap) {
|
|
return make_lw_shared<sstables::sstable_set>(std::make_unique<tablet_sstable_set>(std::move(s), sgm, tmap));
|
|
}
|
|
|
|
const schema_ptr& schema() const noexcept {
|
|
return _schema;
|
|
}
|
|
|
|
virtual std::unique_ptr<sstable_set_impl> clone() const override {
|
|
return std::make_unique<tablet_sstable_set>(*this);
|
|
}
|
|
|
|
virtual std::vector<sstables::shared_sstable> select(const dht::partition_range& range = query::full_partition_range) const override;
|
|
virtual lw_shared_ptr<const sstable_list> all() const override;
|
|
virtual stop_iteration for_each_sstable_until(std::function<stop_iteration(const sstables::shared_sstable&)> func) const override;
|
|
virtual future<stop_iteration> for_each_sstable_gently_until(std::function<future<stop_iteration>(const sstables::shared_sstable&)> func) const override;
|
|
virtual bool insert(sstables::shared_sstable sst) override;
|
|
virtual bool erase(sstables::shared_sstable sst) override;
|
|
virtual size_t size() const noexcept override {
|
|
return _size;
|
|
}
|
|
virtual uint64_t bytes_on_disk() const noexcept override {
|
|
return _bytes_on_disk;
|
|
}
|
|
virtual selector_and_schema_t make_incremental_selector() const override;
|
|
|
|
virtual mutation_reader create_single_key_sstable_reader(
|
|
replica::column_family*,
|
|
schema_ptr,
|
|
reader_permit,
|
|
utils::estimated_histogram&,
|
|
const dht::partition_range&,
|
|
const query::partition_slice&,
|
|
tracing::trace_state_ptr,
|
|
streamed_mutation::forwarding,
|
|
mutation_reader::forwarding,
|
|
const sstables::sstable_predicate&) const override;
|
|
|
|
// Will always return an engaged sstable set ptr.
|
|
const lw_shared_ptr<const sstables::sstable_set>& find_sstable_set(size_t i) const {
|
|
auto it = _sstable_sets.find(i);
|
|
if (it == _sstable_sets.end() || !it->second) [[unlikely]] {
|
|
on_internal_error(tablet_logger, format("SSTable set wasn't found for tablet {} of table {}.{}", i, schema()->ks_name(), schema()->cf_name()));
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
private:
|
|
size_t group_of(const dht::token& t) const noexcept {
|
|
return _tablet_map.get_tablet_id(t).id;
|
|
}
|
|
dht::token first_token_of(size_t idx) const noexcept {
|
|
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
|
if (idx >= _tablet_map.tablet_count()) {
|
|
on_fatal_internal_error(tablet_logger, format("first_token_of: idx={} out of range", idx));
|
|
}
|
|
#endif
|
|
return _tablet_map.get_first_token(tablet_id(idx));
|
|
}
|
|
dht::token last_token_of(size_t idx) const noexcept {
|
|
#ifndef SCYLLA_BUILD_MODE_RELEASE
|
|
if (idx >= _tablet_map.tablet_count()) {
|
|
on_fatal_internal_error(tablet_logger, format("last_token_of: idx={} out of range", idx));
|
|
}
|
|
#endif
|
|
return _tablet_map.get_last_token(tablet_id(idx));
|
|
}
|
|
stop_iteration for_each_sstable_set_until(const dht::partition_range&, std::function<stop_iteration(lw_shared_ptr<const sstables::sstable_set>)>) const;
|
|
future<stop_iteration> for_each_sstable_set_gently_until(const dht::partition_range&, std::function<future<stop_iteration>(lw_shared_ptr<const sstables::sstable_set>)>) const;
|
|
|
|
auto subrange(const dht::partition_range& pr) const {
|
|
size_t candidate_start = pr.start() ? group_of(pr.start()->value().token()) : size_t(0);
|
|
size_t candidate_end = pr.end() ? group_of(pr.end()->value().token()) : (_tablet_map.tablet_count() - 1);
|
|
return std::ranges::subrange(_sstable_set_ids.lower_bound(candidate_start), _sstable_set_ids.upper_bound(candidate_end));
|
|
}
|
|
|
|
friend class tablet_incremental_selector;
|
|
};
|
|
|
|
lw_shared_ptr<sstables::sstable_set> make_tablet_sstable_set(schema_ptr s, const storage_group_manager& sgm, const locator::tablet_map& tmap) {
|
|
return tablet_sstable_set::make(std::move(s), sgm, tmap);
|
|
}
|
|
|
|
future<std::optional<tablet_transition_stage>> read_tablet_transition_stage(cql3::query_processor& qp, table_id tid, dht::token last_token) {
|
|
auto rs = co_await qp.execute_internal("select stage from system.tablets where table_id = ? and last_token = ?",
|
|
{tid.uuid(), dht::token::to_int64(last_token)}, cql3::query_processor::cache_internal::no);
|
|
if (rs->empty() || !rs->one().has("stage")) {
|
|
co_return std::nullopt;
|
|
}
|
|
|
|
co_return tablet_transition_stage_from_string(rs->one().get_as<sstring>("stage"));
|
|
}
|
|
|
|
stop_iteration tablet_sstable_set::for_each_sstable_set_until(const dht::partition_range& pr, std::function<stop_iteration(lw_shared_ptr<const sstables::sstable_set>)> func) const {
|
|
for (const auto& i : subrange(pr)) {
|
|
const auto& set = find_sstable_set(i);
|
|
if (func(set) == stop_iteration::yes) {
|
|
return stop_iteration::yes;
|
|
}
|
|
}
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
future<stop_iteration> tablet_sstable_set::for_each_sstable_set_gently_until(const dht::partition_range& pr, std::function<future<stop_iteration>(lw_shared_ptr<const sstables::sstable_set>)> func) const {
|
|
for (const auto& i : subrange(pr)) {
|
|
const auto& set = find_sstable_set(i);
|
|
if (co_await func(set) == stop_iteration::yes) {
|
|
co_return stop_iteration::yes;
|
|
}
|
|
}
|
|
co_return stop_iteration::no;
|
|
}
|
|
|
|
std::vector<sstables::shared_sstable> tablet_sstable_set::select(const dht::partition_range& range) const {
|
|
std::vector<sstables::shared_sstable> ret;
|
|
ret.reserve(size());
|
|
for_each_sstable_set_until(range, [&] (lw_shared_ptr<const sstables::sstable_set> set) {
|
|
auto ssts = set->select(range);
|
|
if (ret.empty()) {
|
|
ret = std::move(ssts);
|
|
} else {
|
|
std::move(ssts.begin(), ssts.end(), std::back_inserter(ret));
|
|
}
|
|
return stop_iteration::no;
|
|
});
|
|
tablet_logger.debug("tablet_sstable_set::select: range={} ret={}", range, ret.size());
|
|
return ret;
|
|
}
|
|
|
|
lw_shared_ptr<const sstable_list> tablet_sstable_set::all() const {
|
|
auto ret = make_lw_shared<sstable_list>();
|
|
ret->reserve(size());
|
|
for_each_sstable_set_until(query::full_partition_range, [&] (lw_shared_ptr<const sstables::sstable_set> set) {
|
|
set->for_each_sstable([&] (const sstables::shared_sstable& sst) {
|
|
ret->insert(sst);
|
|
});
|
|
return stop_iteration::no;
|
|
});
|
|
return ret;
|
|
}
|
|
|
|
stop_iteration tablet_sstable_set::for_each_sstable_until(std::function<stop_iteration(const sstables::shared_sstable&)> func) const {
|
|
return for_each_sstable_set_until(query::full_partition_range, [func = std::move(func)] (lw_shared_ptr<const sstables::sstable_set> set) {
|
|
return set->for_each_sstable_until(func);
|
|
});
|
|
}
|
|
|
|
future<stop_iteration> tablet_sstable_set::for_each_sstable_gently_until(std::function<future<stop_iteration>(const sstables::shared_sstable&)> func) const {
|
|
return for_each_sstable_set_gently_until(query::full_partition_range, [func = std::move(func)] (lw_shared_ptr<const sstables::sstable_set> set) {
|
|
return set->for_each_sstable_gently_until(func);
|
|
});
|
|
}
|
|
|
|
bool tablet_sstable_set::insert(sstables::shared_sstable sst) {
|
|
throw_with_backtrace<std::bad_function_call>();
|
|
}
|
|
bool tablet_sstable_set::erase(sstables::shared_sstable sst) {
|
|
throw_with_backtrace<std::bad_function_call>();
|
|
}
|
|
|
|
class tablet_incremental_selector : public sstables::incremental_selector_impl {
|
|
const tablet_sstable_set& _tset;
|
|
|
|
// _cur_set and _cur_selector contain a snapshot
|
|
// for the currently selected compaction_group.
|
|
lw_shared_ptr<const sstables::sstable_set> _cur_set;
|
|
std::optional<sstables::sstable_set::incremental_selector> _cur_selector;
|
|
dht::token _lowest_next_token = dht::maximum_token();
|
|
|
|
public:
|
|
tablet_incremental_selector(const tablet_sstable_set& tset)
|
|
: _tset(tset)
|
|
{}
|
|
|
|
virtual std::tuple<dht::partition_range, std::vector<sstables::shared_sstable>, dht::ring_position_ext> select(const selector_pos& s) override {
|
|
// Always return minimum singular range, such that incremental_selector::select() will always call this function,
|
|
// which in turn will find the next sstable set to select sstables from.
|
|
const dht::partition_range current_range = dht::partition_range::make_singular(dht::ring_position::min());
|
|
|
|
// pos must be monotonically increasing in the weak sense
|
|
// but caller can skip to a position outside the current set
|
|
const dht::ring_position_view& pos = s.pos;
|
|
auto token = pos.token();
|
|
if (!_cur_set || pos.token() >= _lowest_next_token) {
|
|
auto idx = _tset.group_of(token);
|
|
auto pr_end = s.range ? dht::ring_position_view::for_range_end(*s.range) : dht::ring_position_view::max();
|
|
// End of stream is reached when pos is past the end of the read range (i.e. exclude tablets
|
|
// that doesn't intersect with the range).
|
|
if (dht::ring_position_tri_compare(*_tset.schema(), pos, pr_end) <= 0 && _tset._sstable_set_ids.contains(idx)) {
|
|
_cur_set = _tset.find_sstable_set(idx);
|
|
}
|
|
// Set the next token to point to the next engaged storage group.
|
|
// It will be considered later on when the _cur_set is exhausted
|
|
_lowest_next_token = find_lowest_next_token(idx);
|
|
}
|
|
|
|
if (!_cur_set) {
|
|
auto lowest_next_position = _lowest_next_token.is_maximum()
|
|
? dht::ring_position_ext::max()
|
|
: dht::ring_position_ext::starting_at(_lowest_next_token);
|
|
tablet_logger.debug("tablet_incremental_selector {}.{}: select pos={}: returning 0 sstables, next_pos={}",
|
|
_tset.schema()->ks_name(), _tset.schema()->cf_name(), pos, lowest_next_position);
|
|
return std::make_tuple(std::move(current_range), std::vector<sstables::shared_sstable>{}, lowest_next_position);
|
|
}
|
|
|
|
_cur_selector.emplace(_cur_set->make_incremental_selector());
|
|
|
|
auto res = _cur_selector->select(s);
|
|
// Return all sstables selected on the requested position from the first matching sstable set.
|
|
// This assumes that the underlying sstable sets are disjoint in their token ranges so
|
|
// only one of them contain any given token.
|
|
auto sstables = std::move(res.sstables);
|
|
// Return the lowest next position, such that this function will be called again to select the
|
|
// lowest next position from the selector which previously returned it.
|
|
// Until the current selector is exhausted. In that case,
|
|
// jump to the next compaction_group sstable set.
|
|
dht::ring_position_ext next_position = res.next_position;
|
|
if (next_position.is_max()) {
|
|
// _cur_selector is exhausted.
|
|
// Return a position starting at `_lowest_next_token`
|
|
// that was calculated for the _cur_set
|
|
// (unless it's already maximum_token in which case we just return next_position == ring_position::max()).
|
|
_cur_set = {};
|
|
_cur_selector.reset();
|
|
if (!_lowest_next_token.is_maximum()) {
|
|
next_position = dht::ring_position_ext::starting_at(_lowest_next_token);
|
|
}
|
|
}
|
|
|
|
tablet_logger.debug("tablet_incremental_selector {}.{}: select pos={}: returning {} sstables, next_pos={}",
|
|
_tset.schema()->ks_name(), _tset.schema()->cf_name(), pos, sstables.size(), next_position);
|
|
return std::make_tuple(std::move(current_range), std::move(sstables), std::move(next_position));
|
|
}
|
|
|
|
private:
|
|
// Find the start token of the first engaged sstable_set
|
|
// starting the search from `current_idx` (exclusive).
|
|
dht::token find_lowest_next_token(size_t current_idx) {
|
|
auto it = _tset._sstable_set_ids.upper_bound(current_idx);
|
|
if (it != _tset._sstable_set_ids.end()) {
|
|
return _tset.first_token_of(*it);
|
|
}
|
|
return dht::maximum_token();
|
|
}
|
|
};
|
|
|
|
mutation_reader
|
|
tablet_sstable_set::create_single_key_sstable_reader(
|
|
replica::column_family* cf,
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr,
|
|
const sstables::sstable_predicate& predicate) const {
|
|
// The singular partition_range start bound must be engaged.
|
|
auto idx = group_of(pr.start()->value().token());
|
|
const auto& set = find_sstable_set(idx);
|
|
return set->create_single_key_sstable_reader(cf, std::move(schema), std::move(permit), sstable_histogram, pr, slice, trace_state, fwd, fwd_mr, predicate);
|
|
}
|
|
|
|
sstables::sstable_set_impl::selector_and_schema_t tablet_sstable_set::make_incremental_selector() const {
|
|
return std::make_tuple(std::make_unique<tablet_incremental_selector>(*this), *_schema);
|
|
}
|
|
|
|
} // namespace replica
|