mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-19 16:15:07 +00:00
tablets, database: Advertise 'arbitrary' layout in snapshot manifest
Currently, the manifest advertises "powof2", which is wrong for arbitrary count and boundaries. Introduce a new kind of layout called "arbitrary", and produce it if the tablet map doesn't conform to "powof2" layout. We should also produce tablet boundaries in this case, but that's worked on in a different PR: https://github.com/scylladb/scylladb/pull/28525
This commit is contained in:
@@ -290,7 +290,8 @@ The `table` member contains metadata about the table being snapshot.
|
||||
- `table_id` - a universally unique identifier (UUID) of the table set when the table is created.
|
||||
- `tablets_type`:
|
||||
- `none` - if the keyspace uses vnodes replication
|
||||
- `powof2` - if the keyspace uses tables replication, and the tablet token ranges are based on powers of 2.
|
||||
- `powof2` - if the keyspace uses tablets replication, and the tablet token ranges are based on powers of 2.
|
||||
- `arbitrary` - if the keyspace uses tablets replication, and the tablet token ranges and count can be arbitrary.
|
||||
- `tablet_count` - Optional. If `tablets_type` is not `none`, contains the number of tablets allcated in the table. If `tablets_type` is `powof2`, tablet_count would be a power of 2.
|
||||
|
||||
The `sstables` member is a list containing metadata about the SSTables in the snapshot.
|
||||
|
||||
@@ -514,6 +514,28 @@ tablet_map::tablet_map(size_t tablet_count, bool with_raft_info, tablet_map::ini
|
||||
}
|
||||
}
|
||||
|
||||
tablet_layout tablet_id_map::get_layout(const utils::chunked_vector<dht::raw_token>& last_tokens) {
|
||||
auto log2count = log2ceil(last_tokens.size());
|
||||
if (last_tokens.size() != 1ull << log2count) {
|
||||
return tablet_layout::arbitrary;
|
||||
}
|
||||
for (size_t i = 0; i < last_tokens.size(); ++i) {
|
||||
auto pow2_last_token = dht::last_token_of_compaction_group(log2count, i);
|
||||
if (last_tokens[i] != pow2_last_token) {
|
||||
return tablet_layout::arbitrary;
|
||||
}
|
||||
}
|
||||
return tablet_layout::pow_of_2;
|
||||
}
|
||||
|
||||
tablet_layout tablet_id_map::get_layout() const {
|
||||
return get_layout(_last_tokens);
|
||||
}
|
||||
|
||||
tablet_layout tablet_map::get_layout() const {
|
||||
return _tablet_ids.get_layout();
|
||||
}
|
||||
|
||||
tablet_map tablet_map::clone() const {
|
||||
return tablet_map(_tablet_ids, _tablets, _transitions, _resize_decision, _resize_task_info,
|
||||
_repair_scheduler_config, _raft_info);
|
||||
|
||||
@@ -57,6 +57,33 @@ struct tablet_id {
|
||||
auto operator<=>(const tablet_id&) const = default;
|
||||
};
|
||||
|
||||
/// Determines properties of tablet layout in a given table (tablet_map).
|
||||
enum class tablet_layout {
|
||||
// Tablet count is a power of 2.
|
||||
// Tablet boundaries start and end at whole tokens.
|
||||
// Every token is owned by exactly one tablet.
|
||||
//
|
||||
// Tablet token ranges are adjacent, with subsequent tablets owning higher tokens.
|
||||
// Tablet of index i covers the range (a, b], where:
|
||||
//
|
||||
// a = i == 0 ? dht::minimum_token() : last_token[i-1]
|
||||
// b = last_token[i]
|
||||
//
|
||||
// Boundary tokens are evenly distributed in token space:
|
||||
//
|
||||
// last_token[i] = i < tablet_count - 1 ? dht::bias((uint64_t(i + 1) << (64 - log2ceil(tablet_count))) - 1)
|
||||
// : dht::maximum_token()
|
||||
pow_of_2,
|
||||
|
||||
// Tablet count can be any integer greater than 0.
|
||||
// Tablet boundaries end and start at whole tokens.
|
||||
// Each tablet owns at least one token.
|
||||
// Every token is owned by exactly one tablet.
|
||||
// Tablet token ranges are adjacent, with subsequent tablets owning higher tokens.
|
||||
// As long as the above holds, boundary tokens can be arbitrary.
|
||||
arbitrary
|
||||
};
|
||||
|
||||
/// Identifies tablet (not be confused with tablet replica) in the scope of the whole cluster.
|
||||
struct global_tablet_id {
|
||||
table_id table;
|
||||
@@ -604,6 +631,9 @@ public:
|
||||
[[nodiscard]] size_t tablet_count() const { return _last_tokens.size(); }
|
||||
[[nodiscard]] dht::raw_token get_last_token(tablet_id id) const { return _last_tokens[id.value()]; }
|
||||
[[nodiscard]] const utils::chunked_vector<dht::raw_token>& last_tokens() const { return _last_tokens; }
|
||||
[[nodiscard]] tablet_layout get_layout() const;
|
||||
|
||||
static tablet_layout get_layout(const utils::chunked_vector<dht::raw_token>& last_tokens);
|
||||
|
||||
/// Adds a new mapping for the next tablet whose range will be recorded to start
|
||||
/// after the last token of the previously added entry and end at last_token (inclusive).
|
||||
@@ -837,6 +867,9 @@ public:
|
||||
/// The higher sibling will own (get_split_token(id), get_last_token(id)].
|
||||
dht::token get_split_token(tablet_id id) const;
|
||||
|
||||
/// Returns tablet_layout of this tablet_map.
|
||||
tablet_layout get_layout() const;
|
||||
|
||||
const locator::resize_decision& resize_decision() const;
|
||||
const tablet_task_info& resize_task_info() const;
|
||||
const std::optional<locator::repair_scheduler_config> get_repair_scheduler_config() const;
|
||||
|
||||
@@ -4066,7 +4066,22 @@ public:
|
||||
|
||||
using snapshot_sstable_set = foreign_ptr<std::unique_ptr<utils::chunked_vector<sstables::sstable_snapshot_metadata>>>;
|
||||
|
||||
static future<> write_manifest(const locator::topology& topology, snapshot_writer& writer, std::vector<snapshot_sstable_set> sstable_sets, std::vector<snapshot_tablet_info> tablets, sstring name, db::snapshot_options opts, schema_ptr schema, std::optional<int64_t> tablet_count) {
|
||||
static std::string get_tablets_type(locator::tablet_layout layout) {
|
||||
switch (layout) {
|
||||
case locator::tablet_layout::pow_of_2: return "powof2";
|
||||
case locator::tablet_layout::arbitrary: return "arbitrary";
|
||||
}
|
||||
on_internal_error(tlogger, format("Unknown tablet layout: {}", static_cast<int>(layout)));
|
||||
}
|
||||
|
||||
static future<> write_manifest(const locator::topology& topology,
|
||||
snapshot_writer& writer,
|
||||
std::vector<snapshot_sstable_set> sstable_sets,
|
||||
std::vector<snapshot_tablet_info> tablets,
|
||||
sstring name, db::snapshot_options opts,
|
||||
schema_ptr schema,
|
||||
std::optional<int64_t> tablet_count,
|
||||
std::optional<locator::tablet_layout> tablet_layout) {
|
||||
manifest_json manifest;
|
||||
|
||||
manifest_json::info info;
|
||||
@@ -4093,7 +4108,7 @@ static future<> write_manifest(const locator::topology& topology, snapshot_write
|
||||
table.keyspace_name = schema->ks_name();
|
||||
table.table_name = schema->cf_name();
|
||||
table.table_id = to_sstring(schema->id());
|
||||
table.tablets_type = tablet_count ? "powof2" : "none";
|
||||
table.tablets_type = tablet_layout ? get_tablets_type(*tablet_layout) : "none";
|
||||
table.tablet_count = tablet_count.value_or(0);
|
||||
manifest.table = std::move(table);
|
||||
|
||||
@@ -4229,12 +4244,14 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
tlogger.debug("snapshot {}: seal_snapshot", name);
|
||||
const auto& topology = sharded_db.local().get_token_metadata().get_topology();
|
||||
std::optional<int64_t> tablet_count;
|
||||
std::optional<locator::tablet_layout> tablet_layout;
|
||||
std::vector<snapshot_tablet_info> tablets;
|
||||
std::unordered_set<size_t> tids;
|
||||
if (t.uses_tablets()) {
|
||||
auto erm = t.get_effective_replication_map();
|
||||
auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
||||
tablet_count = tm.tablet_count();
|
||||
tablet_layout = tm.get_layout();
|
||||
for (auto& ssts : sstable_sets) {
|
||||
for (auto& sst : *ssts) {
|
||||
auto tok = sst.first_token;
|
||||
@@ -4253,7 +4270,8 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
|
||||
co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s,
|
||||
tablet_count, tablet_layout).handle_exception([&] (std::exception_ptr ptr) {
|
||||
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
|
||||
ex = std::move(ptr);
|
||||
});
|
||||
|
||||
@@ -690,12 +690,17 @@ static future<> validate_manifest(const locator::topology& topology, const fs::p
|
||||
}
|
||||
if (tablets_enabled) {
|
||||
BOOST_REQUIRE(tablets_type.has_value());
|
||||
BOOST_REQUIRE_EQUAL(*tablets_type, "powof2");
|
||||
BOOST_REQUIRE(manifest_table.HasMember("tablet_count"));
|
||||
auto& tablet_count_json = manifest_table["tablet_count"];
|
||||
BOOST_REQUIRE(tablet_count_json.IsNumber());
|
||||
uint64_t tablet_count = tablet_count_json.GetInt64();
|
||||
BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count));
|
||||
if (*tablets_type == "powof2") {
|
||||
BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count));
|
||||
} else if (*tablets_type == "arbitrary") {
|
||||
BOOST_REQUIRE_GE(tablet_count, 1);
|
||||
} else {
|
||||
BOOST_FAIL(format("Unknown tablets_type in manifest: {}", *tablets_type));
|
||||
}
|
||||
} else {
|
||||
if (tablets_type) {
|
||||
BOOST_REQUIRE_EQUAL(*tablets_type, "none");
|
||||
|
||||
@@ -2527,6 +2527,44 @@ SEASTAR_THREAD_TEST_CASE(test_rack_list_conversion) {
|
||||
}).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_map_layout) {
|
||||
auto cfg = cql_test_config{};
|
||||
cfg.db_config->tablets_per_shard_goal.set(10000); // Inhibit scaling-down of the count.
|
||||
do_with_cql_env_thread([](auto& e) {
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
|
||||
topology_builder topo(e);
|
||||
topo.add_node(node_state::normal, 1);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, 1);
|
||||
auto min_10 = std::map<sstring, sstring>({{"min_tablet_count", "10"}, {"pow2_count", "false"}}); // 10 -> 5 -> 3 -> 2
|
||||
auto min_8 = std::map<sstring, sstring>({{"min_tablet_count", "8"}, {"pow2_count", "false"}});
|
||||
auto min_2 = std::map<sstring, sstring>({{"min_tablet_count", "2"}, {"pow2_count", "false"}});
|
||||
auto table1 = add_table(e, ks_name, min_10).get();
|
||||
auto table2 = add_table(e, ks_name, min_8).get();
|
||||
auto table3 = add_table(e, ks_name, min_8).get();
|
||||
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 10);
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table2).tablet_count() == 8);
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table3).tablet_count() == 8);
|
||||
|
||||
BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout());
|
||||
BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table2).get_layout());
|
||||
BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table3).get_layout());
|
||||
|
||||
topo.get_shared_load_stats().set_tablet_sizes(stm.get(), table1, 1);
|
||||
|
||||
set_tablet_opts(e, table1, min_2);
|
||||
rebalance_tablets(e, &topo.get_shared_load_stats());
|
||||
|
||||
// Even though the count is a power-of-two, boundaries are not aligned due to odd-count merge,
|
||||
// so the layout should remain arbitrary.
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 2);
|
||||
BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout());
|
||||
return make_ready_future<>();
|
||||
}, std::move(cfg)).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_colocation_skipped_on_excluded_nodes) {
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
Reference in New Issue
Block a user