From 7af9f5366dd8d4f563d7e4d63ac8caafc67caf2a Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Wed, 4 Feb 2026 00:54:40 +0100 Subject: [PATCH] tablets, database: Advertise 'arbitrary' layout in snapshot manifest Currently, the manifest advertises "powof2", which is wrong for arbitrary count and boundaries. Introduce a new kind of layout called "arbitrary", and produce it if the tablet map doesn't conform to "powof2" layout. We should also produce tablet boundaries in this case, but that's worked on in a different PR: https://github.com/scylladb/scylladb/pull/28525 --- docs/dev/object_storage.md | 3 ++- locator/tablets.cc | 22 +++++++++++++++++++++ locator/tablets.hh | 33 ++++++++++++++++++++++++++++++++ replica/table.cc | 24 ++++++++++++++++++++--- test/boost/database_test.cc | 9 +++++++-- test/boost/tablets_test.cc | 38 +++++++++++++++++++++++++++++++++++++ 6 files changed, 123 insertions(+), 6 deletions(-) diff --git a/docs/dev/object_storage.md b/docs/dev/object_storage.md index 441bc88b54..03547e9a29 100644 --- a/docs/dev/object_storage.md +++ b/docs/dev/object_storage.md @@ -290,7 +290,8 @@ The `table` member contains metadata about the table being snapshot. - `table_id` - a universally unique identifier (UUID) of the table set when the table is created. - `tablets_type`: - `none` - if the keyspace uses vnodes replication - - `powof2` - if the keyspace uses tables replication, and the tablet token ranges are based on powers of 2. + - `powof2` - if the keyspace uses tablets replication, and the tablet token ranges are based on powers of 2. + - `arbitrary` - if the keyspace uses tablets replication, and the tablet token ranges and count can be arbitrary. - `tablet_count` - Optional. If `tablets_type` is not `none`, contains the number of tablets allcated in the table. If `tablets_type` is `powof2`, tablet_count would be a power of 2. The `sstables` member is a list containing metadata about the SSTables in the snapshot. diff --git a/locator/tablets.cc b/locator/tablets.cc index b5c3a6285b..5a76311191 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -514,6 +514,28 @@ tablet_map::tablet_map(size_t tablet_count, bool with_raft_info, tablet_map::ini } } +tablet_layout tablet_id_map::get_layout(const utils::chunked_vector& last_tokens) { + auto log2count = log2ceil(last_tokens.size()); + if (last_tokens.size() != 1ull << log2count) { + return tablet_layout::arbitrary; + } + for (size_t i = 0; i < last_tokens.size(); ++i) { + auto pow2_last_token = dht::last_token_of_compaction_group(log2count, i); + if (last_tokens[i] != pow2_last_token) { + return tablet_layout::arbitrary; + } + } + return tablet_layout::pow_of_2; +} + +tablet_layout tablet_id_map::get_layout() const { + return get_layout(_last_tokens); +} + +tablet_layout tablet_map::get_layout() const { + return _tablet_ids.get_layout(); +} + tablet_map tablet_map::clone() const { return tablet_map(_tablet_ids, _tablets, _transitions, _resize_decision, _resize_task_info, _repair_scheduler_config, _raft_info); diff --git a/locator/tablets.hh b/locator/tablets.hh index d20e004453..333d746c44 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -57,6 +57,33 @@ struct tablet_id { auto operator<=>(const tablet_id&) const = default; }; +/// Determines properties of tablet layout in a given table (tablet_map). +enum class tablet_layout { + // Tablet count is a power of 2. + // Tablet boundaries start and end at whole tokens. + // Every token is owned by exactly one tablet. + // + // Tablet token ranges are adjacent, with subsequent tablets owning higher tokens. + // Tablet of index i covers the range (a, b], where: + // + // a = i == 0 ? dht::minimum_token() : last_token[i-1] + // b = last_token[i] + // + // Boundary tokens are evenly distributed in token space: + // + // last_token[i] = i < tablet_count - 1 ? dht::bias((uint64_t(i + 1) << (64 - log2ceil(tablet_count))) - 1) + // : dht::maximum_token() + pow_of_2, + + // Tablet count can be any integer greater than 0. + // Tablet boundaries end and start at whole tokens. + // Each tablet owns at least one token. + // Every token is owned by exactly one tablet. + // Tablet token ranges are adjacent, with subsequent tablets owning higher tokens. + // As long as the above holds, boundary tokens can be arbitrary. + arbitrary +}; + /// Identifies tablet (not be confused with tablet replica) in the scope of the whole cluster. struct global_tablet_id { table_id table; @@ -604,6 +631,9 @@ public: [[nodiscard]] size_t tablet_count() const { return _last_tokens.size(); } [[nodiscard]] dht::raw_token get_last_token(tablet_id id) const { return _last_tokens[id.value()]; } [[nodiscard]] const utils::chunked_vector& last_tokens() const { return _last_tokens; } + [[nodiscard]] tablet_layout get_layout() const; + + static tablet_layout get_layout(const utils::chunked_vector& last_tokens); /// Adds a new mapping for the next tablet whose range will be recorded to start /// after the last token of the previously added entry and end at last_token (inclusive). @@ -837,6 +867,9 @@ public: /// The higher sibling will own (get_split_token(id), get_last_token(id)]. dht::token get_split_token(tablet_id id) const; + /// Returns tablet_layout of this tablet_map. + tablet_layout get_layout() const; + const locator::resize_decision& resize_decision() const; const tablet_task_info& resize_task_info() const; const std::optional get_repair_scheduler_config() const; diff --git a/replica/table.cc b/replica/table.cc index 4871e74af2..7699935a08 100644 --- a/replica/table.cc +++ b/replica/table.cc @@ -4066,7 +4066,22 @@ public: using snapshot_sstable_set = foreign_ptr>>; -static future<> write_manifest(const locator::topology& topology, snapshot_writer& writer, std::vector sstable_sets, std::vector tablets, sstring name, db::snapshot_options opts, schema_ptr schema, std::optional tablet_count) { +static std::string get_tablets_type(locator::tablet_layout layout) { + switch (layout) { + case locator::tablet_layout::pow_of_2: return "powof2"; + case locator::tablet_layout::arbitrary: return "arbitrary"; + } + on_internal_error(tlogger, format("Unknown tablet layout: {}", static_cast(layout))); +} + +static future<> write_manifest(const locator::topology& topology, + snapshot_writer& writer, + std::vector sstable_sets, + std::vector tablets, + sstring name, db::snapshot_options opts, + schema_ptr schema, + std::optional tablet_count, + std::optional tablet_layout) { manifest_json manifest; manifest_json::info info; @@ -4093,7 +4108,7 @@ static future<> write_manifest(const locator::topology& topology, snapshot_write table.keyspace_name = schema->ks_name(); table.table_name = schema->cf_name(); table.table_id = to_sstring(schema->id()); - table.tablets_type = tablet_count ? "powof2" : "none"; + table.tablets_type = tablet_layout ? get_tablets_type(*tablet_layout) : "none"; table.tablet_count = tablet_count.value_or(0); manifest.table = std::move(table); @@ -4229,12 +4244,14 @@ future<> database::snapshot_table_on_all_shards(sharded& sharded_db, c tlogger.debug("snapshot {}: seal_snapshot", name); const auto& topology = sharded_db.local().get_token_metadata().get_topology(); std::optional tablet_count; + std::optional tablet_layout; std::vector tablets; std::unordered_set tids; if (t.uses_tablets()) { auto erm = t.get_effective_replication_map(); auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id()); tablet_count = tm.tablet_count(); + tablet_layout = tm.get_layout(); for (auto& ssts : sstable_sets) { for (auto& sst : *ssts) { auto tok = sst.first_token; @@ -4253,7 +4270,8 @@ future<> database::snapshot_table_on_all_shards(sharded& sharded_db, c } } } - co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) { + co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s, + tablet_count, tablet_layout).handle_exception([&] (std::exception_ptr ptr) { tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr); ex = std::move(ptr); }); diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index 51a1827d18..b14d4330a3 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -690,12 +690,17 @@ static future<> validate_manifest(const locator::topology& topology, const fs::p } if (tablets_enabled) { BOOST_REQUIRE(tablets_type.has_value()); - BOOST_REQUIRE_EQUAL(*tablets_type, "powof2"); BOOST_REQUIRE(manifest_table.HasMember("tablet_count")); auto& tablet_count_json = manifest_table["tablet_count"]; BOOST_REQUIRE(tablet_count_json.IsNumber()); uint64_t tablet_count = tablet_count_json.GetInt64(); - BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count)); + if (*tablets_type == "powof2") { + BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count)); + } else if (*tablets_type == "arbitrary") { + BOOST_REQUIRE_GE(tablet_count, 1); + } else { + BOOST_FAIL(format("Unknown tablets_type in manifest: {}", *tablets_type)); + } } else { if (tablets_type) { BOOST_REQUIRE_EQUAL(*tablets_type, "none"); diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index d57639e6d1..2acf3e1b8d 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -2527,6 +2527,44 @@ SEASTAR_THREAD_TEST_CASE(test_rack_list_conversion) { }).get(); } +SEASTAR_THREAD_TEST_CASE(test_tablet_map_layout) { + auto cfg = cql_test_config{}; + cfg.db_config->tablets_per_shard_goal.set(10000); // Inhibit scaling-down of the count. + do_with_cql_env_thread([](auto& e) { + auto& stm = e.shared_token_metadata().local(); + + topology_builder topo(e); + topo.add_node(node_state::normal, 1); + + auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, 1); + auto min_10 = std::map({{"min_tablet_count", "10"}, {"pow2_count", "false"}}); // 10 -> 5 -> 3 -> 2 + auto min_8 = std::map({{"min_tablet_count", "8"}, {"pow2_count", "false"}}); + auto min_2 = std::map({{"min_tablet_count", "2"}, {"pow2_count", "false"}}); + auto table1 = add_table(e, ks_name, min_10).get(); + auto table2 = add_table(e, ks_name, min_8).get(); + auto table3 = add_table(e, ks_name, min_8).get(); + + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 10); + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table2).tablet_count() == 8); + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table3).tablet_count() == 8); + + BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout()); + BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table2).get_layout()); + BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table3).get_layout()); + + topo.get_shared_load_stats().set_tablet_sizes(stm.get(), table1, 1); + + set_tablet_opts(e, table1, min_2); + rebalance_tablets(e, &topo.get_shared_load_stats()); + + // Even though the count is a power-of-two, boundaries are not aligned due to odd-count merge, + // so the layout should remain arbitrary. + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 2); + BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout()); + return make_ready_future<>(); + }, std::move(cfg)).get(); +} + SEASTAR_THREAD_TEST_CASE(test_colocation_skipped_on_excluded_nodes) { do_with_cql_env_thread([] (auto& e) { topology_builder topo(e);