diff --git a/docs/dev/object_storage.md b/docs/dev/object_storage.md index 441bc88b54..03547e9a29 100644 --- a/docs/dev/object_storage.md +++ b/docs/dev/object_storage.md @@ -290,7 +290,8 @@ The `table` member contains metadata about the table being snapshot. - `table_id` - a universally unique identifier (UUID) of the table set when the table is created. - `tablets_type`: - `none` - if the keyspace uses vnodes replication - - `powof2` - if the keyspace uses tables replication, and the tablet token ranges are based on powers of 2. + - `powof2` - if the keyspace uses tablets replication, and the tablet token ranges are based on powers of 2. + - `arbitrary` - if the keyspace uses tablets replication, and the tablet token ranges and count can be arbitrary. - `tablet_count` - Optional. If `tablets_type` is not `none`, contains the number of tablets allcated in the table. If `tablets_type` is `powof2`, tablet_count would be a power of 2. The `sstables` member is a list containing metadata about the SSTables in the snapshot. diff --git a/locator/tablets.cc b/locator/tablets.cc index b5c3a6285b..5a76311191 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -514,6 +514,28 @@ tablet_map::tablet_map(size_t tablet_count, bool with_raft_info, tablet_map::ini } } +tablet_layout tablet_id_map::get_layout(const utils::chunked_vector& last_tokens) { + auto log2count = log2ceil(last_tokens.size()); + if (last_tokens.size() != 1ull << log2count) { + return tablet_layout::arbitrary; + } + for (size_t i = 0; i < last_tokens.size(); ++i) { + auto pow2_last_token = dht::last_token_of_compaction_group(log2count, i); + if (last_tokens[i] != pow2_last_token) { + return tablet_layout::arbitrary; + } + } + return tablet_layout::pow_of_2; +} + +tablet_layout tablet_id_map::get_layout() const { + return get_layout(_last_tokens); +} + +tablet_layout tablet_map::get_layout() const { + return _tablet_ids.get_layout(); +} + tablet_map tablet_map::clone() const { return tablet_map(_tablet_ids, _tablets, _transitions, _resize_decision, _resize_task_info, _repair_scheduler_config, _raft_info); diff --git a/locator/tablets.hh b/locator/tablets.hh index d20e004453..333d746c44 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -57,6 +57,33 @@ struct tablet_id { auto operator<=>(const tablet_id&) const = default; }; +/// Determines properties of tablet layout in a given table (tablet_map). +enum class tablet_layout { + // Tablet count is a power of 2. + // Tablet boundaries start and end at whole tokens. + // Every token is owned by exactly one tablet. + // + // Tablet token ranges are adjacent, with subsequent tablets owning higher tokens. + // Tablet of index i covers the range (a, b], where: + // + // a = i == 0 ? dht::minimum_token() : last_token[i-1] + // b = last_token[i] + // + // Boundary tokens are evenly distributed in token space: + // + // last_token[i] = i < tablet_count - 1 ? dht::bias((uint64_t(i + 1) << (64 - log2ceil(tablet_count))) - 1) + // : dht::maximum_token() + pow_of_2, + + // Tablet count can be any integer greater than 0. + // Tablet boundaries end and start at whole tokens. + // Each tablet owns at least one token. + // Every token is owned by exactly one tablet. + // Tablet token ranges are adjacent, with subsequent tablets owning higher tokens. + // As long as the above holds, boundary tokens can be arbitrary. + arbitrary +}; + /// Identifies tablet (not be confused with tablet replica) in the scope of the whole cluster. struct global_tablet_id { table_id table; @@ -604,6 +631,9 @@ public: [[nodiscard]] size_t tablet_count() const { return _last_tokens.size(); } [[nodiscard]] dht::raw_token get_last_token(tablet_id id) const { return _last_tokens[id.value()]; } [[nodiscard]] const utils::chunked_vector& last_tokens() const { return _last_tokens; } + [[nodiscard]] tablet_layout get_layout() const; + + static tablet_layout get_layout(const utils::chunked_vector& last_tokens); /// Adds a new mapping for the next tablet whose range will be recorded to start /// after the last token of the previously added entry and end at last_token (inclusive). @@ -837,6 +867,9 @@ public: /// The higher sibling will own (get_split_token(id), get_last_token(id)]. dht::token get_split_token(tablet_id id) const; + /// Returns tablet_layout of this tablet_map. + tablet_layout get_layout() const; + const locator::resize_decision& resize_decision() const; const tablet_task_info& resize_task_info() const; const std::optional get_repair_scheduler_config() const; diff --git a/replica/table.cc b/replica/table.cc index 4871e74af2..7699935a08 100644 --- a/replica/table.cc +++ b/replica/table.cc @@ -4066,7 +4066,22 @@ public: using snapshot_sstable_set = foreign_ptr>>; -static future<> write_manifest(const locator::topology& topology, snapshot_writer& writer, std::vector sstable_sets, std::vector tablets, sstring name, db::snapshot_options opts, schema_ptr schema, std::optional tablet_count) { +static std::string get_tablets_type(locator::tablet_layout layout) { + switch (layout) { + case locator::tablet_layout::pow_of_2: return "powof2"; + case locator::tablet_layout::arbitrary: return "arbitrary"; + } + on_internal_error(tlogger, format("Unknown tablet layout: {}", static_cast(layout))); +} + +static future<> write_manifest(const locator::topology& topology, + snapshot_writer& writer, + std::vector sstable_sets, + std::vector tablets, + sstring name, db::snapshot_options opts, + schema_ptr schema, + std::optional tablet_count, + std::optional tablet_layout) { manifest_json manifest; manifest_json::info info; @@ -4093,7 +4108,7 @@ static future<> write_manifest(const locator::topology& topology, snapshot_write table.keyspace_name = schema->ks_name(); table.table_name = schema->cf_name(); table.table_id = to_sstring(schema->id()); - table.tablets_type = tablet_count ? "powof2" : "none"; + table.tablets_type = tablet_layout ? get_tablets_type(*tablet_layout) : "none"; table.tablet_count = tablet_count.value_or(0); manifest.table = std::move(table); @@ -4229,12 +4244,14 @@ future<> database::snapshot_table_on_all_shards(sharded& sharded_db, c tlogger.debug("snapshot {}: seal_snapshot", name); const auto& topology = sharded_db.local().get_token_metadata().get_topology(); std::optional tablet_count; + std::optional tablet_layout; std::vector tablets; std::unordered_set tids; if (t.uses_tablets()) { auto erm = t.get_effective_replication_map(); auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id()); tablet_count = tm.tablet_count(); + tablet_layout = tm.get_layout(); for (auto& ssts : sstable_sets) { for (auto& sst : *ssts) { auto tok = sst.first_token; @@ -4253,7 +4270,8 @@ future<> database::snapshot_table_on_all_shards(sharded& sharded_db, c } } } - co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) { + co_await write_manifest(topology, *writer, std::move(sstable_sets), std::move(tablets), name, std::move(opts), s, + tablet_count, tablet_layout).handle_exception([&] (std::exception_ptr ptr) { tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr); ex = std::move(ptr); }); diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index 51a1827d18..b14d4330a3 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -690,12 +690,17 @@ static future<> validate_manifest(const locator::topology& topology, const fs::p } if (tablets_enabled) { BOOST_REQUIRE(tablets_type.has_value()); - BOOST_REQUIRE_EQUAL(*tablets_type, "powof2"); BOOST_REQUIRE(manifest_table.HasMember("tablet_count")); auto& tablet_count_json = manifest_table["tablet_count"]; BOOST_REQUIRE(tablet_count_json.IsNumber()); uint64_t tablet_count = tablet_count_json.GetInt64(); - BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count)); + if (*tablets_type == "powof2") { + BOOST_REQUIRE_EQUAL(tablet_count, 1 << log2ceil(tablet_count)); + } else if (*tablets_type == "arbitrary") { + BOOST_REQUIRE_GE(tablet_count, 1); + } else { + BOOST_FAIL(format("Unknown tablets_type in manifest: {}", *tablets_type)); + } } else { if (tablets_type) { BOOST_REQUIRE_EQUAL(*tablets_type, "none"); diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index d57639e6d1..2acf3e1b8d 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -2527,6 +2527,44 @@ SEASTAR_THREAD_TEST_CASE(test_rack_list_conversion) { }).get(); } +SEASTAR_THREAD_TEST_CASE(test_tablet_map_layout) { + auto cfg = cql_test_config{}; + cfg.db_config->tablets_per_shard_goal.set(10000); // Inhibit scaling-down of the count. + do_with_cql_env_thread([](auto& e) { + auto& stm = e.shared_token_metadata().local(); + + topology_builder topo(e); + topo.add_node(node_state::normal, 1); + + auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, 1); + auto min_10 = std::map({{"min_tablet_count", "10"}, {"pow2_count", "false"}}); // 10 -> 5 -> 3 -> 2 + auto min_8 = std::map({{"min_tablet_count", "8"}, {"pow2_count", "false"}}); + auto min_2 = std::map({{"min_tablet_count", "2"}, {"pow2_count", "false"}}); + auto table1 = add_table(e, ks_name, min_10).get(); + auto table2 = add_table(e, ks_name, min_8).get(); + auto table3 = add_table(e, ks_name, min_8).get(); + + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 10); + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table2).tablet_count() == 8); + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table3).tablet_count() == 8); + + BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout()); + BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table2).get_layout()); + BOOST_REQUIRE(tablet_layout::pow_of_2 == stm.get()->tablets().get_tablet_map(table3).get_layout()); + + topo.get_shared_load_stats().set_tablet_sizes(stm.get(), table1, 1); + + set_tablet_opts(e, table1, min_2); + rebalance_tablets(e, &topo.get_shared_load_stats()); + + // Even though the count is a power-of-two, boundaries are not aligned due to odd-count merge, + // so the layout should remain arbitrary. + BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).tablet_count() == 2); + BOOST_REQUIRE(tablet_layout::arbitrary == stm.get()->tablets().get_tablet_map(table1).get_layout()); + return make_ready_future<>(); + }, std::move(cfg)).get(); +} + SEASTAR_THREAD_TEST_CASE(test_colocation_skipped_on_excluded_nodes) { do_with_cql_env_thread([] (auto& e) { topology_builder topo(e);