From 50fbac6ea62063096975404f0ead4a29cfccbc97 Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Wed, 18 Mar 2026 00:19:46 +0100 Subject: [PATCH] tablets: Introduce pow2_count per-table tablet option By default it's true, in which case tablet count of the table is rounded up to a power of two. This option allows lifting this, in which case the count can be arbitrary. This will allow testing the logic of arbitrary tablet count. --- cql3/statements/cf_prop_defs.cc | 2 +- db/tablet_options.cc | 41 +++++++++++++++++++++++++++++---- db/tablet_options.hh | 11 +++++++-- docs/architecture/tablets.rst | 8 +++++-- docs/cql/ddl.rst | 3 +++ service/tablet_allocator.cc | 26 +++++++++++++++++---- 6 files changed, 77 insertions(+), 14 deletions(-) diff --git a/cql3/statements/cf_prop_defs.cc b/cql3/statements/cf_prop_defs.cc index 33da04a7de..997e9435b6 100644 --- a/cql3/statements/cf_prop_defs.cc +++ b/cql3/statements/cf_prop_defs.cc @@ -197,7 +197,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, if (!db.features().tablet_options) { throw exceptions::configuration_exception("tablet options cannot be used until all nodes in the cluster enable this feature"); } - db::tablet_options::validate(*tablet_options_map); + db::tablet_options::validate(*tablet_options_map, db.features()); } if (has_property(KW_STORAGE_ENGINE)) { diff --git a/db/tablet_options.cc b/db/tablet_options.cc index a057d5b993..5662bcb3ec 100644 --- a/db/tablet_options.cc +++ b/db/tablet_options.cc @@ -9,6 +9,7 @@ #include "exceptions/exceptions.hh" #include "db/tablet_options.hh" +#include "gms/feature_service.hh" #include #include "utils/log.hh" @@ -16,6 +17,17 @@ extern logging::logger dblog; namespace db { +static +bool parse_bool_option(const sstring& value) { + if (strcasecmp(value.c_str(), "true") == 0 || strcasecmp(value.c_str(), "yes") == 0 || value == "1") { + return true; + } + if (strcasecmp(value.c_str(), "false") == 0 || strcasecmp(value.c_str(), "no") == 0 || value == "0") { + return false; + } + throw std::invalid_argument(format("Invalid boolean value: {}", value)); +} + tablet_options::tablet_options(const map_type& map) { for (auto& [key, value_str] : map) { switch (tablet_options::from_string(key)) { @@ -39,6 +51,9 @@ tablet_options::tablet_options(const map_type& map) { expected_data_size_in_gb.emplace(value); } break; + case tablet_option_type::pow2_count: + pow2_count = parse_bool_option(value_str); + break; } } } @@ -49,6 +64,7 @@ sstring tablet_options::to_string(tablet_option_type hint) { case tablet_option_type::max_tablet_count: return "max_tablet_count"; case tablet_option_type::min_per_shard_tablet_count: return "min_per_shard_tablet_count"; case tablet_option_type::expected_data_size_in_gb: return "expected_data_size_in_gb"; + case tablet_option_type::pow2_count: return "pow2_count"; } } @@ -61,6 +77,8 @@ tablet_option_type tablet_options::from_string(sstring hint_desc) { return tablet_option_type::min_per_shard_tablet_count; } else if (hint_desc == "expected_data_size_in_gb") { return tablet_option_type::expected_data_size_in_gb; + } else if (hint_desc == "pow2_count") { + return tablet_option_type::pow2_count; } else { throw exceptions::syntax_exception(fmt::format("Unknown tablet hint '{}'", hint_desc)); } @@ -80,13 +98,17 @@ std::map tablet_options::to_map() const { if (expected_data_size_in_gb) { res[to_string(tablet_option_type::expected_data_size_in_gb)] = fmt::to_string(*expected_data_size_in_gb); } + if (pow2_count) { + res[to_string(tablet_option_type::pow2_count)] = fmt::to_string(*pow2_count); + } return res; } -void tablet_options::validate(const map_type& map) { +void tablet_options::validate(const map_type& map, const gms::feature_service& features) { std::optional min_tablets; std::optional max_tablets; - + bool pow2_count = features.arbitrary_tablet_boundaries ? default_pow2_count : true; + for (auto& [key, value_str] : map) { switch (tablet_options::from_string(key)) { case tablet_option_type::min_tablet_count: @@ -113,12 +135,23 @@ void tablet_options::validate(const map_type& map) { throw exceptions::configuration_exception(format("Invalid value '{}' for expected_data_size_in_gb", value)); } break; + case tablet_option_type::pow2_count: + try { + pow2_count = parse_bool_option(value_str); + } catch (const std::invalid_argument& e) { + throw exceptions::configuration_exception(format("Invalid value '{}' for pow2_count", value_str)); + } + if (!pow2_count && !features.arbitrary_tablet_boundaries) { + throw exceptions::configuration_exception( + "pow2_count cannot be set to false until the arbitrary_tablet_boundaries feature is enabled"); + } + break; } } if (min_tablets && max_tablets) { - auto effective_min = 1u << log2ceil(static_cast(*min_tablets)); - auto effective_max = 1u << log2floor(static_cast(*max_tablets)); + auto effective_min = pow2_count ? 1u << log2ceil(static_cast(*min_tablets)) : static_cast(*min_tablets); + auto effective_max = pow2_count ? 1u << log2floor(static_cast(*max_tablets)) : static_cast(*max_tablets); if (effective_min > effective_max) { throw exceptions::configuration_exception( diff --git a/db/tablet_options.hh b/db/tablet_options.hh index 1431d682d7..be5a808bfa 100644 --- a/db/tablet_options.hh +++ b/db/tablet_options.hh @@ -13,6 +13,8 @@ using namespace seastar; +namespace gms { class feature_service; } + namespace db { // Per-table tablet options @@ -21,28 +23,33 @@ enum class tablet_option_type { max_tablet_count, min_per_shard_tablet_count, expected_data_size_in_gb, + pow2_count, }; struct tablet_options { + // System-wide default for pow2_count if the option is not set. + static const bool default_pow2_count = true; + using map_type = std::map; std::optional min_tablet_count; std::optional max_tablet_count; std::optional min_per_shard_tablet_count; std::optional expected_data_size_in_gb; + std::optional pow2_count; tablet_options() = default; explicit tablet_options(const map_type& map); operator bool() const noexcept { - return min_tablet_count || max_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb; + return min_tablet_count || max_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb || pow2_count; } map_type to_map() const; static sstring to_string(tablet_option_type hint); static tablet_option_type from_string(sstring hint_desc); - static void validate(const map_type& map); + static void validate(const map_type& map, const gms::feature_service& features); }; } // namespace db diff --git a/docs/architecture/tablets.rst b/docs/architecture/tablets.rst index b356dbe078..62da18b026 100644 --- a/docs/architecture/tablets.rst +++ b/docs/architecture/tablets.rst @@ -108,6 +108,8 @@ The computed number of tablets a table will have is based on several parameters See :ref:`Per-table tablet options ` for details. * Table-level option ``'max_tablet_count'``. This option sets the maximum number of tablets for the given table See :ref:`Per-table tablet options ` for details. +* Table-level option ``pow2_count``. This option, when set to true, forces the number of tablets for a given table to be a power of 2. + See :ref:`Per-table tablet options ` for details. * Config option ``'tablets_initial_scale_factor'``. This option sets the minimal number of tablets per shard per table globally. This option can be overridden by the table-level option: ``'min_per_shard_tablet_count'``. ``'tablets_initial_scale_factor'`` is ignored if either the keyspace option ``'initial'`` or table-level @@ -126,8 +128,10 @@ will be used as the number of tablets for the given table. When both ``'min_tablet_count'`` and ``'max_tablet_count'`` are set together, ScyllaDB validates the combination by computing **effective** bounds: - * The **effective minimum** is the smallest power of 2 that is greater than or equal to ``min_tablet_count``. - * The **effective maximum** is the largest power of 2 that is less than or equal to ``max_tablet_count``. + * The **effective minimum** is the smallest power of 2 that is greater than or equal to ``min_tablet_count`` if ``pow2_count`` is true, + or simply ``min_tablet_count`` otherwise. + * The **effective maximum** is the largest power of 2 that is less than or equal to ``max_tablet_count`` if ``pow2_count`` is true, + or simply ``max_tablet_count`` otherwise. ScyllaDB validates that the effective minimum does not exceed the effective maximum. If it does, the ``CREATE TABLE`` statement will be rejected with an error. To avoid ambiguity, it is recommended diff --git a/docs/cql/ddl.rst b/docs/cql/ddl.rst index 4a36eedf0d..6e55fa5923 100644 --- a/docs/cql/ddl.rst +++ b/docs/cql/ddl.rst @@ -500,6 +500,7 @@ Creating a new table uses the ``CREATE TABLE`` statement: tablet_option: 'expected_data_size_in_gb' ':' : | 'min_per_shard_tablet_count' ':' : | 'min_tablet_count' ':' + : | 'pow2_count' ':' ( 'true' | 'false' ) For instance:: @@ -1138,6 +1139,8 @@ if its data size, or performance requirements are known in advance. This enables efficient file-based streaming during restore. Setting both ``min_tablet_count`` and ``max_tablet_count`` to the same value fixes the tablet count for the table. + ``pow2_count`` "true" When set to ``true``, the tablet count of a table is always a power of 2. The + count wanted due to all other factors is rounded up to the nearest power of 2. =============================== =============== =================================================================================== When allocating tablets for a new table, ScyllaDB uses the maximum of the ``initial`` tablets configured for the keyspace diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index b74f2de9bb..b71da816ba 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -146,6 +146,15 @@ db::tablet_options combine_tablet_options(R&& opts) { combined_opts.max_tablet_count = std::min(*combined_opts.max_tablet_count, *opt.max_tablet_count); } } + if (opt.pow2_count) { + // We need some way to resolve conflicts. + // pow2_count will be true if any of the options wants pow2_count, because + // we want to treat pow2_count == true as a requirement (for backwards compatibility) + // while pow2_count = false like a preference. Not a hard reason. + if (!combined_opts.pow2_count || *opt.pow2_count) { + combined_opts.pow2_count = *opt.pow2_count; + } + } } if (total_expected_data_size_in_gb_count) { @@ -1715,9 +1724,10 @@ public: size_t target_tablet_count; // Tablet count wanted by scheduler. sstring target_tablet_count_reason; // Winning rule for target_tablet_count value. std::optional avg_tablet_size; // nullopt when stats not yet available. + bool pow2_count; // Whether tablet count for the table should be a power of two. // Final tablet count. - // It's target_tablet_count aligned to power of 2 if arbitrary_tablet_boundaries feature is not enabled. + // It's target_tablet_count aligned to power of 2 if pow2_count == true. size_t target_tablet_count_aligned; resize_decision::way_type resize_decision; // Decision which should be emitted to achieve target_tablet_count_aligned. @@ -1867,6 +1877,9 @@ public: auto process_table = [&] (table_id table, const locator::table_group_set& tables, schema_ptr s, db::tablet_options tablet_options, const tablet_aware_replication_strategy* rs, size_t tablet_count) { table_sizing& table_plan = plan.tables[table]; table_plan.current_tablet_count = tablet_count; + table_plan.pow2_count = tablet_options.pow2_count.value_or( + _db.features().arbitrary_tablet_boundaries ? db::tablet_options::default_pow2_count : true); + rs_by_table[table] = rs; // for a group of co-located tablets of size g with average tablet size t, the migration unit @@ -1963,8 +1976,8 @@ public: table_plan.target_tablet_count = target_tablet_count.tablet_count; table_plan.target_tablet_count_reason = target_tablet_count.reason; - lblogger.debug("Table {} ({}.{}) target_tablet_count: {} ({})", table, s->ks_name(), s->cf_name(), - table_plan.target_tablet_count, table_plan.target_tablet_count_reason); + lblogger.debug("Table {} ({}.{}) target_tablet_count: {} ({}), pow2_count: {}, opt: {}", table, s->ks_name(), s->cf_name(), + table_plan.target_tablet_count, table_plan.target_tablet_count_reason, table_plan.pow2_count, tablet_options.to_map()); }; for (const auto& [table, tables] : _tm->tablets().all_table_groups()) { @@ -1973,7 +1986,6 @@ public: } const auto& tmap = _tm->tablets().get_tablet_map(table); auto [s, rs] = get_schema_and_rs(table); - auto tablet_options = combine_tablet_options( tables | std::views::transform([&] (table_id table) { return _db.get_tables_metadata().get_table_if_exists(table); }) | std::views::filter([] (auto t) { return t != nullptr; }) @@ -2095,7 +2107,11 @@ public: // table_plan.resize_decision for (auto&& [table, table_plan] : plan.tables) { - table_plan.target_tablet_count_aligned = 1u << log2ceil(table_plan.target_tablet_count); + if (!table_plan.pow2_count) { + table_plan.target_tablet_count_aligned = table_plan.target_tablet_count; + } else { + table_plan.target_tablet_count_aligned = 1u << log2ceil(table_plan.target_tablet_count); + } if (table_plan.target_tablet_count_aligned > table_plan.current_tablet_count) { table_plan.resize_decision = locator::resize_decision::split();