schema: add per-table tablet options
Unlike with vnodes, each tablet is served only by a single shard, and it is associated with a memtable that, when flushed, it creates sstables which token-range is confined to the tablet owning them. On one hand, this allows for far better agility and elasticity since migration of tablets between nodes or shards does not require rewriting most if not all of the sstables, as required with vnodes (at the cleanup phase). Having too few tablets might limit performance due not being served by all shards or by imbalance between shards caused by quantization. The number of tabelts per table has to be a power of 2 with the current design, and when divided by the number of shards, some shards will serve N tablets, while others may serve N+1, and when N is small N+1/N may be significantly larger than 1. For example, with N=1, some shards will serve 2 tablet replicas and some will serve only 1, causing an imbalance of 100%. Now, simply allocating a lot more tablets for each table may theoretically address this problem, but practically: a. Each tablet has memory overhead and having too many tablets in the system with many tables and many tablets for each of them may overwhelm the system's and cause out-of-memory errors. b. Too-small tablets cause a proliferation of small sstables that are less efficient to acces, have higher metadata overhead (due to per-sstable overhead), and might exhaust the system's open file-descriptors limitations. The options introduced in this change can help the user tune the system in two ways: 1. Sizing the table to prevent unnecessary tablet splits and migrations. This can be done when the table is created, or later on, using ALTER TABLE. 2. Controlling min_per_shard_tablet_count to improve tablet balancing, for hot tables. Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
This commit is contained in:
@@ -1012,6 +1012,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'db/view/view_update_generator.cc',
|
||||
'db/virtual_table.cc',
|
||||
'db/virtual_tables.cc',
|
||||
'db/tablet_options.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
'utils/UUID_gen.cc',
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "cql3/statements/cf_prop_defs.hh"
|
||||
#include "cql3/statements/request_validations.hh"
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "db/extensions.hh"
|
||||
#include "db/tags/extension.hh"
|
||||
@@ -20,6 +21,7 @@
|
||||
#include "tombstone_gc.hh"
|
||||
#include "db/per_partition_rate_limit_extension.hh"
|
||||
#include "db/per_partition_rate_limit_options.hh"
|
||||
#include "db/tablet_options.hh"
|
||||
#include "utils/bloom_calculations.hh"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
@@ -52,6 +54,8 @@ const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";
|
||||
|
||||
const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
|
||||
|
||||
const sstring cf_prop_defs::KW_TABLETS = "tablets";
|
||||
|
||||
schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const {
|
||||
schema::extensions_map er;
|
||||
for (auto& p : exts.schema_extensions()) {
|
||||
@@ -68,6 +72,14 @@ schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions
|
||||
return er;
|
||||
}
|
||||
|
||||
data_dictionary::keyspace cf_prop_defs::find_keyspace(const data_dictionary::database db, std::string_view ks_name) {
|
||||
try {
|
||||
return db.find_keyspace(ks_name);
|
||||
} catch (const data_dictionary::no_such_keyspace& e) {
|
||||
throw request_validations::invalid_request("{}", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, const schema::extensions_map& schema_extensions) const {
|
||||
// Skip validation if the comapction strategy class is already set as it means we've already
|
||||
// prepared (and redoing it would set strategyClass back to null, which we don't want)
|
||||
@@ -75,13 +87,15 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& ks = find_keyspace(db, ks_name);
|
||||
|
||||
static std::set<sstring> keywords({
|
||||
KW_COMMENT,
|
||||
KW_GCGRACESECONDS, KW_CACHING, KW_DEFAULT_TIME_TO_LIVE,
|
||||
KW_MIN_INDEX_INTERVAL, KW_MAX_INDEX_INTERVAL, KW_SPECULATIVE_RETRY,
|
||||
KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
|
||||
KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS,
|
||||
KW_SYNCHRONOUS_UPDATES
|
||||
KW_SYNCHRONOUS_UPDATES, KW_TABLETS,
|
||||
});
|
||||
static std::set<sstring> obsolete_keywords({
|
||||
sstring("index_interval"),
|
||||
@@ -162,6 +176,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
|
||||
}
|
||||
|
||||
speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
|
||||
|
||||
if (auto tablet_options_map = get_tablet_options()) {
|
||||
if (!ks.uses_tablets()) {
|
||||
throw exceptions::configuration_exception("tablet options cannot be used when tablets are disabled for the keyspace");
|
||||
}
|
||||
if (!db.features().tablet_options) {
|
||||
throw exceptions::configuration_exception("tablet options cannot be used until all nodes in the cluster enable this feature");
|
||||
}
|
||||
db::tablet_options::validate(*tablet_options_map);
|
||||
}
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> cf_prop_defs::get_compaction_type_options() const {
|
||||
@@ -252,6 +276,13 @@ const db::per_partition_rate_limit_options* cf_prop_defs::get_per_partition_rate
|
||||
return &ext->get_options();
|
||||
}
|
||||
|
||||
std::optional<db::tablet_options::map_type> cf_prop_defs::get_tablet_options() const {
|
||||
if (auto tablet_options = get_map(KW_TABLETS)) {
|
||||
return tablet_options.value();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const {
|
||||
if (has_property(KW_COMMENT)) {
|
||||
builder.set_comment(get_string(KW_COMMENT, ""));
|
||||
@@ -351,6 +382,10 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
|
||||
|
||||
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
|
||||
}
|
||||
|
||||
if (auto tablet_options_opt = get_map(KW_TABLETS)) {
|
||||
builder.set_tablet_options(std::move(*tablet_options_opt));
|
||||
}
|
||||
}
|
||||
|
||||
void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const
|
||||
|
||||
@@ -18,12 +18,14 @@
|
||||
|
||||
namespace data_dictionary {
|
||||
class database;
|
||||
class keyspace;
|
||||
}
|
||||
|
||||
class tombstone_gc_options;
|
||||
|
||||
namespace db {
|
||||
class extensions;
|
||||
class tablet_options;
|
||||
}
|
||||
namespace cdc {
|
||||
class options;
|
||||
@@ -60,6 +62,8 @@ public:
|
||||
static const sstring COMPACTION_STRATEGY_CLASS_KEY;
|
||||
static const sstring COMPACTION_ENABLED_KEY;
|
||||
|
||||
static const sstring KW_TABLETS;
|
||||
|
||||
// FIXME: In origin the following consts are in CFMetaData.
|
||||
static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
|
||||
static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128;
|
||||
@@ -70,6 +74,7 @@ public:
|
||||
|
||||
private:
|
||||
mutable std::optional<sstables::compaction_strategy_type> _compaction_strategy_class;
|
||||
static data_dictionary::keyspace find_keyspace(const data_dictionary::database db, std::string_view ks_name);
|
||||
public:
|
||||
std::optional<sstables::compaction_strategy_type> get_compaction_strategy_class() const;
|
||||
|
||||
@@ -103,6 +108,7 @@ public:
|
||||
int32_t get_paxos_grace_seconds() const;
|
||||
std::optional<table_id> get_id() const;
|
||||
bool get_synchronous_updates_flag() const;
|
||||
std::optional<db::tablet_options::map_type> get_tablet_options() const;
|
||||
|
||||
void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const;
|
||||
void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;
|
||||
|
||||
@@ -55,6 +55,11 @@ keyspace::is_internal() const {
|
||||
return _ops->is_internal(*this);
|
||||
}
|
||||
|
||||
bool
|
||||
keyspace::uses_tablets() const {
|
||||
return metadata()->uses_tablets();
|
||||
}
|
||||
|
||||
const locator::abstract_replication_strategy&
|
||||
keyspace::get_replication_strategy() const {
|
||||
return _ops->get_replication_strategy(*this);
|
||||
|
||||
@@ -87,6 +87,7 @@ private:
|
||||
keyspace(const impl* ops, const void* keyspace);
|
||||
public:
|
||||
bool is_internal() const;
|
||||
bool uses_tablets() const;
|
||||
lw_shared_ptr<keyspace_metadata> metadata() const;
|
||||
const user_types_metadata& user_types() const;
|
||||
const locator::abstract_replication_strategy& get_replication_strategy() const;
|
||||
|
||||
@@ -65,6 +65,9 @@ public:
|
||||
std::optional<unsigned> initial_tablets() const {
|
||||
return _initial_tablets;
|
||||
}
|
||||
bool uses_tablets() const noexcept {
|
||||
return _initial_tablets.has_value();
|
||||
}
|
||||
const std::unordered_map<sstring, schema_ptr>& cf_meta_data() const {
|
||||
return _cf_meta_data;
|
||||
}
|
||||
|
||||
@@ -38,7 +38,8 @@ target_sources(db
|
||||
snapshot/backup_task.cc
|
||||
rate_limiter.cc
|
||||
per_partition_rate_limit_options.cc
|
||||
row_cache.cc)
|
||||
row_cache.cc,
|
||||
tablet_options.cc)
|
||||
target_include_directories(db
|
||||
PUBLIC
|
||||
${CMAKE_SOURCE_DIR})
|
||||
|
||||
@@ -336,6 +336,11 @@ schema_ptr scylla_tables(schema_features features) {
|
||||
// In this case, for non-system tables, `version` is null and `schema::version()` will be a hash.
|
||||
sb.with_column("committed_by_group0", boolean_type);
|
||||
}
|
||||
|
||||
// It is safe to add the `tablets` column unconditionally,
|
||||
// since it is written to only after the cluster feature is enabled.
|
||||
sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false));
|
||||
|
||||
sb.with_hash_version();
|
||||
s = sb.build();
|
||||
}
|
||||
@@ -1733,6 +1738,19 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times
|
||||
auto& cdef = *scylla_tables()->get_column_definition("partitioner");
|
||||
m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now()));
|
||||
}
|
||||
// A table will have engaged tablet options
|
||||
// only after they were set by CREATE TABLE or ALTER TABLE,
|
||||
// Meaning the cluster feature is enabled, so it is safe to write
|
||||
// to this columns.
|
||||
if (table->has_tablet_options()) {
|
||||
auto& map = table->raw_tablet_options();
|
||||
auto& cdef = *scylla_tables()->get_column_definition("tablets");
|
||||
if (map.empty()) {
|
||||
m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now()));
|
||||
} else {
|
||||
m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp));
|
||||
}
|
||||
}
|
||||
// In-memory tables are deprecated since scylla-2024.1.0
|
||||
// FIXME: delete the column when there's no live version supporting it anymore.
|
||||
// Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature
|
||||
@@ -2154,6 +2172,19 @@ static void prepare_builder_from_table_row(const schema_ctxt& ctxt, schema_build
|
||||
}
|
||||
}
|
||||
|
||||
static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, schema_builder& builder, const query::result_set_row& table_row) {
|
||||
auto in_mem = table_row.get<bool>("in_memory");
|
||||
auto in_mem_enabled = in_mem.value_or(false);
|
||||
if (in_mem_enabled) {
|
||||
slogger.warn("Support for in_memory tables has been deprecated.");
|
||||
}
|
||||
builder.set_in_memory(in_mem_enabled);
|
||||
if (auto opt_map = get_map<sstring, sstring>(table_row, "tablets")) {
|
||||
auto tablet_options = db::tablet_options(*opt_map);
|
||||
builder.set_tablet_options(tablet_options.to_map());
|
||||
}
|
||||
}
|
||||
|
||||
schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional<table_schema_version> version)
|
||||
{
|
||||
slogger.trace("create_table_from_mutations: version={}, {}", version, sm);
|
||||
@@ -2208,13 +2239,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations
|
||||
if (sm.scylla_tables()) {
|
||||
table_rs = query::result_set(*sm.scylla_tables());
|
||||
if (!table_rs.empty()) {
|
||||
query::result_set_row table_row = table_rs.row(0);
|
||||
auto in_mem = table_row.get<bool>("in_memory");
|
||||
auto in_mem_enabled = in_mem.value_or(false);
|
||||
if (in_mem_enabled) {
|
||||
slogger.warn("Support for in_memory tables has been deprecated.");
|
||||
}
|
||||
builder.set_in_memory(in_mem_enabled);
|
||||
prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0));
|
||||
}
|
||||
}
|
||||
v3_columns columns(std::move(column_defs), is_dense, is_compound);
|
||||
@@ -2445,6 +2470,13 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm
|
||||
schema_builder builder{ks_name, cf_name, id};
|
||||
prepare_builder_from_table_row(ctxt, builder, row);
|
||||
|
||||
if (sm.scylla_tables()) {
|
||||
table_rs = query::result_set(*sm.scylla_tables());
|
||||
if (!table_rs.empty()) {
|
||||
prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0));
|
||||
}
|
||||
}
|
||||
|
||||
auto computed_columns = get_computed_columns(sm);
|
||||
auto column_defs = create_columns_from_column_rows(ctxt, query::result_set(sm.columns_mutation()), ks_name, cf_name, false, column_view_virtual::no, computed_columns);
|
||||
for (auto&& cdef : column_defs) {
|
||||
|
||||
96
db/tablet_options.cc
Normal file
96
db/tablet_options.cc
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "db/tablet_options.hh"
|
||||
#include "utils/log.hh"
|
||||
|
||||
extern logging::logger dblog;
|
||||
|
||||
namespace db {
|
||||
|
||||
tablet_options::tablet_options(const map_type& map) {
|
||||
for (auto& [key, value_str] : map) {
|
||||
switch (tablet_options::from_string(key)) {
|
||||
case tablet_option_type::min_tablet_count:
|
||||
if (auto value = std::atol(value_str.c_str())) {
|
||||
min_tablet_count.emplace(value);
|
||||
}
|
||||
break;
|
||||
case tablet_option_type::min_per_shard_tablet_count:
|
||||
if (auto value = std::atof(value_str.c_str())) {
|
||||
min_per_shard_tablet_count.emplace(value);
|
||||
}
|
||||
break;
|
||||
case tablet_option_type::expected_data_size_in_gb:
|
||||
if (auto value = std::atol(value_str.c_str())) {
|
||||
expected_data_size_in_gb.emplace(value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sstring tablet_options::to_string(tablet_option_type hint) {
|
||||
switch (hint) {
|
||||
case tablet_option_type::min_tablet_count: return "min_tablet_count";
|
||||
case tablet_option_type::min_per_shard_tablet_count: return "min_per_shard_tablet_count";
|
||||
case tablet_option_type::expected_data_size_in_gb: return "expected_data_size_in_gb";
|
||||
}
|
||||
}
|
||||
|
||||
tablet_option_type tablet_options::from_string(sstring hint_desc) {
|
||||
if (hint_desc == "min_tablet_count") {
|
||||
return tablet_option_type::min_tablet_count;
|
||||
} else if (hint_desc == "min_per_shard_tablet_count") {
|
||||
return tablet_option_type::min_per_shard_tablet_count;
|
||||
} else if (hint_desc == "expected_data_size_in_gb") {
|
||||
return tablet_option_type::expected_data_size_in_gb;
|
||||
} else {
|
||||
throw exceptions::syntax_exception(fmt::format("Unknown tablet hint '{}'", hint_desc));
|
||||
}
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> tablet_options::to_map() const {
|
||||
std::map<sstring, sstring> res;
|
||||
if (min_tablet_count) {
|
||||
res[to_string(tablet_option_type::min_tablet_count)] = fmt::to_string(*min_tablet_count);
|
||||
}
|
||||
if (min_per_shard_tablet_count) {
|
||||
res[to_string(tablet_option_type::min_per_shard_tablet_count)] = fmt::to_string(*min_per_shard_tablet_count);
|
||||
}
|
||||
if (expected_data_size_in_gb) {
|
||||
res[to_string(tablet_option_type::expected_data_size_in_gb)] = fmt::to_string(*expected_data_size_in_gb);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void tablet_options::validate(const map_type& map) {
|
||||
for (auto& [key, value_str] : map) {
|
||||
switch (tablet_options::from_string(key)) {
|
||||
case tablet_option_type::min_tablet_count:
|
||||
if (auto value = std::atol(value_str.c_str()); value < 0) {
|
||||
throw exceptions::configuration_exception(format("Invalid value '{}' for min_tablet_count", value));
|
||||
}
|
||||
break;
|
||||
case tablet_option_type::min_per_shard_tablet_count:
|
||||
if (auto value = std::atof(value_str.c_str()); value < 0) {
|
||||
throw exceptions::configuration_exception(format("Invalid value '{}' for min_per_shard_tablet_count", value));
|
||||
}
|
||||
break;
|
||||
case tablet_option_type::expected_data_size_in_gb:
|
||||
if (auto value = std::atol(value_str.c_str()); value < 0) {
|
||||
throw exceptions::configuration_exception(format("Invalid value '{}' for expected_data_size_in_gb", value));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
46
db/tablet_options.hh
Normal file
46
db/tablet_options.hh
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright 2025-present ScyllaDB
|
||||
*/
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
|
||||
using namespace seastar;
|
||||
|
||||
namespace db {
|
||||
|
||||
// Per-table tablet options
|
||||
enum class tablet_option_type {
|
||||
min_tablet_count,
|
||||
min_per_shard_tablet_count,
|
||||
expected_data_size_in_gb,
|
||||
};
|
||||
|
||||
struct tablet_options {
|
||||
using map_type = std::map<sstring, sstring>;
|
||||
|
||||
std::optional<ssize_t> min_tablet_count;
|
||||
std::optional<double> min_per_shard_tablet_count;
|
||||
std::optional<ssize_t> expected_data_size_in_gb;
|
||||
|
||||
tablet_options() = default;
|
||||
explicit tablet_options(const map_type& map);
|
||||
|
||||
operator bool() const noexcept {
|
||||
return min_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb;
|
||||
}
|
||||
|
||||
map_type to_map() const;
|
||||
|
||||
static sstring to_string(tablet_option_type hint);
|
||||
static tablet_option_type from_string(sstring hint_desc);
|
||||
static void validate(const map_type& map);
|
||||
};
|
||||
|
||||
} // namespace db
|
||||
@@ -344,9 +344,17 @@ Creating a new table uses the ``CREATE TABLE`` statement:
|
||||
: | CLUSTERING ORDER BY '(' `clustering_order` ')' [ AND `table_options` ]
|
||||
: | scylla_encryption_options: '=' '{'[`cipher_algorithm` : <hash>]','[`secret_key_strength` : <len>]','[`key_provider`: <provider>]'}'
|
||||
: | caching '=' ' {'caching_options'}'
|
||||
: | tablets '=' '{' `tablet_options` '}'
|
||||
: | `options`
|
||||
|
||||
clustering_order: `column_name` (ASC | DESC) ( ',' `column_name` (ASC | DESC) )*
|
||||
|
||||
tablet_options: `tablet_option` [',' `tablet_option`]
|
||||
: |
|
||||
|
||||
tablet_option: 'expected_data_size_in_gb' ':' <int>
|
||||
: | 'min_per_shard_tablet_count' ':' <float>
|
||||
: | 'min_tablet_count' ':' <int>
|
||||
|
||||
For instance::
|
||||
|
||||
@@ -713,6 +721,10 @@ A table supports the following options:
|
||||
- map
|
||||
- see below
|
||||
- :ref:`CDC Options <cdc-options>`
|
||||
* - ``tablet_options``
|
||||
- map
|
||||
- see below
|
||||
- :ref:`Per-table tablet options <cql-per-table-tablet-options>`
|
||||
|
||||
|
||||
.. _speculative-retry-options:
|
||||
@@ -898,6 +910,65 @@ The following modes are available:
|
||||
* - ``immediate``
|
||||
- Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.
|
||||
|
||||
.. _cql-per-table-tablet-options:
|
||||
|
||||
Per-table tablet options
|
||||
########################
|
||||
|
||||
By default, ScyllaDB will allocate the initial number of tablets automatically.
|
||||
Then on, tables may be automatically split if their average size is greater than twice
|
||||
the ``target_tablet_size_in_bytes``, or merged if the average tablet size is less than
|
||||
half the ``target_tablet_size_in_bytes``.
|
||||
|
||||
Other considerations, like the total number of tablet replicas per-shard, may also affect the tablet count.
|
||||
Since each tablet replica has a constant memory overhead, ScyllaDB may limit the number of tablets to prevent
|
||||
shards from running out-of-memory, in the presence of many tables.
|
||||
|
||||
The following per-table ``tablets`` options can be used to tune the tablet allocation logic for the table
|
||||
if its data size, or performance requirements are known in advance.
|
||||
|
||||
=============================== =============== ===================================================================================
|
||||
Option Default Description
|
||||
=============================== =============== ===================================================================================
|
||||
``expected_data_size_in_gb`` 0 This option provides a hint for the anticipated table size, before replication.
|
||||
ScyllaDB will generate a tablets topology that matches that expectation (see details below).
|
||||
It can be set when the table is created to allocate more tablets for it,
|
||||
as if it already occupies that size. This will prevent unnecessary tablet splits
|
||||
and tablet migrations during data ingestion.
|
||||
It can also be changed later in the table life cycle to induce tablet splits or merges
|
||||
to fit the new expected size.
|
||||
The minimum tablet count is calculated by dividing the expected data
|
||||
size by the ``target_tablet_size_in_bytes`` config option.
|
||||
``min_per_shard_tablet_count`` 0 Used for ensuring that the table workload is well balanced in the whole cluster in a
|
||||
topology-independent way. A higher number of tablet replicas per shard may help balance
|
||||
the table workload more evenly across shards and across nodes in the cluster.
|
||||
For example, setting this to 10 means that shard overcommit is limited to 10%, regardless
|
||||
of cluster size.
|
||||
Note that ``min_per_shard_tablet_count`` supports floating point values and can be set to
|
||||
a value less than 1. This is useful for clusters with large number of shards where the
|
||||
average number of tablet replicas owned by each shard is less than 1.
|
||||
``min_tablet_count`` 0 Determines the minimum number of tablets to allocate for the table.
|
||||
The hint is based on the deprecated keyspace ``initial`` tablets option.
|
||||
Note that the actual number of tablet replicas that are owned by each shard is a
|
||||
function of the tablet count, the replication factor in the datacenter, and the number
|
||||
of nodes and shards in the datacenter. It is recommended to use higher-level options
|
||||
such as ``expected_data_size_in_gb`` or ``min_per_shard_tablet_count`` instead.
|
||||
=============================== =============== ===================================================================================
|
||||
|
||||
When allocating tablets for a new table, ScyllaDB uses the maximum of the ``initial`` tablets configured for the keyspace
|
||||
and the minimum tablet count calculated from the table's ``tablets`` options, if any.
|
||||
If multiple tablet options are provided, ScyllaDB uses the maximum tablet count derived by each option individually.
|
||||
If the keyspace ``initial`` tablets is set to zero and no ``tablets`` options are provided,
|
||||
ScyllaDB automatically calculates the number of tablets so that each shard would own at least one tablet replica,
|
||||
scaled up by the ``tablets_initial_scale_factor`` configuration option.
|
||||
|
||||
Unlike the ``initial`` tablet count configured for the keyspace, ScyllaDB will not merge tablets when their
|
||||
average size drops below half the ``target_tablet_size_in_bytes`` if that would cause the table's tablet count
|
||||
to go below the minimum tablet count, or the per-shard tablet-count as per the above options.
|
||||
This is useful for tables that go through rapid growth and shrink cycles.
|
||||
If the table is shrunk for the long term and there are no special performance needs for the tablet, it is recommended
|
||||
to drop the tablet options or to adjust them respectively, to fit the new requirements.
|
||||
|
||||
Other considerations:
|
||||
#####################
|
||||
|
||||
|
||||
@@ -147,6 +147,7 @@ CREATE TABLE ks.t_scylla_cdc_log (
|
||||
AND comment = 'CDC log for ks.t'
|
||||
AND compaction = {'class': 'TimeWindowCompactionStrategy', 'compaction_window_size': '60', 'compaction_window_unit': 'MINUTES', 'expired_sstable_check_frequency_seconds': '1800'}
|
||||
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
|
||||
AND tablets = {'expected_data_size_in_gb': '250', 'min_per_shard_tablet_count': '0.8', 'min_tablet_count': '1'}
|
||||
AND crc_check_chance = 1
|
||||
AND default_time_to_live = 0
|
||||
AND gc_grace_seconds = 0
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include <map>
|
||||
#include "cql3/description.hh"
|
||||
#include "db/tablet_options.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "timestamp.hh"
|
||||
#include "utils/assert.hh"
|
||||
@@ -588,6 +589,7 @@ bool operator==(const schema& x, const schema& y)
|
||||
&& x._raw._indices_by_name == y._raw._indices_by_name
|
||||
&& x._raw._is_counter == y._raw._is_counter
|
||||
&& x._raw._in_memory == y._raw._in_memory
|
||||
&& x._raw._tablet_options == y._raw._tablet_options
|
||||
;
|
||||
}
|
||||
|
||||
@@ -676,6 +678,8 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) {
|
||||
feed_hash(h, ext->options_to_string());
|
||||
}
|
||||
|
||||
feed_hash(h, r._tablet_options);
|
||||
|
||||
return table_schema_version(utils::UUID_gen::get_name_UUID(h.finalize()));
|
||||
}
|
||||
|
||||
@@ -844,6 +848,17 @@ auto fmt::formatter<schema>::format(const schema& s, fmt::format_context& ctx) c
|
||||
out = fmt::format_to(out, ",minIndexInterval={}", s._raw._min_index_interval);
|
||||
out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._max_index_interval);
|
||||
out = fmt::format_to(out, ",speculativeRetry={}", s._raw._speculative_retry.to_sstring());
|
||||
out = fmt::format_to(out, ",tablets={{");
|
||||
if (s._raw._tablet_options) {
|
||||
n = 0;
|
||||
for (auto& [k, v] : *s._raw._tablet_options) {
|
||||
if (n++) {
|
||||
out = fmt::format_to(out, ", ");
|
||||
}
|
||||
out = fmt::format_to(out, "{}={}", k, v);
|
||||
}
|
||||
}
|
||||
out = fmt::format_to(out, "}}");
|
||||
out = fmt::format_to(out, ",triggers=[]");
|
||||
out = fmt::format_to(out, ",isDense={}", s._raw._is_dense);
|
||||
out = fmt::format_to(out, ",in_memory={}", s._raw._in_memory);
|
||||
@@ -1137,7 +1152,13 @@ std::ostream& schema::schema_properties(const schema_describe_helper& helper, st
|
||||
os << "\n AND memtable_flush_period_in_ms = " << memtable_flush_period();
|
||||
os << "\n AND min_index_interval = " << min_index_interval();
|
||||
os << "\n AND speculative_retry = '" << speculative_retry().to_sstring() << "'";
|
||||
|
||||
|
||||
if (has_tablet_options()) {
|
||||
os << "\n AND tablets = {";
|
||||
map_as_cql_param(os, tablet_options().to_map());
|
||||
os << "}";
|
||||
}
|
||||
|
||||
for (auto& [type, ext] : extensions()) {
|
||||
os << "\n AND " << type << " = " << ext->options_to_string();
|
||||
}
|
||||
@@ -1622,6 +1643,22 @@ const ::tombstone_gc_options& schema::tombstone_gc_options() const {
|
||||
return default_tombstone_gc_options;
|
||||
}
|
||||
|
||||
bool schema::has_tablet_options() const noexcept {
|
||||
return _raw._tablet_options.has_value();
|
||||
}
|
||||
|
||||
db::tablet_options schema::tablet_options() const {
|
||||
return db::tablet_options(raw_tablet_options());
|
||||
}
|
||||
|
||||
const db::tablet_options::map_type& schema::raw_tablet_options() const noexcept {
|
||||
if (!_raw._tablet_options) {
|
||||
static db::tablet_options::map_type no_options;
|
||||
return no_options;
|
||||
}
|
||||
return *_raw._tablet_options;
|
||||
}
|
||||
|
||||
schema_builder& schema_builder::with_cdc_options(const cdc::options& opts) {
|
||||
add_extension(cdc::cdc_extension::NAME, ::make_shared<cdc::cdc_extension>(opts));
|
||||
return *this;
|
||||
@@ -1642,6 +1679,11 @@ schema_builder& schema_builder::set_paxos_grace_seconds(int32_t seconds) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
schema_builder& schema_builder::set_tablet_options(std::map<sstring, sstring>&& hints) {
|
||||
_raw._tablet_options = std::move(hints);
|
||||
return *this;
|
||||
}
|
||||
|
||||
gc_clock::duration schema::paxos_grace_seconds() const {
|
||||
return std::chrono::duration_cast<gc_clock::duration>(
|
||||
std::chrono::seconds(
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "timestamp.hh"
|
||||
#include "tombstone_gc_options.hh"
|
||||
#include "db/per_partition_rate_limit_options.hh"
|
||||
#include "db/tablet_options.hh"
|
||||
#include "schema_fwd.hh"
|
||||
|
||||
namespace dht {
|
||||
@@ -567,6 +568,7 @@ private:
|
||||
// schema digest. It is also not set locally on a schema tables.
|
||||
std::reference_wrapper<const dht::static_sharder> _sharder;
|
||||
bool _in_memory = false;
|
||||
std::optional<std::map<sstring, sstring>> _tablet_options;
|
||||
std::optional<raw_view_info> _view_info;
|
||||
};
|
||||
raw_schema _raw;
|
||||
@@ -742,6 +744,12 @@ public:
|
||||
return _raw._caching_options;
|
||||
}
|
||||
|
||||
// Returns true iff the _tablet_options are initialized.
|
||||
// They may still be empty, e.g. after ALTER TABLE.
|
||||
bool has_tablet_options() const noexcept;
|
||||
db::tablet_options tablet_options() const;
|
||||
const db::tablet_options::map_type& raw_tablet_options() const noexcept;
|
||||
|
||||
static void set_default_partitioner(const sstring& class_name, unsigned ignore_msb = 0);
|
||||
const dht::i_partitioner& get_partitioner() const;
|
||||
|
||||
|
||||
@@ -227,6 +227,8 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
schema_builder& set_tablet_options(std::map<sstring, sstring>&& hints);
|
||||
|
||||
class default_names {
|
||||
public:
|
||||
default_names(const schema_builder&);
|
||||
@@ -283,7 +285,7 @@ public:
|
||||
schema_builder& with_cdc_options(const cdc::options&);
|
||||
schema_builder& with_tombstone_gc_options(const tombstone_gc_options& opts);
|
||||
schema_builder& with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options&);
|
||||
|
||||
|
||||
default_names get_default_names() const {
|
||||
return default_names(_raw);
|
||||
}
|
||||
|
||||
@@ -1158,7 +1158,7 @@ SEASTAR_TEST_CASE(test_system_schema_version_is_stable) {
|
||||
|
||||
// If you changed the schema of system.batchlog then this is expected to fail.
|
||||
// Just replace expected version with the new version.
|
||||
BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("3febbbce-8841-304a-abb9-170078ac173d")));
|
||||
BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("1f504ac7-350f-37aa-8a9e-105b1325d8e3")));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -234,6 +234,31 @@ def test_desc_table(cql, test_keyspace, random_seed, has_tablets):
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {new_tbl}")
|
||||
|
||||
# This test compares the content of `system_schema.tables` and `system_schema.columns` tables
|
||||
# when providing tablet options to CREATE TABLE.
|
||||
def test_desc_table_with_tablet_options(cql, test_keyspace, random_seed, has_tablets):
|
||||
if has_tablets: # issue #18180
|
||||
global counter_table_chance
|
||||
counter_table_chance = 0
|
||||
tablet_options = {
|
||||
'min_tablet_count': '100',
|
||||
'min_per_shard_tablet_count': '0.8', # Verify that a floating point value works for this hint
|
||||
'expected_data_size_in_gb': '50',
|
||||
}
|
||||
with new_random_table(cql, test_keyspace, tablet_options=tablet_options) as tbl:
|
||||
desc = cql.execute(f"DESC TABLE {tbl}")
|
||||
desc_create_stmt = desc.one().create_statement
|
||||
|
||||
try:
|
||||
new_tbl = f"{test_keyspace}.{unique_name()}"
|
||||
new_create_stmt = desc_create_stmt.replace(tbl, new_tbl)
|
||||
cql.execute(new_create_stmt)
|
||||
new_desc_stmt = cql.execute(f"DESC TABLE {new_tbl}")
|
||||
new_desc_create_stmt = new_desc_stmt.one().create_statement
|
||||
assert new_desc_create_stmt == new_create_stmt
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {new_tbl}")
|
||||
|
||||
# Test that `DESC TABLE {tbl}` contains appropriate create statement for table
|
||||
# This test compares the content of `system_schema.scylla_tables` tables, thus the test
|
||||
# is `scylla_only`.
|
||||
@@ -1280,7 +1305,7 @@ def new_random_keyspace(cql):
|
||||
# UDTs that can be used to create the table. The function uses `new_test_table`
|
||||
# from util.py, so it can be used in a "with", as:
|
||||
# with new_random_table(cql, test_keyspace) as table:
|
||||
def new_random_table(cql, keyspace, udts=[]):
|
||||
def new_random_table(cql, keyspace, udts=[], tablet_options={}):
|
||||
pk_n = random.randrange(1, max_pk)
|
||||
ck_n = random.randrange(max_ck)
|
||||
regular_n = random.randrange(1, max_regular)
|
||||
@@ -1348,6 +1373,8 @@ def new_random_table(cql, keyspace, udts=[]):
|
||||
# Extra properties which ScyllaDB supports but Cassandra doesn't
|
||||
extras["paxos_grace_seconds"] = random.randrange(1000, 100000)
|
||||
extras["tombstone_gc"] = f"{{'mode': 'timeout', 'propagation_delay_in_seconds': '{random.randrange(100, 100000)}'}}"
|
||||
if tablet_options:
|
||||
extras["tablets"] = str(tablet_options)
|
||||
|
||||
extra_options = [f"{k} = {v}" for (k, v) in extras.items()]
|
||||
extra_str = " AND ".join(extra_options)
|
||||
|
||||
Reference in New Issue
Block a user