Instead of dht::partition_ranges_vector, which is an std::vector<> and
have been seen to cause large allocations when calculating ranges to be
invalidated after compaction:
seastar_memory - oversized allocation: 147456 bytes. This is non-fatal, but could lead to latency and/or fragmentation issues. Please report: at
[Backtrace #0]
void seastar::backtrace<seastar::current_backtrace_tasklocal()::$_0>(seastar::current_backtrace_tasklocal()::$_0&&, bool) at ./build/release/seastar/./seastar/include/seastar/util/backtrace.hh:89
(inlined by) seastar::current_backtrace_tasklocal() at ./build/release/seastar/./seastar/src/util/backtrace.cc:99
seastar::current_tasktrace() at ./build/release/seastar/./seastar/src/util/backtrace.cc:136
seastar::current_backtrace() at ./build/release/seastar/./seastar/src/util/backtrace.cc:169
seastar::memory::cpu_pages::warn_large_allocation(unsigned long) at ./build/release/seastar/./seastar/src/core/memory.cc:840
seastar::memory::cpu_pages::check_large_allocation(unsigned long) at ./build/release/seastar/./seastar/src/core/memory.cc:903
(inlined by) seastar::memory::cpu_pages::allocate_large(unsigned int, bool) at ./build/release/seastar/./seastar/src/core/memory.cc:910
(inlined by) seastar::memory::allocate_large(unsigned long, bool) at ./build/release/seastar/./seastar/src/core/memory.cc:1533
(inlined by) seastar::memory::allocate_slowpath(unsigned long) at ./build/release/seastar/./seastar/src/core/memory.cc:1679
seastar::memory::allocate(unsigned long) at ././seastar/src/core/memory.cc:1698
(inlined by) operator new(unsigned long) at ././seastar/src/core/memory.cc:2440
(inlined by) std::__new_allocator<interval<dht::ring_position>>::allocate(unsigned long, void const*) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/new_allocator.h:151
(inlined by) std::allocator<interval<dht::ring_position>>::allocate(unsigned long) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/allocator.h:203
(inlined by) std::allocator_traits<std::allocator<interval<dht::ring_position>>>::allocate(std::allocator<interval<dht::ring_position>>&, unsigned long) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/alloc_traits.h:614
(inlined by) std::_Vector_base<interval<dht::ring_position>, std::allocator<interval<dht::ring_position>>>::_M_allocate(unsigned long) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/stl_vector.h:387
(inlined by) std::vector<interval<dht::ring_position>, std::allocator<interval<dht::ring_position>>>::reserve(unsigned long) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/vector.tcc:79
dht::to_partition_ranges(utils::chunked_vector<interval<dht::token>, 131072ul> const&, seastar::bool_class<utils::can_yield_tag>) at ./dht/i_partitioner.cc:347
compaction::compaction::get_ranges_for_invalidation(std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable>>> const&) at ./compaction/compaction.cc:619
(inlined by) compaction::compaction::get_compaction_completion_desc(std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable>>>, std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable>>>) at ./compaction/compaction.cc:719
(inlined by) compaction::regular_compaction::replace_remaining_exhausted_sstables() at ./compaction/compaction.cc:1362
compaction::compaction::finish(std::chrono::time_point<db_clock, std::chrono::duration<long, std::ratio<1l, 1000l>>>, std::chrono::time_point<db_clock, std::chrono::duration<long, std::ratio<1l, 1000l>>>) at ./compaction/compaction.cc:1021
compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0::operator()() at ./compaction/compaction.cc:1960
(inlined by) compaction::compaction_result std::__invoke_impl<compaction::compaction_result, compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>(std::__invoke_other, compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/invoke.h:63
(inlined by) std::__invoke_result<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>::type std::__invoke<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>(compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/bits/invoke.h:98
(inlined by) decltype(auto) std::__apply_impl<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0, std::tuple<>>(compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&, std::tuple<>&&, std::integer_sequence<unsigned long, ...>) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/tuple:2920
(inlined by) decltype(auto) std::apply<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0, std::tuple<>>(compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&, std::tuple<>&&) at /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/tuple:2935
(inlined by) seastar::future<compaction::compaction_result> seastar::futurize<compaction::compaction_result>::apply<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>(compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&, std::tuple<>&&) at ././seastar/include/seastar/core/future.hh:1930
(inlined by) seastar::futurize<std::invoke_result<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>::type>::type seastar::async<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>(seastar::thread_attributes, compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&)::'lambda'()::operator()() const at ././seastar/include/seastar/core/thread.hh:267
(inlined by) seastar::noncopyable_function<void ()>::direct_vtable_for<seastar::futurize<std::invoke_result<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>::type>::type seastar::async<compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0>(seastar::thread_attributes, compaction::compaction::run(std::unique_ptr<compaction::compaction, std::default_delete<compaction::compaction>>)::$_0&&)::'lambda'()>::call(seastar::noncopyable_function<void ()> const*) at ././seastar/include/seastar/util/noncopyable_function.hh:138
seastar::noncopyable_function<void ()>::operator()() const at ./build/release/seastar/./seastar/include/seastar/util/noncopyable_function.hh:224
(inlined by) seastar::thread_context::main() at ./build/release/seastar/./seastar/src/core/thread.cc:318
dht::partition_ranges_vector is used on the hot path, so just convert
the problematic user -- cache invalidation -- to use
utils::chunked_vector<dht::partition_range> instead.
Fixes: SCYLLADB-121
Closesscylladb/scylladb#28855
Currently, the view_update_generator::mutate_MV function acquires a
reference to the keyspace relevant to the operation, then it calls
max_concurrent_for_each and uses that reference inside the lambda passed
to that function. max_concurrent_for_each can preempt and there is no
mechanism that makes sure that the keyspace is alive until the view
updates are generated, so it is possible that the keyspace is freed by
the time the reference is used.
Fix the issue by precomputing the necessary information based on the
keyspace reference right away, and then passing that information by
value to the other parts of the code. It turns out that we only need to
know whether the keyspace uses tablets and whether it uses a network
topology strategy.
Fixes: scylladb/scylladb#28925Closesscylladb/scylladb#28928
The motivations for this patch are as follows:
- Guardrails should follow similar conventions, e.g. for config names,
metrics names, testing. Keeping guardrails together makes it easier
to find and compare existing guardrails when new guardrails are
implemented.
- The configuration is used to auto-generate the documentation
(particularly, the `configuration-parameters` page). Currently,
the order of parameters in the documentation is inconsistent (e.g.
`minimum_replication_factor_fail_threshold` before
`minimum_replication_factor_warn_threshold` but
`maximum_replication_factor_fail_threshold` after
`maximum_replication_factor_warn_threshold`), which can be confusing
to customers.
Fixes: SCYLLADB-256
Closesscylladb/scylladb#28932
This patch fixes 2 issues within strong consistency state machine:
- it might happen that apply is called before the schema is delivered to the node
- on the other hand, the apply may be called after the schema was changed and purged from the schema registry
The first problem is fixed by doing `group0.read_barrier()` before applying the mutations.
The second one is solved by upgrading the mutations using column mappings in case the version of the mutations' schema is older.
Fixes SCYLLADB-428
Strong consistency is in experimental phase, no need to backport.
Closesscylladb/scylladb#28546
* https://github.com/scylladb/scylladb:
test/cluster/test_strong_consistency: add reproducer for old schema during apply
test/cluster/test_strong_consistency: add reproducer for missing schema during apply
test/cluster/test_strong_consistency: extract common function
raft_group_registry: allow to drop append entries requests for specific raft group
strong_consistency/state_machine: find and hold schemas of applying mutations
strong_consistency/state_machine: pull necessary dependencies
db/schema_tables: add `get_column_mapping_if_exists()`
When we generate view updates, we check whether we can skip the
entire view update if all columns selected by the view are unmodified.
However, for collection columns, we only check if they were unset
before and after the update.
In this patch we add a check for the actual collection contents.
We perform this check for both virtual and non-virtual selections.
When the column is only a virtual column in the view, it would be
enough to check the liveness of each collection cell, however for
that we'd need to deserialize the entire collection anyway, which
should be effectively as expensive as comparing all of its bytes.
Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-808
Currently, when we generate view updates, we skip the view update if
all columns selected by the view are unchanged in the base table update.
However, this does not apply for collection columns - if the base table
has a collection regular column, we never allow skipping generating
view updates and the reason for that is missing implementation.
We can easily relax this for the case where the collection was missing
before and after the update - in this commit we move the check for
collections after the check for missing cells.
Set enable_schema_commitlog for each group0 tables.
Assert that group0 tables use schema commitlog in ensure_group0_schema
(per each command).
Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-914.
Needs backport to all live releases as all are vulnerable
Closesscylladb/scylladb#28876
* github.com:scylladb/scylladb:
test: add test_group0_tables_use_schema_commitlog
db: service: remove group0 tables from schema commitlog schema initializer
service: ensure that tables updated via group0 use schema commitlog
db: schema: remove set_is_group0_table param
Consider this:
- repair takes the lock holder
- tablet merge filber destories the compaction group and the compaction state
- repair fails
- repair destroy the lock holder
This is observed in the test:
```
repair - repair[5d73d094-72ee-4570-a3cc-1cd479b2a036] Repair 1 out of 1 tablets: table=sec_index.users range=(432345564227567615,504403158265495551] replicas=[0e9d51a5-9c99-4d6e-b9db-ad36a148b0ea:15, 498e354c-1254-4d8d-a565-2f5c6523845a:9, 5208598c-84f0-4526-bb7f-573728592172:28]
...
repair - repair[5d73d094-72ee-4570-a3cc-1cd479b2a036]: Started to repair 1 out of 1 tables in keyspace=sec_index, table=users, table_id=ea2072d0-ccd9-11f0-8dba-c5ab01bffb77, repair_reason=repair
repair - Enable incremental repair for table=sec_index.users range=(432345564227567615,504403158265495551]
table - Disabled compaction for range=(432345564227567615,504403158265495551] session_id=a13a72cc-cd2d-11f0-8e9b-76d54580ab09 for incremental repair
table - Got unrepaired compaction and repair lock for range=(432345564227567615,504403158265495551] session_id=a13a72cc-cd2d-11f0-8e9b-76d54580ab09 for incremental repair
table - Disabled compaction for range=(432345564227567615,504403158265495551] session_id=a13a72cc-cd2d-11f0-8e9b-76d54580ab09 for incremental repair
table - Got unrepaired compaction and repair lock for range=(432345564227567615,504403158265495551] session_id=a13a72cc-cd2d-11f0-8e9b-76d54580ab09 for incremental repair
repair - repair[5d73d094-72ee-4570-a3cc-1cd479b2a036]: get_sync_boundary: got error from node=0e9d51a5-9c99-4d6e-b9db-ad36a148b0ea, keyspace=sec_index, table=users, range=(432345564227567615,504403158265495551], error=seastar::rpc::remote_verb_error (Compaction state for table [0x60f008fa34c0] not found)
compaction_manager - Stopping 1 tasks for 1 ongoing compactions for table sec_index.users compaction_group=238 due to tablet merge
compaction_manager - Stopping 1 tasks for 1 ongoing compactions for table sec_index.users compaction_group=238 due to tablet merge
....
scylla[10793] Segmentation fault on shard 28, in scheduling group streaming
```
The rwlock in compaction_state could be destroyed before the lock holder
of the rwlock is destroyed. This causes user after free when the lock
the holder is destroyed.
To fix it, users of repair lock will now be waited when a compaction
group is being stopped.
That way, compaction group - which controls the lifetime of rwlock -
cannot be destroyed while the lock is held.
Additionally, the merge completion fiber - that might remove groups -
is properly serialized with incremental repair.
The issue can be reproduced using sanitize build consistently and can not
be reproduced after the fix.
Fixes#27365Closesscylladb/scylladb#28823
* github.com:scylladb/scylladb:
repair: Fix rwlock in compaction_state and lock holder lifecycle
repair: Prevent repair lock holder leakage after table drop
In scenarios where we want to firsty check if a column mapping exists
and if we don't want do flow control with exception, it is very wasteful
to do
```
if (column_mapping_exists()) {
get_column_mapping();
}
```
especially in a hot path like `state_machine::apply()` becase this will
execute 2 internal queries.
This commit introduces `get_column_mapping_if_exists()` function,
which simply wrapps result of `get_column_mapping()` in optional and
doesn't throw an exception if the mapping doesn't exist.
Currently, repair-mode tombstone-gc cannot be used on tables with RF=1. We want to make repair-mode the default for all tablet tables (and more, see https://github.com/scylladb/scylladb/issues/22814), but currently a keyspace created with RF=1 and later altered to RF>1 will end up using timeout-mode tombstone gc. This is because the repair-mode tombstone-gc code relies on repair history to determine the gc-before time for keys/ranges. RF=1 tables cannot run repairs so they will have empty repair history and consequently won't be able to purge tombstones.
This PR solves this by keeping a registry of RF=1 tables and consulting this registry when creating `tombstone_gc_state` objects. If the table is RF=1, tombstone-gc will work as if the table used immediate-mode tombstone-gc. The registry is updated on each replication update. As soon as the table is not RF=1 anymore, the tombstone-gc reverts to the natural repair-mode behaviour.
After this PR, tombstone-gc defaults to repair-mode for all tables, regardless of RF and tablets/vnodes.
Fixes: SCYLLADB-106.
New feature, no backport required.
Closesscylladb/scylladb#22945
* github.com:scylladb/scylladb:
test/{boost,cluster}: add test for tombstone gc mode=repair with RF=1
tombstone_gc: allow use of repair-mode for RF=1 tables
replica/table: update rf=1 table registry in shared tombstone-gc state
tombstone_gc: tombstone_gc_before_getter: consider RF when getting gc before time
tombstone_gc: unpack per_table_history_maps
tombstone_gc: extract _group0_gc_time from per_table_history_map
tombstone_gc: drop tombstone_gc_state(nullptr) ctor and operator bool()
test/lib/random_schema: use timeout-mode tombstone_gc
tombstone_gc_options: add C++ friendly constructor
test: move away from tombstone_gc_state(nullptr) ctor
treewide: move away from tombstone_gc_state(nullptr) ctor
sstable: move away from tombstone_gc_mode::operator bool()
replica/table: add get_tombstone_gc_state()
compaction: use tombstone_gc_state with value semantics
db/row_cache: use tombstone_gc_state with value semantics
tombstone_gc: introduce tombstone_gc_state::for_tests()
This patch series implements `write_consistency_levels_warned` and `write_consistency_levels_disallowed` guardrails, allowing the configuration of which consistency levels are unwanted for writes. The motivation for these guardrails is to forbid writing with consistency levels that don't provide high durability guarantees (like CL=ANY, ONE, or LOCAL_ONE).
Neither guardrail is enabled by default, so as not to disrupt clusters that are currently using any of the CLs for writes. The warning guardrail may seem harmless, as it only adds a warning to the CQL response; however, enabling it can significantly increase network traffic (as a warning message is added to each response) and also decrease throughput due to additional allocations required to prepare the warning. Therefore, both guardrails should be enabled with care. The newly added `writes_per_consistency_level` metric, which is incremented unconditionally, can help decide whether a guardrail can be safely enabled in an existing cluster.
This commit adds additional `if` instructions on the critical path. However, based on the `perf_simple_query` benchmark for writes, the difference is marginal (~40 additional instructions, which is a relative difference smaller than 0.001).
BEFORE:
```
291443.35 tps ( 53.3 allocs/op, 16.0 logallocs/op, 14.2 tasks/op, 48067 insns/op, 18885 cycles/op, 0 errors)
throughput:
mean= 289743.07 standard-deviation=6075.60
median= 291424.69 median-absolute-deviation=1702.56
maximum=292498.27 minimum=261920.06
instructions_per_op:
mean= 48072.30 standard-deviation=21.15
median= 48074.49 median-absolute-deviation=12.07
maximum=48119.87 minimum=48019.89
cpu_cycles_per_op:
mean= 18884.09 standard-deviation=56.43
median= 18877.33 median-absolute-deviation=14.71
maximum=19155.48 minimum=18821.57
```
AFTER:
```
290108.83 tps ( 53.3 allocs/op, 16.0 logallocs/op, 14.2 tasks/op, 48121 insns/op, 18988 cycles/op, 0 errors)
throughput:
mean= 289105.08 standard-deviation=3626.58
median= 290018.90 median-absolute-deviation=1072.25
maximum=291110.44 minimum=274669.98
instructions_per_op:
mean= 48117.57 standard-deviation=18.58
median= 48114.51 median-absolute-deviation=12.08
maximum=48162.18 minimum=48087.18
cpu_cycles_per_op:
mean= 18953.43 standard-deviation=28.76
median= 18945.82 median-absolute-deviation=20.84
maximum=19023.93 minimum=18916.46
```
Fixes: SCYLLADB-259
Refs: SCYLLADB-739
No backport, it's a new feature
Closesscylladb/scylladb#28570
* github.com:scylladb/scylladb:
scylla.yaml: add write CL guardrails to scylla.yaml
scylla.yaml: reorganize guardrails config to be in one place
test: add cluster tests for write CL guardrails
test: implement test_guardrail_write_consistency_level
cql3: start using write CL guardrails
cql3/query_processor: implement metrics to track CL of writes
db: cql3/query_processor: add write_consistency_levels enum_sets
config: add write_consistency_levels_* guardrails configuration
set_is_group0_table takes an enabled flag, based on which it decides
whether it's a group0 table. The method is called only with enabled = true.
Drop the param. For not group0 tables nothing should be set.
The comparator used to sort per-IP client rows was not a strict-weak-ordering (it could return true in both directions for some pairs), which makes `std::ranges::sort` behavior undefined. A concrete pair that breaks it (and is realistic in system.clients):
a = (port=9042, client_type="cql")
b = (port=10000, client_type="alternator")
With the current comparator:
cmp(a,b) = (9042 < 10000) || ("cql" < "alternator") = true || false = true
cmp(b,a) = (10000 < 9042) || ("alternator" < "cql") = false || true = true
So both directions are true, meaning there is no valid ordering that sort can achieve.
The fix is to sort lexicographically by (port, client_type) to match the table's clustering key and ensure deterministic ordering.
Closesscylladb/scylladb#28844
This patch series removes creation of default 'cassandra:cassandra' superuser on system start.
Disable creation of a superuser with default 'cassandra:cassandra' credentials to improve security. The current flow requires clients to create another superuser and then drop the default `cassandra:cassandra' role. For those who do, there is a time window where the default credentials exist. For those who do not, that role stays. We want to improve security by forcing the client to either use config to specify default values for default superuser name and password or use cqlsh over maintenance socket connection to explicitly create/alter a superuser role.
The patch series:
- Enable role modification over the maintenance socket
- Stop using default 'cassandra' value for default superuser, skipping creation instead
Design document: https://scylladb.atlassian.net/wiki/spaces/RND/pages/165773327/Drop+default+cassandra+superuserFixesscylladb/scylla-enterprise#5657
This is an improvement. It does not need a backport.
Closesscylladb/scylladb#27215
* github.com:scylladb/scylladb:
config: enable maintenance socket in workdir by default
docs: auth: do not specify password with -p option
docs: update documentation related to default superuser
test: maintenance socket role management
test: cluster: add logs to test_maintenance_socket.py
test: pylib: fix connect_driver handling when adding and starting server
auth: do not create default 'cassandra:cassandra' superuser
auth: remove redundant DEFAULT_USER_NAME from password authenticator
auth: enable role management operations via maintenance socket
client_state: add has_superuser method
client_state: add _bypass_auth_checks flag
auth: let maintenance_socket_role_manager know if node is in maintenance mode
auth: remove class registrator usage
auth: instantiate auth service with factory functors
auth: add service constructor with factory functors
auth: add transitional.hh file
service: qos: handle special scheduling group case for maintenance socket
service: qos: use _auth_integration as condition for using _auth_integration
The method is called from storage_proxy::mutate_hint() which is in turn called from hint_mutation::apply_locally(). The latter is either called from directly by hint sender, which already runs in streaming group, or via RPC HINT_MUTATION handler which uses index 1 that negotiates streaming group as well.
To be sure, add a debugging check for current group being the expected one.
Code cleanup, not backporting
Closesscylladb/scylladb#28545
* github.com:scylladb/scylladb:
hint: Don't switch group in database::apply_hint()
hint_sender: Switch to sender group on stop either
In this series we introduce new system tables and use them for storing the raft metadata
for strongly consistent tables. In contrast to the previously used raft group0 tables, the
new tables can store data on any shard. The tables also allow specifying the shard where
each partition should reside, which enables the tablets of strongly consistent tables to have
their raft group metadata co-located on the same shard as the tablet replica.
The new tables have almost the same schemas as the raft group0 tables. However, they
have an additional column in their partition keys. The additional column is the shard
that specifies where the data should be located. While a tablet and its corresponding
raft group server resides on some shard, it now writes and reads all requests to the
metadata tables using its shard in addition to the group_id.
The extra partition key column is used by the new partitioner and sharder which allow
this special shard routing. The partitioner encodes the shard in the token and the
sharder decodes the shard from the token. This approach for routing avoids any
additional lookups (for the tablet mapping) during operations on the new tables
and it also doesn't require keeping any state. It also doesn't interact negatively
with resharding - as long as tablets (and their corresponding raft metadata) occupy
some shard, we do not allow starting the node with a shard count lower than the
id of this shard. When increasing the shard count, the routing does not change,
similarly to how tablet allocation doesn't change.
To use the new tables, a new implementation of `raft::persistence` is added. Currently,
it's almost an exact copy of the `raft_sys_table_storage` which just uses the new tables,
but in the future we can modify it with changes specific to metadata (or mutation)
storage for strongly consistent tables. The new storage is used in the `groups_manager`,
which combined with the removal of some `this_shard_id() == 0` checks, allows strongly
consistent tables to be used on all shards.
This approach for making sure that the reads/writes to the new tables end up on the correct shards
won in the balance of complexity/usability/performance against a few other approaches we've considered.
They include:
1. Making the Raft server read/write directly to the database, skipping the sharder, on its shard, while using
the default partitioner/sharder. This approach could let us avoid changing the schema and there should be
no problems for reads and writes performed by the Raft server. However, in this approach we would input
data in tables conflicting with the placement determined by the sharder. As a result, any read going through
the sharder could miss the rows it was supposed to read. Even when reading all shards to find a specific value,
there is a risk of polluting the cache - the rows loaded on incorrect shards may persist in the cache for an unknown
amount of time. The cache may also mistakenly remember that a row is missing, even though it's actually present,
just on an incorrect shard.
Some of the issues with this approach could be worked around using another sharder which always returns
this_shard_id() when asked about a shard. It's not clear how such a sharder would implement a method like
`token_for_next_shard`, and how much simpler it would be compared to the current "identity" sharder.
2. Using a sharder depending on the current allocation of tablets on the node. This approach relies on the
knowledge of group_id -> shard mapping at any point in time in the cluster. For this approach we'd also
need to either add a custom partitioner which encodes the group_id in the token, or we'd need to track the
token(group_id) -> shard mapping. This approach has the benefit over the one used in the series of keeping
the partition key as just group_id. However, it requires more logic, and the access to the live state of the node
in the sharder, and it's not static - the same token may be sharded differently depending on the state of the
node - it shouldn't occur in practice, but if we changed the state of the node before adjusting the table data,
we would be unable to access/fix the stale data without artificially also changing the state of the node.
3. Using metadata tables co-located to the strongly consistent tables. This approach could simplify the
metadata migrations in the future, however it would require additional schema management of all co-located
metadata tables, and it's not even obvious what could be used as the partition key in these tables - some
metadata is per-raft-group, so we couldn't reuse the partition key of the strongly consistent table for it. And
finding and remembering a partition key that is routed to a specific shard is not a simple task. Finally, splits
and merges will most likely need special handling for metadata anyway, so we wouldn't even make use of
co-located table's splits and merges.
Fixes [SCYLLADB-361](https://scylladb.atlassian.net/browse/SCYLLADB-361)
[SCYLLADB-361]: https://scylladb.atlassian.net/browse/SCYLLADB-361?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQClosesscylladb/scylladb#28509
* github.com:scylladb/scylladb:
docs: add strong consistency doc
test/cluster: add tests for strongly-consistent tables' metadata persistence
raft: enable multi-shard raft groups for strongly consistent tablets
test/raft: add unit tests for raft_groups_storage
raft: add raft_groups_storage persistence class
db: add system tables for strongly consistent tables' raft groups
dht: add fixed_shard_partitioner and fixed_shard_sharder
raft: add group_id -> shard mapping to raft_group_registry
schema: add with_sharder overload accepting static_sharder reference
When we create a materialized view, we consider 2 cases:
1. the view's primary key contains a column that is not
in the primary key of the base table
2. the view's primary key doesn't contain such a column
In the 2nd case, we add all columns from the base table
to the schema of the view (as virtual columns). As a result,
all of these columns are effectively "selected" in
view_updates::can_skip_view_updates. Same thing happens when
we add new columns to the base table using ALTER.
Because of this, we can never have !column_is_selected and
!has_base_non_pk_columns_in_view_pk at the same time. And
thus, the check (!column_is_selected
&& _base_info.has_base_non_pk_columns_in_view_pk) is always
the same as (!column_is_selected).
Because we immediately return after this check, the tail of
this function is also never reached - all checks after the
(column_is_selected) are affected by this. Also, the condition
(!column_is_selected && base_has_nonexpiring_marker) is always
false at the point it is called. And this in turn makes the
`base_has_nonexpiring_marker` unused, so we delete it as well.
It's worth considering, why did we even have
`base_has_nonexpiring_marker` if it's effectively unused. We
initially introduced it in bd52e05ae2 and we (incorrectly)
used it to allow skipping view updates even if the liveness of
virtual columns changed. Soon after, in 5f85a7a821, we
started categorizing virtual columns as column_is_selected == true
and we moved the liveness checks for virtual columns to the
`if (column_is_selected)` clause, before the `base_has_nonexpiring_marker`
check. We changed this because even if we have a nonexpiring marker
right now, it may be changed in the future, in which case the liveness
of the view row will depend on liveness of the virtual columns and
we'll need to have the view updates from the time the row marker was
nonexpiring.
Closesscylladb/scylladb#28838
Prevent repair lock holder from being leaked in repair_service when table
is dropped midway.
The leakage might result in use-after-free later, since the repair lock
itself will be gone after table drop.
The RPC verb that removes the lock on success path will not be called
by coordinator after table was dropped.
Refs #27365.
Fixes https://scylladb.atlassian.net/browse/SCYLLADB-896.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
We want to enable maintenance socket by default.
This will prevent users from having to reboot a server to enable it.
Also, there is little point in having maintenance socket that is turned off,
and we want users to use it. After this patch series, they will have
to use it. Note that while config seeding exists, we do not encourage it
for production deployments.
This patch changes default maintenance_socket value from ignore to workdir.
This enables maintenance socket without specifying an explicit path.
Refs SCYLLADB-409
It is ambigous, use the appropriate no-gc or gc-all factories instead,
as appropriate.
A special note for mutation::compacted(): according to the comment above
it, it doesn't drop expired tombstones but as it is currently, it
actually does. Change the tombstone gc param for the underlying call to
compact_for_compaction() to uphold the comment. This is used in tests
mostly, so no fallout expected.
Tests are handled in the next commit, to reduce noise.
Two tests in mutation_test.cc have to be updated:
* test_compactor_range_tombstone_spanning_many_pages
has to be updated in this commit, as it uses
mutation_partition::compact_for_query() as well as
compact_for_query(). The test passes default constructed
tombstone_gc() to the latter while the former now uses no-gc
creating a mismatch in tombstone gc behaviour, resulting in test
failure. Update the test to also pass no-gc to compact_for_query().
* test_query_digest similarly uses mutation_partition::query_mutation()
and another compaction method, having to match the no-gc now used in
query_mutation().
Instead of keeping a pointer to it. Replace nullptr with
tombstone_gc_state::no_gc().
This object is now designed to be used as a value-type, after recent
refactoring.
Introduced a new max_tablet_count tablet option that caps the maximum number of tablets a table can have. This feature is designed primarily for backup and restore workflows.
During backup, when load balancing is disabled for snapshot consistency, the current tablet count is recorded in the backup manifest.
During restore, max_tablet_count is set to this recorded value, ensuring the restored table's tablet count never exceeds the original snapshot's tablet distribution.
This guarantee enables efficient file-based SSTable streaming during restore, as each SSTable remains fully contained within a single tablet boundary.
Closesscylladb/scylladb#28450
3f7ee3ce5d introduced system.batchlog_v2, with a schema designed to speed up batchlog replays and make post-replay cleanups much more effective.
It did not introduce a cluster feature for the new table, because it is node local table, so the cluster can switch to the new table gradually, one node at a time.
However, https://github.com/scylladb/scylladb/issues/27886 showed that the switching causes timeouts during upgrades, in mixed clusters. Furthermore, switching to the new table unconditionally on upgrades nodes, means that on rollback, the batches saved into the v2 table are lost.
This PR introduces re-introduces v1 (`system.batchlog`) support and guards the use of the v2 table with a cluster feature, so mixed clusters keep using v1 and thus be rollback-compatible.
The re-introduced v1 support doesn't support post-replay cleanups for simplicity. The cleanup in v1 was never particularly effective anyway and we ended up disabling it for heavy batchlog users, so I don't think the lack of support for cleanup is a problem.
Fixes: https://github.com/scylladb/scylladb/issues/27886
Needs backport to 2026.1, to fix upgrades for clusters using batches
Closesscylladb/scylladb#28736
* github.com:scylladb/scylladb:
test/boost/batchlog_manager_test: add tests for v1 batchlog
test/boost/batchlog_manager_test: make prepare_batches() work with both v1 and v2
test/boost/batchlog_manager_test: fix indentation
test/boost/batchlog_manager_test: extract prepare_batches() method
test/lib/cql_assertions: is_rows(): add dump parameter
tools/scylla-sstable: extract query result printers
tools/scylla-sstable: add std::ostream& arg to query result printers
repair/row_level: repair_flush_hints_batchlog_handler(): add all_replayed to finish log
db/batchlog_manager: re-add v1 support
db/batchlog_manager: return all_replayed from process_batch()
db/batchlog_manager: process_bath() fix indentation
db/batchlog_manager: make batch() a standalone function
db/batchlog_manager: make structs stats public
db/batchlog_manager: allocate limiter on the stack
db/batchlog_manager: add feature_service dependency
gms/feature_service: add batchlog_v2 feature
The PR removes most of the code that assumes that group0 and raft topology is not enabled. It also makes sure that joining a cluster in no raft mode or upgrading a node in a cluster that not yet uses raft topology to this version will fail.
Refs #15422
No backport needed since this removes functionality.
Closesscylladb/scylladb#28514
* https://github.com/scylladb/scylladb:
group0: fix indentation after previous patch
raft_group0: simplify get_group0_upgrade_state function since no upgrade can happen any more
raft_group0: move service::group0_upgrade_state to use fmt::formatter instead of iostream
raft_group0: remove unused code from raft_group0
node_ops: remove topology over node ops code
topology: fix indentation after the previous patch
topology: drop topology_change_enabled parameter from raft_group0 code
storage_service: remove unused handle_state_* functions
gossiper: drop wait_for_gossip_to_settle and deprecate correspondent option
storage_service: fix indentation after the last patch
storage_service: remove gossiper bootstrapping code
storage_service: drop get_group_server_if_raft_topolgy_enabled
storage_service: drop is_topology_coordinator_enabled and its uses
storage_service: drop run_with_api_lock_in_gossiper_mode_only
topology: remove code that assumes raft_topology_change_enabled() may return false
test: schema_change_test: make test_schema_digest_does_not_change_with_disabled_features tests run in raft mode
test: schema_change_test: drop schema tests relevant for no raft mode only
topology: remove upgrade to raft topology code
group0: remove upgrade to group0 code
group0: refuse to boot if a cluster is still is not in a raft topology mode
storage_service: refuse to join a cluster in legacy mode
Previously, rewriting an sstable component (e.g., via rewrite_statistics) created a temporary file that was renamed
to the final name after sealing. This allows crash recovery by simply removing the temporary file on startup.
However, this approach won't work once component digests are stored in scylla_metadata,
as replacing a component like Statistics will require atomically updating both the component
and scylla_metadata with the new digest—impossible with POSIX rename.
The new mechanism creates a clone sstable with a fresh generation:
- Hard-links all components from the source except the component being rewritten and scylla metadata if update_sstable_id is true
- Copies original sstable components pointer and recognized components from the source
- Invokes a modifier callback to adjust the new sstable before rewriting
- Writes the modified component. If update_sstable_id is true, reads scylla metadata, generates new sstable_id and rewrites it.
- Seals the new sstable with a temporary TOC
- Replaces the old sstable atomically, the same way as it is done in compaction
This is built on the rewrite_sstables compaction framework to support batch operations (e.g., following incremental repair).
In case of any failure during the whole process, sstable will be automatically deleted on the node startup due to
temporary toc persistence.
This prepares the infrastructure for component digests. Once digests are introduced in scylla_metadata
this mechanism will be extended to also rewrite scylla metadata with the updated digest alongside the modified component, ensuring atomic updates of both.
Switch vector dimension handling to fixed-width `uint32_t` type,
update parsing/validation, and add boundary tests.
The dimension is parsed as `unsigned long` at first which is guaranteed
to be **at least** 32-bit long, which is safe to downcast to `uint32_t`.
Move `MAX_VECTOR_DIMENSION` from `cql3_type::raw_vector` to `cql3_type`
to ensure public visibility for checks outside the class.
Add tests to verify the type boundaries.
Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-223
Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
Co-authored-by: Dawid Pawlik <dawid.pawlik@scylladb.com>
Closesscylladb/scylladb#28762
Add three new system tables for storing raft state for strongly
consistent tablets, corresponding to the tables for group0:
- system.raft_groups: Stores the raft log, term/vote, snapshot_id,
and commit_idx for each tablet's raft group.
- system.raft_groups_snapshots: Stores snapshot descriptors
(index, term) for each group.
- system.raft_groups_snapshot_config: Stores the raft configuration
(current and previous voters) for each snapshot.
These tables use a (shard, group_id) composite partition key with
the newly added raft_groups_partitioner and raft_groups_sharder, ensuring
data is co-located with the tablet replica that owns the raft group.
The tables are only created when the STRONGLY_CONSISTENT_TABLES experimental
feature is enabled.
Refs: SCYLLADB-193
Adds a "snapshot_table" topology operation and associated data structure/table columns to support dispatching a snapshot operation as a topo coordinator op.
Logic is similar, and thus broken out and semi-shared with, truncation.
Also adds optional tablet metadata to manifest, listing all tablets present in a given snapshot, as well as
tablet sstable ownership, repair status, and token ranges.
As per description in SCYLLADB-193, the alternative snapshot mechanism is in
a separate namespace under 'tablets', which while dubious is the desired destination.
The API is accessed via `nodetool cluster snapshot`, which more or less mirrors `nodetool snapshot`, but using topo op.
TTL is added to message propagation as a separate patch here, since it is not (yet) used from API (or nodetool).
Requires a syntax for both API and command line.
Closesscylladb/scylladb#28525
* github.com:scylladb/scylladb:
topology::snapshot: Add expiry (ttl) to RPC/topo op
test_snapshot_with_tablets: Extend test to check manifest content
table::manifest: Add tablet info to manifest.json
test::test_snapshot_with_tablets: Add small test for topo coordinated snapshot
scylla-nodetool: Add "cluster snapshot" command
api::storage_service: Add tablets/snapshots command for cluster level snapshot
db::snapshot-ctl: Add method to do snapshot using topo coordinator
storage_proxy: Add snapshot_keyspace method
topology_coordinator: Add handler for snapshot_tables
storage_proxy: Add handler for SNAPSHOT_WITH_TABLETS
messaging_service: Add SNAPSHOT_WITH_TABLETS verb
feature_service: Add SNAPSHOT_AS_TOPOLOGY_OPERATION feature
topology_mutation: Add setter for snapshot part of row
system_keyspace::topology_requests_entry: Add snapshot info to table
topology_state_machine: Add snapshot_tables operation
topology_coordinator: Break out logic from handle_truncate_table
storage_proxy: Break out logic from request_truncate_with_tablets
test/object_store: Remove create_ks_and_cf() helper
test/object_store: Replace create_ks_and_cf() usage with standard methods
test/object_store: Shift indentation right for test cases
The path removes the code protected by !raft_topology_change_enabled()
since it is no longer reachable. Drop test_lwt_for_tablets_is_not_supported_without_raft
since not raft mode is no longer supported.
This series hardens MV shutdown behavior by fixing lifecycle tracking for detached view-builder callbacks and aligning update handling with the same async dispatch style used by create/drop.
Patch 1 refactors on_update_view to use a dedicated coroutine dispatcher (dispatch_update_view), keeping update logic serialized under the existing view-builder lock and consistent with the callback architecture already used for create/drop paths.
Patch 2 adds explicit callback lifetime coordination in view_builder:
- introduce a seastar::gate member
- acquire _ops_gate.hold() when launching detached create/update/drop dispatch futures
- keep the hold alive until each detached future resolves
- close the gate during view_builder::drain() so shutdown waits for in-flight callback work before final teardown
Together, these changes reduce shutdown race exposure in MV event handling while preserving existing behavior for normal operation.
Testing:
- pytest --test-py-init test/cluster/mv (47 passed, 7 skipped)
backport: not required started happening in master
fixes: SCYLLADB-687
Closesscylladb/scylladb#28648
* github.com:scylladb/scylladb:
db/view: gate detached view-builder callbacks during shutdown
db:view: refactor on_update_view to use coroutine dispatcher
system.batchlog will still have to be used while the cluster is
upgrading from an older version, which doesn't know v2 yet.
Re-add support for replaying v1 batchlogs. The switch to v2 will happen
after the BATCHLOG_V2 cluster feature is enabled.
The only external user -- storage_proxy -- only needs a minor
adjustment: switch between the table names. The rest is handled
transparently by the db/batchlog.hh interface and the batchlog_manager.
process_batch() currently returns stop_iteration::no from all control
paths. This is not useful. Return the all_replayed output param instead.
This requires making the batch() lambda a coroutine, but considering the
amount of work process_batch() does (send multiple writes), this should
be inconsequential.
Some assertions in the Raft-based topology are likely to cause crashes of
multiple nodes due to the consistent nature of the Raft-based code. If the
failing assertion is executed in the code run by each follower (e.g., the code
reloading the in-memory topology state machine), then all nodes can crash. If
the failing assertion is executed only by the leader (e.g., the topology
coordinator fiber), then multiple consecutive group0 leaders will chain-crash
until there is no group0 majority.
Crashing multiple nodes is much more severe than necessary. It's enough to
prevent the topology state machine from making more progress. This will
naturally happen after throwing a runtime error. The problematic fiber will be
killed or will keep failing in a loop. Note that it should be safe to block
the topology state machine, but not the whole group0, as the topology state
machine is mostly isolated from the rest of group0.
We replace some occurrences of `on_fatal_internal_error` and `SCYLLA_ASSERT`
with `on_internal_error`. These are not all occurrences, as some fatal
assertions make sense, for example, in the bootstrap procedure.
We also raise an internal error to prevent a segmentation fault in a few places.
Fixes#27987
Backporting this PR is not required, but we can consider it at least for 2026.1
because:
- it is LTS,
- the changes are low-risk,
- there shouldn't be many conflicts.
Closesscylladb/scylladb#28558
* github.com:scylladb/scylladb:
raft topology: prevent accessing nullptr returned by topology::find
raft topology: make some assertions non-crashing
In https://github.com/scylladb/scylladb/pull/27262 table audit has been
re-enabled by default in `scylla.yaml`, logging certain categories to a table,
which should make new Scylla deployments have audit enabled.
Now, in the next release, we also want to enable audit in `db/config.cc`,
which should enable audit for all deployments, which don't explicitly configure
audit otherwise in `scylla.yaml` (or via cmd line).
BTW. Because this commit aligns audit's default config values in `db/config.cc`
to those of `scylla.yaml`, `docs/reference/configuration-parameters.rst`, which
is based on `db/config.cc` will start showing that table audit is the default.
Refs: https://github.com/scylladb/scylladb/issues/28355
Refs: https://scylladb.atlassian.net/browse/SCYLLADB-222
No backport: table audit has been enabled in 2026.1 in `scylla.yaml`,
and should be always on starting from the next release,
which is the release we're currently merging to (2026.2).
Closesscylladb/scylladb#28376
* github.com:scylladb/scylladb:
docs: decommission: note audit ks may require ALTERing
docs: mention table audit enabled by default
audit: disable DDL by default
db/config: enable table audit by default
test/cluster: fix `test_table_desc_read_barrier` assertion
test/cluster: adjust audit in tests involving decommissioning its ks
audit_test: fix incorrect config in `test_audit_type_none`
Tablet migration keeps sstable snapshot during streaming, which may
cause temporary increase in disk utilization if compaction is running
concurrently. SSTables compacted away are kept on disk until streaming
is done with them. The more tablets we allow to migrate concurrently,
the higher disk space can rise. When the target tablet size is
configured correcly, every tablet should own about 1% of disk
space. So concurrency of 4 shouldn't put us at risk. But target tablet
size is not chosen dynamically yet, and it may not be aligned with
disk capacity.
Also, tablet sizes can temporarily grow above the target, up to 2x
before the split starts, and some more because splits take a while to
complete.
To reduce the impact from this, reduce concurrency of
migration. Concurrency of 2 should still be enough to saturate
resources on the leaving shard.
Also, reducing concurrency means that load balancing is more
responsive to preemption. There will be less bandwidth sharing, so
scheduled migrations complete faster. This is important for scale-out,
where we bootstrap a node and want to start migrations to that new
node as soon as possible.
Refs scylladb/siren#15317Closesscylladb/scylladb#28563
* github.com:scylladb/scylladb:
tablets, config: Reduce migration concurrency to 2
tablets: load_balancer: Always accept migration if the load is 0
config, tablets: Make tablet migration concurrency configurable
Remove bootstrap and decomission from allowed_repair_based_node_ops.
Using RBNO over streaming for these operations has no benefits, as they
are not exposed to the out-of-date replica problem that replace,
removenode and rebuild are.
On top of that, RBNO is known to have problems with empty user tables.
Using streaming for boostrap and decomission is safe and faster
than RBNO in all condition, especially when the table is small.
One test needs adjustment as it relies on RBNO being used for all node
ops.
Fixes: SCYLLADB-105
Closesscylladb/scylladb#28080
DDL audit category doesn't make sense if its enabled by default on its
own, as no DDL statements are going to be audited if audit_keyspaces/audit_tables
setting is empty. This may be counter-intuitive to our users, who may
expect to actually see these statements logged if we're enabling this by
default. Also, it doesn't make sense to enable a setting by default if
it has no effect.
Additionally, listed all possible audit categories for user's
convenience.