topology_coordinator: suppress cancel warning in should_preempt_balancing

Agent-Logs-Url: https://github.com/scylladb/scylladb/sessions/ff8e4ba3-e470-4446-8a15-9f173b22c277 Co-authored-by: tgrabiec <283695+tgrabiec@users.noreply.github.com>
vector_search: forward non-primary key restrictions to Vector Store service
2026-04-25 19:10:42 +00:00 · 2026-04-10 19:25:21 +00:00 · 2026-04-10 17:16:29 +02:00 · 2026-04-10 12:24:18 +02:00 · 2026-04-10 12:17:43 +02:00 · 2026-04-10 11:11:21 +02:00
136 changed files with 5588 additions and 1072 deletions
--- a/.github/workflows/call_validate_pr_author_email.yml
+++ b/.github/workflows/call_validate_pr_author_email.yml
@@ -7,6 +7,11 @@ on:
      - synchronize
      - reopened

+permissions:
+  contents: read
+  pull-requests: write
+  statuses: write
+
 jobs:
  validate_pr_author_email:
    uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,12 @@ cmake_minimum_required(VERSION 3.27)

 project(scylla)

+# Disable CMake's automatic -fcolor-diagnostics injection (CMake 3.24+ adds
+# it for Clang+Ninja). configure.py does not add any color diagnostics flags,
+# so we clear the internal CMake variable to prevent injection.
+set(CMAKE_CXX_COMPILE_OPTIONS_COLOR_DIAGNOSTICS "")
+set(CMAKE_C_COMPILE_OPTIONS_COLOR_DIAGNOSTICS "")
+
 list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)
@@ -51,6 +57,16 @@ set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE INTERNAL "")
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

+# Global defines matching configure.py
+# Since gcc 13, libgcc doesn't need the exception workaround
+add_compile_definitions(SEASTAR_NO_EXCEPTION_HACK)
+# Hacks needed to expose internal APIs for xxhash dependencies
+add_compile_definitions(XXH_PRIVATE_API)
+# SEASTAR_TESTING_MAIN is added later (after add_subdirectory(seastar) and
+# add_subdirectory(abseil)) to avoid leaking into the seastar subdirectory.
+# If SEASTAR_TESTING_MAIN is defined globally before seastar, it causes a
+# duplicate 'main' symbol in seastar_testing.
+
 if(is_multi_config)
    find_package(Seastar)
    # this is atypical compared to standard ExternalProject usage:
@@ -98,10 +114,31 @@ else()
    set(Seastar_IO_URING ON CACHE BOOL "" FORCE)
    set(Seastar_SCHEDULING_GROUPS_COUNT 21 CACHE STRING "" FORCE)
    set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
+    # Match configure.py's build_seastar_shared_libs: Debug and Dev
+    # build Seastar as a shared library, others build it static.
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "Dev")
+        set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
+    else()
+        set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+    endif()
    add_subdirectory(seastar)
-    target_compile_definitions (seastar
-      PRIVATE
-        SEASTAR_NO_EXCEPTION_HACK)
+
+    # Coverage mode sets cmake_build_type='Debug' for Seastar
+    # (configure.py:515), so Seastar's pkg-config output includes sanitizer
+    # link flags in seastar_libs_coverage (configure.py:2514,2649).
+    # Seastar's own CMake only activates sanitizer targets for Debug/Sanitize
+    # configs, so we inject link options on the seastar target for Coverage.
+    # Using PUBLIC ensures they propagate to all targets linking Seastar
+    # (but not standalone tools like patchelf), matching configure.py's
+    # behavior.  Compile-time flags and defines are handled globally in
+    # cmake/mode.Coverage.cmake.
+    if(CMAKE_BUILD_TYPE STREQUAL "Coverage")
+        target_link_options(seastar
+            PUBLIC
+                -fsanitize=address
+                -fsanitize=undefined
+                -fsanitize=vptr)
+    endif()
 endif()

 set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
@@ -111,8 +148,10 @@ if(Scylla_ENABLE_LTO)
 endif()

 find_package(Sanitizers QUIET)
+# Match configure.py:2192 — abseil gets sanitizer flags with -fno-sanitize=vptr
+# to exclude vptr checks which are incompatible with abseil's usage.
 list(APPEND absl_cxx_flags
-    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>>)
+    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>;-fno-sanitize=vptr>)
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    list(APPEND ABSL_GCC_FLAGS ${absl_cxx_flags})
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -137,9 +176,38 @@ add_library(absl::headers ALIAS absl-headers)
 # unfortunately.
 set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)

+# Now that seastar and abseil subdirectories are fully processed, add
+# SEASTAR_TESTING_MAIN globally. This matches configure.py's global define
+# without leaking into seastar (which would cause duplicate main symbols).
+add_compile_definitions(SEASTAR_TESTING_MAIN)
+
 # System libraries dependencies
 find_package(Boost REQUIRED
    COMPONENTS filesystem program_options system thread regex unit_test_framework)
+# When using shared Boost libraries, define BOOST_ALL_DYN_LINK (matching configure.py)
+if(NOT Boost_USE_STATIC_LIBS)
+    add_compile_definitions(BOOST_ALL_DYN_LINK)
+endif()
+
+# CMake's Boost package config adds per-component defines like
+# BOOST_UNIT_TEST_FRAMEWORK_DYN_LINK, BOOST_REGEX_DYN_LINK, etc. on the
+# imported targets. configure.py only uses BOOST_ALL_DYN_LINK (which covers
+# all components), so strip the per-component defines to align the two build
+# systems.
+foreach(_boost_target
+    Boost::unit_test_framework
+    Boost::regex
+    Boost::filesystem
+    Boost::program_options
+    Boost::system
+    Boost::thread)
+  if(TARGET ${_boost_target})
+    # Completely remove all INTERFACE_COMPILE_DEFINITIONS from the Boost target.
+    # This prevents per-component *_DYN_LINK and *_NO_LIB defines from
+    # propagating. BOOST_ALL_DYN_LINK (set globally) covers all components.
+    set_property(TARGET ${_boost_target} PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+  endif()
+endforeach()
 target_link_libraries(Boost::regex
  INTERFACE
    ICU::i18n
@@ -196,6 +264,10 @@ if (Scylla_USE_PRECOMPILED_HEADER)
    message(STATUS "Using precompiled header for Scylla - remember to add `sloppiness = pch_defines,time_macros` to ccache.conf, if you're using ccache.")
    target_precompile_headers(scylla-precompiled-header PRIVATE "stdafx.hh")
    target_compile_definitions(scylla-precompiled-header PRIVATE SCYLLA_USE_PRECOMPILED_HEADER)
+    # Match configure.py: -fpch-validate-input-files-content tells the compiler
+    # to check content of stdafx.hh if timestamps don't match (important for
+    # ccache/git workflows where timestamps may not be preserved).
+    add_compile_options(-fpch-validate-input-files-content)
  endif()
 else()
  set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
--- a/2
+++ b/2
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -1122,6 +1122,7 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

        cdc::options opts;
        opts.enabled(true);
+        // cdc::delta_mode is ignored by Alternator, so aim for the least overhead.
        opts.set_delta_mode(cdc::delta_mode::keys);
        opts.ttl(std::chrono::duration_cast<std::chrono::seconds>(dynamodb_streams_max_window).count());

--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -743,7 +743,7 @@
               "parameters":[
                  {
                     "name":"tag",
-                     "description":"the tag given to the snapshot",
+                     "description":"The snapshot tag to delete. If omitted, all snapshots are removed.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -751,7 +751,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma-separated keyspaces name that their snapshot will be deleted",
+                     "description":"Comma-separated list of keyspace names to delete snapshots from. If omitted, snapshots are deleted from all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -759,7 +759,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"an optional table name that its snapshot will be deleted",
+                     "description":"A table name used to filter which table's snapshots are deleted. If omitted or empty, snapshots for all tables are eligible. When provided together with 'kn', the table is looked up in each listed keyspace independently. For secondary indexes, the logical index name (e.g. 'myindex') can be used and is resolved automatically.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -23,7 +23,7 @@ void set_error_injection(http_context& ctx, routes& r) {

    hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
        sstring injection = req->get_path_param("injection");
-        bool one_shot = req->get_query_param("one_shot") == "True";
+        bool one_shot = strcasecmp(req->get_query_param("one_shot").c_str(), "true") == 0;
        auto params = co_await util::read_entire_stream_contiguous(*req->content_stream);

        const size_t max_params_size = 1024 * 1024;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -573,14 +573,6 @@ void unset_view_builder(http_context& ctx, routes& r) {
    cf::get_built_indexes.unset(r);
 }

-static future<json::json_return_type> describe_ring_as_json(sharded<service::storage_service>& ss, sstring keyspace) {
-    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring(keyspace), token_range_endpoints_to_json));
-}
-
-static future<json::json_return_type> describe_ring_as_json_for_table(const sharded<service::storage_service>& ss, sstring keyspace, sstring table) {
-    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
-}
-
 namespace {
 template <typename Key, typename Value>
 storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
@@ -678,13 +670,16 @@ rest_describe_ring(http_context& ctx, sharded<service::storage_service>& ss, std
        if (!req->param.exists("keyspace")) {
            throw bad_param_exception("The keyspace param is not provided");
        }
-        auto keyspace = req->get_path_param("keyspace");
+        auto keyspace = validate_keyspace(ctx, req);
        auto table = req->get_query_param("table");
+        utils::chunked_vector<dht::token_range_endpoints> ranges;
        if (!table.empty()) {
-            validate_table(ctx.db.local(), keyspace, table);
-            return describe_ring_as_json_for_table(ss, keyspace, table);
+            auto table_id = validate_table(ctx.db.local(), keyspace, table);
+            ranges = co_await ss.local().describe_ring_for_table(table_id);
+        } else {
+            ranges = co_await ss.local().describe_ring(keyspace);
        }
-        return describe_ring_as_json(ss, validate_keyspace(ctx, req));
+        co_return json::json_return_type(stream_range_as_array(std::move(ranges), token_range_endpoints_to_json));
 }

 static
@@ -2118,6 +2113,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
            }
            co_return json_void();
+        } catch (const data_dictionary::no_such_column_family& e) {
+            throw httpd::bad_param_exception(e.what());
        } catch (...) {
            apilog.error("take_snapshot failed: {}", std::current_exception());
            throw;
@@ -2154,6 +2151,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        try {
            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
            co_return json_void();
+        } catch (const data_dictionary::no_such_column_family& e) {
+            throw httpd::bad_param_exception(e.what());
        } catch (...) {
            apilog.error("del_snapshot failed: {}", std::current_exception());
            throw;
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -14,6 +14,7 @@
 #include <fmt/ranges.h>

 #include "utils/to_string.hh"
+#include "utils/error_injection.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
@@ -105,6 +106,9 @@ auth::authentication_option_set auth::certificate_authenticator::alterable_optio
 }

 future<std::optional<auth::authenticated_user>> auth::certificate_authenticator::authenticate(session_dn_func f) const {
+    if (auto user = utils::get_local_injector().inject_parameter("transport_early_auth_bypass")) {
+        co_return auth::authenticated_user{sstring(*user)};
+    }
    if (!f) {
        co_return std::nullopt;
    }
--- a/cmake/FindLua.cmake
+++ b/cmake/FindLua.cmake
@@ -0,0 +1,47 @@
+#
+# Copyright 2025-present ScyllaDB
+#
+
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+
+# Custom FindLua module that uses pkg-config, matching configure.py's
+# approach.  CMake's built-in FindLua resolves to the versioned library
+# (e.g. liblua-5.4.so) instead of the unversioned symlink (liblua.so),
+# causing a name mismatch between the two build systems.
+
+find_package(PkgConfig REQUIRED)
+
+# configure.py: lua53 on Debian-like, lua on others
+pkg_search_module(PC_lua QUIET lua53 lua)
+
+find_library(Lua_LIBRARY
+  NAMES lua lua5.3 lua53
+  HINTS
+    ${PC_lua_LIBDIR}
+    ${PC_lua_LIBRARY_DIRS})
+
+find_path(Lua_INCLUDE_DIR
+  NAMES lua.h
+  HINTS
+    ${PC_lua_INCLUDEDIR}
+    ${PC_lua_INCLUDE_DIRS})
+
+mark_as_advanced(
+  Lua_LIBRARY
+  Lua_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(Lua
+  REQUIRED_VARS
+    Lua_LIBRARY
+    Lua_INCLUDE_DIR
+  VERSION_VAR PC_lua_VERSION)
+
+if(Lua_FOUND)
+  set(LUA_LIBRARIES ${Lua_LIBRARY})
+  set(LUA_INCLUDE_DIR ${Lua_INCLUDE_DIR})
+endif()
+
--- a/cmake/mode.Coverage.cmake
+++ b/cmake/mode.Coverage.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_CXX_FLAGS_COVERAGE
-  "-fprofile-instr-generate -fcoverage-mapping -fprofile-list=${CMAKE_SOURCE_DIR}/coverage_sources.list"
+  "-fprofile-instr-generate -fcoverage-mapping"
  CACHE
  INTERNAL
  "")
@@ -8,18 +8,33 @@ update_build_flags(Coverage
  OPTIMIZATION_LEVEL "g")

 set(scylla_build_mode_Coverage "coverage")
+
+# Coverage mode sets cmake_build_type='Debug' for Seastar
+# (configure.py:515), so Seastar's pkg-config --cflags output
+# (configure.py:2252-2267, queried at configure.py:3039) includes debug
+# defines, sanitizer compile flags, and -fstack-clash-protection.
+# Seastar's CMake generator expressions only activate these for
+# Debug/Sanitize configs, so we add them explicitly for Coverage.
 set(Seastar_DEFINITIONS_COVERAGE
  SCYLLA_BUILD_MODE=${scylla_build_mode_Coverage}
-  DEBUG
-  SANITIZE
-  DEBUG_LSA_SANITIZER
-  SCYLLA_ENABLE_ERROR_INJECTION)
+  SEASTAR_DEBUG
+  SEASTAR_DEFAULT_ALLOCATOR
+  SEASTAR_SHUFFLE_TASK_QUEUE
+  SEASTAR_DEBUG_SHARED_PTR
+  SEASTAR_DEBUG_PROMISE
+  SEASTAR_TYPE_ERASE_MORE)
 foreach(definition ${Seastar_DEFINITIONS_COVERAGE})
  add_compile_definitions(
    $<$<CONFIG:Coverage>:${definition}>)
 endforeach()

-set(CMAKE_STATIC_LINKER_FLAGS_COVERAGE
+add_compile_options(
+  $<$<CONFIG:Coverage>:-fsanitize=address>
+  $<$<CONFIG:Coverage>:-fsanitize=undefined>
+  $<$<CONFIG:Coverage>:-fsanitize=vptr>
+  $<$<CONFIG:Coverage>:-fstack-clash-protection>)
+
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
  "-fprofile-instr-generate -fcoverage-mapping")

 maybe_limit_stack_usage_in_KB(40 Coverage)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -131,6 +131,7 @@ function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
  check_cxx_compiler_flag(${_stack_usage_threshold_flag} _stack_usage_flag_supported)
  if(_stack_usage_flag_supported)
    add_compile_options($<$<CONFIG:${config}>:${_stack_usage_threshold_flag}>)
+    add_compile_options($<$<CONFIG:${config}>:-Wno-error=stack-usage=>)
  endif()
 endfunction()

@@ -260,6 +261,23 @@ endif()

 # Force SHA1 build-id generation
 add_link_options("LINKER:--build-id=sha1")
+
+# Match configure.py: add -fno-lto globally. configure.py adds -fno-lto to
+# all binaries (except standalone cpp_apps like patchelf) via the per-binary
+# $libs variable. LTO-enabled targets (scylla binary in RelWithDebInfo) will
+# override with -flto=thin -ffat-lto-objects via enable_lto().
+add_link_options(-fno-lto)
+
+# Match configure.py:2633-2636 — sanitizer link flags for standalone binaries
+# (e.g. patchelf) that don't link Seastar.  Seastar-linked targets get these
+# via seastar_libs (configure.py:2649).
+# Coverage mode gets sanitizer link flags via the seastar target instead
+# (see CMakeLists.txt), matching configure.py where only seastar_libs_coverage
+# carries -fsanitize (not cxx_ld_flags).
+add_link_options(
+  $<$<CONFIG:Debug,Sanitize>:-fsanitize=address>
+  $<$<CONFIG:Debug,Sanitize>:-fsanitize=undefined>)
+
 include(CheckLinkerFlag)
 set(Scylla_USE_LINKER
    ""
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -488,6 +488,7 @@ sstable_format: ms
 # compressed.
 # can be:  all  - all traffic is compressed
 #          dc   - traffic between different datacenters is compressed
+#          rack - traffic between different racks is compressed
 #          none - nothing is compressed.
 # internode_compression: none

--- a/configure.py
+++ b/configure.py
@@ -1708,6 +1708,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
+    'test/boost/sstable_tablet_streaming.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
@@ -2232,16 +2233,20 @@ abseil_libs = ['absl/' + lib for lib in [
    'container/libabsl_raw_hash_set.a',
    'synchronization/libabsl_synchronization.a',
    'synchronization/libabsl_graphcycles_internal.a',
+    'synchronization/libabsl_kernel_timeout_internal.a',
    'debugging/libabsl_stacktrace.a',
    'debugging/libabsl_symbolize.a',
    'debugging/libabsl_debugging_internal.a',
    'debugging/libabsl_demangle_internal.a',
+    'debugging/libabsl_demangle_rust.a',
+    'debugging/libabsl_decode_rust_punycode.a',
+    'debugging/libabsl_utf8_for_code_point.a',
+    'debugging/libabsl_borrowed_fixup_buffer.a',
    'time/libabsl_time.a',
    'time/libabsl_time_zone.a',
    'numeric/libabsl_int128.a',
    'hash/libabsl_hash.a',
    'hash/libabsl_city.a',
-    'hash/libabsl_low_level_hash.a',
    'base/libabsl_malloc_internal.a',
    'base/libabsl_spinlock_wait.a',
    'base/libabsl_base.a',
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -201,6 +201,10 @@ public:
        return _clustering_columns_restrictions;
    }

+    const expr::expression& get_nonprimary_key_restrictions() const {
+        return _nonprimary_key_restrictions;
+    }
+
    // Get a set of columns restricted by the IS NOT NULL restriction.
    // IS NOT NULL is a special case that is handled separately from other restrictions.
    const std::unordered_set<const column_definition*> get_not_null_columns() const;
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -36,6 +36,7 @@
 #include "db/schema_tables.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/concrete_types.hh"
+#include "types/vector.hh"
 #include "db/tags/extension.hh"
 #include "tombstone_gc_extension.hh"
 #include "index/secondary_index.hh"
@@ -118,8 +119,51 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
    }
 }

+// Cassandra SAI compatibility: detect the StorageAttachedIndex class name
+// used by Cassandra to create vector and metadata indexes.
+static bool is_sai_class_name(const sstring& class_name) {
+    return class_name == "org.apache.cassandra.index.sai.StorageAttachedIndex"
+        || boost::iequals(class_name, "storageattachedindex")
+        || boost::iequals(class_name, "sai");
+}
+
+// Returns true if the custom class name refers to a vector-capable index
+// (either ScyllaDB's native vector_index or Cassandra's SAI).
 static bool is_vector_capable_class(const sstring& class_name) {
-    return boost::iequals(class_name, "vector_index");
+    return class_name == "vector_index" || is_sai_class_name(class_name);
+}
+
+// When the custom class is SAI, verify that at least one target is a
+// vector column and rewrite the class to ScyllaDB's native "vector_index".
+// Non-vector single-column targets and multi-column (local-index partition
+// key) targets are skipped — they are treated as filtering columns by
+// vector_index::check_target().
+static void maybe_rewrite_sai_to_vector_index(
+        const schema& schema,
+        const std::vector<::shared_ptr<index_target>>& targets,
+        index_specific_prop_defs& props) {
+    if (!props.custom_class || !is_sai_class_name(*props.custom_class)) {
+        return;
+    }
+    for (const auto& target : targets) {
+        auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
+        if (!ident) {
+            // Multi-column target (local-index partition key) — skip.
+            continue;
+        }
+        auto cd = schema.get_column_definition((*ident)->name());
+        if (!cd) {
+            // Nonexistent column — skip; vector_index::validate() will catch it.
+            continue;
+        }
+        if (dynamic_cast<const vector_type_impl*>(cd->type.get())) {
+            props.custom_class = "vector_index";
+            return;
+        }
+    }
+    throw exceptions::invalid_request_exception(
+        "StorageAttachedIndex (SAI) is only supported on vector columns; "
+        "use a secondary index for non-vector columns");
 }

 static bool is_vector_index(const index_options_map& options) {
@@ -276,7 +320,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat

    _idx_properties->validate();

-    // FIXME: This is ugly and can be improved.
+
    const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
    const bool uses_view_properties = _view_properties.properties()->count() > 0
            || _view_properties.use_compact_storage()
@@ -363,6 +407,8 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
        targets.emplace_back(raw_target->prepare(*schema));
    }

+    maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties);
+
    if (_idx_properties && _idx_properties->custom_class) {
        auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class);
        if (!custom_index_factory) {
--- a/cql3/statements/strong_consistency/modification_statement.cc
+++ b/cql3/statements/strong_consistency/modification_statement.cc
@@ -52,6 +52,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
    }

    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
+
    const auto mutate_result = co_await coordinator.get().mutate(_statement->s,
        keys[0].start()->value().token(),
        [&](api::timestamp_type ts) {
@@ -65,7 +66,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
                    raw_cql_statement, muts.size()));
            }
            return std::move(*muts.begin());
-        });
+        }, timeout, qs.get_client_state().get_abort_source());

    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
--- a/cql3/statements/strong_consistency/select_statement.cc
+++ b/cql3/statements/strong_consistency/select_statement.cc
@@ -42,7 +42,7 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
    const auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
    auto query_result = co_await coordinator.get().query(_query_schema, *read_command,
-        key_ranges, state.get_trace_state(), timeout);
+        key_ranges, state.get_trace_state(), timeout, state.get_client_state().get_abort_source());

    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&query_result)) {
@@ -54,4 +54,4 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
        read_command, options, now);
 }

-}
+}
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -250,8 +250,8 @@ void keyspace_metadata::validate(const gms::feature_service& fs, const locator::
    if (params.consistency && !fs.strongly_consistent_tables) {
        throw exceptions::configuration_exception("The strongly_consistent_tables feature must be enabled to use a consistency option");
    }
-    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::global) {
-        throw exceptions::configuration_exception("Global consistency is not supported yet");
+    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::local) {
+        throw exceptions::configuration_exception("Local consistency is not supported yet");
    }
 }

--- a/db/corrupt_data_handler.cc
+++ b/db/corrupt_data_handler.cc
@@ -22,7 +22,7 @@ corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
        _metrics.add_group("corrupt_data", {
                sm::make_counter("entries_reported", _stats.corrupt_data_reported,
                               sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
-                                               "A non-zero value indicates that the database suffered data corruption."))
+                                               "A non-zero value indicates that the database suffered data corruption.")).set_skip_when_empty()
                });
    }
 }
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -50,9 +50,7 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
    size_t mut_size = fm->representation().size();
    shard_stats().size_of_hints_in_progress += mut_size;

-    if (utils::get_local_injector().enter("slow_down_writing_hints")) {
-        co_await seastar::sleep(std::chrono::seconds(10));
-    }
+    co_await utils::get_local_injector().inject("slow_down_writing_hints", std::chrono::seconds(10));

    try {
        const auto shared_lock = co_await get_shared_lock(file_update_mutex());
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -186,7 +186,7 @@ void manager::register_metrics(const sstring& group_name) {
            sm::description("Number of unexpected errors during sending, sending will be retried later")),

        sm::make_counter("corrupted_files", _stats.corrupted_files,
-                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
+                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")).set_skip_when_empty(),

        sm::make_gauge("pending_drains",
                        sm::description("Number of tasks waiting in the queue for draining hints"),
--- a/db/rate_limiter.cc
+++ b/db/rate_limiter.cc
@@ -206,7 +206,7 @@ void rate_limiter_base::register_metrics() {
                sm::description("Number of times a lookup returned an already allocated entry.")),

        sm::make_counter("failed_allocations", _metrics.failed_allocations,
-                sm::description("Number of times the rate limiter gave up trying to allocate.")),
+                sm::description("Number of times the rate limiter gave up trying to allocate.")).set_skip_when_empty(),

        sm::make_counter("probe_count", _metrics.probe_count,
                sm::description("Number of probes made during lookups.")),
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -174,7 +174,7 @@ cache_tracker::setup_metrics() {
        sm::make_counter("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
        sm::make_counter("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
        sm::make_counter("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
-        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload),
+        sm::make_counter("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload).set_skip_when_empty(),
        sm::make_counter("rows_processed_from_memtable", _stats.rows_processed_from_memtable,
            sm::description("total number of rows in memtables which were processed during cache update on memtable flush")),
        sm::make_counter("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -18,8 +18,11 @@
 #include <seastar/coroutine/parallel_for_each.hh>
 #include "db/snapshot-ctl.hh"
 #include "db/snapshot/backup_task.hh"
+#include "db/schema_tables.hh"
+#include "index/secondary_index_manager.hh"
 #include "replica/database.hh"
 #include "replica/global_table_ptr.hh"
+#include "replica/schema_describe_helper.hh"
 #include "sstables/sstables_manager.hh"
 #include "service/storage_proxy.hh"

@@ -154,14 +157,56 @@ future<> snapshot_ctl::do_take_cluster_column_family_snapshot(std::vector<sstrin
    );
 }

+sstring snapshot_ctl::resolve_table_name(const sstring& ks_name, const sstring& name) const {
+    try {
+        _db.local().find_uuid(ks_name, name);
+        return name;
+    } catch (const data_dictionary::no_such_column_family&) {
+        // The name may be a logical index name (e.g. "myindex").
+        // Only indexes with a backing view have a separate backing table
+        // that can be snapshotted. Custom indexes such as vector indexes
+        // do not, so keep rejecting them here rather than mapping them to
+        // a synthetic name.
+        auto schema = _db.local().find_indexed_table(ks_name, name);
+        if (schema) {
+            const auto& im = schema->all_indices().at(name);
+            if (db::schema_tables::view_should_exist(im)) {
+                return secondary_index::index_table_name(name);
+            }
+        }
+        throw;
+    }
+}
+
 future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
+    for (auto& t : tables) {
+        t = resolve_table_name(ks_name, t);
+    }
    co_await check_snapshot_not_exist(ks_name, tag, tables);
    co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), opts);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
-    return run_snapshot_modify_operation([this, tag = std::move(tag), keyspace_names = std::move(keyspace_names), cf_name = std::move(cf_name)] {
-        return _db.local().clear_snapshot(tag, keyspace_names, cf_name);
+    co_return co_await run_snapshot_modify_operation([this, tag = std::move(tag), keyspace_names = std::move(keyspace_names), cf_name = std::move(cf_name)] (this auto) -> future<> {
+        // clear_snapshot enumerates keyspace_names and uses cf_name as a
+        // filter in each. When cf_name needs resolution (e.g. logical index
+        // name -> backing table name), the result may differ per keyspace,
+        // so resolve and clear individually.
+        if (!cf_name.empty() && !keyspace_names.empty()) {
+            std::vector<std::pair<sstring, sstring>> resolved_targets;
+            resolved_targets.reserve(keyspace_names.size());
+
+            // Resolve every keyspace first so a later failure doesn't delete
+            // snapshots that were already matched in earlier keyspaces.
+            for (const auto& ks_name : keyspace_names) {
+                resolved_targets.emplace_back(ks_name, resolve_table_name(ks_name, cf_name));
+            }
+            for (auto& [ks_name, resolved_cf_name] : resolved_targets) {
+                co_await _db.local().clear_snapshot(tag, {ks_name}, std::move(resolved_cf_name));
+            }
+            co_return;
+        }
+        co_await _db.local().clear_snapshot(std::move(tag), std::move(keyspace_names), cf_name);
    });
 }

@@ -170,7 +215,26 @@ snapshot_ctl::get_snapshot_details() {
    using snapshot_map = std::unordered_map<sstring, db_snapshot_details>;

    co_return co_await run_snapshot_list_operation(coroutine::lambda([this] () -> future<snapshot_map> {
-        return _db.local().get_snapshot_details();
+        auto details = co_await _db.local().get_snapshot_details();
+
+        for (auto& [snapshot_name, snapshot_details] : details) {
+            for (auto& table : snapshot_details) {
+                auto schema = _db.local().as_data_dictionary().try_find_table(
+                        table.ks, table.cf);
+                if (!schema || !schema->schema()->is_view()) {
+                    continue;
+                }
+
+                auto helper = replica::make_schema_describe_helper(
+                        schema->schema(), _db.local().as_data_dictionary());
+                if (helper.type == schema_describe_helper::type::index) {
+                    table.cf = secondary_index::index_name_from_table_name(
+                            table.cf);
+                }
+            }
+        }
+
+        co_return details;
    }));
 }

@@ -235,4 +299,4 @@ future<int64_t> snapshot_ctl::true_snapshots_size(sstring ks, sstring cf) {
    }));
 }

-}
+}
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -133,6 +133,12 @@ private:

    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

+    // Resolve a user-provided table name that may be a logical index name
+    // (e.g. "myindex") to its backing column family name (e.g.
+    // "myindex_index"). Returns the name unchanged if it already
+    // matches a column family.
+    sstring resolve_table_name(const sstring& ks_name, const sstring& name) const;
+
    future<> run_snapshot_modify_operation(noncopyable_function<future<>()> &&);

    template <typename Func>
@@ -151,4 +157,4 @@ private:
    future<> do_take_cluster_column_family_snapshot(std::vector<sstring> ks_names, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
 };

-}
+}
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -210,9 +210,7 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
-            auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
            co_await create_staging_sstable_tasks();
-            lock.return_all();
            _as.check();
            co_await _sstables_to_register_event.when();
        } catch (semaphore_aborted&) {
@@ -237,13 +235,45 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    }
 }

+future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    // Collect `_staging_sstables_mutex` locks from multiple shards,
+    // so other shards won't interact with their `_staging_sstables` map
+    // until the caller releases them.
+    std::vector<foreign_ptr<semaphore_units<>>> locks;
+    locks.resize(smp::count);
+    // Locks are acquired from multiple shards in parallel.
+    // This is the only place where multiple-shard locks are acquired at once
+    // and the method is called only once at a time (from `create_staging_sstable_tasks()`
+    // on shard 0), so no deadlock may occur.
+    co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
+        auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
+            auto& vbw = sharded_vbw.local();
+            auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
+            co_return make_foreign(std::move(lock));
+        });
+        locks[shard_id] = std::move(lock_ptr);
+    });
+    co_return std::move(locks);
+}
+
 future<> view_building_worker::create_staging_sstable_tasks() {
+    // Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
+    auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
+
    if (_sstables_to_register.empty()) {
        co_return;
    }

-    utils::chunked_vector<canonical_mutation> cmuts;
+    auto shards = _sstables_to_register 
+        | std::views::values 
+        | std::views::join 
+        | std::views::transform([] (const auto& sst_info) { return sst_info.shard; }) 
+        | std::ranges::to<std::flat_set<shard_id>>();
+    shards.erase(0); // We're already holding shard0 lock
+    auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));

+    utils::chunked_vector<canonical_mutation> cmuts;
    auto guard = co_await _group0.client().start_operation(_as);
    auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
@@ -696,24 +726,34 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
 }

 future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
-    if (_staging_sstables[table_id].empty()) {
+    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
+    std::vector<sstables::shared_sstable> sstables_to_process;
+
+    try {
+        // Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
+        // concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
+        // while we read them.
+        auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
+        auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
+        auto tid = tablet_map.get_tablet_id(last_token);
+        auto tablet_range = tablet_map.get_token_range(tid);
+
+        // Select sstables belonging to the tablet (identified by `last_token`)
+        for (auto& sst: _staging_sstables[table_id]) {
+            auto sst_last_token = sst->get_last_decorated_key().token();
+            if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
+                sstables_to_process.push_back(sst);
+            }
+        }
+        lock.return_all();
+    } catch (semaphore_aborted&) {
+        vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
        co_return;
    }

-    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
-    auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
-    auto tid = tablet_map.get_tablet_id(last_token);
-    auto tablet_range = tablet_map.get_token_range(tid);
-
-    // Select sstables belonging to the tablet (identified by `last_token`)
-    std::vector<sstables::shared_sstable> sstables_to_process;
-    for (auto& sst: _staging_sstables[table_id]) {
-        auto sst_last_token = sst->get_last_decorated_key().token();
-        if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
-            sstables_to_process.push_back(sst);
-        }
+    if (sstables_to_process.empty()) {
+        co_return;
    }
-
    co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);

    try {
--- a/db/view/view_building_worker.hh
+++ b/db/view/view_building_worker.hh
@@ -14,6 +14,7 @@
 #include <seastar/core/shared_future.hh>
 #include <unordered_map>
 #include <unordered_set>
+#include <flat_set>
 #include "locator/abstract_replication_strategy.hh"
 #include "locator/tablets.hh"
 #include "raft/raft.hh"
@@ -172,10 +173,15 @@ private:
    future<> do_process_staging(table_id base_id, dht::token last_token);

    future<> run_staging_sstables_registrator();
-    // Caller must hold units from `_staging_sstables_mutex`
+    // Acquires `_staging_sstables_mutex` on all shards internally,
+    // so callers must not hold `_staging_sstables_mutex` when invoking it.
    future<> create_staging_sstable_tasks();
    future<> discover_existing_staging_sstables();
    std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
+    // Acquire `_staging_sstables_mutex` on multiple shards in parallel.
+    // Must be called only from shard 0.
+    // Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
+    future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);

    void init_messaging_service();
    future<> uninit_messaging_service();
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -99,7 +99,7 @@ public:

                set_cell(cr, "up", gossiper.is_alive(hostid));
                if (gossiper.is_shutdown(endpoint)) {
-                    set_cell(cr, "status", gossiper.get_gossip_status(endpoint));
+                    set_cell(cr, "status", "shutdown");
                } else {
                    set_cell(cr, "status", boost::to_upper_copy<std::string>(fmt::format("{}", ss.get_node_state(hostid))));
                }
@@ -224,12 +224,12 @@ public:
            }

            if (_db.find_keyspace(e.name).get_replication_strategy().uses_tablets()) {
-                co_await _db.get_tables_metadata().for_each_table_gently([&, this] (table_id, lw_shared_ptr<replica::table> table) -> future<> {
+                co_await _db.get_tables_metadata().for_each_table_gently([&, this] (table_id tid, lw_shared_ptr<replica::table> table) -> future<> {
                    if (table->schema()->ks_name() != e.name) {
                        co_return;
                    }
                    const auto& table_name = table->schema()->cf_name();
-                    utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(e.name, table_name);
+                    utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(tid);
                    co_await emit_ring(result, e.key, table_name, std::move(ranges));
                });
            } else {
--- a/docs/alternator/network.md
+++ b/docs/alternator/network.md
@@ -31,7 +31,7 @@ was used. Alternator currently supports two compression algorithms, `gzip`
 and `deflate`, both standardized in ([RFC 9110](https://www.rfc-editor.org/rfc/rfc9110.html)).
 Other standard compression types which are listed in
 [IANA's HTTP Content Coding Registry](https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding),
-including `zstd` ([RFC 8878][https://www.rfc-editor.org/rfc/rfc8878.html]),
+including `zstd` ([RFC 8878](https://www.rfc-editor.org/rfc/rfc8878.html)),
 are not yet supported by Alternator.

 Note that HTTP's compression only compresses the request's _body_ - not the
--- a/docs/cql/secondary-indexes.rst
+++ b/docs/cql/secondary-indexes.rst
@@ -261,8 +261,51 @@ The following options are supported for vector indexes. All of them are optional
 |                              | * ``true``: Enable rescoring.                                                                            |               |
 |                              | * ``false``: Disable rescoring.                                                                          |               |
 +------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
+| ``source_model``             | The name of the embedding model that produced the vectors (e.g., ``"ada002"``). Cassandra client         | *(none)*      |
+|                              | libraries such as CassIO send this option to tag the index with the model. Cassandra SAI rejects it as   |               |
+|                              | an unrecognized property; ScyllaDB accepts and preserves it in ``DESCRIBE`` output for compatibility     |               |
+|                              | with those libraries, but does not act on it.                                                            |               |
+------------------------------+----------------------------------------------------------------------------------------------------------+---------------+


+.. _cassandra-sai-compatibility:
+
+Cassandra SAI Compatibility for Vector Search
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ScyllaDB accepts the Cassandra ``StorageAttachedIndex`` (SAI) class name in ``CREATE CUSTOM INDEX``
+statements **for vector columns**. Cassandra libraries such as
+`CassIO <https://cassio.org/>`_ and `LangChain <https://www.langchain.com/>`_ use SAI to create
+vector indexes; ScyllaDB recognizes these statements for compatibility.
+
+When ScyllaDB encounters an SAI class name on a **vector column**, the index is automatically
+created as a native ``vector_index``. The following class names are recognized:
+
+* ``org.apache.cassandra.index.sai.StorageAttachedIndex`` (exact case required)
+* ``StorageAttachedIndex`` (case-insensitive)
+* ``SAI`` (case-insensitive)
+
+Example::
+
+   -- Cassandra SAI statement accepted by ScyllaDB:
+   CREATE CUSTOM INDEX ON my_table (embedding)
+   USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'
+   WITH OPTIONS = {'similarity_function': 'COSINE'};
+
+   -- Equivalent to:
+   CREATE CUSTOM INDEX ON my_table (embedding)
+   USING 'vector_index'
+   WITH OPTIONS = {'similarity_function': 'COSINE'};
+
+The ``similarity_function`` option is supported by both Cassandra SAI and ScyllaDB.
+
+.. note::
+
+   SAI class names are only supported on **vector columns**. Using an SAI class name on a
+   non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI
+   indexing of non-vector columns is not supported by ScyllaDB; use a
+   :doc:`secondary index </cql/secondary-indexes>` instead.
+
 .. _drop-index-statement:

 DROP INDEX
--- a/docs/dev/compare-build-systems.md
+++ b/docs/dev/compare-build-systems.md
@@ -0,0 +1,155 @@
+# Comparing Build Systems: configure.py vs CMake
+
+ScyllaDB has two build systems: the primary `configure.py` + Ninja pipeline
+and an alternative CMake build (used mainly for IDE integration — CLion,
+clangd, etc.).  Both must produce equivalent compilation and link commands.
+
+`scripts/compare_build_systems.py` verifies this by parsing the `build.ninja`
+files generated by each system and comparing:
+
+1. **Per-file compilation flags** — defines, warnings, optimization, language
+   flags for every Scylla source file.
+2. **Link target sets** — are the same executables produced by both systems?
+3. **Per-target linker settings** — link flags and libraries for every common
+   executable.
+
+`configure.py` is treated as the baseline.  CMake should match it.
+
+## Quick start
+
+```bash
+# Compare a single mode
+scripts/compare_build_systems.py -m dev
+
+# Compare all modes
+scripts/compare_build_systems.py
+
+# Verbose output — show per-file and per-target differences
+scripts/compare_build_systems.py -m debug -v
+```
+
+The script automatically configures both build systems into a temporary
+directory for every run — the user's existing build tree is never touched.
+No manual `configure.py` or `cmake` invocation is required.
+
+## Mode mapping
+
+| configure.py | CMake            |
+|--------------|------------------|
+| `debug`      | `Debug`          |
+| `dev`        | `Dev`            |
+| `release`    | `RelWithDebInfo` |
+| `sanitize`   | `Sanitize`       |
+| `coverage`   | `Coverage`       |
+
+## Examples
+
+```bash
+# Check dev mode only (fast, most common during development)
+scripts/compare_build_systems.py -m dev
+
+# Check all modes
+scripts/compare_build_systems.py
+
+# CI mode: quiet, strict (exit 1 on any diff)
+scripts/compare_build_systems.py --ci
+
+# Verbose output for debugging a specific mode
+scripts/compare_build_systems.py -m sanitize -v
+
+# Quiet mode — only prints summary and errors
+scripts/compare_build_systems.py -m dev -q
+```
+
+## Exit codes
+
+| Code | Meaning                                                                  |
+|------|--------------------------------------------------------------------------|
+| `0`  | All checked modes match                                                  |
+| `1`  | Differences found                                                        |
+| `2`  | Configuration failure or some modes could not be compared (e.g. skipped) |
+
+## What it ignores
+
+The script intentionally ignores certain structural differences that are
+inherent to how the two build systems work:
+
+- **Include paths** (`-I`, `-isystem`) — directory layout differs between
+  the two systems.
+- **LTO/PGO flags** — these are configuration-dependent options, not
+  mode-inherent.
+- **Internal library targets** — CMake creates intermediate static/shared
+  libraries (e.g., `scylla-main`, `test-lib`, abseil targets) while
+  `configure.py` links `.o` files directly.
+- **Per-component Boost defines** — CMake adds `BOOST_REGEX_DYN_LINK` etc.
+  per component; `configure.py` uses a single `BOOST_ALL_DYN_LINK`.
+
+## Typical workflow
+
+After modifying `CMakeLists.txt` or `cmake/mode.*.cmake`:
+
+```bash
+# 1. Run the comparison (auto-configures both systems in a temp dir)
+scripts/compare_build_systems.py -m dev -v
+
+# 2. Fix any differences, repeat
+```
+
+## AI agent workflow
+
+When the script reports mismatches, you can paste its summary output into
+an AI coding agent (GitHub Copilot, etc.) and ask it to fix the
+discrepancies.  The agent has access to both `configure.py` and the
+CMake files and can resolve most differences automatically.
+
+### Example interaction
+
+**1. Run the script:**
+
+```bash
+scripts/compare_build_systems.py
+```
+
+**2. Copy the summary and paste it to the agent:**
+
+> I ran `scripts/compare_build_systems.py` and got:
+>
+> ```
+> Summary
+> ══════════════════════════════════════════════════════════════════════
+>   debug      (CMake: Debug          ):  ✗ MISMATCH
+>     Compilation: 3 files with flag diffs, 1 sources only in configure.py
+>       only-configure.py  defines: -DSOME_FLAG  (3 files)
+>     Link targets: 1 only in configure.py
+>     Linker: 2 targets with lib diffs
+>       lib only in CMake: boost_filesystem  (2 targets)
+>   dev        (CMake: Dev            ):  ✗ MISMATCH
+>     Compilation: 1 sources only in configure.py
+>     Link targets: 1 only in configure.py
+>   release    (CMake: RelWithDebInfo ):  ✓ MATCH
+>   sanitize   (CMake: Sanitize       ):  ✓ MATCH
+>   coverage   (CMake: Coverage       ):  ✓ MATCH
+> ```
+>
+> Please fix all issues and commit according to project guidelines.
+
+**3. The agent will:**
+
+- Identify each discrepancy (missing sources, missing targets, extra
+  libraries, missing defines).
+- Trace root causes — e.g., a test added to `configure.py` but not to
+  `test/boost/CMakeLists.txt`, or an unnecessary `Boost::filesystem`
+  link in a CMake target.
+- Apply fixes to the appropriate `CMakeLists.txt` files.
+- Re-run cmake and the comparison script to verify the fix.
+- Commit each fix to the correct commit in the series (using
+  `git commit --fixup` + `git rebase --autosquash`).
+
+### Tips
+
+- **Paste the full summary block** — the inline diff details (compilation,
+  link targets, linker) give the agent enough context to act without
+  scrolling through verbose output.
+- **Use `-v` for stubborn issues** — if the agent needs per-file or
+  per-target detail, re-run with `-v` and paste the relevant section.
+
--- a/docs/dev/counters.md
+++ b/docs/dev/counters.md
@@ -0,0 +1,81 @@
+# Counters
+
+Counters are special kinds of cells which value can only be incremented, decremented, read and (with some limitations) deleted. In particular, once deleted, that counter cannot be used again. For example:
+
+```cql
+> UPDATE cf SET my_counter = my_counter + 6 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+  0 |          6
+
+(1 rows)
+> UPDATE cf SET my_counter = my_counter - 1 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+  0 |          5
+
+(1 rows)
+> DELETE my_counter FROM cf WHERE pk = 0;
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+
+(0 rows)
+> UPDATE cf SET my_counter = my_counter + 3 WHERE pk = 0
+> SELECT * FROM cf;
+ pk | my_counter
+----+------------
+
+(0 rows)
+```
+
+## Counters representation
+Counters are represented as sets of, so called, shards which are triples containing:
+* counter id – uuid identifying the writer owning that shard (see below)
+* logical clock – incremented each time the owning writer modifies the shard value
+* current value – sum of increments and decrements done by the owning writer
+
+During each write operation one of the replicas is chosen as a leader. The leader reads its shard, increments logical clock, updates current value and then sends the new version of its shard to the other replicas.
+
+Shards owned by the same writer are merged (see below) so that each counter cell contains only one shard per counter id. Reading the actual counter value requires summing values of all shards.
+
+### Counter id
+
+The counter id is a 128-bit UUID that identifies which writer owns a shard. How it is assigned depends on whether the table uses vnodes or tablets.
+
+**Vnodes:** the counter id is the host id of the node that owns the shard. Each node in the cluster gets a unique counter id, so the number of shards in a counter cell grows with the number of distinct nodes that have ever written to it.
+
+**Tablets:** the counter id is rack-based rather than node-based. It is a deterministic type-3 (name-based) UUID derived from the string `"<datacenter>:<rack>"`. All nodes in the same rack share the same counter id.
+
+During tablet migration, since there are two active replicas in a rack and in order to avoid conflicts, the node that is a *pending replica* uses the **negated** rack UUID as its counter id.
+
+This bounds the number of shards in a counter cell to at most `2 × (number of racks)` regardless of node replacements.
+
+### Merging and reconciliation
+Reconciliation of two counters requires merging all shards belonging to the same counter id. The rule is: the shard with the highest logical clock wins.
+
+Since support of deleting counters is limited so that once deleted they cannot be used again, during reconciliation tombstones win with live counter cell regardless of their timestamps.
+
+### Digest
+Computing a digest of counter cells needs to be done based solely on the shard contents (counter id, value, logical clock) rather than any structural metadata.
+
+## Writes
+1. Counter update starts with a client sending counter delta as a long (CQL3 `bigint`) to the coordinator.
+2. CQL3 creates a `CounterMutation` containing a `counter_update` cell which is just a delta.
+3. Coordinator chooses the leader of the counter update and sends it the mutation. The leader is always one of the replicas owning the partition the modified counter belongs to.
+4. Now, the leader needs to transform counter deltas into shards. To do that it reads the current value of the shard it owns, and produces a new shard with the value modified by the delta and the logical clock incremented.
+5. The mutation with the newly created shard is both used to update the memtable on the leader as well as sent to the other nodes for replication.
+
+### Choosing leader
+Choosing a replica which becomes a leader for a counter update is completely at the coordinator discretion. It is not a static role in any way and any concurrent update could be forwarded to a different leader. This means that all problems related to leader election are avoided.
+
+The coordinator chooses the leader using the following algorithm:
+
+1. If the coordinator can be a leader it chooses itself.
+2. Otherwise, a random replica from the local DC is chosen.
+3. If there is no eligible node available in the local DC the replica closest to the coordinator (according to the snitch) is chosen.
+
+## Reads
+Querying counter values is much simpler than updating it. First part of the read operation is performed as for all other cell types. When counter cells from different sources are being reconciled their shards are merged. Once the final counter cell value is known and the `CounterCell` is serialised, current values of all shards are summed up and the output of serialisation is a long integer.
--- a/docs/dev/docker-hub.md
+++ b/docs/dev/docker-hub.md
@@ -192,14 +192,10 @@ For example, to configure ScyllaDB to use listen address `10.0.0.5`:
 $ docker run --name some-scylla -d scylladb/scylla --listen-address 10.0.0.5
 ```

-**Since: 1.4**
-
 #### `--alternator-address ADDR`

 The `--alternator-address` command line option configures the Alternator API listen address. The default value is the same as `--listen-address`.

-**Since: 3.2**
-
 #### `--alternator-port PORT`

 The `--alternator-port` command line option configures the Alternator API listen port. The Alternator API is disabled by default. You need to specify the port to enable it.
@@ -210,22 +206,16 @@ For example, to configure ScyllaDB to listen to Alternator API at port `8000`:
 $ docker run --name some-scylla -d scylladb/scylla --alternator-port 8000
 ```

-**Since: 3.2**
-
 #### `--alternator-https-port PORT`

 The `--alternator-https-port` option is similar to `--alternator-port`, just enables an encrypted (HTTPS) port. Either the `--alternator-https-port` or `--alternator-http-port`, or both, can be used to enable Alternator.

 Note that the `--alternator-https-port` option also requires that files `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` be inserted into the image. These files contain an SSL certificate and key, respectively.

-**Since: 4.2**
-
 #### `--alternator-write-isolation policy`

 The `--alternator-write-isolation` command line option chooses between four allowed write isolation policies described in docs/alternator/alternator.md. This option must be specified if Alternator is enabled - it does not have a default.

-**Since: 4.1**
-
 #### `--broadcast-address ADDR`

 The `--broadcast-address` command line option configures the IP address the ScyllaDB instance tells other ScyllaDB nodes in the cluster to connect to.
@@ -304,8 +294,6 @@ For example, to skip running I/O setup:
 $ docker run --name some-scylla -d scylladb/scylla --io-setup 0
 ```

-**Since: 4.3**
-
 #### `--cpuset CPUSET`

 The `--cpuset` command line option restricts ScyllaDB to run on only on CPUs specified by `CPUSET`.
@@ -341,26 +329,18 @@ For example, to enable the User Defined Functions (UDF) feature:
 $ docker run --name some-scylla -d scylladb/scylla --experimental-feature=udf
 ```

-**Since: 2.0**
-
 #### `--disable-version-check`

 The `--disable-version-check` disable the version validation check.

-**Since: 2.2**
-
 #### `--authenticator AUTHENTICATOR`

 The `--authenticator` command lines option allows to provide the authenticator class ScyllaDB will use. By default ScyllaDB uses the `AllowAllAuthenticator` which performs no credentials checks. The second option is using the `PasswordAuthenticator` parameter, which relies on username/password pairs to authenticate users.

-**Since: 2.3**
-
 #### `--authorizer AUTHORIZER`

 The `--authorizer` command lines option allows to provide the authorizer class ScyllaDB will use. By default ScyllaDB uses the `AllowAllAuthorizer` which allows any action to any user. The second option is using the `CassandraAuthorizer` parameter, which stores permissions in `system.permissions` table.

-**Since: 2025.4**
-
 #### `--dc NAME`

 The `--dc` command line option sets the datacenter name for the ScyllaDB node.
--- a/docs/dev/system-keyspaces.md
+++ b/docs/dev/system-keyspaces.md
@@ -0,0 +1,67 @@
+# System Keyspaces Overview
+
+This page gives a high-level overview of several internal keyspaces and what they are used for.
+
+## Table of Contents
+
+- [system_replicated_keys](#system_replicated_keys)
+- [system_distributed](#system_distributed)
+- [system_distributed_everywhere](#system_distributed_everywhere)
+- [system_auth](#system_auth)
+- [system](#system)
+- [system_schema](#system_schema)
+- [system_traces](#system_traces)
+- [system_audit/audit](#system_auditaudit)
+
+## `system_replicated_keys`
+
+Internal keyspace for encryption-at-rest key material used by the replicated key provider. It stores encrypted data keys so nodes can retrieve the correct key IDs when reading encrypted data.
+
+This keyspace is created as an internal system keyspace and uses `EverywhereStrategy` so key metadata is available on every node. It is not intended for user data.
+
+## `system_distributed`
+
+Internal distributed metadata keyspace used for cluster-wide coordination data that is shared across nodes.
+
+In practice, it is used for metadata such as:
+
+- materialized view build coordination state
+- CDC stream/timestamp metadata exposed to clients
+- service level definitions used by workload prioritization
+
+This keyspace is managed by Scylla and is not intended for application tables.
+It is created as an internal keyspace (historically with `SimpleStrategy` and RF=3 by default).
+
+## `system_distributed_everywhere`
+
+Legacy keyspace. It is no longer used.
+
+## `system_auth`
+
+Legacy auth keyspace name kept primarily for compatibility.
+
+Auth tables have moved to the `system` keyspace (`roles`, `role_members`, `role_permissions`, and related auth state). `system_auth` may still exist for compatibility with legacy tooling/queries, but it is no longer where current auth state is primarily stored.
+
+## `system`
+
+This keyspace is local one, so each node has its own, independent content for tables in this keyspace. For some tables, the content is coordinated at a higher level (RAFT), but not via the traditional replication systems (storage proxy).
+
+See the detailed table-level documentation here: [system_keyspace](system_keyspace.md)
+
+## `system_schema`
+
+This keyspace is local one, so each node has its own, independent content for tables in this keyspace. All tables in this keyspace are coordinated via the schema replication system.
+
+See the detailed table-level documentation here: [system_schema_keyspace](system_schema_keyspace.md)
+
+## `system_traces`
+
+Internal tracing keyspace used for query tracing and slow-query logging records (`sessions`, `events`, and related index/log tables).
+
+This keyspace is written by Scylla's tracing subsystem for diagnostics and observability. It is operational metadata, not user application data (historically created with `SimpleStrategy` and RF=2).
+
+## `system_audit`/`audit`
+
+Internal audit-logging keyspace used to persist audit events when table-backed auditing is enabled.
+
+Scylla's audit table storage is implemented as an internal audit keyspace for audit records (for example, auth/admin/DCL activity depending on audit configuration). In current code this keyspace is named `audit`, while operational material may refer to it as its historical name (`system_audit`). It is intended for security/compliance observability, not for application data.
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -289,7 +289,7 @@ Yes, but it will require running a full repair (or cleanup) to change the replic
 - If you're reducing the replication factor, run ``nodetool cleanup <updated Keyspace>`` on the keyspace you modified to remove surplus replicated data.
  Cleanup runs on a per-node basis.
 - If you're increasing the replication factor, refer to :doc:`How to Safely Increase the RF </kb/rf-increase>`
- Note that you need to provide the keyspace namr. If you do not, the cleanup or repair operation runs on all keyspaces for the specific node.
+- Note that you need to provide the keyspace name. If you do not, the cleanup or repair operation runs on all keyspaces for the specific node.

 Why can't I set ``listen_address`` to listen to 0.0.0.0 (all my addresses)?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -181,6 +181,7 @@ internode_compression controls whether traffic between nodes is compressed.

 * all  - all traffic is compressed.
 * dc   - traffic between different datacenters is compressed.
+* rack - traffic between different racks is compressed.
 * none - nothing is compressed (default).

 Configuring TLS/SSL in scylla.yaml
--- a/docs/operating-scylla/security/auditing.rst
+++ b/docs/operating-scylla/security/auditing.rst
@@ -2,8 +2,8 @@
 ScyllaDB Auditing Guide
 ========================

-Auditing allows the administrator to monitor activities on a Scylla cluster, including queries and data changes. 
-The information is stored in a Syslog or a Scylla table.
+Auditing allows the administrator to monitor activities on a ScyllaDB cluster, including CQL queries and data changes, as well as Alternator (DynamoDB-compatible API) requests.
+The information is stored in a Syslog or a ScyllaDB table.

 Prerequisite
 ------------
@@ -14,15 +14,15 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
 Enabling Audit
 ---------------

-By default, table auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
+By default, auditing is **enabled** with the ``table`` backend. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
 You can set the following options:

 * ``none`` - Audit is disabled.
-* ``table`` - Audit is enabled, and messages are stored in a Scylla table (default).
+* ``table`` - Audit is enabled, and messages are stored in a ScyllaDB table (default).
 * ``syslog`` - Audit is enabled, and messages are sent to Syslog.
-* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
+* ``syslog,table`` - Audit is enabled, and messages are stored in a ScyllaDB table and sent to Syslog.

-Configuring any other value results in an error at Scylla startup.
+Configuring any other value results in an error at ScyllaDB startup.

 Configuring Audit
 -----------------
@@ -34,7 +34,9 @@ Flag                Default Value                       Description
 ==================  ==================================  ========================================================================================================================
 audit_categories    "DCL,AUTH,ADMIN"                                  Comma-separated list of statement categories that should be audited
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
-audit_tables        “”                                  Comma-separated list of table names that should be audited, in the format of <keyspacename>.<tablename>
+audit_tables        “”                                  Comma-separated list of table names that should be audited, in the format ``<keyspace_name>.<table_name>``.
+                                                        
+                                                        For Alternator tables use the ``alternator.<table_name>`` format (see :ref:`alternator-auditing`).
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
 audit_keyspaces     “”                                  Comma-separated list of keyspaces that should be audited. You must specify at least one keyspace.
                                                        If you leave this option empty, no keyspace will be audited.
@@ -47,30 +49,137 @@ You can use DCL, AUTH, and ADMIN audit categories without including any keyspace
 audit_categories parameter description
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-=========  =========================================================================================
-Parameter  Logs Description
-=========  =========================================================================================
-AUTH       Logs login events
---------  -----------------------------------------------------------------------------------------
-DML        Logs insert, update, delete, and other data manipulation language (DML) events
---------  -----------------------------------------------------------------------------------------
-DDL        Logs object and role create, alter, drop, and other data definition language (DDL) events
---------  -----------------------------------------------------------------------------------------
-DCL        Logs grant, revoke, create role, drop role, and list roles events
---------  -----------------------------------------------------------------------------------------
-QUERY      Logs all queries
---------  -----------------------------------------------------------------------------------------
-ADMIN      Logs service level operations: create, alter, drop, attach, detach, list.
+=========  =========================================================================================  ====================
+Parameter  Logs Description                                                                           Applies To
+=========  =========================================================================================  ====================
+AUTH       Logs login events                                                                           CQL
+---------  -----------------------------------------------------------------------------------------  --------------------
+DML        Logs insert, update, delete, and other data manipulation language (DML) events              CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+DDL        Logs object and role create, alter, drop, and other data definition language (DDL) events   CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+DCL        Logs grant, revoke, create role, drop role, and list roles events                           CQL
+---------  -----------------------------------------------------------------------------------------  --------------------
+QUERY      Logs all queries                                                                            CQL, Alternator
+---------  -----------------------------------------------------------------------------------------  --------------------
+ADMIN      Logs service level operations: create, alter, drop, attach, detach, list.                   CQL
           For :ref:`service level <workload-priorization-service-level-management>`
           auditing.
-=========  =========================================================================================
+=========  =========================================================================================  ====================
+
+For details on auditing Alternator operations, see :ref:`alternator-auditing`.

 Note that enabling audit may negatively impact performance and audit-to-table may consume extra storage. That's especially true when auditing DML and QUERY categories, which generate a high volume of audit messages.

+.. _alternator-auditing:
+
+Auditing Alternator Requests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When auditing is enabled, Alternator (DynamoDB-compatible API) requests are audited using the same
+backends and the same filtering configuration (``audit_categories``, ``audit_keyspaces``,
+``audit_tables``) as CQL operations. No additional configuration is needed.
+
+Both successful and failed Alternator requests are audited.
+
+Alternator Operation Categories
+""""""""""""""""""""""""""""""""
+
+Each Alternator API operation is assigned to one of the standard audit categories:
+
+=========  ====================================================================================================
+Category   Alternator Operations
+=========  ====================================================================================================
+DDL        CreateTable, DeleteTable, UpdateTable, TagResource, UntagResource, UpdateTimeToLive
+---------  ----------------------------------------------------------------------------------------------------
+DML        PutItem, UpdateItem, DeleteItem, BatchWriteItem
+---------  ----------------------------------------------------------------------------------------------------
+QUERY      GetItem, BatchGetItem, Query, Scan, DescribeTable, ListTables, DescribeEndpoints,
+           ListTagsOfResource, DescribeTimeToLive, DescribeContinuousBackups,
+           ListStreams, DescribeStream, GetShardIterator, GetRecords
+=========  ====================================================================================================
+
+.. note:: AUTH, DCL, and ADMIN categories do not apply to Alternator operations. These categories
+   are specific to CQL authentication, authorization, and service-level management.
+
+Operation Field Format
+"""""""""""""""""""""""
+
+For CQL operations, the ``operation`` field in the audit log contains the raw CQL query string.
+For Alternator operations, the format is:
+
+.. code-block:: none
+
+   <OperationName>|<JSON request body>
+
+For example:
+
+.. code-block:: none
+
+   PutItem|{"TableName":"my_table","Item":{"p":{"S":"pk_val"},"c":{"S":"ck_val"},"v":{"S":"data"}}}
+
+.. note:: The full JSON request body is included in the ``operation`` field. For batch operations
+   (such as BatchWriteItem), this can be very large (up to 16 MB).
+
+Keyspace and Table Filtering for Alternator
+""""""""""""""""""""""""""""""""""""""""""""
+
+The real keyspace name of an Alternator table ``T`` is ``alternator_T``.
+The ``audit_tables`` config flag uses the shorthand format ``alternator.T`` to refer to such
+tables -- the parser expands it to the real keyspace name automatically.
+For ``audit_keyspaces``, use the real keyspace name directly.
+
+For example, to audit an Alternator table called ``my_table_name`` use either of the below:
+
+.. code-block:: yaml
+
+   # Using audit_tables - use 'alternator' as the keyspace name:
+   audit_tables: "alternator.my_table_name"
+
+   # Using audit_keyspaces - use the real keyspace name:
+   audit_keyspaces: "alternator_my_table_name"
+
+**Global and batch operations**: Some Alternator operations are not scoped to a single table:
+
+* ``ListTables`` and ``DescribeEndpoints`` have no associated keyspace or table.
+* ``BatchWriteItem`` and ``BatchGetItem`` may span multiple tables.
+
+These operations are logged whenever their category matches ``audit_categories``, regardless of
+``audit_keyspaces`` or ``audit_tables`` filters. Their ``keyspace_name`` field is empty, and for
+batch operations the ``table_name`` field contains a pipe-separated (``|``) list of all involved table names.
+
+**DynamoDB Streams operations**: For streams-related operations (``DescribeStream``, ``GetShardIterator``,
+``GetRecords``), the ``table_name`` field contains the base table name and the CDC log table name
+separated by a pipe (e.g., ``my_table|my_table_scylla_cdc_log``).
+
+Alternator Audit Log Examples
+""""""""""""""""""""""""""""""
+
+Syslog output example (PutItem):
+
+.. code-block:: shell
+
+   Mar 18 10:15:03 ip-10-143-2-108 scylla-audit[28387]: node="10.143.2.108", category="DML", cl="LOCAL_QUORUM", error="false", keyspace="alternator_my_table", query="PutItem|{\"TableName\":\"my_table\",\"Item\":{\"p\":{\"S\":\"pk_val\"}}}", client_ip="127.0.0.1", table="my_table", username="anonymous"
+
+Table output example (PutItem):
+
+.. code-block:: shell
+
+   SELECT * FROM audit.audit_log ;
+
+returns:
+
+.. code-block:: none
+
+    date                    | node         | event_time                           | category | consistency  | error | keyspace_name         | operation                                                                        | source    | table_name | username  |
+   -------------------------+--------------+--------------------------------------+----------+--------------+-------+-----------------------+----------------------------------------------------------------------------------+-----------+------------+-----------+
+   2026-03-18 00:00:00+0000 | 10.143.2.108 | 3429b1a5-2a94-11e8-8f4e-000000000001 |      DML | LOCAL_QUORUM | False | alternator_my_table   | PutItem|{"TableName":"my_table","Item":{"p":{"S":"pk_val"}}}                     | 127.0.0.1 |   my_table | anonymous |
+   (1 row)
+
 Configuring Audit Storage
 ---------------------------

-Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>` or both.
+Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a ScyllaDB :ref:`table <auditing-table-storage>` or both.

 .. _auditing-syslog-storage:

@@ -99,13 +208,13 @@ Storing Audit Messages in Syslog
      # All tables in those keyspaces will be audited
      audit_keyspaces: "mykespace"

-#. Restart the Scylla node.
+#. Restart the ScyllaDB node.

 .. include:: /rst_include/scylla-commands-restart-index.rst

-By default, audit messages are written to the same destination as Scylla :doc:`logging </getting-started/logging>`, with ``scylla-audit`` as the process name.
+By default, audit messages are written to the same destination as ScyllaDB :doc:`logging </getting-started/logging>`, with ``scylla-audit`` as the process name.

-Logging output example (drop table): 
+Logging output example (CQL drop table):

 .. code-block:: shell

@@ -123,7 +232,7 @@ To redirect the Syslog output to a file, follow the steps below (available only
 Storing Audit Messages in a Table
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Messages are stored in a Scylla table named ``audit.audit_log``. 
+Messages are stored in a ScyllaDB table named ``audit.audit_log``. 

 For example:

@@ -170,11 +279,11 @@ For example:
      # All tables in those keyspaces will be audited
      audit_keyspaces: "mykespace"

-#. Restart Scylla node.
+#. Restart the ScyllaDB node.

   .. include:: /rst_include/scylla-commands-restart-index.rst

-   Table output example (drop table):
+   Table output example (CQL drop table):

   .. code-block:: shell

@@ -196,7 +305,7 @@ Storing Audit Messages in a Table and Syslog Simultaneously

 **Procedure**

-#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart scylla only once.
+#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart ScyllaDB only once.

   To have both syslog and table you need to specify both backends separated by a comma:

--- a/docs/using-scylla/cassandra-compatibility.rst
+++ b/docs/using-scylla/cassandra-compatibility.rst
@@ -227,13 +227,19 @@ Security
 Indexing and Caching
 ^^^^^^^^^^^^^^^^^^^^^

-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|   Options                                                    | Support                                                                              |
-+==============================================================+======================================================================================+
-|:doc:`Secondary Index </features/secondary-indexes>`          | |v|                                                                                  |
-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
-|:doc:`Materialized Views </features/materialized-views>`      |  |v|                                                                                 |
-+--------------------------------------------------------------+--------------------------------------------------------------------------------------+
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|   Options                                                      | Support                                                                              |
+================================================================+======================================================================================+
+|:doc:`Secondary Index </features/secondary-indexes>`            | |v|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|StorageAttachedIndex (SAI)                                      | |x|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:ref:`SAI for vector search <cassandra-sai-compatibility>`      | |v| :sup:`*`                                                                         |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+|:doc:`Materialized Views </features/materialized-views>`        | |v|                                                                                  |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
+
+:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``


 Additional Features
--- a/ent/ldap/ldap_connection.cc
+++ b/ent/ldap/ldap_connection.cc
@@ -437,7 +437,6 @@ void ldap_connection::poll_results() {
            const auto found = _msgid_to_promise.find(id);
            if (found == _msgid_to_promise.end()) {
                mylog.error("poll_results: got valid result for unregistered id {}, dropping it", id);
-                ldap_msgfree(result);
            } else {
                found->second.set_value(std::move(result_ptr));
                _msgid_to_promise.erase(found);
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -41,7 +41,7 @@ public:
               _ip == other._ip;
    }

-    endpoint_state(inet_address ip) noexcept
+    explicit endpoint_state(inet_address ip) noexcept
        : _heart_beat_state()
        , _update_timestamp(clk::now())
        , _ip(ip)
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -59,7 +59,6 @@ using clk = gossiper::clk;
 static logging::logger logger("gossip");

 constexpr std::chrono::milliseconds gossiper::INTERVAL;
-constexpr std::chrono::hours gossiper::A_VERY_LONG_TIME;
 constexpr generation_type::value_type gossiper::MAX_GENERATION_DIFFERENCE;

 const sstring& gossiper::get_cluster_name() const noexcept {
@@ -648,7 +647,7 @@ future<> gossiper::do_apply_state_locally(locator::host_id node, endpoint_state
            }
            // Re-rake after apply_new_states
            es = get_endpoint_state_ptr(node);
-            if (!is_alive(es->get_host_id()) && !is_dead_state(*es) && !shadow_round) { // unless of course, it was dead
+            if (!is_alive(es->get_host_id()) && !is_left(*es) && !shadow_round) { // unless of course, it was dead
                mark_alive(es);
            }
        } else {
@@ -767,7 +766,7 @@ future<> gossiper::remove_endpoint(locator::host_id endpoint, permit_id pid) {

    if (was_alive) {
        try {
-            logger.info("InetAddress {}/{} is now DOWN, status = {}", state->get_host_id(), ip, get_gossip_status(*state));
+            logger.info("InetAddress {}/{} is now DOWN, status = {}", host_id, ip, get_node_status(host_id));
            co_await do_on_dead_notifications(ip, std::move(state), pid);
        } catch (...) {
            logger.warn("Fail to call on_dead callback: {}", std::current_exception());
@@ -1174,10 +1173,10 @@ future<> gossiper::unregister_(shared_ptr<i_endpoint_state_change_subscriber> su

 std::set<locator::host_id> gossiper::get_live_members() const {
    std::set<locator::host_id> live_members(_live_endpoints.begin(), _live_endpoints.end());
-    auto myip = get_broadcast_address();
+    auto myid = my_host_id();
    logger.debug("live_members before={}", live_members);
-    if (!is_shutdown(myip)) {
-        live_members.insert(my_host_id());
+    if (!is_shutdown(myid)) {
+        live_members.insert(myid);
    }
    logger.debug("live_members after={}", live_members);
    return live_members;
@@ -1248,7 +1247,6 @@ future<> gossiper::evict_from_membership(locator::host_id hid, permit_id pid) {
        }
        g._endpoint_state_map.erase(hid);
    });
-    _expire_time_endpoint_map.erase(hid);
    logger.debug("evicting {} from gossip", hid);
 }

@@ -1321,21 +1319,6 @@ future<> gossiper::replicate(endpoint_state es, permit_id pid) {
    }
 }

-future<> gossiper::advertise_token_removed(locator::host_id host_id, permit_id pid) {
-    auto permit = co_await lock_endpoint(host_id, pid);
-    pid = permit.id();
-    auto eps = get_endpoint_state(host_id);
-    eps.update_timestamp(); // make sure we don't evict it too soon
-    eps.get_heart_beat_state().force_newer_generation_unsafe();
-    auto expire_time = compute_expire_time();
-    eps.add_application_state(application_state::STATUS, versioned_value::removed_nonlocal(host_id, expire_time.time_since_epoch().count()));
-    logger.info("Completing removal of {}", host_id);
-    add_expire_time_for_endpoint(host_id, expire_time);
-    co_await replicate(std::move(eps), pid);
-    // ensure at least one gossip round occurs before returning
-    co_await sleep_abortable(INTERVAL * 2, _abort_source);
-}
-
 future<> gossiper::assassinate_endpoint(sstring address) {
    throw std::runtime_error("Assassinating endpoint is not supported in topology over raft mode");
 }
@@ -1368,13 +1351,10 @@ future<> gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
        std::uniform_real_distribution<double> dist(0, 1);
        double rand_dbl = dist(_random_engine);
        if (rand_dbl < prob) {
-            std::set<locator::host_id> addrs;
-            for (auto&& x : _unreachable_endpoints) {
-                // Ignore the node which is decommissioned
-                if (get_gossip_status(_address_map.get(x.first)) != sstring(versioned_value::STATUS_LEFT)) {
-                    addrs.insert(x.first);
-                }
-            }
+            auto addrs = _unreachable_endpoints | std::ranges::views::keys | std::views::filter([this] (auto ep) {
+                // Ignore the node which is no longer part of the cluster
+                return !_topo_sm._topology.left_nodes.contains(raft::server_id(ep.uuid()));
+            }) | std::ranges::to<std::set>();
            logger.trace("do_gossip_to_unreachable_member: live_endpoint nr={} unreachable_endpoints nr={}",
                live_endpoint_count, unreachable_endpoint_count);
            return send_gossip(message, addrs);
@@ -1383,17 +1363,6 @@ future<> gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
    return make_ready_future<>();
 }

-clk::time_point gossiper::get_expire_time_for_endpoint(locator::host_id id) const noexcept {
-    /* default expire_time is A_VERY_LONG_TIME */
-    auto it = _expire_time_endpoint_map.find(id);
-    if (it == _expire_time_endpoint_map.end()) {
-        return compute_expire_time();
-    } else {
-        auto stored_time = it->second;
-        return stored_time;
-    }
-}
-
 endpoint_state_ptr gossiper::get_endpoint_state_ptr(locator::host_id ep) const noexcept {
    auto it = _endpoint_state_map.find(ep);
    if (it == _endpoint_state_map.end()) {
@@ -1420,7 +1389,7 @@ endpoint_state& gossiper::my_endpoint_state() {
    auto ep = get_broadcast_address();
    auto it = _endpoint_state_map.find(id);
    if (it == _endpoint_state_map.end()) {
-        it = _endpoint_state_map.emplace(id, make_endpoint_state_ptr({ep})).first;
+        it = _endpoint_state_map.emplace(id, make_endpoint_state_ptr(endpoint_state{ep})).first;
    }
    return const_cast<endpoint_state&>(*it->second);
 }
@@ -1634,9 +1603,8 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {
    }

    // Do not mark a node with status shutdown as UP.
-    auto status = sstring(get_gossip_status(*es));
-    if (status == sstring(versioned_value::SHUTDOWN)) {
-        logger.warn("Skip marking node {} with status = {} as UP", host_id, status);
+    if (is_shutdown(*es)) {
+        logger.warn("Skip marking node {} with status = shutdown as UP", host_id);
        co_return;
    }

@@ -1649,7 +1617,6 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {
        auto [it_, inserted] = data.live.insert(addr);
        was_live = !inserted;
    });
-    _expire_time_endpoint_map.erase(host_id);
    if (was_live) {
        co_return;
    }
@@ -1662,7 +1629,7 @@ future<> gossiper::real_mark_alive(locator::host_id host_id) {

    auto addr = es->get_ip();

-    logger.info("InetAddress {}/{} is now UP, status = {}", host_id, addr, status);
+    logger.info("InetAddress {}/{} is now UP, status = {}", host_id, addr, get_node_status(host_id));

    co_await _subscribers.for_each([addr, host_id, es, pid = permit.id()] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) -> future<> {
        co_await subscriber->on_alive(addr, host_id, es, pid);
@@ -1678,7 +1645,7 @@ future<> gossiper::mark_dead(locator::host_id addr, endpoint_state_ptr state, pe
        data.live.erase(addr);
        data.unreachable[addr] = now();
    });
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(*state));
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_node_status(addr));
    co_await do_on_dead_notifications(state->get_ip(), std::move(state), pid);
 }

@@ -1688,14 +1655,14 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,

    endpoint_state_ptr eps_old = get_endpoint_state_ptr(ep);

-    if (!is_dead_state(eps) && !shadow_round) {
+    if (!is_left(eps) && !shadow_round) {
        if (_endpoint_state_map.contains(ep))  {
-            logger.info("Node {} has restarted, now UP, status = {}", ep, get_gossip_status(eps));
+            logger.info("Node {} has restarted, now UP, status = {}", ep, get_node_status(ep));
        } else {
-            logger.debug("Node {} is now part of the cluster, status = {}", ep, get_gossip_status(eps));
+            logger.debug("Node {} is now part of the cluster, status = {}", ep, get_node_status(ep));
        }
    }
-    logger.trace("Adding endpoint state for {}, status = {}", ep, get_gossip_status(eps));
+    logger.trace("Adding endpoint state for {}, status = {}", ep, get_node_status(ep));
    co_await replicate(eps, pid);

    if (shadow_round) {
@@ -1713,10 +1680,10 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,
    if (!ep_state) {
        throw std::out_of_range(format("ep={}", ep));
    }
-    if (!is_dead_state(*ep_state)) {
+    if (!is_left(*ep_state)) {
        mark_alive(ep_state);
    } else {
-        logger.debug("Not marking {} alive due to dead state {}", ep, get_gossip_status(eps));
+        logger.debug("Not marking {} alive due to dead state {}", ep, get_node_status(ep));
        co_await mark_dead(ep, ep_state, pid);
    }

@@ -1730,8 +1697,8 @@ future<> gossiper::handle_major_state_change(endpoint_state eps, permit_id pid,
    }
 }

-bool gossiper::is_dead_state(const endpoint_state& eps) const {
-    return std::ranges::any_of(DEAD_STATES, [state = get_gossip_status(eps)](const auto& deadstate) { return state == deadstate; });
+bool gossiper::is_left(const endpoint_state& eps) const {
+    return _topo_sm._topology.left_nodes.contains(raft::server_id(eps.get_host_id().uuid()));
 }

 bool gossiper::is_shutdown(const locator::host_id& endpoint) const {
@@ -1746,10 +1713,6 @@ bool gossiper::is_normal(const locator::host_id& endpoint) const {
    return get_gossip_status(endpoint) == versioned_value::STATUS_NORMAL;
 }

-bool gossiper::is_silent_shutdown_state(const endpoint_state& ep_state) const{
-    return std::ranges::any_of(SILENT_SHUTDOWN_STATES, [state = get_gossip_status(ep_state)](const auto& deadstate) { return state == deadstate; });
-}
-
 future<> gossiper::apply_new_states(endpoint_state local_state, const endpoint_state& remote_state, permit_id pid, bool shadow_round) {
    // don't SCYLLA_ASSERT here, since if the node restarts the version will go back to zero
    //int oldVersion = local_state.get_heart_beat_state().get_heart_beat_version();
@@ -2173,16 +2136,14 @@ future<> gossiper::do_stop_gossiping() {
        logger.info("gossip is already stopped");
        co_return;
    }
+
    auto my_ep_state = get_this_endpoint_state_ptr();
-    if (my_ep_state) {
-        logger.info("My status = {}", get_gossip_status(*my_ep_state));
-    }
-    if (my_ep_state && !is_silent_shutdown_state(*my_ep_state)) {
+    if (my_ep_state && _topo_sm._topology.normal_nodes.contains(raft::server_id(my_host_id().uuid()))) {
        auto local_generation = my_ep_state->get_heart_beat_state().get_generation();
        logger.info("Announcing shutdown");
        co_await add_local_application_state(application_state::STATUS, versioned_value::shutdown(true));
        auto live_endpoints = _live_endpoints;
-        for (locator::host_id id : live_endpoints) {
+        co_await coroutine::parallel_for_each(live_endpoints, [this, &local_generation] (locator::host_id id) -> future<> {
            logger.info("Sending a GossipShutdown to {} with generation {}", id, local_generation);
            try {
                co_await ser::gossip_rpc_verbs::send_gossip_shutdown(&_messaging, id, get_broadcast_address(), local_generation.value());
@@ -2190,7 +2151,7 @@ future<> gossiper::do_stop_gossiping() {
            } catch (...) {
                logger.warn("Fail to send GossipShutdown to {}: {}", id, std::current_exception());
            }
-        }
+        });
        co_await sleep(std::chrono::milliseconds(_gcfg.shutdown_announce_ms));
    } else {
        logger.warn("No local state or state is in silent shutdown, not announcing shutdown");
@@ -2241,19 +2202,6 @@ bool gossiper::is_enabled() const {
    return _enabled && !_abort_source.abort_requested();
 }

-void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
-    auto now_ = now();
-    auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
-    logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
-            endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
-            now_.time_since_epoch().count(), diff);
-    _expire_time_endpoint_map[endpoint] = expire_time;
-}
-
-clk::time_point gossiper::compute_expire_time() {
-    return now() + A_VERY_LONG_TIME;
-}
-
 bool gossiper::is_alive(locator::host_id id) const {
    if (id == my_host_id()) {
        return true;
@@ -2373,91 +2321,22 @@ std::string_view gossiper::get_gossip_status(const locator::host_id& endpoint) c
    return do_get_gossip_status(get_application_state_ptr(endpoint, application_state::STATUS));
 }

-bool gossiper::is_safe_for_bootstrap(inet_address endpoint) const {
-    // We allow to bootstrap a new node in only two cases:
-    // 1) The node is a completely new node and no state in gossip at all
-    // 2) The node has state in gossip and it is already removed from the
-    // cluster either by nodetool decommission or nodetool removenode
-    bool allowed = true;
-    auto host_id = try_get_host_id(endpoint);
-    if (!host_id) {
-        logger.debug("is_safe_for_bootstrap: node={}, status=no state in gossip, allowed_to_bootstrap={}", endpoint, allowed);
-        return allowed;
+std::string gossiper::get_node_status(const locator::host_id& endpoint) const noexcept {
+    if (this_shard_id() != 0) {
+        on_internal_error(logger, "get_node_status should only be called on shard 0");
    }
-    auto eps = get_endpoint_state_ptr(*host_id);
-    if (!eps) {
-        logger.debug("is_safe_for_bootstrap: node={}, status=no state in gossip, allowed_to_bootstrap={}", endpoint, allowed);
-        return allowed;
+    if (is_shutdown(endpoint)) {
+        return "shutdown";
    }
-    auto status = get_gossip_status(*eps);
-    std::unordered_set<std::string_view> allowed_statuses{
-        versioned_value::STATUS_LEFT,
-        versioned_value::REMOVED_TOKEN,
-    };
-    allowed = allowed_statuses.contains(status);
-    logger.debug("is_safe_for_bootstrap: node={}, status={}, allowed_to_bootstrap={}", endpoint, status, allowed);
-    return allowed;
-}
-
-std::set<sstring> gossiper::get_supported_features(locator::host_id endpoint) const {
-    auto app_state = get_application_state_ptr(endpoint, application_state::SUPPORTED_FEATURES);
-    if (!app_state) {
-        return {};
-    }
-    return feature_service::to_feature_set(app_state->value());
-}
-
-std::set<sstring> gossiper::get_supported_features(const std::unordered_map<locator::host_id, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const {
-    std::unordered_map<locator::host_id, std::set<sstring>> features_map;
-    std::set<sstring> common_features;
-
-    for (auto& x : loaded_peer_features) {
-        auto features = feature_service::to_feature_set(x.second);
-        if (features.empty()) {
-            logger.warn("Loaded empty features for peer node {}", x.first);
-        } else {
-            features_map.emplace(x.first, std::move(features));
+    auto n = _topo_sm._topology.find(raft::server_id{endpoint.uuid()});
+    if (!n) {
+        if (_topo_sm._topology.left_nodes.contains(raft::server_id{endpoint.uuid()})) {
+            return "left";
        }
+        return "unknown";
+    } else {
+        return fmt::format("{}", n->second.state);
    }
-
-    for (auto& x : _endpoint_state_map) {
-        auto host_id = x.second->get_host_id();
-        auto features = get_supported_features(host_id);
-        if (ignore_local_node && host_id == my_host_id()) {
-            logger.debug("Ignore SUPPORTED_FEATURES of local node: features={}", features);
-            continue;
-        }
-        if (features.empty()) {
-            auto it = loaded_peer_features.find(host_id);
-            if (it != loaded_peer_features.end()) {
-                logger.info("Node {} does not contain SUPPORTED_FEATURES in gossip, using features saved in system table, features={}", host_id, feature_service::to_feature_set(it->second));
-            } else {
-                logger.warn("Node {} does not contain SUPPORTED_FEATURES in gossip or system table", host_id);
-            }
-        } else {
-            // Replace the features with live info
-            features_map[host_id] = std::move(features);
-        }
-    }
-
-    if (ignore_local_node) {
-        features_map.erase(my_host_id());
-    }
-
-    if (!features_map.empty()) {
-        common_features = features_map.begin()->second;
-    }
-
-    for (auto& x : features_map) {
-        auto& features = x.second;
-        std::set<sstring> result;
-        std::set_intersection(features.begin(), features.end(),
-                common_features.begin(), common_features.end(),
-                std::inserter(result, result.end()));
-        common_features = std::move(result);
-    }
-    common_features.erase("");
-    return common_features;
 }

 void gossiper::check_snitch_name_matches(sstring local_snitch_name) const {
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -91,7 +91,6 @@ struct loaded_endpoint_state {
 class gossiper : public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
 public:
    using clk = seastar::lowres_system_clock;
-    using ignore_features_of_local_node = bool_class<class ignore_features_of_local_node_tag>;
    using generation_for_nodes = std::unordered_map<locator::host_id, generation_type>;
 private:
    using messaging_verb = netw::messaging_verb;
@@ -198,18 +197,7 @@ private:
    endpoint_locks_map _endpoint_locks;

 public:
-    static constexpr std::array DEAD_STATES{
-        versioned_value::REMOVED_TOKEN,
-        versioned_value::STATUS_LEFT,
-    };
-    static constexpr std::array SILENT_SHUTDOWN_STATES{
-        versioned_value::REMOVED_TOKEN,
-        versioned_value::STATUS_LEFT,
-        versioned_value::STATUS_BOOTSTRAPPING,
-        versioned_value::STATUS_UNKNOWN,
-    };
    static constexpr std::chrono::milliseconds INTERVAL{1000};
-    static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

    // Maximum difference between remote generation value and generation
    // value this node would get if this node were restarted that we are
@@ -241,7 +229,6 @@ private:
    /* initial seeds for joining the cluster */
    std::set<inet_address> _seeds;

-    std::map<locator::host_id, clk::time_point> _expire_time_endpoint_map;

    bool _in_shadow_round = false;

@@ -341,13 +328,6 @@ private:
    utils::chunked_vector<gossip_digest> make_random_gossip_digest() const;

 public:
-    /**
-     * Handles switching the endpoint's state from REMOVING_TOKEN to REMOVED_TOKEN
-     *
-     * @param endpoint
-     * @param host_id
-     */
-    future<> advertise_token_removed(locator::host_id host_id, permit_id);

    /**
     * Do not call this method unless you know what you are doing.
@@ -363,7 +343,6 @@ public:
    future<generation_type> get_current_generation_number(locator::host_id endpoint) const;
    future<version_type> get_current_heart_beat_version(locator::host_id endpoint) const;

-    bool is_safe_for_bootstrap(inet_address endpoint) const;
 private:
    /**
     * Returns true if the chosen target was also a seed. False otherwise
@@ -383,7 +362,6 @@ private:
    future<> do_gossip_to_unreachable_member(gossip_digest_syn message);

 public:
-    clk::time_point get_expire_time_for_endpoint(locator::host_id endpoint) const noexcept;

    // Gets a shared pointer to the endpoint_state, if exists.
    // Otherwise, returns a null ptr.
@@ -467,7 +445,7 @@ private:
 public:
    bool is_alive(locator::host_id id) const;

-    bool is_dead_state(const endpoint_state& eps) const;
+    bool is_left(const endpoint_state& eps) const;
    // Wait for nodes to be alive on all shards
    future<> wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout);
    future<> wait_alive(std::vector<locator::host_id> nodes, std::chrono::milliseconds timeout);
@@ -588,17 +566,12 @@ public:
 public:
    bool is_enabled() const;

-public:
-    void add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time);
-
-    static clk::time_point compute_expire_time();
 public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const locator::host_id& endpoint) const;
    bool is_shutdown(const endpoint_state& eps) const;
    bool is_normal(const locator::host_id& endpoint) const;
    bool is_cql_ready(const locator::host_id& endpoint) const;
-    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void force_newer_generation();
 public:
    std::string_view get_gossip_status(const endpoint_state& ep_state) const noexcept;
@@ -615,12 +588,8 @@ private:
    gossip_address_map& _address_map;
    gossip_config _gcfg;
    condition_variable _failure_detector_loop_cv;
-    // Get features supported by a particular node
-    std::set<sstring> get_supported_features(locator::host_id endpoint) const;
    locator::token_metadata_ptr get_token_metadata_ptr() const noexcept;
-public:
-    // Get features supported by all the nodes this node knows about
-    std::set<sstring> get_supported_features(const std::unordered_map<locator::host_id, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const;
+    std::string get_node_status(const locator::host_id& endpoint) const noexcept;
 private:
    seastar::metrics::metric_groups _metrics;
 public:
--- a/gms/versioned_value.cc
+++ b/gms/versioned_value.cc
@@ -10,10 +10,6 @@
 #include "gms/versioned_value.hh"
 #include "message/messaging_service.hh"

-#include <boost/algorithm/string/split.hpp>
-#include <boost/algorithm/string/classification.hpp>
-#include <charconv>
-
 namespace gms {

 static_assert(std::is_nothrow_default_constructible_v<versioned_value>);
@@ -23,11 +19,6 @@ versioned_value versioned_value::network_version() {
    return versioned_value(format("{}", netw::messaging_service::current_version));
 }

-sstring versioned_value::make_full_token_string(const std::unordered_set<dht::token>& tokens) {
-    return fmt::to_string(fmt::join(tokens | std::views::transform([] (const dht::token& t) {
-        return t.to_sstring(); }), ";"));
-}
-
 sstring versioned_value::make_token_string(const std::unordered_set<dht::token>& tokens) {
    if (tokens.empty()) {
        return "";
@@ -35,16 +26,4 @@ sstring versioned_value::make_token_string(const std::unordered_set<dht::token>&
    return tokens.begin()->to_sstring();
 }

-std::unordered_set<dht::token> versioned_value::tokens_from_string(const sstring& s) {
-    if (s.size() == 0) {
-        return {}; // boost::split produces one element for empty string
-    }
-    std::vector<sstring> tokens;
-    boost::split(tokens, s, boost::is_any_of(";"));
-    std::unordered_set<dht::token> ret;
-    for (auto str : tokens) {
-        ret.emplace(dht::token::from_sstring(str));
-    }
-    return ret;
-}
-}
+} // namespace gms
--- a/gms/versioned_value.hh
+++ b/gms/versioned_value.hh
@@ -18,7 +18,6 @@
 #include "schema/schema_fwd.hh"
 #include "service/state_id.hh"
 #include "version.hh"
-#include "cdc/generation_id.hh"
 #include <set>
 #include <unordered_set>

@@ -45,11 +44,7 @@ public:

    // values for ApplicationState.STATUS
    static constexpr std::string_view STATUS_UNKNOWN{"UNKNOWN"};
-    static constexpr std::string_view STATUS_BOOTSTRAPPING{"BOOT"};
    static constexpr std::string_view STATUS_NORMAL{"NORMAL"};
-    static constexpr std::string_view STATUS_LEFT{"LEFT"};
-
-    static constexpr std::string_view REMOVED_TOKEN{"removed"};

    static constexpr std::string_view SHUTDOWN{"shutdown"};

@@ -80,26 +75,18 @@ public:
        : _version(-1) {
    }

-    static sstring version_string(const std::initializer_list<sstring>& args) {
-        return fmt::to_string(fmt::join(args, versioned_value::DELIMITER));
-    }
-
-    static sstring make_full_token_string(const std::unordered_set<dht::token>& tokens);
-    static sstring make_token_string(const std::unordered_set<dht::token>& tokens);
-    static sstring make_cdc_generation_id_string(std::optional<cdc::generation_id>);
-
-    // Reverse of `make_full_token_string`.
-    static std::unordered_set<dht::token> tokens_from_string(const sstring&);
-
    static versioned_value clone_with_higher_version(const versioned_value& value) noexcept {
        return versioned_value(value.value());
    }

-    static versioned_value bootstrapping(const std::unordered_set<dht::token>& tokens) {
-        return versioned_value(version_string({sstring(versioned_value::STATUS_BOOTSTRAPPING),
-                                               make_token_string(tokens)}));
+private:
+    static sstring version_string(const std::initializer_list<sstring>& args) {
+        return fmt::to_string(fmt::join(args, versioned_value::DELIMITER));
    }

+    static sstring make_token_string(const std::unordered_set<dht::token>& tokens);
+
+public:
    static versioned_value normal(const std::unordered_set<dht::token>& tokens) {
        return versioned_value(version_string({sstring(versioned_value::STATUS_NORMAL),
                                               make_token_string(tokens)}));
@@ -113,24 +100,10 @@ public:
        return versioned_value(new_version.to_sstring());
    }

-    static versioned_value left(const std::unordered_set<dht::token>& tokens, int64_t expire_time) {
-        return versioned_value(version_string({sstring(versioned_value::STATUS_LEFT),
-                                               make_token_string(tokens),
-                                               std::to_string(expire_time)}));
-    }
-
    static versioned_value host_id(const locator::host_id& host_id) {
        return versioned_value(host_id.to_sstring());
    }

-    static versioned_value tokens(const std::unordered_set<dht::token>& tokens) {
-        return versioned_value(make_full_token_string(tokens));
-    }
-
-    static versioned_value removed_nonlocal(const locator::host_id& host_id, int64_t expire_time) {
-        return versioned_value(sstring(REMOVED_TOKEN) + sstring(DELIMITER) + host_id.to_sstring() + sstring(DELIMITER) + to_sstring(expire_time));
-    }
-
    static versioned_value shutdown(bool value) {
        return versioned_value(sstring(SHUTDOWN) + sstring(DELIMITER) + (value ? "true" : "false"));
    }
@@ -169,10 +142,6 @@ public:
        return versioned_value(private_ip);
    }

-    static versioned_value severity(double value) {
-        return versioned_value(to_sstring(value));
-    }
-
    static versioned_value supported_features(const std::set<std::string_view>& features) {
        return versioned_value(fmt::to_string(fmt::join(features, ",")));
    }
--- a/index/secondary_index_manager.cc
+++ b/index/secondary_index_manager.cc
@@ -202,6 +202,10 @@ std::optional<sstring> secondary_index_manager::custom_index_class(const schema&

 // This function returns a factory, as the custom index class should be lightweight, preferably not holding any state.
 // We prefer this over a static custom index class instance, as it allows us to avoid any issues with thread safety.
+//
+// Note: SAI class names (StorageAttachedIndex, sai) are not listed here
+// because maybe_rewrite_sai_to_vector_index() in create_index_statement.cc
+// rewrites them to "vector_index" before the index metadata is persisted.
 std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_manager::get_custom_class_factory(const sstring& class_name) {
    sstring lower_class_name = class_name;
    std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
--- a/index/vector_index.cc
+++ b/index/vector_index.cc
@@ -104,7 +104,11 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
        {"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
        // 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
        {"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
-};
+        // 'source_model' is a Cassandra SAI option specifying the embedding model name.
+        // Used by Cassandra libraries (e.g., CassIO) to tag indexes with the model that produced the vectors.
+        // Accepted for compatibility but not used by ScyllaDB.
+        {"source_model", [](const sstring&, const sstring&) { /* accepted for Cassandra compatibility */ }},
+    };

 static constexpr auto TC_TARGET_KEY = "tc";
 static constexpr auto PK_TARGET_KEY = "pk";
@@ -250,11 +254,37 @@ bool vector_index::view_should_exist() const {
 }

 std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
+    static const std::unordered_set<sstring> system_options = {
+        cql3::statements::index_target::target_option_name,
+        db::index::secondary_index::custom_class_option_name,
+        db::index::secondary_index::index_version_option_name,
+    };
+
    fragmented_ostringstream os;
    os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
       << cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
       << " USING 'vector_index'";

+    // Collect user-provided options (excluding system keys like target, class_name, index_version).
+    std::map<sstring, sstring> user_options;
+    for (const auto& [key, value] : im.options()) {
+        if (!system_options.contains(key)) {
+            user_options.emplace(key, value);
+        }
+    }
+    if (!user_options.empty()) {
+        os << " WITH OPTIONS = {";
+        bool first = true;
+        for (const auto& [key, value] : user_options) {
+            if (!first) {
+                os << ", ";
+            }
+            os << "'" << key << "': '" << value << "'";
+            first = false;
+        }
+        os << "}";
+    }
+
    return cql3::description{
        .keyspace = base_schema.ks_name(),
        .type = "index",
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -1269,7 +1269,7 @@ private:
                    return info->next;
                }
            }
-            on_internal_error(tablet_logger, format("Invalid replica selector", static_cast<int>(info->writes)));
+            on_internal_error(tablet_logger, format("Invalid write replica selector: {}", static_cast<int>(info->writes)));
        });
        tablet_logger.trace("get_replicas_for_write({}): table={}, tablet={}, replicas={}", search_token, _table, tablet, replicas);
        return replicas;
@@ -1296,7 +1296,7 @@ private:
            case write_replica_set_selector::next:
                return {};
        }
-        on_internal_error(tablet_logger, format("Invalid replica selector", static_cast<int>(info->writes)));
+        on_internal_error(tablet_logger, format("Invalid write replica selector: {}", static_cast<int>(info->writes)));
    }

    host_id_vector_replica_set get_for_reading_helper(const token& search_token) const {
@@ -1314,7 +1314,7 @@ private:
                    return info->next;
                }
            }
-            on_internal_error(tablet_logger, format("Invalid replica selector", static_cast<int>(info->reads)));
+            on_internal_error(tablet_logger, format("Invalid read replica selector: {}", static_cast<int>(info->reads)));
        });
        tablet_logger.trace("get_endpoints_for_reading({}): table={}, tablet={}, replicas={}", search_token, _table, tablet, replicas);
        return to_host_set(replicas);
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -367,6 +367,8 @@ future<> token_metadata_impl::clear_gently() noexcept {
    co_await utils::clear_gently(_sorted_tokens);
    co_await _topology.clear_gently();
    co_await _tablets.clear_gently();
+    co_await utils::clear_gently(_topology_change_info);
+    _topology_change_info.reset();
    co_return;
 }

--- a/locator/topology.cc
+++ b/locator/topology.cc
@@ -490,6 +490,10 @@ const endpoint_dc_rack& topology::get_location_slow(host_id id) const {
    throw std::runtime_error(format("Requested location for node {} not in topology. backtrace {}", id, lazy_backtrace()));
 }

+utils::UUID topology::get_rack_uuid() const {
+    return utils::UUID_gen::get_name_UUID(format("{}:{}", get_location().dc, get_location().rack));
+}
+
 void topology::sort_by_proximity(locator::host_id address, host_id_vector_replica_set& addresses) const {
    if (can_sort_by_proximity()) {
        do_sort_by_proximity(address, addresses);
--- a/locator/topology.hh
+++ b/locator/topology.hh
@@ -344,6 +344,8 @@ public:
        return get_location(id).rack;
    }

+    utils::UUID get_rack_uuid() const;
+
    auto get_local_dc_filter() const noexcept {
        return [ this, local_dc = get_datacenter() ] (auto ep) {
            return get_datacenter(ep) == local_dc;
--- a/main.cc
+++ b/main.cc
@@ -1807,7 +1807,17 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            checkpoint(stop_signal, "starting repair service");
            auto max_memory_repair = memory::stats().total_memory() * 0.1;
-            repair.start(std::ref(tsm), std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_ks), std::ref(view_builder), std::ref(view_building_worker), std::ref(task_manager), std::ref(mm), max_memory_repair).get();
+            auto repair_config = sharded_parameter([&] {
+                return repair_service::config{
+                    .enable_small_table_optimization_for_rbno = cfg->enable_small_table_optimization_for_rbno,
+                    .repair_hints_batchlog_flush_cache_time_in_ms = cfg->repair_hints_batchlog_flush_cache_time_in_ms,
+                    .repair_partition_count_estimation_ratio = cfg->repair_partition_count_estimation_ratio,
+                    .critical_disk_utilization_level = cfg->critical_disk_utilization_level,
+                    .repair_multishard_reader_buffer_hint_size = cfg->repair_multishard_reader_buffer_hint_size,
+                    .repair_multishard_reader_enable_read_ahead = cfg->repair_multishard_reader_enable_read_ahead,
+                };
+            });
+            repair.start(std::ref(tsm), std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_ks), std::ref(view_builder), std::ref(view_building_worker), std::ref(task_manager), std::ref(mm), max_memory_repair, std::move(repair_config)).get();
            auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
                repair.stop().get();
            });
--- a/mutation/counters.cc
+++ b/mutation/counters.cc
@@ -175,11 +175,9 @@ std::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, ato
 }


-void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, locator::host_id local_host_id) {
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, counter_id local_id) {
    // FIXME: allow current_state to be frozen_mutation

-    utils::UUID local_id = local_host_id.uuid();
-
    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& cells) {
        cells.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
@@ -188,7 +186,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
            auto delta = acv.counter_update_value();
-            auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
+            auto cs = counter_shard(local_id, delta, clock_offset + 1);
            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
        });
    };
@@ -212,7 +210,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
            auto ccv = counter_cell_view(acv);
-            auto cs = ccv.get_shard(counter_id(local_id));
+            auto cs = ccv.get_shard(local_id);
            if (!cs) {
                return; // continue
            }
@@ -232,7 +230,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            auto delta = acv.counter_update_value();

            if (shards.empty() || shards.front().first > id) {
-                auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
+                auto cs = counter_shard(local_id, delta, clock_offset + 1);
                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
            } else {
                auto& cs = shards.front().second;
--- a/mutation/counters.hh
+++ b/mutation/counters.hh
@@ -370,7 +370,7 @@ struct counter_cell_mutable_view : basic_counter_cell_view<mutable_view::yes> {
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
-void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, locator::host_id local_id);
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, counter_id local_id);

 template<>
 struct appending_hash<counter_shard_view> {
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -113,8 +113,6 @@ void tracker::set_configuration(const configuration& configuration, index_t next
            }
            auto newp = this->progress::find(s.addr.id);
            if (newp != this->progress::end()) {
-                // Processing joint configuration and already added
-                // an entry for this id.
                continue;
            }
            auto oldp = old_progress.find(s.addr.id);
@@ -123,7 +121,7 @@ void tracker::set_configuration(const configuration& configuration, index_t next
            } else {
                newp = this->progress::emplace(s.addr.id, follower_progress{s.addr.id, next_idx}).first;
            }
-            newp->second.can_vote = s.can_vote;
+            newp->second.can_vote = configuration.can_vote(s.addr.id);
        }
    };
    emplace_simple_config(configuration.current, _current_voters);
--- a/repair/reader.hh
+++ b/repair/reader.hh
@@ -46,7 +46,9 @@ private:
        const dht::sharder& remote_sharder,
        unsigned remote_shard,
        gc_clock::time_point compaction_time,
-        incremental_repair_meta inc);
+        incremental_repair_meta inc,
+        uint64_t multishard_reader_buffer_hint_size,
+        bool multishard_reader_enable_read_ahead);

 public:
    repair_reader(
@@ -60,7 +62,9 @@ public:
        uint64_t seed,
        read_strategy strategy,
        gc_clock::time_point compaction_time,
-        incremental_repair_meta inc);
+        incremental_repair_meta inc,
+        uint64_t multishard_reader_buffer_hint_size,
+        bool multishard_reader_enable_read_ahead);

    future<mutation_fragment_opt>
    read_mutation_fragment();
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -6,7 +6,6 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

-#include "db/config.hh"
 #include "repair.hh"
 #include "gms/gossip_address_map.hh"
 #include "locator/abstract_replication_strategy.hh"
@@ -137,9 +136,8 @@ std::string_view format_as(row_level_diff_detect_algorithm algo) {
    return "unknown";
 }

-bool should_enable_small_table_optimization_for_rbno(const replica::database& db, sstring keyspace, streaming::stream_reason reason) {
+bool should_enable_small_table_optimization_for_rbno(bool enable_small_table_optimization_for_rbno, sstring keyspace, streaming::stream_reason reason) {
    bool small_table_optimization = false;
-    auto enable_small_table_optimization_for_rbno = db.get_config().enable_small_table_optimization_for_rbno();
    if (enable_small_table_optimization_for_rbno) {
        static const std::unordered_set<sstring> small_table_optimization_enabled_ks = {
            "system_distributed",
@@ -1507,7 +1505,7 @@ future<> repair::data_sync_repair_task_impl::run() {
    auto id = get_repair_uniq_id();

    size_t ranges_reduced_factor = 1;
-    bool small_table_optimization = should_enable_small_table_optimization_for_rbno(db, keyspace, _reason);
+    bool small_table_optimization = should_enable_small_table_optimization_for_rbno(rs.get_config().enable_small_table_optimization_for_rbno(), keyspace, _reason);
    if (small_table_optimization) {
        auto range = dht::token_range(dht::token_range::bound(dht::minimum_token(), false), dht::token_range::bound(dht::maximum_token(), false));
        ranges_reduced_factor = _ranges.size();
@@ -1601,7 +1599,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
                continue;
            }
            auto nr_tables = get_nr_tables(db, keyspace_name);
-            bool small_table_optimization = should_enable_small_table_optimization_for_rbno(db, keyspace_name, reason);
+            bool small_table_optimization = should_enable_small_table_optimization_for_rbno(_config.enable_small_table_optimization_for_rbno(), keyspace_name, reason);
            if (small_table_optimization) {
                nr_ranges_total += 1 * nr_tables;
                continue;
@@ -1621,7 +1619,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
                rlogger.info("bootstrap_with_repair: keyspace={} does not exist any more, ignoring it", keyspace_name);
                continue;
            }
-            bool small_table_optimization = should_enable_small_table_optimization_for_rbno(db, keyspace_name, reason);
+            bool small_table_optimization = should_enable_small_table_optimization_for_rbno(_config.enable_small_table_optimization_for_rbno(), keyspace_name, reason);
            dht::token_range_vector desired_ranges;
            //Collects the source that will have its range moved to the new node
            std::unordered_map<dht::token_range, repair_neighbors> range_sources;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -47,7 +47,6 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/all.hh>
 #include <seastar/coroutine/as_future.hh>
-#include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "service/storage_proxy.hh"
 #include "db/batchlog_manager.hh"
@@ -287,7 +286,9 @@ mutation_reader repair_reader::make_reader(
    const dht::sharder& remote_sharder,
    unsigned remote_shard,
    gc_clock::time_point compaction_time,
-    incremental_repair_meta inc) {
+    incremental_repair_meta inc,
+    uint64_t multishard_reader_buffer_hint_size,
+    bool multishard_reader_enable_read_ahead) {
    switch (strategy) {
        case read_strategy::local: {
            auto ms = mutation_source([&cf, compaction_time] (
@@ -313,12 +314,11 @@ mutation_reader repair_reader::make_reader(
        }
        case read_strategy::multishard_split: {
            std::optional<size_t> multishard_reader_buffer_size;
-            const auto& dbconfig = db.local().get_config();
-            if (dbconfig.repair_multishard_reader_buffer_hint_size()) {
+            if (multishard_reader_buffer_hint_size) {
                // Setting the repair buffer size as the multishard reader's buffer
                // size helps avoid extra cross-shard round-trips and possible
                // evict-recreate cycles.
-                multishard_reader_buffer_size = dbconfig.repair_multishard_reader_buffer_hint_size();
+                multishard_reader_buffer_size = multishard_reader_buffer_hint_size;
            }
            return make_multishard_streaming_reader(db, _schema, _permit, [this] {
                auto shard_range = _sharder.next();
@@ -326,7 +326,7 @@ mutation_reader repair_reader::make_reader(
                    return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
                }
                return std::optional<dht::partition_range>();
-            }, compaction_time, multishard_reader_buffer_size, read_ahead(dbconfig.repair_multishard_reader_enable_read_ahead()));
+            }, compaction_time, multishard_reader_buffer_size, read_ahead(multishard_reader_enable_read_ahead));
        }
        case read_strategy::multishard_filter: {
            return make_filtering_reader(make_multishard_streaming_reader(db, _schema, _permit, _range, compaction_time, {}, read_ahead::yes),
@@ -354,14 +354,17 @@ repair_reader::repair_reader(
    uint64_t seed,
    read_strategy strategy,
    gc_clock::time_point compaction_time,
-    incremental_repair_meta inc)
+    incremental_repair_meta inc,
+    uint64_t multishard_reader_buffer_hint_size,
+    bool multishard_reader_enable_read_ahead)
    : _schema(s)
    , _permit(std::move(permit))
    , _range(dht::to_partition_range(range))
    , _sharder(remote_sharder, range, remote_shard)
    , _seed(seed)
    , _local_read_op(strategy == read_strategy::local ? std::optional(cf.read_in_progress()) : std::nullopt)
-    , _reader(make_reader(db, cf, strategy, remote_sharder, remote_shard, compaction_time, inc))
+    , _reader(make_reader(db, cf, strategy, remote_sharder, remote_shard, compaction_time, inc,
+                          multishard_reader_buffer_hint_size, multishard_reader_enable_read_ahead))
 { }

 future<mutation_fragment_opt>
@@ -1321,7 +1324,9 @@ private:
                    return read_strategy;
                }),
                _compaction_time,
-                _incremental_repair_meta);
+                _incremental_repair_meta,
+                _rs.get_config().repair_multishard_reader_buffer_hint_size(),
+                bool(_rs.get_config().repair_multishard_reader_enable_read_ahead()));
        }
        try {
            while (cur_size < _max_row_buf_size) {
@@ -2177,8 +2182,13 @@ public:
        auto& cm = table.get_compaction_manager();
        int64_t repaired_at = _incremental_repair_meta.sstables_repaired_at + 1;

-        auto modifier = [repaired_at] (sstables::sstable& new_sst) {
+        // Keep the new sstables marked as being_repaired until repair_update_compaction_ctrl
+        // is called (after sstables_repaired_at is committed to Raft). This is an additional
+        // in-memory guard; the classifier itself also protects these sstables via the
+        // repaired_at > sstables_repaired_at check.
+        auto modifier = [repaired_at, session = _frozen_topology_guard] (sstables::sstable& new_sst) {
            new_sst.update_repaired_at(repaired_at);
+            new_sst.mark_as_being_repaired(session);
        };

        std::unordered_map<compaction::compaction_group_view*, std::vector<sstables::shared_sstable>> sstables_by_group;
@@ -2625,7 +2635,7 @@ future<repair_flush_hints_batchlog_response> repair_service::repair_flush_hints_
    auto permit = co_await seastar::get_units(_flush_hints_batchlog_sem, 1);
    bool updated = false;
    auto now = gc_clock::now();
-    auto cache_time = std::chrono::milliseconds(get_db().local().get_config().repair_hints_batchlog_flush_cache_time_in_ms());
+    auto cache_time = std::chrono::milliseconds(_config.repair_hints_batchlog_flush_cache_time_in_ms());
    auto cache_disabled = cache_time == std::chrono::milliseconds(0);
    auto flush_time = now;
    db::all_batches_replayed all_replayed = db::all_batches_replayed::yes;
@@ -3495,7 +3505,7 @@ public:
                    // To save memory and have less different conditions, we
                    // use the estimation for RBNO repair as well.

-                    _estimated_partitions *= _shard_task.db.local().get_config().repair_partition_count_estimation_ratio();
+                    _estimated_partitions *= _shard_task.rs.get_config().repair_partition_count_estimation_ratio();
                }

                parallel_for_each(master.all_nodes(), coroutine::lambda([&] (repair_node_state& ns) -> future<> {
@@ -3631,7 +3641,8 @@ repair_service::repair_service(sharded<service::topology_state_machine>& tsm,
        sharded<db::view::view_building_worker>& vbw,
        tasks::task_manager& tm,
        service::migration_manager& mm,
-        size_t max_repair_memory)
+        size_t max_repair_memory,
+        config cfg)
    : _tsm(tsm)
    , _gossiper(gossiper)
    , _messaging(ms)
@@ -3646,6 +3657,7 @@ repair_service::repair_service(sharded<service::topology_state_machine>& tsm,
    , _node_ops_metrics(_repair_module)
    , _max_repair_memory(max_repair_memory)
    , _memory_sem(max_repair_memory)
+    , _config(std::move(cfg))
 {
    tm.register_module("repair", _repair_module);
    if (this_shard_id() == 0) {
@@ -3656,7 +3668,7 @@ repair_service::repair_service(sharded<service::topology_state_machine>& tsm,

 future<> repair_service::start(utils::disk_space_monitor* dsm) {
    if (dsm && (this_shard_id() == 0)) {
-        _out_of_space_subscription = dsm->subscribe(_db.local().get_config().critical_disk_utilization_level, [this] (auto threshold_reached) {
+        _out_of_space_subscription = dsm->subscribe(_config.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
                return container().invoke_on_all([] (repair_service& rs) { return rs.drain(); });
            }
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -109,6 +109,17 @@ struct repair_task_progress {
 };

 class repair_service : public seastar::peering_sharded_service<repair_service> {
+public:
+    struct config {
+        utils::updateable_value<bool> enable_small_table_optimization_for_rbno = utils::updateable_value<bool>(true);
+        utils::updateable_value<uint32_t> repair_hints_batchlog_flush_cache_time_in_ms = utils::updateable_value<uint32_t>(60*1000);
+        utils::updateable_value<double> repair_partition_count_estimation_ratio = utils::updateable_value<double>(0.1);
+        utils::updateable_value<float> critical_disk_utilization_level = utils::updateable_value<float>(0.98);
+        utils::updateable_value<uint64_t> repair_multishard_reader_buffer_hint_size = utils::updateable_value<uint64_t>(1024 * 1024);
+        utils::updateable_value<uint64_t> repair_multishard_reader_enable_read_ahead = utils::updateable_value<uint64_t>(0);
+    };
+
+private:
    sharded<service::topology_state_machine>& _tsm;
    sharded<gms::gossiper>& _gossiper;
    netw::messaging_service& _messaging;
@@ -162,6 +173,9 @@ class repair_service : public seastar::peering_sharded_service<repair_service> {
            sstring keyspace, std::vector<sstring> cfs,
            std::unordered_set<locator::host_id> ignore_nodes);

+    config _config;
+    static config default_config() { return {}; }
+
 public:
    std::unordered_map<locator::global_tablet_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;

@@ -177,12 +191,15 @@ public:
            sharded<db::view::view_building_worker>& vbw,
            tasks::task_manager& tm,
            service::migration_manager& mm,
-            size_t max_repair_memory
+            size_t max_repair_memory,
+            repair_service::config cfg = default_config()
            );
    ~repair_service();
    future<> start(utils::disk_space_monitor* dsm);
    future<> stop();

+    const config& get_config() const noexcept { return _config; }
+
    // shutdown() stops all ongoing repairs started on this node (and
    // prevents any further repairs from being started). It returns a future
    // saying when all repairs have stopped, and attempts to stop them as
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -97,6 +97,8 @@ class compaction_group {
    std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
    repair_classifier_func _repair_sstable_classifier;

+    counter_id _counter_id;
+
    lw_shared_ptr<logstor::segment_set> _logstor_segments;
    std::optional<logstor::separator_buffer> _logstor_separator;
    std::vector<future<>> _separator_flushes;
@@ -191,6 +193,14 @@ public:

    future<> update_repaired_at_for_merge();

+    void set_counter_id(counter_id cid) noexcept {
+        _counter_id = cid;
+    }
+
+    counter_id get_counter_id() const noexcept {
+        return _counter_id;
+    }
+
    void set_compaction_strategy_state(compaction::compaction_strategy_state compaction_strategy_state) noexcept;

    lw_shared_ptr<memtable_list>& memtables() noexcept;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -635,8 +635,10 @@ database::setup_metrics() {
                       sm::description("Counts sstables that survived the clustering key filtering. "
                                       "High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),

+        // NOTE: dropped_view_updates is registered as a metric but never incremented in the current
+        // codebase. Consider removing it entirely if it is confirmed dead.
        sm::make_counter("dropped_view_updates", _cf_stats.dropped_view_updates,
-                       sm::description("Counts the number of view updates that have been dropped due to cluster overload. "))(basic_level),
+                       sm::description("Counts the number of view updates that have been dropped due to cluster overload. "))(basic_level).set_skip_when_empty(),

       sm::make_counter("view_building_paused", _cf_stats.view_building_paused,
                      sm::description("Counts the number of times view building process was paused (e.g. due to node unavailability). ")),
@@ -655,7 +657,7 @@ database::setup_metrics() {
                       sm::description("Counts write operations which were rejected on the replica side because the per-partition limit was reached."))(basic_level),

        sm::make_counter("total_writes_rejected_due_to_out_of_space_prevention", _stats->total_writes_rejected_due_to_out_of_space_prevention,
-                       sm::description("Counts write operations which were rejected due to disabled user tables writes."))(basic_level),
+                       sm::description("Counts write operations which were rejected due to disabled user tables writes."))(basic_level).set_skip_when_empty(),

        sm::make_counter("total_reads_rate_limited", _stats->total_reads_rate_limited,
                       sm::description("Counts read operations which were rejected on the replica side because the per-partition limit was reached.")),
@@ -704,11 +706,13 @@ database::setup_metrics() {
        sm::make_counter("multishard_query_unpopped_bytes", _stats->multishard_query_unpopped_bytes,
                       sm::description("The total number of bytes that were extracted from the shard reader but were unconsumed by the query and moved back into the reader.")),

+        // NOTE: multishard_query_failed_reader_stops appears to have no increment site in the
+        // current codebase. Consider removing it entirely if it is confirmed dead.
        sm::make_counter("multishard_query_failed_reader_stops", _stats->multishard_query_failed_reader_stops,
-                       sm::description("The number of times the stopping of a shard reader failed.")),
+                       sm::description("The number of times the stopping of a shard reader failed.")).set_skip_when_empty(),

        sm::make_counter("multishard_query_failed_reader_saves", _stats->multishard_query_failed_reader_saves,
-                       sm::description("The number of times the saving of a shard reader failed.")),
+                       sm::description("The number of times the saving of a shard reader failed.")).set_skip_when_empty(),

        sm::make_total_operations("counter_cell_lock_acquisition", _cl_stats->lock_acquisitions,
                                 sm::description("The number of acquired counter cell locks.")),
@@ -2044,10 +2048,12 @@ future<mutation> database::read_and_transform_counter_mutation_to_shards(mutatio
        co_await seastar::sleep(std::chrono::milliseconds(100));
    }

+    counter_id my_counter_id = cf.get_counter_id(m);
+
    // ...now, that we got existing state of all affected counter
    // cells we can look for our shard in each of them, increment
    // its clock and apply the delta.
-    transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable(), get_token_metadata().get_my_id());
+    transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable(), my_counter_id);

    co_return std::move(m);
 }
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -700,6 +700,10 @@ public:
    future<> maybe_split_compaction_group_of(locator::tablet_id);

    dht::token_range get_token_range_after_split(const dht::token&) const noexcept;
+
+    // Returns a counter_id for use in local counter updates.
+    counter_id get_counter_id(const mutation&) const;
+
 private:
    // If SSTable doesn't need split, the same input SSTable is returned as output.
    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -861,14 +861,31 @@ private:
        return idx;
    }

+    // Returns true if the sstable is currently being repaired. Checks the in-memory
+    // being_repaired flag first, then falls back to a durable check: if the sstable's
+    // repaired_at equals sstables_repaired_at+1 and the tablet is undergoing repair
+    // (i.e. tablet_transition_kind::repair), the sstable belongs to the current repair
+    // round but sstables_repaired_at+1 hasn't been committed to Raft yet (race window).
+    bool is_being_repaired(const sstables::shared_sstable& sst, int64_t sstables_repaired_at) const noexcept {
+        if (!sst->being_repaired.uuid().is_null()) {
+            return true;
+        }
+        auto repaired_at = sst->get_stats_metadata().repaired_at;
+        if (repaired_at != sstables_repaired_at + 1) {
+            return false;
+        }
+        auto& cg = compaction_group_for_sstable(sst);
+        auto trinfo = tablet_map().get_tablet_transition_info(locator::tablet_id(cg.group_id()));
+        return trinfo && trinfo->transition == locator::tablet_transition_kind::repair;
+    }
+
    repair_classifier_func make_repair_sstable_classifier_func() const {
-        // FIXME: implement it for incremental repair!
-        return [] (const sstables::shared_sstable& sst, int64_t sstables_repaired_at) {
+        return [this] (const sstables::shared_sstable& sst, int64_t sstables_repaired_at) {
            bool is_repaired = repair::is_repaired(sstables_repaired_at, sst);
            if (is_repaired) {
                return repair_sstable_classification::repaired;
            } else {
-                if (!sst->being_repaired.uuid().is_null()) {
+                if (is_being_repaired(sst, sstables_repaired_at)) {
                    return repair_sstable_classification::repairing;
                } else {
                    return repair_sstable_classification::unrepaired;
@@ -1266,6 +1283,14 @@ dht::token_range table::get_token_range_after_split(const dht::token& token) con
    return _sg_manager->get_token_range_after_split(token);
 }

+counter_id table::get_counter_id(const mutation& m) const {
+    if (uses_tablets()) {
+        return storage_group_for_token(m.token()).main_compaction_group()->get_counter_id();
+    } else {
+        return counter_id(_erm->get_token_metadata().get_my_id().uuid());
+    }
+}
+
 std::unique_ptr<storage_group_manager> table::make_storage_group_manager() {
    std::unique_ptr<storage_group_manager> ret;
    if (uses_tablets()) {
@@ -3400,7 +3425,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effec

            auto it = _storage_groups.find(group_id);
            if (it == _storage_groups.end()) {
-                throw std::runtime_error(format("Unable to find sibling tablet of id for table {}", group_id, table_id));
+                throw std::runtime_error(format("Unable to find sibling tablet of id {} for table {}", group_id, table_id));
            }
            auto& sg = it->second;
            sg->for_each_compaction_group([&new_sg, new_range, new_tid, group_id] (const compaction_group_ptr& cg) {
@@ -3482,10 +3507,22 @@ void tablet_storage_group_manager::update_effective_replication_map(
    for_each_storage_group([&] (size_t group_id, storage_group& sg) {
        const locator::tablet_id tid = static_cast<locator::tablet_id>(group_id);
        const locator::tablet_info& tinfo = new_tablet_map->get_tablet_info(tid);
-        const bool tombstone_gc_enabled = std::ranges::contains(tinfo.replicas, this_replica);
+        const bool is_pending_replica = !std::ranges::contains(tinfo.replicas, this_replica);
+        const bool tombstone_gc_enabled = !is_pending_replica;

-        sg.for_each_compaction_group([tombstone_gc_enabled] (const compaction_group_ptr& cg_ptr) {
+        // construct a counter id for use in local counter updates.
+        // there is a single replica in a rack, so we can reuse a single counter id for all replicas
+        // in a rack. replicas in different racks use different counter ids.
+        // during migration there are two active counter replicas in a rack, then the pending
+        // replica uses a variation of the rack's counter id, so there are at most two distinct
+        // counter ids per rack.
+        auto rack_uuid = erm.get_topology().get_rack_uuid();
+        auto my_counter_uuid = is_pending_replica ? utils::UUID_gen::negate(rack_uuid) : rack_uuid;
+        counter_id my_counter_id(my_counter_uuid);
+
+        sg.for_each_compaction_group([tombstone_gc_enabled, my_counter_id] (const compaction_group_ptr& cg_ptr) {
            cg_ptr->set_tombstone_gc_enabled(tombstone_gc_enabled);
+            cg_ptr->set_counter_id(my_counter_id);
        });
    });

--- a/schema/caching_options.hh
+++ b/schema/caching_options.hh
@@ -30,6 +30,9 @@ class caching_options {
    friend class schema;
    caching_options();
 public:
+    // do not used schema.cdc_options().enabled(), use cdc::cdc_enabled(schema)
+    // instead. This is because cdc_enabled() also checks for CDC being enabled
+    // by a vector index.
    bool enabled() const {
        return _enabled;
    }
--- a/schema/schema.cc
+++ b/schema/schema.cc
@@ -131,12 +131,12 @@ bool operator==(const column_mapping_entry& lhs, const column_mapping_entry& rhs
 }

 bool operator==(const column_mapping& lhs, const column_mapping& rhs) {
-    const auto& lhs_columns = lhs.columns(), rhs_columns = rhs.columns();
+    const auto& lhs_columns = lhs.columns(), &rhs_columns = rhs.columns();
    if (lhs_columns.size() != rhs_columns.size()) {
        return false;
    }
    for (size_t i = 0, end = lhs_columns.size(); i < end; ++i) {
-        const column_mapping_entry& lhs_entry = lhs_columns[i], rhs_entry = rhs_columns[i];
+        const column_mapping_entry& lhs_entry = lhs_columns[i], &rhs_entry = rhs_columns[i];
        if (lhs_entry != rhs_entry) {
            return false;
        }
@@ -1186,10 +1186,18 @@ cql3::description schema::describe(const schema_describe_helper& helper, cql3::d
        }
    });

+    // For indexes, cf_name() returns the backing materialized view's
+    // table name (e.g. "myindex_index"), not the logical index name
+    // (e.g. "myindex"). Derive the correct name so all callers get
+    // the user-facing index name.
+    sstring name = helper.type == schema_describe_helper::type::index
+            ? secondary_index::index_name_from_table_name(cf_name())
+            : cf_name();
+
    return cql3::description {
        .keyspace = ks_name(),
        .type = std::move(type),
-        .name = cf_name(),
+        .name = std::move(name),
        .create_statement = desc_opt == cql3::describe_option::NO_STMTS
                ? std::nullopt
                : std::make_optional(get_create_statement(helper, desc_opt == cql3::describe_option::STMTS_AND_INTERNALS))
--- a/scripts/base36-uuid.py
+++ b/scripts/base36-uuid.py
@@ -142,7 +142,7 @@ class TimeUuid:
        # need to translate the timestamp to the UNIX time
        unix_time = self.uuid.time - self.UNIX_EPOCH_SINCE_GREGORIAN_DAY0
        seconds, decimicro_seconds = divmod(unix_time, DECIMICRO_RATIO)
-        return datetime.datetime.fromtimestamp(seconds), decimicro_seconds
+        return datetime.datetime.fromtimestamp(seconds, tz=datetime.timezone.utc), decimicro_seconds

    def print_field(self, field: str, print_in_hex: bool) -> None:
        def print_num(n: int, bits: int) -> str:
@@ -186,7 +186,7 @@ def test_dencode_base36() -> None:
    assert timeuuid.encode_with_base36() == encoded_uuid

    timestamp, decimicro_seconds = timeuuid.timestamp
-    assert timestamp == datetime.datetime(2022, 5, 23, 18, 37, 52)
+    assert timestamp == datetime.datetime(2022, 5, 23, 10, 37, 52, tzinfo=datetime.timezone.utc)
    assert decimicro_seconds == 7040000


--- a/scripts/compare_build_systems.py
+++ b/scripts/compare_build_systems.py
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -606,14 +606,27 @@ class std_unordered_map:
    def __bool__(self):
        return self.__nonzero__()

+absl_kBitCount = 16

 class flat_hash_map:
    def __init__(self, ref):
        kt = ref.type.template_argument(0)
        vt = ref.type.template_argument(1)
        slot_ptr_type = gdb.lookup_type('::std::pair<const {}, {} >'.format(str(kt), str(vt))).pointer()
-        self.slots = ref['slots_'].cast(slot_ptr_type)
-        self.size = ref['size_']
+        try:
+            # abseil >= lts_2026_01_07
+            value = ref['settings_']['value']
+            self.size = value['size_']['data_'] >> (1 + absl_kBitCount)
+            capacity = int(value['capacity_'])
+            heap_or_soo = value['heap_or_soo_']
+            if capacity <= 1:
+                # SOO mode: slot data stored inline in soo_data
+                self.slots = heap_or_soo['soo_data'].address.cast(slot_ptr_type)
+            else:
+                self.slots = heap_or_soo['heap']['slot_array']['p'].cast(slot_ptr_type)
+        except gdb.error:
+            self.slots = ref['slots_'].cast(slot_ptr_type)
+            self.size = ref['size_']

    def __len__(self):
        return self.size
@@ -640,7 +653,13 @@ class absl_container:
    def __init__(self, ref):
        self.val = ref
        HasInfozShift = 1
-        self.size = ref["settings_"]["value"]["size_"] >> HasInfozShift
+        try:
+            # abseil >= lts_2026_01_07
+            self.size = ref["settings_"]["value"]["size_"]["data_"] >> (HasInfozShift + absl_kBitCount)
+            self.new_layout = True
+        except gdb.error:
+            self.size = ref["settings_"]["value"]["size_"] >> HasInfozShift
+            self.new_layout = False

    def __len__(self):
        return self.size
@@ -649,10 +668,29 @@ class absl_container:
        if self.size == 0:
            return
        capacity = int(self.val["settings_"]["value"]["capacity_"])
-        control = self.val["settings_"]["value"]["control_"]
-        # for the map the slot_type is std::pair<K, V>
        slot_type = gdb.lookup_type(str(self.val.type.strip_typedefs()) + "::slot_type")
-        slots = self.val["settings_"]["value"]["slots_"].cast(slot_type.pointer())
+
+        if self.new_layout:
+            # abseil >= lts_2026_01_07: SOO (Small Object Optimization) support.
+            # When capacity <= 1, the table may be in SOO mode where the single
+            # slot is stored inline in heap_or_soo_.soo_data (no control bytes).
+            heap_or_soo = self.val["settings_"]["value"]["heap_or_soo_"]
+            if capacity <= 1:
+                # SOO mode: slot data is stored directly in soo_data
+                soo_data = heap_or_soo["soo_data"]
+                slot = soo_data.address.cast(slot_type.pointer())
+                if slot[0]['value'].type.name.find("::map_slot_type") != -1:
+                    yield slot[0]['key'], slot[0]['value']['second']
+                else:
+                    yield slot[0]['key'], slot[0]['value']
+                return
+            # Non-SOO mode: control and slots are in heap
+            heap = heap_or_soo["heap"]
+            control = heap["control"]["p"]
+            slots = heap["slot_array"]["p"].cast(slot_type.pointer())
+        else:
+            control = self.val["settings_"]["value"]["control_"]
+            slots = self.val["settings_"]["value"]["slots_"].cast(slot_type.pointer())
        for i in range(capacity):
            ctrl_t = int(control[i])
            # if the control is empty or deleted, its value is less than -1, see
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -232,6 +232,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
+        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::TABLETS),
    };

    if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
--- a/service/client_state.hh
+++ b/service/client_state.hh
@@ -76,14 +76,15 @@ public:
            : _cs(cs), _auth_service(auth_service), _sl_controller(sl_controller) {}
        friend client_state;
    public:
-        client_state get() const {
-            return client_state(_cs, _auth_service, _sl_controller);
+        client_state get(abort_source* as = nullptr) const {
+            return client_state(_cs, _auth_service, _sl_controller, as);
        }
    };
 private:
    client_state(const client_state* cs,
        seastar::sharded<auth::service>* auth_service,
-        seastar::sharded<qos::service_level_controller>* sl_controller)
+        seastar::sharded<qos::service_level_controller>* sl_controller,
+        abort_source* as)
            : _keyspace(cs->_keyspace)
            , _user(cs->_user)
            , _auth_state(cs->_auth_state)
@@ -94,6 +95,7 @@ private:
            , _sl_controller(sl_controller ? &sl_controller->local() : nullptr)
            , _default_timeout_config(cs->_default_timeout_config)
            , _timeout_config(cs->_timeout_config)
+            , _as(as)
            , _enabled_protocol_extensions(cs->_enabled_protocol_extensions)
    {}
    friend client_state_for_another_shard;
@@ -153,6 +155,11 @@ private:

    workload_type _workload_type = workload_type::unspecified;

+    // Used to communicate with the code executing user requests.
+    // It's a way to indicate that we might abort processing the
+    // request, e.g. if the corresponding connection has been severed.
+    abort_source* _as{nullptr};
+
 public:
    struct internal_tag {};
    struct external_tag {};
@@ -211,14 +218,16 @@ public:
                 qos::service_level_controller* sl_controller,
                 timeout_config timeout_config,
                 const socket_address& remote_address = socket_address(),
-                 bool bypass_auth_checks = false)
+                 bool bypass_auth_checks = false,
+                 abort_source* as = nullptr)
            : _is_internal(false)
            , _bypass_auth_checks(bypass_auth_checks)
            , _remote_address(remote_address)
            , _auth_service(&auth_service)
            , _sl_controller(sl_controller)
            , _default_timeout_config(timeout_config)
-            , _timeout_config(timeout_config) {
+            , _timeout_config(timeout_config)
+            , _as(as) {
        if (!auth_service.underlying_authenticator().require_authentication()) {
            _user = auth::authenticated_user();
        }
@@ -244,29 +253,32 @@ public:
        return *_sl_controller;
    }

-    client_state(internal_tag) : client_state(internal_tag{}, infinite_timeout_config)
+    client_state(internal_tag, abort_source* as = nullptr) : client_state(internal_tag{}, infinite_timeout_config, as)
    {}

-    client_state(internal_tag, const timeout_config& config)
+    client_state(internal_tag, const timeout_config& config, abort_source* as = nullptr)
            : _keyspace("system")
            , _is_internal(true)
            , _bypass_auth_checks(true)
            , _default_timeout_config(config)
            , _timeout_config(config)
+            , _as(as)
    {}

-    client_state(internal_tag, auth::service& auth_service, qos::service_level_controller& sl_controller, sstring username)
+    client_state(internal_tag, auth::service& auth_service, qos::service_level_controller& sl_controller, sstring username, abort_source* as = nullptr)
        : _user(auth::authenticated_user(username))
        , _auth_state(auth_state::READY)
        , _is_internal(true)
        , _bypass_auth_checks(true)
        , _auth_service(&auth_service)
        , _sl_controller(&sl_controller)
+        , _as(as)
    {}

    client_state(auth::service& auth_service,
                 qos::service_level_controller* sl_controller,
-                 forwarded_client_state&& forwarded_state)
+                 forwarded_client_state&& forwarded_state,
+                 abort_source* as = nullptr)
            : _keyspace(std::move(forwarded_state.keyspace))
            , _user(forwarded_state.username ? auth::authenticated_user(*forwarded_state.username) : auth::authenticated_user{})
            , _auth_state(auth_state::READY)
@@ -277,6 +289,7 @@ public:
            , _sl_controller(sl_controller)
            , _default_timeout_config(forwarded_state.timeout_config)
            , _timeout_config(std::move(forwarded_state.timeout_config))
+            , _as(as)
            , _enabled_protocol_extensions(cql_transport::cql_protocol_extension_enum_set::from_mask(
                    forwarded_state.protocol_extensions_mask))
    {}
@@ -392,6 +405,16 @@ public:
        return _keyspace;
    }

+    abort_source& get_abort_source() {
+        if (_as == nullptr) {
+            utils::on_internal_error("client_state::get_abort_source(): Tried to dereference nullptr");
+        }
+        return *_as;
+    }
+    abort_source* get_abort_source_ptr() noexcept {
+        return _as;
+    }
+
    /**
     * Sets active user. Does _not_ validate anything
     */
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2664,7 +2664,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
    // We use the first path for CDC mutations (if present) and the latter for "paxos mutations".
    // Attempts to send both kinds of mutations in one shot caused an infinite loop.
    future<> f_cdc = make_ready_future<>();
-    if (_schema->cdc_options().enabled()) {
+    if (cdc::cdc_enabled(*_schema)) {
        auto update_mut = decision->update.unfreeze(_schema);
        const auto base_tbl_id = update_mut.column_family_id();
        utils::chunked_vector<mutation> update_mut_vec{std::move(update_mut)};
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -20,7 +20,6 @@
 #include "raft/raft.hh"
 #include "auth/cache.hh"
 #include <ranges>
-#include <seastar/core/shard_id.hh>
 #include <seastar/core/sleep.hh>
 #include "service/qos/raft_service_level_distributed_data_accessor.hh"
 #include "service/qos/service_level_controller.hh"
@@ -119,11 +118,9 @@
 #include "service/task_manager_module.hh"
 #include "service/topology_mutation.hh"
 #include "cql3/query_processor.hh"
-#include "service/qos/service_level_controller.hh"
 #include <csignal>
 #include "utils/labels.hh"
 #include "view_info.hh"
-#include "raft/raft.hh"
 #include "debug.hh"

 #include <boost/algorithm/string/split.hpp>
@@ -352,30 +349,6 @@ bool storage_service::is_replacing() {
    return !cfg.replace_address().empty();
 }

-bool storage_service::is_first_node() {
-    if (is_replacing()) {
-        return false;
-    }
-    auto seeds = _gossiper.get_seeds();
-    if (seeds.empty()) {
-        return false;
-    }
-    // Node with the smallest IP address is chosen as the very first node
-    // in the cluster. The first node is the only node that does not
-    // bootstrap in the cluster. All other nodes will bootstrap.
-    std::vector<gms::inet_address> sorted_seeds(seeds.begin(), seeds.end());
-    std::sort(sorted_seeds.begin(), sorted_seeds.end());
-    if (sorted_seeds.front() == get_broadcast_address()) {
-        slogger.info("I am the first node in the cluster. Skip bootstrap. Node={}", get_broadcast_address());
-        return true;
-    }
-    return false;
-}
-
-bool storage_service::should_bootstrap() {
-    return !_sys_ks.local().bootstrap_complete() && !is_first_node();
-}
-
 /* Broadcasts the chosen tokens through gossip,
 * together with a CDC generation timestamp and STATUS=NORMAL.
 *
@@ -1575,9 +1548,7 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
        raft_replace_info = raft_group0::replace_info {
            .raft_id = raft::server_id{ri->host_id.uuid()},
        };
-    } else if (should_bootstrap()) {
-        co_await check_for_endpoint_collision(initial_contact_nodes);
-    } else {
+    } else if (_sys_ks.local().bootstrap_complete()) {
        slogger.info("Performing gossip shadow round, initial_contact_nodes={}", initial_contact_nodes);
        co_await _gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::no);
        _gossiper.check_snitch_name_matches(_snitch.local()->get_name());
@@ -1833,7 +1804,7 @@ future<> storage_service::on_change(gms::inet_address endpoint, locator::host_id
    slogger.debug("endpoint={} on_change:     states={}, permit_id={}", endpoint, states, pid);

    auto ep_state = _gossiper.get_endpoint_state_ptr(host_id);
-    if (!ep_state || _gossiper.is_dead_state(*ep_state)) {
+    if (!ep_state || _gossiper.is_left(*ep_state)) {
        slogger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint);
        co_return;
    }
@@ -1918,12 +1889,8 @@ std::optional<db::system_keyspace::peer_info> storage_service::get_peer_info_for

    auto set_field = [&]<typename T> (std::optional<T>& field,
            const gms::versioned_value& value,
-            std::string_view name,
-            bool managed_by_raft)
+            std::string_view name)
    {
-        if (managed_by_raft) {
-            return;
-        }
        try {
            field = T(value.value());
        } catch (...) {
@@ -1932,31 +1899,17 @@ std::optional<db::system_keyspace::peer_info> storage_service::get_peer_info_for
        }
    };

+    // Fields managed by raft are skipped here
    for (const auto& [state, value] : app_state_map) {
        switch (state) {
-        case application_state::DC:
-            set_field(get_peer_info().data_center, value, "data_center", true);
-            break;
        case application_state::INTERNAL_IP:
-            set_field(get_peer_info().preferred_ip, value, "preferred_ip", false);
-            break;
-        case application_state::RACK:
-            set_field(get_peer_info().rack, value, "rack", true);
-            break;
-        case application_state::RELEASE_VERSION:
-            set_field(get_peer_info().release_version, value, "release_version", true);
+            set_field(get_peer_info().preferred_ip, value, "preferred_ip");
            break;
        case application_state::RPC_ADDRESS:
-            set_field(get_peer_info().rpc_address, value, "rpc_address", false);
+            set_field(get_peer_info().rpc_address, value, "rpc_address");
            break;
        case application_state::SCHEMA:
-            set_field(get_peer_info().schema_version, value, "schema_version", false);
-            break;
-        case application_state::TOKENS:
-            // tokens are updated separately
-            break;
-        case application_state::SUPPORTED_FEATURES:
-            set_field(get_peer_info().supported_features, value, "supported_features", true);
+            set_field(get_peer_info().schema_version, value, "schema_version");
            break;
        default:
            break;
@@ -2425,27 +2378,6 @@ future<> storage_service::wait_for_group0_stop() {
    }
 }

-future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::inet_address> initial_contact_nodes) {
-    slogger.debug("Starting shadow gossip round to check for endpoint collision");
-
-    return seastar::async([this, initial_contact_nodes] {
-        bool found_bootstrapping_node = false;
-        auto local_features = _feature_service.supported_feature_set();
-        do {
-            slogger.info("Performing gossip shadow round");
-            _gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::yes).get();
-            _gossiper.check_snitch_name_matches(_snitch.local()->get_name());
-            auto addr = get_broadcast_address();
-            if (!_gossiper.is_safe_for_bootstrap(addr)) {
-                throw std::runtime_error(::format("A node with address {} already exists, cancelling join. "
-                    "Use replace_address if you want to replace this node.", addr));
-            }
-        } while (found_bootstrapping_node);
-        slogger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
-        _gossiper.reset_endpoint_state_map().get();
-    });
-}
-
 future<> storage_service::remove_endpoint(inet_address endpoint, gms::permit_id pid) {
    auto host_id_opt = _gossiper.try_get_host_id(endpoint);
    if (host_id_opt) {
@@ -2498,17 +2430,6 @@ storage_service::prepare_replacement_info(std::unordered_set<gms::inet_address>
        replace_host_id = _gossiper.get_host_id(replace_address);
    }

-    auto state = _gossiper.get_endpoint_state_ptr(replace_host_id);
-    if (!state) {
-        throw std::runtime_error(::format("Cannot replace_address {} because it doesn't exist in gossip", replace_address));
-    }
-
-    // Reject to replace a node that has left the ring
-    auto status = _gossiper.get_gossip_status(replace_host_id);
-    if (status == gms::versioned_value::STATUS_LEFT || status == gms::versioned_value::REMOVED_TOKEN) {
-        throw std::runtime_error(::format("Cannot replace_address {} because it has left the ring, status={}", replace_address, status));
-    }
-
    auto dc_rack = get_dc_rack_for(replace_host_id).value_or(locator::endpoint_dc_rack::default_location);

    auto ri = replacement_info {
@@ -2802,12 +2723,7 @@ future<> storage_service::raft_decommission() {
    rtlogger.info("decommission: waiting for completion (request ID: {})", request_id);
    auto error = co_await wait_for_topology_request_completion(request_id);

-    if (error.empty()) {
-        // Need to set it otherwise gossiper will try to send shutdown on exit
-        rtlogger.info("decommission: successfully removed from topology (request ID: {}), updating gossip status", request_id);
-        co_await _gossiper.add_local_application_state(std::pair(gms::application_state::STATUS, gms::versioned_value::left({}, _gossiper.now().time_since_epoch().count())));
-        rtlogger.info("Decommission succeeded. Request ID: {}", request_id);
-    } else  {
+    if (!error.empty()) {
        auto err = fmt::format("Decommission failed. See earlier errors ({}). Request ID: {}", error, request_id);
        rtlogger.error("{}", err);
        throw std::runtime_error(err);
@@ -3824,14 +3740,14 @@ storage_service::describe_ring(const sstring& keyspace, bool include_only_local_
 }

 future<utils::chunked_vector<dht::token_range_endpoints>>
-storage_service::describe_ring_for_table(const sstring& keyspace_name, const sstring& table_name) const {
-    slogger.debug("describe_ring for table {}.{}", keyspace_name, table_name);
-    auto& t = _db.local().find_column_family(keyspace_name, table_name);
+storage_service::describe_ring_for_table(table_id tid) const {
+    auto& t = _db.local().find_column_family(tid);
+    const auto& schema = *t.schema();
+    slogger.debug("describe_ring for table {}.{}", schema.ks_name(), schema.cf_name());
    if (!t.uses_tablets()) {
-        auto ranges = co_await describe_ring(keyspace_name);
+        auto ranges = co_await describe_ring(schema.ks_name());
        co_return ranges;
    }
-    table_id tid = t.schema()->id();
    auto erm = t.get_effective_replication_map();
    auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(tid);
    const auto& topology = erm->get_topology();
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -35,7 +35,6 @@
 #include <seastar/core/gate.hh>
 #include "replica/database_fwd.hh"
 #include "streaming/stream_reason.hh"
-#include <seastar/core/sharded.hh>
 #include "service/migration_listener.hh"
 #include <seastar/core/metrics_registration.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -56,7 +55,6 @@
 class node_ops_cmd_request;
 class node_ops_cmd_response;
 class node_ops_info;
-enum class node_ops_cmd : uint32_t;
 class repair_service;
 class protocol_server;

@@ -177,7 +175,6 @@ class storage_service : public service::migration_listener, public gms::i_endpoi
 private:
    using token = dht::token;
    using token_range_endpoints = dht::token_range_endpoints;
-    using endpoint_details = dht::endpoint_details;
    using boot_strapper = dht::boot_strapper;
    using token_metadata = locator::token_metadata;
    using shared_token_metadata = locator::shared_token_metadata;
@@ -214,7 +211,6 @@ private:

    sstring _operation_in_progress;
    seastar::metrics::metric_groups _metrics;
-    using client_shutdown_hook = noncopyable_function<void()>;
    std::vector<protocol_server*> _protocol_servers;
    std::vector<std::any> _listeners;
    named_gate _async_gate;
@@ -474,8 +470,6 @@ private:

 public:

-    future<> check_for_endpoint_collision(std::unordered_set<gms::inet_address> initial_contact_nodes);
-
    future<> join_cluster(sharded<service::storage_proxy>& proxy,
            start_hint_manager start_hm, gms::generation_type new_generation);

@@ -492,9 +486,7 @@ public:
    future<> wait_for_group0_stop();

 private:
-    bool should_bootstrap();
    bool is_replacing();
-    bool is_first_node();
    future<> start_sys_dist_ks() const;
    future<> join_topology(sharded<service::storage_proxy>& proxy,
            std::unordered_set<gms::inet_address> initial_contact_nodes,
@@ -528,7 +520,7 @@ public:

    future<utils::chunked_vector<token_range_endpoints>> describe_ring(const sstring& keyspace, bool include_only_local_dc = false) const;

-    future<utils::chunked_vector<dht::token_range_endpoints>> describe_ring_for_table(const sstring& keyspace_name, const sstring& table_name) const;
+    future<utils::chunked_vector<dht::token_range_endpoints>> describe_ring_for_table(table_id tid) const;

    /**
     * Retrieve a map of tokens to endpoints, including the bootstrapping ones.
@@ -735,9 +727,6 @@ public:
    future<> removenode(locator::host_id host_id, locator::host_id_or_endpoint_list ignore_nodes);
    future<> mark_excluded(const std::vector<locator::host_id>&);
    future<node_ops_cmd_response> node_ops_cmd_handler(gms::inet_address coordinator, std::optional<locator::host_id> coordinator_host_id, node_ops_cmd_request req);
-    void node_ops_cmd_check(gms::inet_address coordinator, const node_ops_cmd_request& req);
-    future<> node_ops_cmd_heartbeat_updater(node_ops_cmd cmd, node_ops_id uuid, std::list<gms::inet_address> nodes, lw_shared_ptr<bool> heartbeat_updater_done);
-    void on_node_ops_registered(node_ops_id);

    future<mode> get_operation_mode();

--- a/service/strong_consistency/coordinator.cc
+++ b/service/strong_consistency/coordinator.cc
@@ -8,12 +8,14 @@

 #include "coordinator.hh"
 #include "db/consistency_level_type.hh"
+#include "exceptions/exceptions.hh"
 #include "raft/raft.hh"
 #include "schema/schema.hh"
 #include "replica/database.hh"
 #include "locator/tablet_replication_strategy.hh"
 #include "service/strong_consistency/state_machine.hh"
 #include "service/strong_consistency/groups_manager.hh"
+#include "utils/error_injection.hh"
 #include "idl/strong_consistency/state_machine.dist.hh"
 #include "idl/strong_consistency/state_machine.dist.impl.hh"
 #include "gms/gossiper.hh"
@@ -23,6 +25,30 @@ namespace service::strong_consistency {

 static logging::logger logger("sc_coordinator");

+// FIXME: Once the drivers support new error codes corresponding
+// to timeouts of queries to strongly consistent tables, use
+// a new, dedicated exception type instead of this.
+struct write_timeout : public exceptions::mutation_write_timeout_exception {
+    write_timeout(std::string_view ks, std::string_view cf)
+        : exceptions::mutation_write_timeout_exception(
+            seastar::format("Query timed out for {}.{}", ks, cf),
+            db::consistency_level::ONE, 0, 1, db::write_type::SIMPLE
+        )
+    {}
+};
+
+// FIXME: Once the drivers support new error codes corresponding
+// to timeouts of queries to strongly consistent tables, use
+// a new, dedicated exception type instead of this.
+struct read_timeout : public exceptions::read_timeout_exception {
+    read_timeout(std::string_view ks, std::string_view cf)
+        : exceptions::read_timeout_exception(
+            seastar::format("Query timed out for {}.{}", ks, cf),
+            db::consistency_level::ONE, 0, 1, false
+        )
+    {}
+};
+
 static const locator::tablet_replica* find_replica(const locator::tablet_info& tinfo, locator::host_id id) {
    const auto it = std::ranges::find_if(tinfo.replicas,
        [&] (const locator::tablet_replica& r) {
@@ -31,6 +57,34 @@ static const locator::tablet_replica* find_replica(const locator::tablet_info& t
    return it == tinfo.replicas.end() ? nullptr : &*it;
 }

+// Subscribe target to sources and return an array of the corresponding
+// subscriptions.
+//
+// The subscribing process will follow the order of the passed abort
+// sources. The corresponding subscriptions in the returned array will
+// also keep the same order.
+//
+// If some of the passed abort sources have already been triggered,
+// they will immediately trigger target. This will be done in their
+// relative order in the function's argument list.
+template <std::same_as<abort_source>... Ts>
+static auto chain_abort_sources(abort_source& target, Ts&... sources) {
+    static_assert(sizeof...(Ts) > 0, "We need to chain at least one abort source!");
+    auto source_array = std::array{std::ref(sources)...};
+
+    for (abort_source& source : source_array) {
+        if (source.abort_requested()) {
+            target.request_abort_ex(source.abort_requested_exception_ptr());
+        }
+    }
+
+    return std::array{
+        sources.subscribe([&target] (const std::optional<std::exception_ptr>& eptr) noexcept {
+            target.request_abort_ex(eptr.value_or(target.get_default_exception()));
+        })...
+    };
+}
+
 struct coordinator::operation_ctx {
    locator::effective_replication_map_ptr erm;
    raft_server raft_server;
@@ -65,13 +119,13 @@ static locator::tablet_replica select_closest_replica(const gms::gossiper& gossi
    return *it;
 }

-auto coordinator::create_operation_ctx(const schema& schema, const dht::token& token) 
+auto coordinator::create_operation_ctx(const schema& schema, const dht::token& token, abort_source& as)
    -> future<value_or_redirect<operation_ctx>>
 {
    auto erm = schema.table().get_effective_replication_map();
    if (const auto* tablet_aware_rs = erm->get_replication_strategy().maybe_as_tablet_aware();
        !tablet_aware_rs || 
-        tablet_aware_rs->get_consistency() != data_dictionary::consistency_config_option::local)
+        tablet_aware_rs->get_consistency() != data_dictionary::consistency_config_option::global)
    {
        on_internal_error(logger,
            format("Unexpected replication strategy '{}' with consistency '{}' for table {}.{}",
@@ -96,7 +150,11 @@ auto coordinator::create_operation_ctx(const schema& schema, const dht::token& t
        };
    }
    const auto& raft_info = tablet_map.get_tablet_raft_info(tablet_id);
-    auto raft_server = co_await _groups_manager.acquire_server(raft_info.group_id);
+
+    co_await utils::get_local_injector().inject("sc_coordinator_wait_before_acquire_server",
+            utils::wait_for_message(5min));
+
+    auto raft_server = co_await _groups_manager.acquire_server(raft_info.group_id, as);

    co_return operation_ctx {
        .erm = std::move(erm),
@@ -116,71 +174,108 @@ coordinator::coordinator(groups_manager& groups_manager, replica::database& db,

 future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
        const dht::token& token,
-        mutation_gen&& mutation_gen)
+        mutation_gen&& mutation_gen,
+        timeout_clock::time_point timeout,
+        abort_source& as)
 {
-    auto op_result = co_await create_operation_ctx(*schema, token);
-    if (const auto* redirect = get_if<need_redirect>(&op_result)) {
-        co_return *redirect;
-    }
-    auto& op = get<operation_ctx>(op_result);
+    auto aoe = abort_on_expiry<timeout_clock>(timeout);
+    [[maybe_unused]] const auto subs = chain_abort_sources(aoe.abort_source(), as);

-    while (true) {
-        auto disposition = op.raft_server.begin_mutate();
-        if (const auto* not_a_leader = get_if<raft::not_a_leader>(&disposition)) {
-            const auto leader_host_id = locator::host_id{not_a_leader->leader.uuid()};
-            const auto* target = find_replica(op.tablet_info, leader_host_id);
-            if (!target) {
-                on_internal_error(logger,
-                    ::format("table {}.{}, tablet {}, current leader {} is not a replica, replicas {}",
-                        schema->ks_name(), schema->cf_name(), op.tablet_id, 
-                        leader_host_id, op.tablet_info.replicas));
+    try {
+        auto op_result = co_await create_operation_ctx(*schema, token, aoe.abort_source());
+        if (const auto* redirect = get_if<need_redirect>(&op_result)) {
+            co_return *redirect;
+        }
+        auto& op = get<operation_ctx>(op_result);
+
+        while (true) {
+            co_await utils::get_local_injector().inject("sc_coordinator_wait_before_begin_mutate",
+                utils::wait_for_message(5min));
+
+            auto disposition = op.raft_server.begin_mutate(aoe.abort_source());
+            if (const auto* not_a_leader = get_if<raft::not_a_leader>(&disposition)) {
+                const auto leader_host_id = locator::host_id{not_a_leader->leader.uuid()};
+                const auto* target = find_replica(op.tablet_info, leader_host_id);
+                if (!target) {
+                    on_internal_error(logger,
+                        ::format("table {}.{}, tablet {}, current leader {} is not a replica, replicas {}",
+                            schema->ks_name(), schema->cf_name(), op.tablet_id,
+                            leader_host_id, op.tablet_info.replicas));
+                }
+                co_return need_redirect{*target};
            }
-            co_return need_redirect{*target};
-        }
-        if (auto* wait_for_leader = get_if<raft_server::need_wait_for_leader>(&disposition)) {
-            co_await std::move(wait_for_leader->future);
-            continue;
-        }
-        const auto [ts, term] = get<raft_server::timestamp_with_term>(disposition);
-
-        const raft_command command {
-            .mutation{mutation_gen(ts)}
-        };
-        raft::command raft_cmd;
-        ser::serialize(raft_cmd, command);
-
-        logger.debug("mutate(): add_entry({}), term {}",
-            command.mutation.pretty_printer(schema), term);
-        try {
-            co_await op.raft_server.server().add_entry(std::move(raft_cmd),
-                raft::wait_type::committed,
-                nullptr);
-            co_return std::monostate{};
-        } catch (...) {
-            auto ex = std::current_exception();
-            if (try_catch<raft::request_aborted>(ex) || try_catch<raft::stopped_error>(ex)) {
-                // Holding raft_server.holder guarantees that the raft::server is not
-                // aborted until the holder is released.
-
-                on_internal_error(logger,
-                    format("mutate(): add_entry, unexpected exception {}, table {}.{}, tablet {}, term {}", 
-                        ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term));
-            } else if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
-                logger.debug("mutate(): add_entry, got retriable error {}, table {}.{}, tablet {}, term {}",
-                    ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
-
+            if (auto* wait_for_leader = get_if<raft_server::need_wait_for_leader>(&disposition)) {
+                co_await std::move(wait_for_leader->future);
                continue;
-            } else if (try_catch<raft::commit_status_unknown>(ex)) {
-                logger.debug("mutate(): add_entry, got commit_status_unknown {}, table {}.{}, tablet {}, term {}",
-                    ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
-
-                // FIXME: use a dedicated ERROR_CODE instead of SERVER_ERROR
-                throw exceptions::server_exception(
-                    "The outcome of this statement is unknown. It may or may not have been applied. "
-                    "Retrying the statement may be necessary.");
            }
+            const auto [ts, term] = get<raft_server::timestamp_with_term>(disposition);

-            // We know nothing about other errors, let the cql server convert them to SERVER_ERROR.
+            const raft_command command {
+                .mutation{mutation_gen(ts)}
+            };
+            raft::command raft_cmd;
+            ser::serialize(raft_cmd, command);
+
+            logger.debug("mutate(): add_entry({}), term {}",
+                command.mutation.pretty_printer(schema), term);
+
+            co_await utils::get_local_injector().inject("sc_coordinator_wait_before_add_entry",
+                utils::wait_for_message(5min));
+
+            try {
+                co_await op.raft_server.server().add_entry(std::move(raft_cmd),
+                    raft::wait_type::committed,
+                    &aoe.abort_source());
+                co_return std::monostate{};
+            } catch (...) {
+                auto ex = std::current_exception();
+                if (try_catch<raft::stopped_error>(ex)) {
+                    // Holding raft_server.holder guarantees that the raft::server is not
+                    // aborted until the holder is released.
+
+                    on_internal_error(logger,
+                        format("mutate(): add_entry, unexpected exception {}, table {}.{}, tablet {}, term {}",
+                            ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term));
+                } else if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
+                    logger.debug("mutate(): add_entry, got retriable error {}, table {}.{}, tablet {}, term {}",
+                        ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
+
+                    continue;
+                } else if (try_catch<raft::commit_status_unknown>(ex)) {
+                    logger.debug("mutate(): add_entry, got commit_status_unknown {}, table {}.{}, tablet {}, term {}",
+                        ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
+
+                    // FIXME: use a dedicated ERROR_CODE instead of SERVER_ERROR
+                    throw exceptions::server_exception(
+                        "The outcome of this statement is unknown. It may or may not have been applied. "
+                        "Retrying the statement may be necessary.");
+                }
+
+                // Let the outer code handle other errors.
+                throw;
+            }
+        }
+    } catch (...) {
+        auto ex = std::current_exception();
+        // Unfortunately, timeouts can materialize in different forms depending
+        // on which statement throws the exception.
+        //
+        // * raft::request_aborted: If the abort source passed to a raft::server's
+        //     method was triggered.
+        // * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
+        // * timed_out_error: Can be thrown by the abort_on_expiry.
+        // * condition_variable_timed_out: Can be thrown by begin_mutate.
+        //
+        // We handle them collectively here.
+        if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
+                || try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
+            logger.trace("mutate(): request timed out with error {}, table {}.{}, token {}",
+                ex, schema->ks_name(), schema->cf_name(), token);
+            co_return coroutine::return_exception(write_timeout(schema->ks_name(), schema->cf_name()));
+        } else {
+            logger.trace("mutate(): unknown exception {}, table {}.{}, token {}",
+                ex, schema->ks_name(), schema->cf_name(), token);
+            // We know nothing about other errors. Let the CQL server convert them to SERVER_ERROR.
            throw;
        }
    }
@@ -190,21 +285,52 @@ auto coordinator::query(schema_ptr schema,
        const query::read_command& cmd,
        const dht::partition_range_vector& ranges,
        tracing::trace_state_ptr trace_state,
-        db::timeout_clock::time_point timeout
+        timeout_clock::time_point timeout,
+        abort_source& as
    ) -> future<query_result_type>
 {
-    auto op_result = co_await create_operation_ctx(*schema, ranges[0].start()->value().token());
-    if (const auto* redirect = get_if<need_redirect>(&op_result)) {
-        co_return *redirect;
+    auto aoe = abort_on_expiry<timeout_clock>(timeout);
+    [[maybe_unused]] const auto subs = chain_abort_sources(aoe.abort_source(), as);
+
+    try {
+        auto op_result = co_await create_operation_ctx(*schema, ranges[0].start()->value().token(), aoe.abort_source());
+        if (const auto* redirect = get_if<need_redirect>(&op_result)) {
+            co_return *redirect;
+        }
+        auto& op = get<operation_ctx>(op_result);
+
+        co_await utils::get_local_injector().inject("sc_coordinator_wait_before_query_read_barrier",
+            utils::wait_for_message(5min));
+
+        co_await op.raft_server.server().read_barrier(&aoe.abort_source());
+
+        auto [result, cache_temp] = co_await _db.query(schema, cmd,
+            query::result_options::only_result(), ranges, trace_state, timeout);
+
+        co_return std::move(result);
+    } catch (...) {
+        auto ex = std::current_exception();
+        // Unfortunately, timeouts can materialize in different forms depending
+        // on which statement throws the exception.
+        //
+        // * raft::request_aborted: If the abort source passed to a raft::server's
+        //     method was triggered.
+        // * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
+        // * timed_out_error: Can be thrown by the abort_on_expiry.
+        //
+        // We handle them collectively here.
+        if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
+                || try_catch<timed_out_error>(ex)) {
+            logger.trace("query(): request timed out with error {}, table {}.{}, read cmd {}",
+                ex, schema->ks_name(), schema->cf_name(), cmd);
+            co_return coroutine::return_exception(read_timeout(schema->ks_name(), schema->cf_name()));
+        } else {
+            logger.trace("mutate(): unknown exception {}, table {}.{}, read cmd {}",
+                ex, schema->ks_name(), schema->cf_name(), cmd);
+            // We know nothing about other errors. Let the CQL server convert them to SERVER_ERROR.
+            throw;
+        }
    }
-    auto& op = get<operation_ctx>(op_result);
-
-    co_await op.raft_server.server().read_barrier(nullptr);
-
-    auto [result, cache_temp] = co_await _db.query(schema, cmd,
-        query::result_options::only_result(), ranges, trace_state, timeout);
-
-    co_return std::move(result);
 }

 }
--- a/service/strong_consistency/coordinator.hh
+++ b/service/strong_consistency/coordinator.hh
@@ -28,26 +28,35 @@ template <typename T = std::monostate>
 using value_or_redirect = std::variant<T, need_redirect>;

 class coordinator : public peering_sharded_service<coordinator> {
+public:
+    using timeout_clock = typename db::timeout_clock;
+
+private:
    groups_manager& _groups_manager;
    replica::database& _db;
    gms::gossiper& _gossiper;

    struct operation_ctx;
-    future<value_or_redirect<operation_ctx>> create_operation_ctx(const schema& schema, const dht::token& token);
+    future<value_or_redirect<operation_ctx>> create_operation_ctx(const schema& schema,
+        const dht::token& token,
+        abort_source& as);
 public:
    coordinator(groups_manager& groups_manager, replica::database& db, gms::gossiper& gossiper);

    using mutation_gen = noncopyable_function<mutation(api::timestamp_type)>;
    future<value_or_redirect<>> mutate(schema_ptr schema, 
        const dht::token& token,
-        mutation_gen&& mutation_gen);
+        mutation_gen&& mutation_gen,
+        timeout_clock::time_point timeout,
+        abort_source& as);

    using query_result_type = value_or_redirect<lw_shared_ptr<query::result>>;
    future<query_result_type> query(schema_ptr schema,
        const query::read_command& cmd,
        const dht::partition_range_vector& ranges,
        tracing::trace_state_ptr trace_state,
-        db::timeout_clock::time_point timeout);
+        timeout_clock::time_point timeout,
+        abort_source& as);
 };

 }
--- a/service/strong_consistency/groups_manager.cc
+++ b/service/strong_consistency/groups_manager.cc
@@ -17,6 +17,8 @@
 #include "replica/database.hh"
 #include "db/config.hh"

+#include <seastar/core/abort_source.hh>
+
 namespace service::strong_consistency {

 using namespace locator;
@@ -68,10 +70,20 @@ raft_server::raft_server(groups_manager::raft_group_state& state, gate::holder h
 {
 }

-auto raft_server::begin_mutate() -> begin_mutate_result {
+// conditional_variable::wait doesn't have an overload taking an abort_source.
+// This is a temporary workaround until we extend the interface.
+// See: scylladb/seastar#3292.
+static future<> wait_with_abort_source(condition_variable& cv, abort_source& as) {
+    as.check();
+    const auto _ = as.subscribe([&cv] noexcept { cv.broadcast(); });
+    co_await cv.wait();
+    as.check();
+}
+
+auto raft_server::begin_mutate(abort_source& as) -> begin_mutate_result {
    const auto leader = _state.server->current_leader();
    if (!leader) {
-        return need_wait_for_leader{_state.server->wait_for_leader(nullptr)};
+        return need_wait_for_leader{_state.server->wait_for_leader(&as)};
    }
    if (leader != _state.server->id()) {
        return raft::not_a_leader{leader};
@@ -86,7 +98,7 @@ auto raft_server::begin_mutate() -> begin_mutate_result {
        // after every state change wake-up. This ensures we will not deadlock,
        // even if the raft server state changes again (e.g., we lose leadership)
        // before the updater gets a chance to run.
-        return need_wait_for_leader{_state.leader_info_cond.wait()};
+        return need_wait_for_leader{wait_with_abort_source(_state.leader_info_cond, as)};
    }
    const auto new_ts = std::max(api::new_timestamp(), _state.leader_info->last_timestamp + 1);
    _state.leader_info->last_timestamp = new_ts;
@@ -137,6 +149,12 @@ future<> groups_manager::start_raft_group(global_tablet_id tablet,

    auto& persistence_ref = *storage;
    auto config = raft::server::configuration {
+        // Snapshotting is not implemented yet for strong consistency,
+        // so effectively disable periodic snapshotting.
+        // TODO: Revert after snapshots are implemented
+        .snapshot_threshold = std::numeric_limits<size_t>::max(),
+        .snapshot_threshold_log_size = 10 * 1024 * 1024, // 10MB
+        .max_log_size = 20 * 1024 * 1024, // 20MB
        .enable_forwarding = false,
        .on_background_error = [tablet, group_id](std::exception_ptr e) {
            on_internal_error(logger, 
@@ -163,11 +181,17 @@ void groups_manager::schedule_raft_group_deletion(raft::group_id id, raft_group_
    if (state.gate->is_closed()) {
        return;
    }
-    logger.info("schedule_raft_group_deletion(): group id {}", id);
+    logger.info("schedule_raft_group_deletion(): group id {}: scheduling", id);
    state.server_control_op = futurize_invoke([this, &state, id, g = state.gate](this auto) -> future<> {
        co_await state.server_control_op.get_future();
+        logger.debug("schedule_raft_group_deletion(): group id {}: starting", id);
+
        co_await g->close();
+        logger.debug("schedule_raft_group_deletion(): group id {}: gate closed", id);
+
        co_await _raft_gr.abort_server(id);
+        logger.debug("schedule_raft_group_deletion(): group id {}: server aborted", id);
+
        co_await std::move(state.leader_info_updater);

        _raft_gr.destroy_server(id);
@@ -222,7 +246,15 @@ future<> groups_manager::leader_info_updater(raft_group_state& state, global_tab
                logger.debug("leader_info_updater({}-{}): current term {}, running read_barrier()",
                    tablet, gid,
                    current_term);
+                // We intentionally pass nullptr here. If the tablet is leaving this node,
+                // the Raft server will be aborted and the loop will break.
+                // The same will happen when the node is shutting down.
+                // There's no reason to abort this operation in any other case.
                co_await state.server->read_barrier(nullptr);
+
+                co_await utils::get_local_injector().inject("sc_leader_info_updater_wait_before_setting_leader_info",
+                    utils::wait_for_message(5min));
+
                state.leader_info = leader_info {
                    .term = current_term,
                    .last_timestamp = schema->table().get_max_timestamp_for_tablet(tablet.tablet)
@@ -239,6 +271,10 @@ future<> groups_manager::leader_info_updater(raft_group_state& state, global_tab
            }
            state.leader_info_cond.broadcast();

+            // We intentionally pass nullptr here. If the tablet is leaving this node,
+            // the Raft server will be aborted and the loop will break.
+            // The same will happen when the node is shutting down.
+            // There's no reason to abort this operation in any other case.
            co_await state.server->wait_for_state_change(nullptr);
        }
    } catch (const raft::request_aborted&) {
@@ -253,6 +289,9 @@ future<> groups_manager::leader_info_updater(raft_group_state& state, global_tab
        // thrown from find_schema() and schema->table() when the table is dropped
        logger.debug("leader_info_updater({}-{}): got replica::no_such_column_family {}",
            tablet, gid, std::current_exception());
+    } catch (...) {
+        on_internal_error(logger, ::format("leader_info_updater({}-{}): unexpected exception: {}",
+            tablet, gid, std::current_exception()));
    }
 }

@@ -293,7 +332,7 @@ void groups_manager::update(token_metadata_ptr new_tm) {
    schedule_raft_groups_deletion(false);
 }

-future<raft_server> groups_manager::acquire_server(raft::group_id group_id) {
+future<raft_server> groups_manager::acquire_server(raft::group_id group_id, abort_source& as) {
    if (!_features.strongly_consistent_tables) {
        on_internal_error(logger, "strongly consistent tables are not enabled on this shard");
    }
@@ -303,7 +342,7 @@ future<raft_server> groups_manager::acquire_server(raft::group_id group_id) {
        on_internal_error(logger, format("raft group {} not found", group_id));
    }
    auto& state = it->second;
-    return state.server_control_op.get_future().then([&state, h = state.gate->hold()] mutable {
+    return state.server_control_op.get_future(as).then([&state, h = state.gate->hold()] mutable {
        return raft_server(state, std::move(h));
    });
 }
--- a/service/strong_consistency/groups_manager.hh
+++ b/service/strong_consistency/groups_manager.hh
@@ -110,7 +110,7 @@ public:
    void update(locator::token_metadata_ptr new_tm);

    // The raft_server instance is used to submit write commands and perform read_barrier() before reads.
-    future<raft_server> acquire_server(raft::group_id group_id);
+    future<raft_server> acquire_server(raft::group_id group_id, abort_source& as);

    // Called during node boot. Waits for all raft::server instances corresponding
    // to the latest group0 state to start.
@@ -152,7 +152,7 @@ public:
        future<> future;
    };
    using begin_mutate_result = std::variant<timestamp_with_term, raft::not_a_leader, need_wait_for_leader>;
-    begin_mutate_result begin_mutate();
+    begin_mutate_result begin_mutate(abort_source&);
 };

-}
+}
--- a/service/strong_consistency/state_machine.cc
+++ b/service/strong_consistency/state_machine.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

+#include <seastar/core/abort_source.hh>
 #include <seastar/core/shard_id.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "state_machine.hh"
@@ -34,6 +35,8 @@ class state_machine : public raft_state_machine {
    service::migration_manager& _mm;
    db::system_keyspace& _sys_ks;

+    abort_source _as;
+
 public:
    state_machine(locator::global_tablet_id tablet,
        raft::group_id gid,
@@ -70,6 +73,28 @@ public:
            // (see `schema_applier::commit_on_shard()` and `storage_service::commit_token_metadata_change()`).
            // In this case, we should just ignore mutations without throwing an error.
            logger.log(log_level::warn, rate_limit, "apply(): table {} was already dropped, ignoring mutations", _tablet.table);
+        } catch (const abort_requested_exception& ex) {
+            // The exception can be thrown by get_schema_and_upgrade_mutations.
+            // It means that the Raft group is being removed.
+            //
+            // Technically, throwing an exception from a state machine
+            // may result in killing the corresponding Raft instance:
+            // cf. the description of raft::state_machine:
+            //
+            //  "Any of the functions may return an error, but it will kill the
+            //   raft instance that uses it. Depending on what state the failure
+            //   leaves the state is the raft instance will either have to be recreated
+            //   with the same state machine and rejoined the cluster with the same server_id
+            //   or it new raft instance will have to be created with empty state machine and
+            //   it will have to rejoin to the cluster with different server_id through
+            //   configuration change."
+            //
+            // Fortunately, in strong consistency, we use the default Raft server
+            // implementation, which handles abort_requested_exception thrown by
+            // raft::state_machine::apply -- it will simply end the applier fiber.
+            logger.debug("apply(): execution for tablet {}, group_id={} aborted due to: {}",
+                _tablet, _group_id, ex);
+            throw;
        }
         catch (...) {
            throw std::runtime_error(::format(
@@ -79,11 +104,16 @@ public:
    }

    future<raft::snapshot_id> take_snapshot() override {
-        throw std::runtime_error("take_snapshot() not implemented");
+        // Until snapshot transfer is fully implemented, return a fake ID
+        // and don't actually do anything. As long as we don't do snapshot
+        // transfers (attempting to do that throws an exception), we should
+        // be safe.
+        return make_ready_future<raft::snapshot_id>(raft::snapshot_id(utils::make_random_uuid()));
    }

    void drop_snapshot(raft::snapshot_id id) override {
-        throw std::runtime_error("drop_snapshot() not implemented");
+        // Taking a snapshot is a no-op, so dropping a snapshot is also a no-op.
+        (void) id;
    }

    future<> load_snapshot(raft::snapshot_id id) override {
@@ -91,6 +121,8 @@ public:
    }

    future<> abort() override {
+        logger.debug("abort(): Aborting state machine for group {}", _group_id);
+        _as.request_abort();
        return make_ready_future<>();
    }

@@ -109,6 +141,10 @@ private:
        bool barrier_executed = false;

        auto get_schema = [&] (table_schema_version schema_version) -> future<std::pair<schema_ptr, column_mappings_cache::value_ptr>> {
+            if (utils::get_local_injector().enter("sc_state_machine_return_empty_schema")) {
+                co_return std::pair{nullptr, nullptr};
+            }
+
            auto schema = local_schema_registry().get_or_null(schema_version);
            if (schema) {
                co_return std::pair{std::move(schema), nullptr};
@@ -147,8 +183,7 @@ private:
                if (utils::get_local_injector().enter("disable_raft_drop_append_entries_for_specified_group")) {
                    utils::get_local_injector().disable("raft_drop_incoming_append_entries_for_specified_group");
                }
-                // TODO: pass valid abort source
-                co_await _mm.get_group0_barrier().trigger();
+                co_await _mm.get_group0_barrier().trigger(false, &_as);
                barrier_executed = true;
                schema_cm = co_await get_schema(schema_version);
            }
@@ -189,4 +224,4 @@ std::unique_ptr<raft_state_machine> make_state_machine(locator::global_tablet_id
    return std::make_unique<state_machine>(tablet, gid, db, mm, sys_ks);
 }

-};
+};
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -274,7 +274,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
    // cancel_requests - no request can be started so cancel the queue
    // start_vnodes_cleanup - cleanup needs to be started
    // node_to_work_on - the node the topology coordinator should work on
-    std::variant<group0_guard, cancel_requests, start_vnodes_cleanup, node_to_work_on> get_next_task(group0_guard guard) {
+    std::variant<group0_guard, cancel_requests, start_vnodes_cleanup, node_to_work_on> get_next_task(group0_guard guard, bool warn = true) {
        auto& topo = _topo_sm._topology;

        if (topo.transition_nodes.size() != 0) {
@@ -325,7 +325,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
            }
            // We did not find a request that has enough live node to proceed
            // Cancel all requests to let admin know that no operation can succeed
-            rtlogger.warn("topology coordinator: cancel request queue because no request can proceed. Dead nodes: {}", dead_nodes);
+            if (warn) {
+                rtlogger.warn("topology coordinator: cancel request queue because no request can proceed. Dead nodes: {}", dead_nodes);
+            }
            return cancel_requests{std::move(guard), std::move(dead_nodes)};
        }

@@ -2583,7 +2585,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
    // on the fact that the block which calls this is atomic.
    // FIXME: Don't take the ownership of the guard to make the above guarantee explicit.
    std::pair<bool, group0_guard> should_preempt_balancing(group0_guard guard) {
-        auto work = get_next_task(std::move(guard));
+        // Cancellation won't be acted upon here - it will be performed by handle_topology_transition()
+        // when called from the main loop. Suppress the warning to avoid duplicate messages.
+        auto work = get_next_task(std::move(guard), false /* warn */);
        if (auto* node = std::get_if<node_to_work_on>(&work)) {
            return std::make_pair(true, std::move(node->guard));
        }
--- a/service_permit.hh
+++ b/service_permit.hh
@@ -18,6 +18,13 @@ class service_permit {
    friend service_permit empty_service_permit();
 public:
    size_t count() const { return _permit ? _permit->count() : 0; };
+    // Merge additional semaphore units into this permit.
+    // Used to grow the permit after the actual resource cost is known.
+    void adopt(seastar::semaphore_units<>&& units) {
+        if (_permit) {
+            _permit->adopt(std::move(units));
+        }
+    }
 };

 inline service_permit make_service_permit(seastar::semaphore_units<>&& permit) {
--- a/sstables/object_storage_client.cc
+++ b/sstables/object_storage_client.cc
@@ -81,6 +81,9 @@ public:
    future<> put_object(object_name name, ::memory_data_sink_buffers bufs, abort_source* as) override {
        return _client->put_object(name.str(), std::move(bufs), as);
    }
+    future<> copy_object(object_name src, object_name dst, abort_source* as) override {
+        return _client->copy_object(src.str(), dst.str(), std::nullopt, std::nullopt, as);
+    }
    future<> delete_object(object_name name) override {
        return _client->delete_object(name.str());
    }
@@ -155,6 +158,9 @@ public:
        co_await sink.flush();
        co_await sink.close();
    }
+    future<> copy_object(object_name src, object_name dst, abort_source*) override {
+        return _client->copy_object(src.bucket(), src.object(), dst.bucket(), dst.object());
+    }
    future<> delete_object(object_name name) override {
        return _client->delete_object(name.bucket(), name.object());
    }
--- a/sstables/object_storage_client.hh
+++ b/sstables/object_storage_client.hh
@@ -71,6 +71,7 @@ public:
    virtual ~object_storage_client() = default;

    virtual future<> put_object(object_name, ::memory_data_sink_buffers bufs, abort_source* = nullptr) = 0;
+    virtual future<> copy_object(object_name src, object_name dst, abort_source* = nullptr) = 0;
    virtual future<> delete_object(object_name) = 0;
    virtual file make_readable_file(object_name, abort_source* = nullptr) = 0;
    virtual data_sink make_data_upload_sink(object_name, std::optional<unsigned> max_parts_per_piece, abort_source* = nullptr) = 0;
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -404,6 +404,10 @@ public:
        return _version;
    }

+    format_types get_format() const {
+        return _format;
+    }
+
    // Returns the total bytes of all components.
    uint64_t bytes_on_disk() const;
    file_size_stats get_file_size_stats() const;
--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -623,6 +623,7 @@ protected:
    static constexpr auto status_removing = "removing";

    object_name make_object_name(const sstable& sst, component_type type) const;
+    object_name make_object_name(const sstable& sst, sstring comp, generation_type gen) const;

    table_id owner() const {
        if (std::holds_alternative<sstring>(_location)) {
@@ -678,7 +679,10 @@ public:
    future<> put_object(object_name name, ::memory_data_sink_buffers bufs) {
        return _client->put_object(std::move(name), std::move(bufs), abort_source());
    }
-    future<> delete_object(object_name name) {
+    future<> copy_object(object_name src, object_name dst) const {
+        return _client->copy_object(std::move(src), std::move(dst), abort_source());
+    }
+    future<> delete_object(object_name name) const {
        return _client->delete_object(std::move(name));
    }
    file make_readable_file(object_name name) {
@@ -703,16 +707,22 @@ public:
 };

 object_name object_storage_base::make_object_name(const sstable& sst, component_type type) const {
-    if (!sst.generation().is_uuid_based()) {
+    auto comp = sstable_version_constants::get_component_map(sst.get_version()).at(type);
+    return make_object_name(sst, std::move(comp), sst.generation());
+}
+
+object_name object_storage_base::make_object_name(const sstable& sst, sstring comp, generation_type gen) const {
+    if (!gen.is_uuid_based()) {
        throw std::runtime_error(fmt::format("'{}' STORAGE only works with uuid_sstable_identifier enabled", _type));
    }

    return std::visit(overloaded_functor {
        [&] (const sstring& prefix) {
-            return object_name(_bucket, prefix, sst.component_basename(type));
+            return object_name(_bucket, prefix,
+                sstable::component_basename(sst.get_schema()->ks_name(), sst.get_schema()->cf_name(), sst.get_version(), gen, sst.get_format(), comp));
        },
-        [&] (const table_id& owner) {
-            return object_name(_bucket, sst.generation(), sstable_version_constants::get_component_map(sst.get_version()).at(type));
+        [&] (const table_id&) {
+            return object_name(_bucket, gen, comp);
        }
    }, _location);
 }
@@ -878,8 +888,23 @@ future<> object_storage_base::snapshot(const sstable& sst, sstring name) const {
 }

 future<> object_storage_base::clone(const sstable& sst, generation_type gen, bool leave_unsealed) const {
-    on_internal_error(sstlog, "Cloning S3 objects not implemented");
-    co_return;
+    sstlog.trace("clone sst: {} generation={} leave_unsealed={}", sst.get_filename(), gen, leave_unsealed);
+
+    // Register the cloned sstable as "creating" in the registry
+    entry_descriptor desc(gen, sst.get_version(), sst.get_format(), component_type::TOC);
+    co_await sst.manager().sstables_registry().create_entry(owner(), status_creating, sst.state(), desc);
+
+    // Copy all component objects from the source to the destination generation.
+    co_await coroutine::parallel_for_each(sst.all_components(), [this, &sst, &gen] (const std::pair<component_type, sstring>& p) -> future<> {
+        co_await copy_object(make_object_name(sst, p.second, sst.generation()), make_object_name(sst, p.second, gen));
+    });
+
+    if (!leave_unsealed) {
+        // Mark the cloned sstable as sealed in the registry
+        co_await sst.manager().sstables_registry().update_entry_status(owner(), gen, status_sealed);
+    }
+
+    sstlog.debug("clone sst: {} generation={}: done", sst.get_filename(), gen);
 }

 std::unique_ptr<sstables::storage> make_storage(sstables_manager& manager, const data_dictionary::storage_options& s_opts, sstable_state state) {
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -41,49 +41,51 @@ function(add_scylla_test name)
  else()
    set(src "${name}.cc")
  endif()
-  add_executable(${name} ${src})
-  add_dependencies(tests ${name})

+  # CMake requires globally unique target names.  Prefix with the
+  # directory path (e.g., test/manual/hint_test → test_manual_hint_test)
+  # to avoid collisions between suites, while keeping the output binary
+  # name matching configure.py via OUTPUT_NAME.
  cmake_path(RELATIVE_PATH CMAKE_CURRENT_SOURCE_DIR
    BASE_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE dirname)
+  string(REPLACE "/" "_" _target_prefix "${dirname}")
+  set(target "${_target_prefix}_${name}")
+
+  add_executable(${target} ${src})
+  set_target_properties(${target} PROPERTIES OUTPUT_NAME ${name})
+  add_dependencies(tests ${target})
+
  list(APPEND scylla_tests "${dirname}/${name}")
  set(scylla_tests "${scylla_tests}" PARENT_SCOPE)

-  if(Scylla_ENABLE_LTO)
-    # The runtime benefits of LTO don't outweight the compile time costs for tests.
-    target_link_options(${name} PRIVATE
-      $<$<CONFIG:RelWithDebInfo>:-fno-lto>)
-  endif()
-
-  target_include_directories(${name}
+  target_include_directories(${target}
    PRIVATE
      ${CMAKE_SOURCE_DIR})
-  target_link_libraries(${name}
+  target_link_libraries(${target}
    PRIVATE
      test-lib
      Seastar::seastar
      xxHash::xxhash)
  if(kind STREQUAL "SEASTAR")
-    target_link_libraries(${name}
+    target_link_libraries(${target}
      PRIVATE
        Seastar::seastar_testing)
-    target_compile_definitions(${name}
-      PRIVATE
-        SEASTAR_TESTING_MAIN)
+    # SEASTAR_TESTING_MAIN is provided by add_compile_definitions() in
+    # the top-level CMakeLists.txt, matching configure.py's global define.
  elseif(kind STREQUAL "BOOST")
-    target_link_libraries(${name}
+    target_link_libraries(${target}
      PRIVATE
        Boost::unit_test_framework)
  elseif(kind STREQUAL "UNIT")
-    target_link_libraries(${name}
+    target_link_libraries(${target}
      PRIVATE
        Seastar::seastar_testing)
  else()
    message(FATAL_ERROR "unknown test KIND: ${kind}")
  endif()
  if(parsed_args_LIBRARIES)
-    target_link_libraries(${name}
+    target_link_libraries(${target}
      PRIVATE
        ${parsed_args_LIBRARIES})
  endif()
--- a/test/alternator/conftest.py
+++ b/test/alternator/conftest.py
@@ -61,7 +61,7 @@ def pytest_collection_modifyitems(config, items):
    if config.getoption("--runveryslow"):
        # --runveryslow given in cli: do not skip veryslow tests
        return
-    skip_veryslow = pytest.mark.skip(reason="need --runveryslow option to run")
+    skip_veryslow = pytest.mark.skip_env(reason="need --runveryslow option to run")
    for item in items:
        if "veryslow" in item.keywords:
            item.add_marker(skip_veryslow)
--- a/test/alternator/test_https.py
+++ b/test/alternator/test_https.py
@@ -14,11 +14,13 @@ import urllib3
 import urllib.parse
 import ssl

+from test.pylib.skip_types import skip_env
+
@pytest.fixture(scope="module")
 def https_url(dynamodb):
    url = dynamodb.meta.client._endpoint.host
    if not url.startswith('https://'):
-        pytest.skip("HTTPS-specific tests are skipped without the '--https' option")
+        skip_env("HTTPS-specific tests are skipped without the '--https' option")
    yield url

 # Test which TLS versions are supported. We require that both TLS 1.2 and 1.3
--- a/test/boost/CMakeLists.txt
+++ b/test/boost/CMakeLists.txt
@@ -89,7 +89,6 @@ add_scylla_test(encrypted_file_test
 add_scylla_test(encryption_at_rest_test
  KIND SEASTAR
  LIBRARIES
-    Boost::filesystem
    encryption)
 add_scylla_test(enum_option_test
  KIND BOOST)
@@ -200,10 +199,10 @@ add_scylla_test(reusable_buffer_test
  KIND SEASTAR)
 add_scylla_test(reservoir_sampling_test
  KIND BOOST)
-add_scylla_test(rolling_max_tracker_test
-  KIND BOOST)
 add_scylla_test(rest_client_test
  KIND SEASTAR)
+add_scylla_test(rolling_max_tracker_test
+  KIND BOOST)
 add_scylla_test(rust_test
  KIND BOOST
  LIBRARIES inc)
@@ -265,9 +264,6 @@ add_scylla_test(string_format_test
  KIND BOOST)
 add_scylla_test(summary_test
  KIND BOOST)
-add_scylla_test(symmetric_key_test
-  KIND SEASTAR
-  LIBRARIES encryption)
 add_scylla_test(file_stream_test
  KIND SEASTAR)
 add_scylla_test(tagged_integer_test
@@ -314,12 +310,17 @@ add_scylla_test(address_map_test
  KIND SEASTAR)
 add_scylla_test(object_storage_upload_test
  KIND SEASTAR)
+add_scylla_test(symmetric_key_test
+  KIND SEASTAR
+  LIBRARIES
+    encryption)

 add_scylla_test(combined_tests
  KIND SEASTAR
  SOURCES
    combined_tests.cc
    aggregate_fcts_test.cc
+    auth_cache_test.cc
    auth_test.cc
    batchlog_manager_test.cc
    cache_algorithm_test.cc
--- a/test/boost/counter_test.cc
+++ b/test/boost/counter_test.cc
@@ -391,6 +391,7 @@ SEASTAR_TEST_CASE(test_counter_update_mutations) {
 SEASTAR_TEST_CASE(test_transfer_updates_to_shards) {
    return seastar::async([] {
        auto s = get_schema();
+        counter_id default_counter_id(locator::host_id::create_null_id().uuid());

        auto pk = partition_key::from_single_value(*s, int32_type->decompose(0));
        auto ck = clustering_key::from_single_value(*s, int32_type->decompose(0));
@@ -415,11 +416,11 @@ SEASTAR_TEST_CASE(test_transfer_updates_to_shards) {
        m3.set_static_cell(scol, std::move(c3));

        auto m0 = m1;
-        transform_counter_updates_to_shards(m0, nullptr, 0, locator::host_id::create_null_id());
+        transform_counter_updates_to_shards(m0, nullptr, 0, default_counter_id);

        auto empty = mutation(s, pk);
        auto m = m1;
-        transform_counter_updates_to_shards(m, &empty, 0, locator::host_id::create_null_id());
+        transform_counter_updates_to_shards(m, &empty, 0, default_counter_id);
        BOOST_REQUIRE_EQUAL(m, m0);

        auto ac = get_counter_cell(m);
@@ -439,7 +440,7 @@ SEASTAR_TEST_CASE(test_transfer_updates_to_shards) {
      }

        m = m2;
-        transform_counter_updates_to_shards(m, &m0, 0, locator::host_id::create_null_id());
+        transform_counter_updates_to_shards(m, &m0, 0, default_counter_id);

        ac = get_counter_cell(m);
        BOOST_REQUIRE(ac.is_live());
@@ -458,7 +459,7 @@ SEASTAR_TEST_CASE(test_transfer_updates_to_shards) {
      }

        m = m3;
-        transform_counter_updates_to_shards(m, &m0, 0, locator::host_id::create_null_id());
+        transform_counter_updates_to_shards(m, &m0, 0, default_counter_id);
        ac = get_counter_cell(m);
        BOOST_REQUIRE(!ac.is_live());
        ac = get_static_counter_cell(m);
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -4777,7 +4777,7 @@ static void prepared_on_shard(cql_test_env& e, const sstring& query,

            auto qo = q_serial_opts(std::move(raw_values), cl);
            auto msg = e.execute_prepared_with_qo(id, std::move(qo)).get();
-            if (!msg->move_to_shard()) {
+            if (!msg->as_bounce()) {
                assert_that(msg).is_rows().with_rows_ignore_order(expected_rows);
            }
            return make_foreign(msg);
@@ -4785,8 +4785,8 @@ static void prepared_on_shard(cql_test_env& e, const sstring& query,
    };

    auto msg = execute().get();
-    if (msg->move_to_shard()) {
-        unsigned shard = *msg->move_to_shard();
+    if (auto bounce = msg->as_bounce()) {
+        unsigned shard = bounce->target_shard();
        smp::submit_to(shard, std::move(execute)).get();
    }
 }
--- a/test/boost/repair_test.cc
+++ b/test/boost/repair_test.cc
@@ -193,7 +193,9 @@ SEASTAR_TEST_CASE(test_reader_with_different_strategies) {
            });
            auto read_all = [&](repair_reader::read_strategy strategy) -> future<std::vector<mutation_fragment>> {
                auto reader = repair_reader(e.db(), cf, cf.schema(), make_reader_permit(e),
-                    random_range, remote_sharder, remote_shard, 0, strategy, gc_clock::now(), incremental_repair_meta());
+                    random_range, remote_sharder, remote_shard, 0, strategy, gc_clock::now(), incremental_repair_meta(),
+                    e.db_config().repair_multishard_reader_buffer_hint_size(),
+                    e.db_config().repair_multishard_reader_enable_read_ahead());
                std::vector<mutation_fragment> result;
                while (auto mf = co_await reader.read_mutation_fragment()) {
                    result.push_back(std::move(*mf));
@@ -284,7 +286,9 @@ static future<> run_repair_reader_corruption_test(random_mutation_generator::com
        auto test_range = dht::token_range::make_open_ended_both_sides();
        auto reader = repair_reader(e.db(), cf, cf.schema(), make_reader_permit(e),
            test_range, local_sharder, 0, 0, repair_reader::read_strategy::local,
-            gc_clock::now(), incremental_repair_meta());
+            gc_clock::now(), incremental_repair_meta(),
+            e.db_config().repair_multishard_reader_buffer_hint_size(),
+            e.db_config().repair_multishard_reader_enable_read_ahead());

        try {
            while (auto mf = co_await reader.read_mutation_fragment()) {
--- a/test/boost/sstable_compaction_test.cc
+++ b/test/boost/sstable_compaction_test.cc
@@ -7382,22 +7382,71 @@ SEASTAR_TEST_CASE(sstable_clone_leaving_unsealed_dest_sstable) {
    return test_env::do_with_async([](test_env& env) { sstable_clone_leaving_unsealed_dest_sstable_fn(env); });
 }

+void object_storage_sstable_clone_leaving_unsealed_dest_sstable(test_env& env) {
+    simple_schema ss;
+    auto s = ss.schema();
+    auto pk = ss.make_pkey();
+
+    auto mut1 = mutation(s, pk);
+    mut1.partition().apply_insert(*s, ss.make_ckey(0), ss.new_timestamp());
+    auto sst = make_sstable_containing(env.make_sstable(s), {std::move(mut1)});
+
+    auto table = env.make_table_for_tests(s);
+    auto close_table = deferred_stop(table);
+
+    sstable_generation_generator gen_generator;
+
+    bool leave_unsealed = true;
+    auto d = sst->clone(gen_generator(), leave_unsealed).get();
+
+    auto sst2 = env.make_sstable(s, d.generation, d.version, d.format);
+    {
+        bool checked = false;
+        env.manager()
+            .sstables_registry()
+            .sstables_registry_list(table.schema()->id(),
+                                    [&checked, sst_desc = sst2->get_descriptor(component_type::TOC)](
+                                        sstring status, sstable_state state, entry_descriptor desc) {
+                                        if (desc.generation == sst_desc.generation) {
+                                            checked = true;
+                                            BOOST_REQUIRE_EQUAL(status, "creating");
+                                        }
+                                        return make_ready_future();
+                                    })
+            .get();
+        BOOST_REQUIRE(checked);
+    }
+
+    leave_unsealed = false;
+    d = sst->clone(gen_generator(), leave_unsealed).get();
+
+    auto sst3 = env.make_sstable(s, d.generation, d.version, d.format);
+    {
+        bool checked = false;
+        env.manager()
+            .sstables_registry()
+            .sstables_registry_list(table.schema()->id(),
+                                    [&checked, sst_desc = sst3->get_descriptor(component_type::TOC)](
+                                        sstring status, sstable_state, entry_descriptor desc) {
+                                        if (desc.generation == sst_desc.generation) {
+                                            checked = true;
+                                            BOOST_REQUIRE_EQUAL(status, "sealed");
+                                        }
+                                        return make_ready_future();
+                                    })
+            .get();
+        BOOST_REQUIRE(checked);
+    }
+}
+
 SEASTAR_TEST_CASE(sstable_clone_leaving_unsealed_dest_sstable_s3, *boost::unit_test::precondition(tests::has_scylla_test_env)) {
-    testlog.info("Clone is not supported for S3 storage yet, skipping test");
-    return make_ready_future();
-#if 0
-    return test_env::do_with_async([](test_env& env) { sstable_clone_leaving_unsealed_dest_sstable_fn(env); },
+    return test_env::do_with_async([](test_env& env) { object_storage_sstable_clone_leaving_unsealed_dest_sstable(env); },
                                   test_env_config{.storage = make_test_object_storage_options("S3")});
-#endif
 }

 SEASTAR_FIXTURE_TEST_CASE(sstable_clone_leaving_unsealed_dest_sstable_gcs, gcs_fixture, *tests::check_run_test_decorator("ENABLE_GCP_STORAGE_TEST", true)) {
-    testlog.info("Clone is not supported for GCS storage yet, skipping test");
-    return make_ready_future();
-#if 0
-    return test_env::do_with_async([](test_env& env) { sstable_clone_leaving_unsealed_dest_sstable_fn(env); },
+    return test_env::do_with_async([](test_env& env) { object_storage_sstable_clone_leaving_unsealed_dest_sstable(env); },
                                   test_env_config{.storage = make_test_object_storage_options("GS")});
-#endif
 }

 void failure_when_adding_new_sstable_fn(test_env& env) {
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -6037,8 +6037,7 @@ SEASTAR_THREAD_TEST_CASE(test_tablets_describe_ring) {

        auto ks = add_keyspace(e, {{topo.dc(), num_racks}}, num_racks * nodes_per_rack);
        auto table = add_table(e, ks, std::map<sstring, sstring>({{"min_tablet_count", std::to_string(min_tablet_count)}})).get();
-        auto s = db.find_schema(table);
-        auto ring = ss.describe_ring_for_table(s->ks_name(), s->cf_name()).get();
+        auto ring = ss.describe_ring_for_table(table).get();
        BOOST_REQUIRE_GE(ring.size(), min_tablet_count);
    }, cfg).get();
 }
--- a/test/boost/user_function_test.cc
+++ b/test/boost/user_function_test.cc
@@ -81,7 +81,7 @@ SEASTAR_TEST_CASE(test_user_function_use_null) {
        e.execute_cql("INSERT INTO my_table (key, val) VALUES ('foo', null);").get();
        e.execute_cql("CREATE FUNCTION my_func1(val int) CALLED ON NULL INPUT RETURNS int LANGUAGE Lua AS 'return val + 1';").get();
        e.execute_cql("CREATE FUNCTION my_func2(val int) CALLED ON NULL INPUT RETURNS int LANGUAGE Lua AS 'return val';").get();
-        BOOST_REQUIRE_EXCEPTION(e.execute_cql("SELECT my_func1(val) FROM my_table;").get(), ire, message_equals("lua execution failed: ?:-1: attempt to perform arithmetic on a nil value"));
+        BOOST_REQUIRE_EXCEPTION(e.execute_cql("SELECT my_func1(val) FROM my_table;").get(), ire, message_contains("attempt to perform arithmetic on a nil value"));
        auto res = e.execute_cql("SELECT my_func2(val) FROM my_table;").get();
        assert_that(res).is_rows().with_rows({{std::nullopt}});
        res = e.execute_cql("SELECT val FROM my_table;").get();
@@ -924,7 +924,7 @@ SEASTAR_TEST_CASE(test_user_function_lua_error) {
        e.execute_cql("CREATE TABLE my_table (key text PRIMARY KEY, val int);").get();
        e.execute_cql("INSERT INTO my_table (key, val) VALUES ('foo', 42);").get();
        e.execute_cql("CREATE FUNCTION my_func(val int) RETURNS NULL ON NULL INPUT RETURNS int LANGUAGE Lua AS 'return 2 * bar';").get();
-        BOOST_REQUIRE_EXCEPTION(e.execute_cql("SELECT my_func(val) FROM my_table;").get(), ire, message_equals("lua execution failed: ?:-1: attempt to perform arithmetic on a nil value (field 'bar')"));
+        BOOST_REQUIRE_EXCEPTION(e.execute_cql("SELECT my_func(val) FROM my_table;").get(), ire, message_contains("attempt to perform arithmetic on a nil value (field 'bar')"));

    });
 }
--- a/test/cluster/auth_cluster/test_connection_stage.py
+++ b/test/cluster/auth_cluster/test_connection_stage.py
@@ -0,0 +1,100 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+
+"""
+Tests that system.clients shows connection_stage = 'READY' after a successful
+authentication, regardless of which auth path was taken in process_startup:
+
+  1. No-auth path (AllowAllAuthenticator): require_authentication() = false,
+     connection is immediately marked ready.
+
+  2. SASL/password path (PasswordAuthenticator): client sends AUTH_RESPONSE,
+     process_auth_response() marks the connection ready.
+
+  3. Certificate-bypass path (CertificateAuthenticator with
+     transport_early_auth_bypass injection): authenticate(session_dn_func)
+     returns a user immediately, process_startup() marks the connection ready
+     without a SASL round-trip.
+
+     This is the path introduced by commit 20e9619bb1 that was missing the
+     _ready = true / update_scheduling_group() / on_connection_ready() calls.
+"""
+
+import logging
+import time
+import pytest
+from cassandra.auth import PlainTextAuthProvider
+from test.pylib.manager_client import ManagerClient
+from test.pylib.util import wait_for_cql_and_get_hosts, wait_for
+
+
+logger = logging.getLogger(__name__)
+
+
+def make_server_config(auth_type: str) -> dict:
+    """Return the scylla config dict for a given auth type."""
+    if auth_type == "allow_all":
+        return {
+            "authenticator": "AllowAllAuthenticator",
+            "authorizer": "AllowAllAuthorizer",
+        }
+    if auth_type == "password":
+        return {
+            "authenticator": "PasswordAuthenticator",
+            "authorizer": "CassandraAuthorizer",
+        }
+    if auth_type == "cert_bypass":
+        # CertificateAuthenticator with transport_early_auth_bypass active
+        # from the very first connection (including the server-add readiness
+        # check), so no TLS socket is required.  The injection makes
+        # certificate_authenticator::authenticate() return the cassandra
+        # superuser immediately, bypassing the TLS certificate check.
+        return {
+            "authenticator": "CertificateAuthenticator",
+            "authorizer": "CassandraAuthorizer",
+            # Minimal valid config; regex is never reached due to injection.
+            "auth_certificate_role_queries": [{"source": "SUBJECT", "query": "CN=(.+)"}],
+            "error_injections_at_startup": [
+                {"name": "transport_early_auth_bypass", "value": "cassandra"},
+            ],
+        }
+    raise ValueError(f"Unknown auth_type: {auth_type!r}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("auth_type", [
+    "allow_all",
+    "password",
+    "cert_bypass",
+])
+@pytest.mark.skip_mode("release", reason="error injections are not supported in release mode")
+async def test_connection_stage_ready_after_auth(manager: ManagerClient, auth_type: str) -> None:
+    """After a successful authentication via any code path in process_startup,
+    the connection must be reported as READY in system.clients."""
+    server = await manager.server_add(config=make_server_config(auth_type))
+
+    # CertificateAuthenticator bypassed by injection returns a user without a
+    # SASL challenge, so the driver must not attempt a password handshake.
+    auth_provider = PlainTextAuthProvider(username="cassandra", password="cassandra") \
+        if auth_type == "password" else None
+
+    await manager.driver_connect(server=server, auth_provider=auth_provider)
+    cql = manager.get_cql()
+    await wait_for_cql_and_get_hosts(cql, [server], time.time() + 60)
+
+    async def all_connections_ready():
+        rows = list(cql.execute(
+            "SELECT connection_stage FROM system.clients WHERE client_type = 'cql' ALLOW FILTERING"
+        ))
+        if not rows:
+            return None
+        if any(r.connection_stage != "READY" for r in rows):
+            return None
+        return rows
+
+    rows = await wait_for(all_connections_ready, time.time() + 30)
+    assert rows, "No CQL connections found in system.clients"
+    logger.info("auth_type=%s: all %d connection(s) are READY", auth_type, len(rows))
--- a/test/cluster/dtest/schema_management_test.py
+++ b/test/cluster/dtest/schema_management_test.py
@@ -196,7 +196,7 @@ class TestSchemaManagement(Tester):
        logger.debug("read and check data")
        run_stress("read")

-    @pytest.mark.skip("unimplemented")
+    @pytest.mark.skip_not_implemented(reason="unimplemented")
    def commitlog_replays_after_schema_change(self):
        """
        Commitlog can be replayed even though schema has been changed
--- a/test/cluster/object_store/test_backup.py
+++ b/test/cluster/object_store/test_backup.py
@@ -494,31 +494,6 @@ async def create_cluster(topology, manager, logger, object_storage=None):

    return servers,host_ids

-async def create_dataset(manager, ks, cf, topology, logger, num_keys=256, min_tablet_count=None, schema=None, consistency_level=ConsistencyLevel.ALL):
-    cql = manager.get_cql()
-    logger.info(f'Create keyspace, {topology=}')
-    keys = range(num_keys)
-    replication_opts = {'class': 'NetworkTopologyStrategy'}
-    replication_opts['replication_factor'] = f'{topology.rf}'
-    replication_opts = format_tuples(replication_opts)
-
-    print(replication_opts)
-
-    cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
-
-    if schema is None:
-        if min_tablet_count is not None:
-            logger.info(f'Creating schema with min_tablet_count={min_tablet_count}')
-        schema = create_schema(ks, cf, min_tablet_count)
-    cql.execute(schema)
-
-    stmt = cql.prepare(f"INSERT INTO {ks}.{cf} ( pk, value ) VALUES (?, ?)")
-    if consistency_level is not None:
-        stmt.consistency_level = consistency_level
-    await asyncio.gather(*(cql.run_async(stmt, (str(k), k)) for k in keys))
-
-    return schema, keys, replication_opts
-
 async def do_restore_server(manager, logger, ks, cf, s, toc_names, scope, primary_replica_only, prefix, object_storage):
    logger.info(f'Restore {s.ip_addr} with {toc_names}, scope={scope}')
    tid = await manager.api.restore(s.ip_addr, ks, cf, object_storage.address, object_storage.bucket_name, prefix, toc_names, scope, primary_replica_only=primary_replica_only)
@@ -902,32 +877,35 @@ async def test_restore_primary_replica(manager: ManagerClient, object_storage, d
            scope = "all"
        expected_replicas = 1

-    ks = 'ks'
    cf = 'cf'
+    keys = range(256)
+    replication_str = f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {topology.rf}}}"

    servers, host_ids = await create_cluster(topology, manager, logger, object_storage)

    await manager.disable_tablet_balancing()
    cql = manager.get_cql()

-    schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger)
+    async with new_test_keyspace(manager, replication_str) as ks:
+        cql.execute(create_schema(ks, cf))
+        stmt = cql.prepare(f"INSERT INTO {ks}.{cf} ( pk, value ) VALUES (?, ?)")
+        stmt.consistency_level = ConsistencyLevel.ALL
+        await asyncio.gather(*(cql.run_async(stmt, (str(k), k)) for k in keys))

-    # validate replicas assertions hold on fresh dataset
-    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf)
+        # validate replicas assertions hold on fresh dataset
+        await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf)

-    snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
-    prefix = f'{cf}/{snap_name}'
+        snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
+        prefix = f'{cf}/{snap_name}'

-    await asyncio.gather(*(do_backup(s, snap_name, prefix, ks, cf, object_storage, manager, logger) for s in servers))
+        await asyncio.gather(*(do_backup(s, snap_name, prefix, ks, cf, object_storage, manager, logger) for s in servers))

-    logger.info(f'Re-initialize keyspace')
-    cql.execute(f'DROP KEYSPACE {ks}')
-    cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
-    cql.execute(schema)
+    async with new_test_keyspace(manager, replication_str) as ks:
+        cql.execute(create_schema(ks, cf))

-    await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in servers))
+        await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in servers))

-    await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=expected_replicas)
+        await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=expected_replicas)

    logger.info(f'Validate streaming directions')
    for i, s in enumerate(servers):
--- a/test/cluster/test_audit.py
+++ b/test/cluster/test_audit.py
@@ -12,6 +12,7 @@ import functools
 import itertools
 import logging
 import os.path
+from pathlib import Path
 import re
 import socket
 import socketserver
@@ -37,6 +38,7 @@ from test.cluster.dtest.tools.data import rows_to_list, run_in_parallel
 from test.pylib.manager_client import ManagerClient
 from test.pylib.rest_client import read_barrier
 from test.pylib.util import wait_for as wait_for_async
+from test.pylib.scylla_cluster import ScyllaVersionDescription

 logger = logging.getLogger(__name__)

@@ -61,6 +63,9 @@ class AuditRowMustNotExistError(Exception):
 class AuditTester:
    audit_default_settings = {"audit": "table", "audit_categories": "ADMIN,AUTH,QUERY,DML,DDL,DCL", "audit_keyspaces": "ks"}

+    def __init__(self, manager: ManagerClient):
+        self.manager = manager
+
    def _build_server_config(self, needed: dict[str, str],
                             enable_compact_storage: bool,
                             user: str | None) -> dict[str, Any]:
@@ -526,8 +531,7 @@ class CQLAuditTester(AuditTester):
    AUDIT_LOG_QUERY = "SELECT * FROM audit.audit_log"

    def __init__(self, manager: ManagerClient, helper: AuditBackend | None = None):
-        super().__init__()
-        self.manager = manager
+        super().__init__(manager)
        self.server_addresses: list[str] = []
        self.helper: AuditBackend | None = helper

@@ -664,28 +668,23 @@ class CQLAuditTester(AuditTester):
            count_after = counts_after[mode]
            assert count_before == count_after, f"audit entries count changed (before: {count_before} after: {count_after})"

-    def execute_and_validate_audit_entry(  # noqa: PLR0913
+    def execute_and_validate_new_audit_entry(  # noqa: PLR0913
        self,
        session: Session,
        query: Any,
        category: str,
-        audit_settings: dict[str, str] = AuditTester.audit_default_settings,
        table: str = "",
        ks: str = "ks",
        cl: str = "ONE",
        user: str = "anonymous",
-        expected_error: Any = None,
        bound_values: list[Any] | None = None,
-        expect_new_audit_entry: bool = True,
        expected_operation: str | None = None,
+        error: bool = False,
        session_for_audit_entry_validation: Session | None = None,
    ):
        """
        Execute a query and validate that an audit entry was added to the audit
-        log table. Use the audit_settings parameter in combination with category
-        to determine if the audit entry should be added or not. If the audit
-        entry is expected, validate that the audit entry's content is as
-        expected.
+        log table.
        """

        # In some cases, provided session does not have access to the audit
@@ -694,23 +693,50 @@ class CQLAuditTester(AuditTester):
        if session_for_audit_entry_validation is None:
            session_for_audit_entry_validation = session

-        if category in audit_settings["audit_categories"].split(",") and expect_new_audit_entry:
-            operation = query if expected_operation is None else expected_operation
-            error = expected_error is not None
-
-            expected_entries = [AuditEntry(category, cl, error, ks, operation, table, user)]
-        else:
-            expected_entries = []
+        operation = query if expected_operation is None else expected_operation
+        expected_entries = [AuditEntry(category, cl, error, ks, operation, table, user)]

        with self.assert_entries_were_added(session_for_audit_entry_validation, expected_entries):
-            if expected_error is None:
-                res = session.execute(query, bound_values)
-            else:
-                assert_invalid(session, query, expected=expected_error)
-                res = None
+            res = session.execute(query, bound_values)

        return res

+    def execute_and_validate_if_category_enabled(  # noqa: PLR0913
+        self,
+        session: Session,
+        query: Any,
+        category: str,
+        audit_settings: dict[str, str],
+        table: str = "",
+        ks: str = "ks",
+        cl: str = "ONE",
+        user: str = "anonymous",
+        bound_values: list[Any] | None = None,
+        expected_operation: str | None = None,
+        session_for_audit_entry_validation: Session | None = None,
+    ):
+        """
+        Execute a query and validate or skip audit entry validation based on
+        whether the given category is enabled in audit_settings.
+        """
+
+        if session_for_audit_entry_validation is None:
+            session_for_audit_entry_validation = session
+
+        audit_categories = [c.strip() for c in audit_settings.get("audit_categories", "").split(",")]
+        if category in audit_categories:
+            return self.execute_and_validate_new_audit_entry(
+                session, query, category,
+                table=table, ks=ks, cl=cl, user=user,
+                bound_values=bound_values,
+                expected_operation=expected_operation,
+                session_for_audit_entry_validation=session_for_audit_entry_validation,
+            )
+        else:
+            with self.assert_no_audit_entries_were_added(session_for_audit_entry_validation):
+                res = session.execute(query, bound_values)
+            return res
+
    # Filter out queries that can appear in random moments of the tests,
    # such as LOGINs and USE statements.
    def filter_out_noise(self, rows_dict, filter_out_all_auth=False, filter_out_cassandra_auth=False, filter_out_use=False) -> dict[str, list[AuditEntry]]:
@@ -801,22 +827,23 @@ class CQLAuditTester(AuditTester):
        """
        session = await self.prepare(create_keyspace=False, audit_settings=audit_settings, helper=helper)

-        def execute_and_validate_audit_entry(query, category, **kwargs):
-            return self.execute_and_validate_audit_entry(session, query, category, audit_settings, **kwargs)
-
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            "CREATE KEYSPACE ks WITH replication = { 'class':'SimpleStrategy', 'replication_factor':1} AND DURABLE_WRITES = true",
            category="DDL",
        )
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            'USE "ks"',
            category="DML",
        )
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            "ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 1 } AND DURABLE_WRITES = false",
            category="DDL",
        )
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            "DROP KEYSPACE ks",
            category="DDL",
        )
@@ -846,12 +873,10 @@ class CQLAuditTester(AuditTester):
        with helper_class() as helper:
            session = await self.prepare(create_keyspace=False, helper=helper)

-            self.execute_and_validate_audit_entry(
-                session,
-                'USE "ks"',  # ks doesn't exist because create_keyspace=False in prepare
-                category="DML",
-                expected_error=InvalidRequest,
-            )
+            expected_entry = AuditEntry(category="DML", cl="ONE", error=True, ks="ks",
+                                        statement='USE "ks"', table="", user="anonymous")
+            with self.assert_entries_were_added(session, [expected_entry]):
+                assert_invalid(session, 'USE "ks"', expected=InvalidRequest)  # ks doesn't exist because create_keyspace=False in prepare

    async def verify_table(self, audit_settings=AuditTester.audit_default_settings, helper=None, table_prefix="test", overwrite_audit_tables=False):
        """
@@ -868,20 +893,20 @@ class CQLAuditTester(AuditTester):

        session = await self.prepare(audit_settings=audit_settings, helper=helper, enable_compact_storage=True)

-        def execute_and_validate_audit_entry(query, category, **kwargs):
-            return self.execute_and_validate_audit_entry(session, query, category, audit_settings, **kwargs)
-
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            f"CREATE TABLE {first_table} (k int PRIMARY KEY, v1 int)",
            category="DDL",
            table=first_table,
        )
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            f"CREATE TABLE {second_table} (k int, c1 int, v1 int, PRIMARY KEY (k, c1)) WITH COMPACT STORAGE",
            category="DDL",
            table=second_table,
        )
-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            f"ALTER TABLE {first_table} ADD v2 int",
            category="DDL",
            table=first_table,
@@ -894,45 +919,49 @@ class CQLAuditTester(AuditTester):
                else:
                    columns = "(k, c1, v1)"

-                execute_and_validate_audit_entry(
+                self.execute_and_validate_if_category_enabled(
+                    session,
                    f"INSERT INTO {table} {columns} VALUES ({i}, {i}, {i})",
                    category="DML",
+                    audit_settings=audit_settings,
                    table=f"{table}",
                )

-            res = execute_and_validate_audit_entry(
+            res = self.execute_and_validate_if_category_enabled(
+                session,
                f"SELECT * FROM {table}",
                category="QUERY",
+                audit_settings=audit_settings,
                table=f"{table}",
            )
            assert sorted(rows_to_list(res)) == [[i, i, i] for i in range(10)], res

-            execute_and_validate_audit_entry(
+            self.execute_and_validate_if_category_enabled(
+                session,
                f"TRUNCATE {table}",
                category="DML",
+                audit_settings=audit_settings,
                table=f"{table}",
            )

-            res = execute_and_validate_audit_entry(
+            res = self.execute_and_validate_if_category_enabled(
+                session,
                f"SELECT * FROM {table}",
                category="QUERY",
+                audit_settings=audit_settings,
                table=f"{table}",
            )
            assert rows_to_list(res) == [], res

-            execute_and_validate_audit_entry(
+            self.execute_and_validate_new_audit_entry(
+                session,
                f"DROP TABLE {table}",
                category="DDL",
                table=f"{table}",
            )

-            execute_and_validate_audit_entry(
-                f"SELECT * FROM {table}",
-                category="QUERY",
-                table=f"{table}",
-                expected_error=InvalidRequest,
-                expect_new_audit_entry=False,
-            )
+            with self.assert_no_audit_entries_were_added(session):
+                assert_invalid(session, f"SELECT * FROM {table}", expected=InvalidRequest)

        # Test that the audit entries are not added if the keyspace is not
        # specified in the audit_keyspaces setting.
@@ -1048,7 +1077,7 @@ class CQLAuditTester(AuditTester):
        audit_settings = {"audit": "table", "audit_categories": "ADMIN,AUTH,QUERY,DML,DDL,DCL", "audit_keyspaces": "audit"}
        session = await self.prepare(create_keyspace=False, audit_settings=audit_settings)

-        self.execute_and_validate_audit_entry(session, query=self.AUDIT_LOG_QUERY, category="QUERY", ks="audit", table="audit_log", audit_settings=audit_settings)
+        self.execute_and_validate_new_audit_entry(session, query=self.AUDIT_LOG_QUERY, category="QUERY", ks="audit", table="audit_log")

    async def test_audit_categories_invalid(self):
        """
@@ -1110,24 +1139,30 @@ class CQLAuditTester(AuditTester):
        with helper_class() as helper:
            session = await self.prepare(user="cassandra", password="cassandra", helper=helper)

-            def execute_and_validate_audit_entry(query, category, **kwargs):
-                return self.execute_and_validate_audit_entry(session, query, category, self.audit_default_settings, **kwargs, user="cassandra", ks="")
-
            tests = [self.PasswordMaskingCase("user1", "secret", "Secret^%$#@!"), self.PasswordMaskingCase("user2", "", "")]
            for username, password, new_password in tests:
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"CREATE USER {username} WITH PASSWORD '{password}'",
                    category="DCL",
                    expected_operation=f"CREATE USER {username} WITH PASSWORD '***'",
+                    user="cassandra",
+                    ks="",
                )
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"ALTER USER {username} WITH PASSWORD '{new_password}'",
                    category="DCL",
                    expected_operation=f"ALTER USER {username} WITH PASSWORD '***'",
+                    user="cassandra",
+                    ks="",
                )
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"DROP USER {username}",
                    category="DCL",
+                    user="cassandra",
+                    ks="",
                )

    async def test_negative_audit_records_auth(self):
@@ -1246,24 +1281,30 @@ class CQLAuditTester(AuditTester):
        with helper_class() as helper:
            session = await self.prepare(user="cassandra", password="cassandra", helper=helper)

-            def execute_and_validate_audit_entry(query, category, **kwargs):
-                return self.execute_and_validate_audit_entry(session, query, category, self.audit_default_settings, **kwargs, user="cassandra", ks="")
-
            tests = [self.PasswordMaskingCase("role1", "Secret!@#$", "Secret^%$#@!"), self.PasswordMaskingCase("role2", "", "")]
            for role_name, password, new_password in tests:
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"CREATE ROLE {role_name} WITH PASSWORD = '{password}'",
                    category="DCL",
                    expected_operation=f"CREATE ROLE {role_name} WITH PASSWORD = '***'",
+                    user="cassandra",
+                    ks="",
                )
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"ALTER ROLE {role_name} WITH PASSWORD = '{new_password}'",
                    category="DCL",
                    expected_operation=f"ALTER ROLE {role_name} WITH PASSWORD = '***'",
+                    user="cassandra",
+                    ks="",
                )
-                execute_and_validate_audit_entry(
+                self.execute_and_validate_new_audit_entry(
+                    session,
                    f"DROP ROLE {role_name}",
                    category="DCL",
+                    user="cassandra",
+                    ks="",
                )

    async def test_login(self):
@@ -1300,15 +1341,13 @@ class CQLAuditTester(AuditTester):
        """
        session = await self.prepare(audit_settings={"audit": "table", "audit_categories": "DML", "audit_keyspaces": "ks"})

-        def execute_and_validate_audit_entry(query, category, **kwargs):
-            return self.execute_and_validate_audit_entry(session, query, category, self.audit_default_settings, **kwargs)
-
        with self.assert_no_audit_entries_were_added(session):
            session.execute("CREATE TABLE test1 (k int PRIMARY KEY, v1 int)")
            session.execute("ALTER TABLE test1 ADD v2 int")

        for i in range(10):
-            execute_and_validate_audit_entry(
+            self.execute_and_validate_new_audit_entry(
+                session,
                f"INSERT INTO test1 (k, v1, v2) VALUES ({i}, {i}, {i})",
                category="DML",
                table="test1",
@@ -1318,7 +1357,8 @@ class CQLAuditTester(AuditTester):
            res = sorted(session.execute("SELECT * FROM test1"))
            assert rows_to_list(res) == [[i, i, i] for i in range(10)], res

-        execute_and_validate_audit_entry(
+        self.execute_and_validate_new_audit_entry(
+            session,
            "TRUNCATE test1",
            category="DML",
            table="test1",
@@ -1486,7 +1526,7 @@ class CQLAuditTester(AuditTester):
                query = "INSERT INTO cf (k, c) VALUES (?, ?);"
                pq = session.prepare(query)

-            self.execute_and_validate_audit_entry(
+            self.execute_and_validate_new_audit_entry(
                session,
                pq,
                bound_values=["foo", 4],
@@ -1512,20 +1552,20 @@ class CQLAuditTester(AuditTester):

            test_session = await self.manager.get_cql_exclusive(servers[0], auth_provider=PlainTextAuthProvider(username="test", password="test"))
            test_session.get_execution_profile(EXEC_PROFILE_DEFAULT).consistency_level = ConsistencyLevel.ONE
-            def execute_and_validate_audit_entry(query, category, **kwargs):
-                return self.execute_and_validate_audit_entry(test_session, query, category, session_for_audit_entry_validation=session, user="test", **kwargs)

-            execute_and_validate_audit_entry(
+            self.execute_and_validate_new_audit_entry(
+                test_session,
                "SELECT * FROM ks.test1",
                category="QUERY",
                table="test1",
+                user="test",
+                session_for_audit_entry_validation=session,
            )
-            execute_and_validate_audit_entry(
-                "INSERT INTO ks.test1 (k, v1) VALUES (2, 2)",
-                category="DML",
-                table="test1",
-                expected_error=Unauthorized,
-            )
+
+            expected_entry = AuditEntry(category="DML", cl="ONE", error=True, ks="ks",
+                                        statement="INSERT INTO ks.test1 (k, v1) VALUES (2, 2)", table="test1", user="test")
+            with self.assert_entries_were_added(session, [expected_entry]):
+                assert_invalid(test_session, "INSERT INTO ks.test1 (k, v1) VALUES (2, 2)", expected=Unauthorized)

            session.execute("DROP USER IF EXISTS test")

@@ -1594,7 +1634,7 @@ class CQLAuditTester(AuditTester):
        # Execute previously defined service level statements.
        # Validate that the audit log contains the expected entries.
        for query in query_sequence:
-            self.execute_and_validate_audit_entry(session, query, category="ADMIN", audit_settings=audit_settings, ks="", user="cassandra")
+            self.execute_and_validate_new_audit_entry(session, query, category="ADMIN", ks="", user="cassandra")

        # Create a session with the ADMIN category disabled to validate that
        # the service level statements are not audited in that case.
@@ -1955,3 +1995,62 @@ async def test_config_liveupdate(manager: ManagerClient, helper_class, config_ch
 async def test_parallel_syslog_audit(manager: ManagerClient, helper_class):
    """Cluster must not fail when multiple queries are audited in parallel."""
    await CQLAuditTester(manager).test_parallel_syslog_audit(helper_class)
+
+@pytest.mark.asyncio
+async def test_upgrade_preserves_ddl_audit_for_tables(
+        manager: ManagerClient,
+        scylla_2025_1: ScyllaVersionDescription,
+        scylla_binary: Path):
+    """Verify that upgrading from 2025.1 to master preserves DDL auditing
+    for table-scoped audit configurations (SCYLLADB-1155).
+    """
+    keyspace = "test_audit_upgrade_ks"
+    table = "audited_tbl"
+    fq_table = f"{keyspace}.{table}"
+
+    audit_settings = {
+        "audit": "table",
+        "audit_tables": fq_table,
+        "audit_keyspaces": keyspace,
+    }
+
+    logger.info("Starting server with version 2025.1 and DDL audit config")
+    server = await manager.server_add(
+        version=scylla_2025_1,
+        config=audit_settings,
+    )
+    cql, _ = await manager.get_ready_cql([server])
+
+    await cql.run_async(
+        f"CREATE KEYSPACE IF NOT EXISTS {keyspace}"
+        f" WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}")
+    await cql.run_async(f"CREATE TABLE {fq_table} (pk int PRIMARY KEY, v int)")
+
+    t = CQLAuditTester(manager, helper=AuditBackendTable())
+    t.server_addresses = [server.ip_addr]
+    cql.get_execution_profile(EXEC_PROFILE_DEFAULT).consistency_level = ConsistencyLevel.ONE
+
+    logger.info("Verifying DDL is audited before upgrade (2025.1)")
+    t.helper.clear_audit_logs(cql)
+    t.execute_and_validate_new_audit_entry(
+        cql,
+        f"ALTER TABLE {fq_table} ADD v2 int",
+        category="DDL",
+        table=table,
+        ks=keyspace,
+    )
+
+    logger.info("Upgrading server to current binary")
+    await manager.server_change_version(server.server_id, scylla_binary)
+    cql, _ = await manager.get_ready_cql([server])
+    cql.get_execution_profile(EXEC_PROFILE_DEFAULT).consistency_level = ConsistencyLevel.ONE
+
+    logger.info("Verifying DDL is audited after upgrade (master)")
+    t.helper.clear_audit_logs(cql)
+    t.execute_and_validate_new_audit_entry(
+        cql,
+        f"ALTER TABLE {fq_table} ADD v3 int",
+        category="DDL",
+        table=table,
+        ks=keyspace,
+    )
--- a/test/cluster/test_change_replication_factor_1_to_0.py
+++ b/test/cluster/test_change_replication_factor_1_to_0.py
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
    "use_tablets",
    [
        pytest.param(False, id="vnodes"),
-        pytest.param(True, id="tablets", marks=[pytest.mark.skip(reason="issue #20282"), pytest.mark.nightly]),
+        pytest.param(True, id="tablets", marks=[pytest.mark.skip_bug(reason="issue #20282"), pytest.mark.nightly]),
    ],
 )
@pytest.mark.asyncio
--- a/test/cluster/test_counters_with_tablets.py
+++ b/test/cluster/test_counters_with_tablets.py
@@ -7,8 +7,10 @@
 from test.pylib.manager_client import ManagerClient
 from test.cluster.util import new_test_keyspace
 from test.pylib.tablets import get_tablet_replica
+from test.pylib.rest_client import read_barrier

 import asyncio
+import json
 import logging
 import pytest

@@ -91,3 +93,122 @@ async def test_counter_updates_during_tablet_migration(manager: ManagerClient, m
        actual_count = result[0].c

        assert actual_count == total_updates, f"Counter value mismatch: expected {total_updates}, got {actual_count}"
+
+@pytest.mark.asyncio
+async def test_counter_ids_reuse_in_single_rack(manager: ManagerClient):
+    """
+    Migrate a single counter tablet between 3 nodes in a single rack, performing counter updates on each node,
+    and verify the updates use at most 2 different counter IDs.
+    The counter ID should be reused when migrated to another node, except in the transition stage where 2 counter IDs
+    may be used.
+    """
+    cmdline = ['--smp', '1', '--logger-log-level', 'raft_topology=debug', '--logger-log-level', 'storage_service=debug']
+    servers = await manager.servers_add(3, cmdline=cmdline, property_file=[
+        {'dc': 'dc1', 'rack': 'rack1'},
+        {'dc': 'dc1', 'rack': 'rack1'},
+        {'dc': 'dc1', 'rack': 'rack1'}
+    ])
+    cql, hosts = await manager.get_ready_cql(servers)
+    await manager.disable_tablet_balancing()
+
+    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets={'initial': 1}") as ks:
+        await cql.run_async(f"CREATE TABLE {ks}.counters (pk int PRIMARY KEY, c counter)")
+
+        pk = 1  # Single partition key for all updates
+        tablet_token = 0  # single tablet
+        total_updates = 0
+
+        # Get all host IDs
+        all_host_ids = [await manager.get_host_id(server.server_id) for server in servers]
+
+        # Migrate the tablet between all 3 nodes
+        for _ in range(3):
+            await cql.run_async(f"UPDATE {ks}.counters SET c = c + 1 WHERE pk = {pk}")
+            total_updates += 1
+
+            # Get current tablet location
+            replica = await get_tablet_replica(manager, servers[0], ks, 'counters', tablet_token)
+            src_host = replica[0]
+            src_shard = replica[1]
+
+            # Migrate to the next node
+            src_node_idx = all_host_ids.index(src_host)
+            dst_node_idx = (src_node_idx + 1) % 3
+            dst_host = all_host_ids[dst_node_idx]
+            dst_shard = 0
+
+            logger.info(f"Migrating tablet from node {src_node_idx} to node {dst_node_idx}")
+            await manager.api.move_tablet(servers[0].ip_addr, ks, "counters", src_host, src_shard, dst_host, dst_shard, tablet_token)
+
+        # Perform final counter updates after the last migration
+        await cql.run_async(f"UPDATE {ks}.counters SET c = c + 1 WHERE pk = {pk}")
+        total_updates += 1
+
+        # Verify no counter updates were lost
+        result = await cql.run_async(f"SELECT c FROM {ks}.counters WHERE pk = {pk}")
+        actual_count = result[0].c
+        assert actual_count == total_updates, f"Counter value mismatch: expected {total_updates}, got {actual_count}"
+
+        # Ensure all tablet transitions are fully completed and committed on all nodes
+        await manager.api.quiesce_topology(servers[0].ip_addr)
+        await asyncio.gather(*[read_barrier(manager.api, s.ip_addr) for s in servers])
+
+        await asyncio.gather(*[manager.api.flush_keyspace(s.ip_addr, ks) for s in servers])
+
+        # Get all counter IDs that were used
+        counter_ids = set()
+        for h in hosts:
+            res = await cql.run_async(f"SELECT * FROM MUTATION_FRAGMENTS({ks}.counters)", host=h)
+            for row in res:
+                if row.value:
+                    value_dict = json.loads(row.value)
+                    counter_ids.update({counter_shard["id"] for counter_shard in value_dict["c"]})
+
+        logger.info(f"Unique counter IDs found: {counter_ids}")
+        assert len(counter_ids) >= 1, f"Expected at least 1 counter ID, but found none"
+        assert len(counter_ids) <= 2, f"Expected at most 2 counter IDs, but found {len(counter_ids)}: {counter_ids}"
+
+@pytest.mark.asyncio
+async def test_counter_ids_multi_rack(manager: ManagerClient):
+    """
+    Test counter IDs with 3 nodes in 3 different racks with RF=3.
+    Each rack should use a different counter ID.
+    """
+    cmdline = ['--smp', '1', '--logger-log-level', 'raft_topology=debug', '--logger-log-level', 'storage_service=debug']
+    servers = await manager.servers_add(3, cmdline=cmdline, property_file=[
+        {'dc': 'dc1', 'rack': 'rack1'},
+        {'dc': 'dc1', 'rack': 'rack2'},
+        {'dc': 'dc1', 'rack': 'rack3'}
+    ])
+    cql, hosts = await manager.get_ready_cql(servers)
+    await manager.disable_tablet_balancing()
+
+    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets={'initial': 1}") as ks:
+        await cql.run_async(f"CREATE TABLE {ks}.counters (pk int PRIMARY KEY, c counter)")
+
+        pk = 1  # Single partition key for all updates
+        total_updates = 0
+
+        # Perform counter updates on each node
+        for host in hosts:
+            await cql.run_async(f"UPDATE {ks}.counters SET c = c + 1 WHERE pk = {pk}", host=host)
+            total_updates += 1
+
+        # Verify counter value
+        result = await cql.run_async(f"SELECT c FROM {ks}.counters WHERE pk = {pk}")
+        actual_count = result[0].c
+        assert actual_count == total_updates, f"Counter value mismatch: expected {total_updates}, got {actual_count}"
+
+        await asyncio.gather(*[manager.api.flush_keyspace(s.ip_addr, ks) for s in servers])
+
+        # Collect counter IDs from all nodes
+        counter_ids = set()
+        for h in hosts:
+            res = await cql.run_async(f"SELECT * FROM MUTATION_FRAGMENTS({ks}.counters)", host=h)
+            for row in res:
+                if row.value:
+                    value_dict = json.loads(row.value)
+                    counter_ids.update({counter_shard["id"] for counter_shard in value_dict["c"]})
+
+        logger.info(f"Unique counter IDs found: {counter_ids}")
+        assert len(counter_ids) == 3, f"Expected exactly 3 counter IDs (one per rack), but found {len(counter_ids)}: {counter_ids}"
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -151,8 +151,8 @@ async def trigger_tablet_merge(manager, servers, logs):
    await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark)
    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)

-async def prepare_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = []):
-    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, nr_keys=nr_keys, cmdline=cmdline)
+async def prepare_cluster_for_incremental_repair(manager, nr_keys = 100 , cmdline = [], tablets = 8):
+    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, nr_keys=nr_keys, cmdline=cmdline, tablets=tablets)
    repaired_keys = set(range(0, nr_keys))
    unrepaired_keys = set()
    current_key = nr_keys
@@ -846,3 +846,184 @@ async def test_tablet_incremental_repair_table_drop_compaction_group_gone(manage
        await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")

    await drop_future
+
+# Reproducer for the race window bug in incremental repair where minor compaction
+# promotes unrepaired data into the repaired sstable set.
+#
+# Root cause: after mark_sstable_as_repaired() writes new sstables with repaired_at=N+1
+# on all replicas, there is a window before the coordinator commits sstables_repaired_at=N+1
+# to Raft. During this window is_repaired() still uses the old threshold N, so
+# repaired_at=N+1 does not satisfy repaired_at <= N and the sstables are misclassified as
+# UNREPAIRED. Minor compaction can then merge them with a genuinely unrepaired sstable
+# (repaired_at=0). Because compaction propagates max(repaired_at), the output carries
+# repaired_at=N+1. Once sstables_repaired_at advances to N+1 the merged sstable is
+# classified REPAIRED even though it contains post-repair data that was never part of the
+# repair scan. Replicas that did not compact during this window keep that post-repair data
+# in UNREPAIRED sstables. Future incremental repairs skip the REPAIRED sstable on the
+# affected replica but process the UNREPAIRED sstable on the others, so the classification
+# divergence is never corrected. In tombstone scenarios this enables premature tombstone GC
+# on the affected replica leading to data resurrection.
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_incremental_repair_race_window_promotes_unrepaired_data(manager: ManagerClient):
+    cmdline = ['--hinted-handoff-enabled', '0']
+    servers, cql, hosts, ks, table_id, logs, _, _, current_key, token = \
+        await prepare_cluster_for_incremental_repair(manager, nr_keys=10, cmdline=cmdline, tablets=2)
+
+    # Lower min_threshold to 2 so STCS fires as soon as two sstables appear in the
+    # UNREPAIRED compaction view, making the race easy to trigger deterministically.
+    await cql.run_async(
+        f"ALTER TABLE {ks}.test WITH compaction = "
+        f"{{'class': 'SizeTieredCompactionStrategy', 'min_threshold': 2, 'max_threshold': 4}}"
+    )
+
+    # Disable autocompaction everywhere so we control exactly when compaction runs.
+    for s in servers:
+        await manager.api.disable_autocompaction(s.ip_addr, ks, 'test')
+
+    scylla_path = await manager.server_get_exe(servers[0].server_id)
+
+    # Repair 1: establishes sstables_repaired_at=1 on all nodes.
+    # Keys 0-9 (inserted by preapre_cluster_for_incremental_repair) end up in
+    # S0'(repaired_at=1) on all nodes.
+    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
+
+    # Insert keys 10-19 and flush on all nodes → S1(repaired_at=0).
+    # These will be the subject of repair 2.
+    repair2_keys = list(range(current_key, current_key + 10))
+    await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k})") for k in repair2_keys])
+    for s in servers:
+        await manager.api.flush_keyspace(s.ip_addr, ks)
+    current_key += 10
+
+    coord = await get_topology_coordinator(manager)
+    coord_serv = await find_server_by_host_id(manager, servers, coord)
+    coord_log = await manager.server_open_log(coord_serv.server_id)
+    coord_mark = await coord_log.mark()
+
+    # Hold the race window open: prevent the coordinator from committing
+    # end_repair + sstables_repaired_at=2 to Raft.
+    await manager.api.enable_injection(coord_serv.ip_addr, "delay_end_repair_update", one_shot=False)
+
+    repair_response = await manager.api.tablet_repair(
+        servers[0].ip_addr, ks, "test", token,
+        await_completion=False, incremental_mode="incremental"
+    )
+    task_id = repair_response['tablet_task_id']
+
+    # "Finished tablet repair" is logged once per tablet after mark_sstable_as_repaired()
+    # has completed on all replicas for that tablet.  With tablets=2 the coordinator logs
+    # this message twice (once per tablet).  We must wait for BOTH before writing
+    # post-repair keys; waiting for only the first leaves the second tablet's repair in
+    # progress, which can flush the memtable and mark newly-flushed sstables as repaired,
+    # contaminating servers[0] and servers[2] with post-repair data in repaired sstables.
+    # After both tablets complete, S1 is fully rewritten as S1'(repaired_at=2,
+    # being_repaired=null) on every replica, but sstables_repaired_at in system.tablets is
+    # still 1, so is_repaired(1, S1'{repaired_at=2}) == false and S1' lands in the
+    # UNREPAIRED compaction view on every replica. The race window is now open.
+    pos, _ = await coord_log.wait_for("Finished tablet repair", from_mark=coord_mark)
+    await coord_log.wait_for("Finished tablet repair", from_mark=pos)
+
+    # --- Race window is open ---
+    # Write post-repair keys 20-29.  All nodes receive the writes into their memtables
+    # (RF=3, hinted handoff disabled).
+    post_repair_keys = list(range(current_key, current_key + 10))
+    await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k})") for k in post_repair_keys])
+    current_key += 10
+
+    # Flush servers[1] BEFORE the restart so E(repaired_at=0, keys 20-29) lands on disk.
+    # At this point servers[1] holds on disk:
+    #   S1'  repaired_at=2  being_repaired=session_id  (keys 10-19, from mark_sstable_as_repaired)
+    #   E    repaired_at=0  being_repaired=null         (keys 20-29, genuine post-repair data)
+    # servers[0] and servers[2] still have keys 20-29 only in their memtables.
+    target = servers[1]
+    await manager.api.flush_keyspace(target.ip_addr, ks)
+
+    # Restart servers[1].  being_repaired is in-memory and is lost on restart.
+    # After restart both S1' and E are loaded from disk with being_repaired=null.
+    # Without the classifier fix: is_repaired(sstables_repaired_at=1, S1'{repaired_at=2})
+    # is false and being_repaired is null, so S1' lands in the UNREPAIRED view where
+    # autocompaction is active.  STCS (min_threshold=2) immediately merges S1' and E into
+    # F(repaired_at=max(2,0)=2, keys 10-29), wrongly promoting E into the REPAIRED set.
+    # With the classifier fix: S1' has repaired_at==sstables_repaired_at+1 and the tablet
+    # is still in the `repair` stage, so it is classified REPAIRING (compaction disabled),
+    # and the merge never happens.
+    await manager.server_stop_gracefully(target.server_id)
+    await manager.server_start(target.server_id)
+    await manager.servers_see_each_other(servers)
+
+    # Poll until compaction has produced F(repaired_at=2) containing post-repair keys,
+    # confirming that the bug was triggered (S1' and E merged during the race window).
+    deadline = time.time() + 60
+    compaction_ran = False
+    while time.time() < deadline:
+        for sst in await get_sstables_for_server(manager, target, ks):
+            if get_repaired_at_from_sst(sst, scylla_path) == 2:
+                if set(get_keys_from_sst(sst, scylla_path)) & set(post_repair_keys):
+                    compaction_ran = True
+                    logger.info(f"Post-restart compaction produced F(repaired_at=2) with post-repair keys: {sst}")
+                    break
+        if compaction_ran:
+            break
+        await asyncio.sleep(1)
+
+    # --- Release the race window ---
+    await manager.api.disable_injection(coord_serv.ip_addr, "delay_end_repair_update")
+    await manager.api.wait_task(servers[0].ip_addr, task_id)
+
+    if not compaction_ran:
+        logger.warning("Compaction did not merge S1' and E after restart during the race window; "
+                       "the bug was not triggered.  Skipping assertion.")
+        return
+
+    # Flush servers[0] and servers[2] AFTER the race window closes so their post-repair
+    # keys land in G(repaired_at=0): correctly classified as UNREPAIRED.
+    for s in [servers[0], servers[2]]:
+        await manager.api.flush_keyspace(s.ip_addr, ks)
+
+    # Stop all servers so sstable files on disk are stable.
+    for s in servers:
+        await manager.server_stop_gracefully(s.server_id)
+
+    post_repair_key_set = set(post_repair_keys)
+
+    async def keys_in_repaired_sstables(server) -> set:
+        """Return the set of keys found in any sstable with repaired_at > 0 on this server."""
+        result = set()
+        for sst in await get_sstables_for_server(manager, server, ks):
+            ra = get_repaired_at_from_sst(sst, scylla_path)
+            if ra is not None and ra > 0:
+                result.update(get_keys_from_sst(sst, scylla_path))
+        return result
+
+    repaired_keys_0 = await keys_in_repaired_sstables(servers[0])
+    repaired_keys_1 = await keys_in_repaired_sstables(servers[1])
+    repaired_keys_2 = await keys_in_repaired_sstables(servers[2])
+
+    logger.info(f"Post-repair keys in repaired sstables: "
+                f"servers[0]={len(repaired_keys_0 & post_repair_key_set)}, "
+                f"servers[1]={len(repaired_keys_1 & post_repair_key_set)}, "
+                f"servers[2]={len(repaired_keys_2 & post_repair_key_set)}")
+
+    # servers[0] and servers[2] flushed post-repair keys after the race window closed,
+    # so those keys are in G(repaired_at=0) → correctly UNREPAIRED.
+    assert not (repaired_keys_0 & post_repair_key_set), \
+        f"servers[0] should not have post-repair keys in repaired sstables, " \
+        f"got: {repaired_keys_0 & post_repair_key_set}"
+    assert not (repaired_keys_2 & post_repair_key_set), \
+        f"servers[2] should not have post-repair keys in repaired sstables, " \
+        f"got: {repaired_keys_2 & post_repair_key_set}"
+
+    # BUG: servers[1] restarted during the race window, losing its in-memory being_repaired
+    # markers.  S1'(repaired_at=2) and E(repaired_at=0) both landed in the UNREPAIRED
+    # compaction view and were merged into F(repaired_at=2) by autocompaction.  After
+    # sstables_repaired_at advances to 2, F is classified REPAIRED even though it contains
+    # post-repair data that was never part of the repair scan.  This diverges from servers[0]
+    # and servers[2] which keep those keys UNREPAIRED, enabling premature tombstone GC and
+    # data resurrection.
+    wrongly_promoted = repaired_keys_1 & post_repair_key_set
+    assert not wrongly_promoted, \
+        f"BUG: {len(wrongly_promoted)} post-repair keys were wrongly promoted to REPAIRED " \
+        f"on servers[1] after restart lost the being_repaired markers during the race window. " \
+        f"They are UNREPAIRED on servers[0] and servers[2] (classification divergence). " \
+        f"Wrongly promoted (first 10): {sorted(wrongly_promoted)[:10]}"
--- a/Show More
+++ b/Show More