Replace std::vector with utils::chunked_vector in schema diff structures

Co-authored-by: mykaul <4655593+mykaul@users.noreply.github.com>
Initial plan
2025-12-18 11:09:14 +00:00 · 2025-12-18 11:05:05 +00:00 · 2025-12-17 20:01:00 +02:00 · 2025-12-17 17:29:15 +01:00 · 2025-12-17 14:00:28 +01:00 · 2025-12-17 11:48:39 +01:00
539 changed files with 16938 additions and 6024 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,5 @@
 # AUTH
-auth/* @nuivall @ptrsmrn
+auth/* @nuivall

 # CACHE
 row_cache* @tgrabiec
@@ -25,11 +25,11 @@ compaction/* @raphaelsc
 transport/*

 # CQL QUERY LANGUAGE
-cql3/* @tgrabiec @nuivall @ptrsmrn
+cql3/* @tgrabiec @nuivall

 # COUNTERS
-counters* @nuivall @ptrsmrn
-tests/counter_test* @nuivall @ptrsmrn
+counters* @nuivall
+tests/counter_test* @nuivall

 # DOCS
 docs/* @annastuchlik @tzach
@@ -57,7 +57,6 @@ repair/* @tgrabiec @asias

 # SCHEMA MANAGEMENT
 db/schema_tables* @tgrabiec
-db/legacy_schema_migrator* @tgrabiec
 service/migration* @tgrabiec
 schema* @tgrabiec

--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,86 @@
+# ScyllaDB Development Instructions
+
+## Project Context
+High-performance distributed NoSQL database. Core values: performance, correctness, readability.
+
+## Build System
+
+### Modern Build (configure.py + ninja)
+```bash
+# Configure (run once per mode, or when switching modes)
+./configure.py --mode=<mode>  # mode: dev, debug, release, sanitize
+
+# Build everything
+ninja <mode>-build  # e.g., ninja dev-build
+
+# Build Scylla binary only (sufficient for Python integration tests)
+ninja build/<mode>/scylla
+
+# Build specific test
+ninja build/<mode>/test/boost/<test_name>
+```
+
+## Running Tests
+
+### C++ Unit Tests
+```bash
+# Run all tests in a file
+./test.py --mode=<mode> test/<suite>/<test_name>.cc
+
+# Run a single test case from a file
+./test.py --mode=<mode> test/<suite>/<test_name>.cc::<test_case_name>
+
+# Examples
+./test.py --mode=dev test/boost/memtable_test.cc
+./test.py --mode=dev test/raft/raft_server_test.cc::test_check_abort_on_client_api
+```
+
+**Important:** 
+- Use full path with `.cc` extension (e.g., `test/boost/test_name.cc`, not `boost/test_name`)
+- To run a single test case, append `::<test_case_name>` to the file path
+- If you encounter permission issues with cgroup metric gathering, add `--no-gather-metrics` flag
+
+**Rebuilding Tests:**
+- test.py does NOT automatically rebuild when test source files are modified
+- Many tests are part of composite binaries (e.g., `combined_tests` in test/boost contains multiple test files)
+- To find which binary contains a test, check `configure.py` in the repository root (primary source) or `test/<suite>/CMakeLists.txt`
+- To rebuild a specific test binary: `ninja build/<mode>/test/<suite>/<binary_name>`
+- Examples: 
+  - `ninja build/dev/test/boost/combined_tests` (contains group0_voter_calculator_test.cc and others)
+  - `ninja build/dev/test/raft/replication_test` (standalone Raft test)
+
+### Python Integration Tests
+```bash
+# Only requires Scylla binary (full build usually not needed)
+ninja build/<mode>/scylla
+
+# Run all tests in a file
+./test.py --mode=<mode> <test_path>
+
+# Run a single test case from a file
+./test.py --mode=<mode> <test_path>::<test_function_name>
+
+# Examples
+./test.py --mode=dev alternator/
+./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
+
+# Optional flags
+./test.py --mode=dev cluster/test_raft_no_quorum -v  # Verbose output
+./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5  # Repeat test 5 times
+```
+
+**Important:**
+- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
+- To run a single test case, append `::<test_function_name>` to the file path
+- Add `-v` for verbose output
+- Add `--repeat <num>` to repeat a test multiple times
+- After modifying C++ source files, only rebuild the Scylla binary for Python tests - building the entire repository is unnecessary
+
+## Code Philosophy
+- Performance matters in hot paths (data read/write, inner loops)
+- Self-documenting code through clear naming
+- Comments explain "why", not "what"
+- Prefer standard library over custom implementations
+- Strive for simplicity and clarity, add complexity only when clearly justified
+- Question requests: don't blindly implement requests - evaluate trade-offs, identify issues, and suggest better alternatives when appropriate
+- Consider different approaches, weigh pros and cons, and recommend the best fit for the specific context
--- a/.github/instructions/cpp.instructions.md
+++ b/.github/instructions/cpp.instructions.md
@@ -0,0 +1,115 @@
+---
+applyTo: "**/*.{cc,hh}"
+---
+
+# C++ Guidelines
+
+**Important:** Always match the style and conventions of existing code in the file and directory.
+
+## Memory Management
+- Prefer stack allocation whenever possible
+- Use `std::unique_ptr` by default for dynamic allocations
+- `new`/`delete` are forbidden (use RAII)
+- Use `seastar::lw_shared_ptr` or `seastar::shared_ptr` for shared ownership within same shard
+- Use `seastar::foreign_ptr` for cross-shard sharing
+- Avoid `std::shared_ptr` except when interfacing with external C++ APIs
+- Avoid raw pointers except for non-owning references or C API interop
+
+## Seastar Asynchronous Programming
+- Use `seastar::future<T>` for all async operations
+- Prefer coroutines (`co_await`, `co_return`) over `.then()` chains for readability
+- Coroutines are preferred over `seastar::do_with()` for managing temporary state
+- In hot paths where futures are ready, continuations may be more efficient than coroutines
+- Chain futures with `.then()`, don't block with `.get()` (unless in `seastar::thread` context)
+- All I/O must be asynchronous (no blocking calls)
+- Use `seastar::gate` for shutdown coordination
+- Use `seastar::semaphore` for resource limiting (not `std::mutex`)
+- Break long loops with `maybe_yield()` to avoid reactor stalls
+
+## Coroutines
+```cpp
+seastar::future<T> func() {
+    auto result = co_await async_operation();
+    co_return result;
+}
+```
+
+## Error Handling
+- Throw exceptions for errors (futures propagate them automatically)
+- In data path: avoid exceptions, use `std::expected` (or `boost::outcome`) instead
+- Use standard exceptions (`std::runtime_error`, `std::invalid_argument`)
+- Database-specific: throw appropriate schema/query exceptions
+
+## Performance
+- Pass large objects by `const&` or `&&` (move semantics)
+- Use `std::string_view` for non-owning string references
+- Avoid copies: prefer move semantics
+- Use `utils::chunked_vector` instead of `std::vector` for large allocations (>128KB)
+- Minimize dynamic allocations in hot paths
+
+## Database-Specific Types
+- Use `schema_ptr` for schema references
+- Use `mutation` and `mutation_partition` for data modifications
+- Use `partition_key` and `clustering_key` for keys
+- Use `api::timestamp_type` for database timestamps
+- Use `gc_clock` for garbage collection timing
+
+## Style
+- C++23 standard (prefer modern features, especially coroutines)
+- Use `auto` when type is obvious from RHS
+- Avoid `auto` when it obscures the type
+- Use range-based for loops: `for (const auto& item : container)`
+- Use standard algorithms when they clearly simplify code (e.g., replacing 10-line loops)
+- Avoid chaining multiple algorithms if a straightforward loop is clearer
+- Mark functions and variables `const` whenever possible
+- Use scoped enums: `enum class` (not unscoped `enum`)
+
+## Headers
+- Use `#pragma once`
+- Include order: own header, C++ std, Seastar, Boost, project headers
+- Forward declare when possible
+- Never `using namespace` in headers (exception: `using namespace seastar` is globally available via `seastarx.hh`)
+
+## Documentation
+- Public APIs require clear documentation
+- Implementation details should be self-evident from code
+- Use `///` or Doxygen `/** */` for public documentation, `//` for implementation notes - follow the existing style
+
+## Naming
+- `snake_case` for most identifiers (classes, functions, variables, namespaces)
+- Template parameters: `CamelCase` (e.g., `template<typename ValueType>`)
+- Member variables: prefix with `_` (e.g., `int _count;`)
+- Structs (value-only): no `_` prefix on members
+- Constants and `constexpr`: `snake_case` (e.g., `static constexpr int max_size = 100;`)
+- Files: `.hh` for headers, `.cc` for source
+
+## Formatting
+- 4 spaces indentation, never tabs
+- Opening braces on same line as control structure (except namespaces)
+- Space after keywords: `if (`, `while (`, `return `
+- Whitespace around operators matches precedence: `*a + *b` not `* a+* b`
+- Line length: keep reasonable (<160 chars), use continuation lines with double indent if needed
+- Brace all nested scopes, even single statements
+- Minimal patches: only format code you modify, never reformat entire files
+
+## Logging
+- Use structured logging with appropriate levels: DEBUG, INFO, WARN, ERROR
+- Include context in log messages (e.g., request IDs)
+- Never log sensitive data (credentials, PII)
+
+## Forbidden
+- `malloc`/`free`
+- `printf` family (use logging or fmt)
+- Raw pointers for ownership
+- `using namespace` in headers
+- Blocking operations: `std::sleep`, `std::read`, `std::mutex` (use Seastar equivalents)
+- `std::atomic` (reserved for very special circumstances only)
+- Macros (use `inline`, `constexpr`, or templates instead)
+
+## Testing
+When modifying existing code, follow TDD: create/update test first, then implement.
+- Examine existing tests for style and structure
+- Use Boost.Test framework
+- Use `SEASTAR_THREAD_TEST_CASE` for Seastar asynchronous tests
+- Aim for high code coverage, especially for new features and bug fixes
+- Maintain bisectability: all tests must pass in every commit. Mark failing tests with `BOOST_FAIL()` or similar, then fix in subsequent commit
--- a/.github/instructions/python.instructions.md
+++ b/.github/instructions/python.instructions.md
@@ -0,0 +1,51 @@
+---
+applyTo: "**/*.py"
+---
+
+# Python Guidelines
+
+**Important:** Match existing code style. Some directories (like `test/cqlpy` and `test/alternator`) prefer simplicity over type hints and docstrings.
+
+## Style
+- Follow PEP 8
+- Use type hints for function signatures (unless directory style omits them)
+- Use f-strings for formatting
+- Line length: 160 characters max
+- 4 spaces for indentation
+
+## Imports
+Order: standard library, third-party, local imports
+```python
+import os
+import sys
+
+import pytest
+from cassandra.cluster import Cluster
+
+from test.utils import setup_keyspace
+```
+
+Never use `from module import *`
+
+## Documentation
+All public functions/classes need docstrings (unless the current directory conventions omit them):
+```python
+def my_function(arg1: str, arg2: int) -> bool:
+    """
+    Brief summary of function purpose.
+
+    Args:
+        arg1: Description of first argument.
+        arg2: Description of second argument.
+
+    Returns:
+        Description of return value.
+    """
+    pass
+```
+
+## Testing Best Practices
+- Maintain bisectability: all tests must pass in every commit
+- Mark currently-failing tests with `@pytest.mark.xfail`, unmark when fixed
+- Use descriptive names that convey intent
+- Docstrings/comments should explain what the test verifies and why, and if it reproduces a specific issue or how it fits into the larger test suite
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -62,7 +62,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            labels_to_add.append("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and mark this PR as ready for review"
+            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
            backport_pr.create_issue_comment(pr_comment)
        
        # Apply all labels at once if we have any
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -0,0 +1,14 @@
+name: Call Jira release creation for new milestone
+
+on:
+  milestone:
+    types: [created]
+
+jobs:
+  sync-milestone-to-jira:
+    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
+    with:
+      # Comma-separated list of Jira project keys
+      jira_project_keys: "SCYLLADB,CUSTOMER"
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/docs-validate-metrics.yml
+++ b/.github/workflows/docs-validate-metrics.yml
@@ -0,0 +1,34 @@
+name: Docs / Validate metrics
+
+on:
+  pull_request:
+    branches:
+      - master
+      - enterprise
+    paths:
+      - '**/*.cc'
+      - 'scripts/metrics-config.yml'
+      - 'scripts/get_description.py'
+      - 'docs/_ext/scylladb_metrics.py'
+
+jobs:
+  validate-metrics:
+    runs-on: ubuntu-latest
+    name: Check metrics documentation coverage
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: pip install PyYAML
+
+    - name: Validate metrics
+      run: python3 scripts/get_description.py --validate -c scripts/metrics-config.yml
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -3,10 +3,13 @@ name: Trigger Scylla CI Route
 on:
  issue_comment:
    types: [created]
+  pull_request_target:
+    types:
+      - unlabeled

 jobs:
  trigger-jenkins:
-    if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
+    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
      - name: Trigger Scylla-CI-Route Jenkins Job
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,7 @@ list(APPEND absl_cxx_flags
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    list(APPEND ABSL_GCC_FLAGS ${absl_cxx_flags})
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    list(APPEND absl_cxx_flags "-Wno-deprecated-builtins")
    list(APPEND ABSL_LLVM_FLAGS ${absl_cxx_flags})
 endif()
 set(ABSL_DEFAULT_LINKOPTS
@@ -163,7 +164,45 @@ file(MAKE_DIRECTORY "${scylla_gen_build_dir}")
 include(add_version_library)
 generate_scylla_version()

+option(Scylla_USE_PRECOMPILED_HEADER "Use precompiled header for Scylla" ON)
+add_library(scylla-precompiled-header STATIC exported_templates.cc)
+target_link_libraries(scylla-precompiled-header PRIVATE
+    absl::headers
+    absl::btree
+    absl::hash
+    absl::raw_hash_set
+    Seastar::seastar
+    Snappy::snappy
+    systemd
+    ZLIB::ZLIB
+    lz4::lz4_static
+    zstd::zstd_static)
+if (Scylla_USE_PRECOMPILED_HEADER)
+  set(Scylla_USE_PRECOMPILED_HEADER_USE ON)
+  find_program(DISTCC_EXEC NAMES distcc OPTIONAL)
+  if (DISTCC_EXEC)
+    if(DEFINED ENV{DISTCC_HOSTS})
+      set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
+      message(STATUS "Disabling precompiled header usage because distcc exists and DISTCC_HOSTS is set, assuming you're using distributed compilation.")
+    else()
+      file(REAL_PATH "~/.distcc/hosts" DIST_CC_HOSTS_PATH EXPAND_TILDE)
+      if (EXISTS ${DIST_CC_HOSTS_PATH})
+        set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
+        message(STATUS "Disabling precompiled header usage because distcc and ~/.distcc/hosts exists, assuming you're using distributed compilation.")
+      endif()
+    endif()
+  endif()
+  if (Scylla_USE_PRECOMPILED_HEADER_USE)
+    message(STATUS "Using precompiled header for Scylla - remember to add `sloppiness = pch_defines,time_macros` to ccache.conf, if you're using ccache.")
+    target_precompile_headers(scylla-precompiled-header PRIVATE "stdafx.hh")
+    target_compile_definitions(scylla-precompiled-header PRIVATE SCYLLA_USE_PRECOMPILED_HEADER)
+  endif()
+else()
+  set(Scylla_USE_PRECOMPILED_HEADER_USE OFF)
+endif()
+
 add_library(scylla-main STATIC)
+
 target_sources(scylla-main
  PRIVATE
    absl-flat_hash_map.cc
@@ -208,6 +247,7 @@ target_link_libraries(scylla-main
    ZLIB::ZLIB
    lz4::lz4_static
    zstd::zstd_static
+    scylla-precompiled-header
 )

 option(Scylla_CHECK_HEADERS
--- a/alternator/CMakeLists.txt
+++ b/alternator/CMakeLists.txt
@@ -34,5 +34,8 @@ target_link_libraries(alternator
    idl
    absl::headers)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(alternator REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers alternator
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -42,7 +42,7 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
    if (!comparison_operator.IsString()) {
        throw api_error::validation(fmt::format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
-    std::string op = comparison_operator.GetString();
+    std::string op = rjson::to_string(comparison_operator);
    auto it = ops.find(op);
    if (it == ops.end()) {
        throw api_error::validation(fmt::format("Unsupported comparison operator {}", op));
@@ -377,8 +377,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
        return cmp(unwrap_number(*v1, cmp.diagnostic), unwrap_number(v2, cmp.diagnostic));
    }
    if (kv1.name == "S") {
-        return cmp(std::string_view(kv1.value.GetString(), kv1.value.GetStringLength()),
-                   std::string_view(kv2.value.GetString(), kv2.value.GetStringLength()));
+        return cmp(rjson::to_string_view(kv1.value),
+                   rjson::to_string_view(kv2.value));
    }
    if (kv1.name == "B") {
        auto d_kv1 = unwrap_bytes(kv1.value, v1_from_query);
@@ -470,9 +470,9 @@ static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const r
        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
-        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
-                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+        return check_BETWEEN(rjson::to_string_view(kv_v.value),
+                             rjson::to_string_view(kv_lb.value),
+                             rjson::to_string_view(kv_ub.value),
                             bounds_from_query);
    }
    if (kv_v.name == "B") {
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -8,6 +8,8 @@

 #include "consumed_capacity.hh"
 #include "error.hh"
+#include "utils/rjson.hh"
+#include <fmt/format.h>

 namespace alternator {

@@ -32,12 +34,12 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
    if (!return_consumed->IsString()) {
        throw api_error::validation("Non-string ReturnConsumedCapacity field in request");
    }
-    std::string consumed = return_consumed->GetString();
+    std::string_view consumed = rjson::to_string_view(*return_consumed);
    if (consumed == "INDEXES") {
        throw api_error::validation("INDEXES consumed capacity is not supported");
    }
    if (consumed != "TOTAL") {
-        throw api_error::validation("Unknown consumed capacity "+ consumed);
+        throw api_error::validation(fmt::format("Unknown consumed capacity {}", consumed));
    }
    return true;
 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -419,7 +419,7 @@ static std::optional<std::string> find_table_name(const rjson::value& request) {
    if (!table_name_value->IsString()) {
        throw api_error::validation("Non-string TableName field in request");
    }
-    std::string table_name = table_name_value->GetString();
+    std::string table_name = rjson::to_string(*table_name_value);
    return table_name;
 }

@@ -546,7 +546,7 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
            // does exist but the index does not (ValidationException).
            if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) {
                throw api_error::validation(
-                    fmt::format("Requested resource not found: Index '{}' for table '{}'", index_name->GetString(), orig_table_name));
+                    fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name));
            } else {
                throw api_error::resource_not_found(
                    fmt::format("Requested resource not found: Table: {} not found", orig_table_name));
@@ -587,7 +587,7 @@ static std::string get_string_attribute(const rjson::value& value, std::string_v
        throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}",
                attribute_name, value));
    }
-    return std::string(attribute_value->GetString(), attribute_value->GetStringLength());
+    return rjson::to_string(*attribute_value);
 }

 // Convenience function for getting the value of a boolean attribute, or a
@@ -888,7 +888,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli

    schema_ptr schema = get_table(_proxy, request);
    get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
-    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+    tracing::add_alternator_table_name(trace_state, schema->cf_name());

    rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
    rjson::value response = rjson::empty_object();
@@ -989,7 +989,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    std::string table_name = get_table_name(request);

    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_alternator_table_name(trace_state, table_name);
    auto& p = _proxy.container();

    schema_ptr schema = get_table(_proxy, request);
@@ -1008,8 +1008,8 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
                throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
            }

-            auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
-            auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy, keyspace_name, group0_guard.write_timestamp());
+            auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
+            auto m2 = co_await service::prepare_keyspace_drop_announcement(p.local(), keyspace_name, group0_guard.write_timestamp());

            std::move(m2.begin(), m2.end(), std::back_inserter(m));

@@ -1080,8 +1080,8 @@ static void add_column(schema_builder& builder, const std::string& name, const r
    }
    for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
        const rjson::value& attribute_info = *it;
-        if (attribute_info["AttributeName"].GetString() == name) {
-            auto type = attribute_info["AttributeType"].GetString();
+        if (rjson::to_string_view(attribute_info["AttributeName"]) == name) {
+            std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
            data_type dt = parse_key_type(type);
            if (computed_column) {
                // Computed column for GSI (doesn't choose a real column as-is
@@ -1116,7 +1116,7 @@ static std::pair<std::string, std::string> parse_key_schema(const rjson::value&
        throw api_error::validation("First element of KeySchema must be an object");
    }
    const rjson::value *v = rjson::find((*key_schema)[0], "KeyType");
-    if (!v || !v->IsString() || v->GetString() != std::string("HASH")) {
+    if (!v || !v->IsString() || rjson::to_string_view(*v) != "HASH") {
        throw api_error::validation("First key in KeySchema must be a HASH key");
    }
    v = rjson::find((*key_schema)[0], "AttributeName");
@@ -1124,14 +1124,14 @@ static std::pair<std::string, std::string> parse_key_schema(const rjson::value&
        throw api_error::validation("First key in KeySchema must have string AttributeName");
    }
    validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "HASH key in KeySchema - ");
-    std::string hash_key = v->GetString();
+    std::string hash_key = rjson::to_string(*v);
    std::string range_key;
    if (key_schema->Size() == 2) {
        if (!(*key_schema)[1].IsObject()) {
            throw api_error::validation("Second element of KeySchema must be an object");
        }
        v = rjson::find((*key_schema)[1], "KeyType");
-        if (!v || !v->IsString() || v->GetString() != std::string("RANGE")) {
+        if (!v || !v->IsString() || rjson::to_string_view(*v) != "RANGE") {
            throw api_error::validation("Second key in KeySchema must be a RANGE key");
        }
        v = rjson::find((*key_schema)[1], "AttributeName");
@@ -1583,7 +1583,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    std::unordered_set<std::string> unused_attribute_definitions =
        validate_attribute_definitions("", *attribute_definitions);

-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_alternator_table_name(trace_state, table_name);

    schema_builder builder(keyspace_name, table_name);
    auto [hash_key, range_key] = parse_key_schema(request, "");
@@ -1799,6 +1799,11 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
                }
            }
        }
+        // Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
+        // GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
+        if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+            co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+        }
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
@@ -1865,10 +1870,10 @@ future<executor::request_return_type> executor::create_table(client_state& clien
    _stats.api_operations.create_table++;
    elogger.trace("Creating table {}", request);

-    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
+    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
                                        (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
        const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
-        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, _stats, std::move(tablets_mode));
+        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
    });
 }

@@ -1887,8 +1892,8 @@ future<executor::request_return_type> executor::create_table(client_state& clien
        std::string def_type = type_to_string(def.type);
        for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
            const rjson::value& attribute_info = *it;
-            if (attribute_info["AttributeName"].GetString() == def.name_as_text()) {
-                auto type = attribute_info["AttributeType"].GetString();
+            if (rjson::to_string_view(attribute_info["AttributeName"]) == def.name_as_text()) {
+                std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
                if (type != def_type) {
                    throw api_error::validation(fmt::format("AttributeDefinitions redefined {} to {} already a key attribute of type {} in this table", def.name_as_text(), type, def_type));
                }
@@ -1930,7 +1935,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien

            schema_ptr tab = get_table(p.local(), request);

-            tracing::add_table_name(gt, tab->ks_name(), tab->cf_name());
+            tracing::add_alternator_table_name(gt, tab->cf_name());

            // the ugly but harmless conversion to string_view here is because
            // Seastar's sstring is missing a find(std::string_view) :-()
@@ -2019,6 +2024,10 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                            co_return api_error::validation(fmt::format(
                                "LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
                        }
+                        if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
+                                !p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+                            co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+                        }

                        elogger.trace("Adding GSI {}", index_name);
                        // FIXME: read and handle "Projection" parameter. This will
@@ -2223,12 +2232,12 @@ void validate_value(const rjson::value& v, const char* caller) {

 // The put_or_delete_item class builds the mutations needed by the PutItem and
 // DeleteItem operations - either as stand-alone commands or part of a list
-// of commands in BatchWriteItems.
+// of commands in BatchWriteItem.
 // put_or_delete_item splits each operation into two stages: Constructing the
 // object parses and validates the user input (throwing exceptions if there
 // are input errors). Later, build() generates the actual mutation, with a
 // specified timestamp. This split is needed because of the peculiar needs of
-// BatchWriteItems and LWT. BatchWriteItems needs all parsing to happen before
+// BatchWriteItem and LWT. BatchWriteItem needs all parsing to happen before
 // any writing happens (if one of the commands has an error, none of the
 // writes should be done). LWT makes it impossible for the parse step to
 // generate "mutation" objects, because the timestamp still isn't known.
@@ -2362,7 +2371,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    _cells = std::vector<cell>();
    _cells->reserve(item.MemberCount());
    for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
-        bytes column_name = to_bytes(it->name.GetString());
+        bytes column_name = to_bytes(rjson::to_string_view(it->name));
        validate_value(it->value, "PutItem");
        const column_definition* cdef = find_attribute(*schema, column_name);
        validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
@@ -2624,14 +2633,14 @@ std::optional<service::cas_shard> rmw_operation::shard_for_execute(bool needs_re
 // Build the return value from the different RMW operations (UpdateItem,
 // PutItem, DeleteItem). All these return nothing by default, but can
 // optionally return Attributes if requested via the ReturnValues option.
-static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
+static executor::request_return_type rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
    rjson::value ret = rjson::empty_object();
    consumed_capacity.add_consumed_capacity_to_response_if_needed(ret);
    metric += consumed_capacity.get_consumed_capacity_units();
    if (!attributes.IsNull()) {
        rjson::add(ret, "Attributes", std::move(attributes));
    }
-    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+    return rjson::print(std::move(ret));
 }

 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -2697,7 +2706,10 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        stats& global_stats,
        stats& per_table_stats,
        uint64_t& wcu_total) {
-    auto cdc_opts = cdc::per_request_options{};
+    auto cdc_opts = cdc::per_request_options{
+        .alternator = true,
+        .alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
+    };
    if (needs_read_before_write) {
        if (_write_isolation == write_isolation::FORBID_RMW) {
            throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
@@ -2736,13 +2748,13 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
    auto read_command = needs_read_before_write ?
            previous_item_read_command(proxy, schema(), _ck, selection) :
            nullptr;
-    return proxy.cas(schema(), std::move(*cas_shard), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
+    return proxy.cas(schema(), std::move(*cas_shard), *this, read_command, to_partition_ranges(*schema(), _pk),
            {timeout, std::move(permit), client_state, trace_state},
            db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout, true, std::move(cdc_opts)).then([this, read_command, &wcu_total] (bool is_applied) mutable {
        if (!is_applied) {
            return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
        }
-        return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
+        return make_ready_future<executor::request_return_type>(rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total));
    });
 }

@@ -2780,10 +2792,10 @@ static void verify_all_are_used(const rjson::value* field,
        return;
    }
    for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
-        if (!used.contains(it->name.GetString())) {
+        if (!used.contains(rjson::to_string(it->name))) {
            throw api_error::validation(
                format("{} has spurious '{}', not used in {}",
-                    field_name, it->name.GetString(), operation));
+                    field_name, rjson::to_string_view(it->name), operation));
        }
    }
 }
@@ -2856,7 +2868,7 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
    elogger.trace("put_item {}", request);

    auto op = make_shared<put_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
-    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
+    tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
    const bool needs_read_before_write = op->needs_read_before_write();

    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
@@ -2960,7 +2972,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client

    auto op = make_shared<delete_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
    lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
-    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
+    tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
    const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();

    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
@@ -2997,7 +3009,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client
 }

 static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
-    sstring table_name = batch_request->name.GetString(); // JSON keys are always strings
+    sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings
    try {
        return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
    } catch(data_dictionary::no_such_column_family&) {
@@ -3023,17 +3035,20 @@ struct primary_key_equal {
 };

 // This is a cas_request subclass for applying given put_or_delete_items to
-// one partition using LWT as part as BatchWriteItems. This is a write-only
+// one partition using LWT as part as BatchWriteItem. This is a write-only
 // operation, not needing the previous value of the item (the mutation to be
 // done is known prior to starting the operation). Nevertheless, we want to
 // do this mutation via LWT to ensure that it is serialized with other LWT
 // mutations to the same partition.
+// 
+// The std::vector<put_or_delete_item> must remain alive until the
+// storage_proxy::cas() future is resolved.
 class put_or_delete_item_cas_request : public service::cas_request {
    schema_ptr schema;
-    std::vector<put_or_delete_item> _mutation_builders;
+    const std::vector<put_or_delete_item>& _mutation_builders;
 public:
-    put_or_delete_item_cas_request(schema_ptr s, std::vector<put_or_delete_item>&& b) :
-        schema(std::move(s)), _mutation_builders(std::move(b)) { }
+    put_or_delete_item_cas_request(schema_ptr s, const std::vector<put_or_delete_item>& b) :
+        schema(std::move(s)), _mutation_builders(b) { }
    virtual ~put_or_delete_item_cas_request() = default;
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override {
        std::optional<mutation> ret;
@@ -3049,17 +3064,48 @@ public:
    }
 };

-static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, service::cas_shard cas_shard, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
-        service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit) {
+future<> executor::cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
+        const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
+        tracing::trace_state_ptr trace_state, service_permit permit)
+{
+    if (!cas_shard.this_shard()) {
+        _stats.shard_bounce_for_lwt++;
+        return container().invoke_on(cas_shard.shard(), _ssg,
+                    [cs = client_state.move_to_other_shard(),
+                    &mb = mutation_builders,
+                    &dk,
+                    ks = schema->ks_name(),
+                    cf = schema->cf_name(),
+                    gt = tracing::global_trace_state_ptr(trace_state),
+                    permit = std::move(permit)]
+                    (executor& self) mutable {
+            return do_with(cs.get(), [&mb, &dk, ks = std::move(ks), cf = std::move(cf),
+                                    trace_state = tracing::trace_state_ptr(gt), &self]
+                                    (service::client_state& client_state) mutable {
+                auto schema = self._proxy.data_dictionary().find_schema(ks, cf);
+                service::cas_shard cas_shard(*schema, dk.token());
+
+                //FIXME: Instead of passing empty_service_permit() to the background operation,
+                // the current permit's lifetime should be prolonged, so that it's destructed
+                // only after all background operations are finished as well.
+                return self.cas_write(schema, std::move(cas_shard), dk, mb, client_state, std::move(trace_state), empty_service_permit());
+            });
+        });
+    }
+
    auto timeout = executor::default_timeout();
-    auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
+    auto op = std::make_unique<put_or_delete_item_cas_request>(schema, mutation_builders);
+    auto* op_ptr = op.get();
    auto cdc_opts = cdc::per_request_options{
+        .alternator = true,
+        .alternator_streams_increased_compatibility =
+                schema->cdc_options().enabled() && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
    };
-    return proxy.cas(schema, std::move(cas_shard), op, nullptr, to_partition_ranges(dk),
+    return _proxy.cas(schema, std::move(cas_shard), *op_ptr, nullptr, to_partition_ranges(dk),
            {timeout, std::move(permit), client_state, trace_state},
            db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
-            timeout, timeout, true, std::move(cdc_opts)).discard_result();
-    // We discarded cas()'s future value ("is_applied") because BatchWriteItems
+            timeout, timeout, true, std::move(cdc_opts)).finally([op = std::move(op)]{}).discard_result();
+    // We discarded cas()'s future value ("is_applied") because BatchWriteItem
    // does not need to support conditional updates.
 }

@@ -3081,13 +3127,11 @@ struct schema_decorated_key_equal {

 // FIXME: if we failed writing some of the mutations, need to return a list
 // of these failed mutations rather than fail the whole write (issue #5650).
-static future<> do_batch_write(service::storage_proxy& proxy,
-        smp_service_group ssg,
+future<> executor::do_batch_write(
        std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
        service::client_state& client_state,
        tracing::trace_state_ptr trace_state,
-        service_permit permit,
-        stats& stats) {
+        service_permit permit) {
    if (mutation_builders.empty()) {
        return make_ready_future<>();
    }
@@ -3104,64 +3148,62 @@ static future<> do_batch_write(service::storage_proxy& proxy,
        utils::chunked_vector<mutation> mutations;
        mutations.reserve(mutation_builders.size());
        api::timestamp_type now = api::new_timestamp();
+        bool any_cdc_enabled = false;
        for (auto& b : mutation_builders) {
            mutations.push_back(b.second.build(b.first, now));
+            any_cdc_enabled |= b.first->cdc_options().enabled();
        }
-        return proxy.mutate(std::move(mutations),
+        return _proxy.mutate(std::move(mutations),
                db::consistency_level::LOCAL_QUORUM,
                executor::default_timeout(),
                trace_state,
                std::move(permit),
                db::allow_per_partition_rate_limit::yes,
                false,
-                cdc::per_request_options{});
+                cdc::per_request_options{
+                    .alternator = true,
+                    .alternator_streams_increased_compatibility = any_cdc_enabled && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
+                });
    } else {
        // Do the write via LWT:
        // Multiple mutations may be destined for the same partition, adding
        // or deleting different items of one partition. Join them together
        // because we can do them in one cas() call.
-        std::unordered_map<schema_decorated_key, std::vector<put_or_delete_item>, schema_decorated_key_hash, schema_decorated_key_equal>
-            key_builders(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
-        for (auto& b : mutation_builders) {
-            auto dk = dht::decorate_key(*b.first, b.second.pk());
-            auto [it, added] = key_builders.try_emplace(schema_decorated_key{b.first, dk});
+        using map_type = std::unordered_map<schema_decorated_key, 
+            std::vector<put_or_delete_item>, 
+            schema_decorated_key_hash, 
+            schema_decorated_key_equal>;
+        auto key_builders = std::make_unique<map_type>(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
+        for (auto&& b : std::move(mutation_builders)) {
+            auto [it, added] = key_builders->try_emplace(schema_decorated_key {
+                .schema = b.first,
+                .dk = dht::decorate_key(*b.first, b.second.pk())
+            });
            it->second.push_back(std::move(b.second));
        }
-        return parallel_for_each(std::move(key_builders), [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (auto& e) {
-            stats.write_using_lwt++;
+        auto* key_builders_ptr = key_builders.get();
+        return parallel_for_each(*key_builders_ptr, [this, &client_state, trace_state, permit = std::move(permit)] (const auto& e) {
+            _stats.write_using_lwt++;
            auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
-            if (desired_shard.this_shard()) {
-                return cas_write(proxy, e.first.schema, std::move(desired_shard), e.first.dk, std::move(e.second), client_state, trace_state, permit);
-            } else {
-                stats.shard_bounce_for_lwt++;
-                return proxy.container().invoke_on(desired_shard.shard(), ssg,
-                            [cs = client_state.move_to_other_shard(),
-                             mb = e.second,
-                             dk = e.first.dk,
-                             ks = e.first.schema->ks_name(),
-                             cf = e.first.schema->cf_name(),
-                             gt =  tracing::global_trace_state_ptr(trace_state),
-                             permit = std::move(permit)]
-                            (service::storage_proxy& proxy) mutable {
-                    return do_with(cs.get(), [&proxy, mb = std::move(mb), dk = std::move(dk), ks = std::move(ks), cf = std::move(cf),
-                                              trace_state = tracing::trace_state_ptr(gt)]
-                                              (service::client_state& client_state) mutable {
-                        auto schema = proxy.data_dictionary().find_schema(ks, cf);
+            auto s = e.first.schema;

-                        // The desired_shard on the original shard remains alive for the duration
-                        // of cas_write on this shard and prevents any tablet operations.
-                        // However, we need a local instance of cas_shard on this shard
-                        // to pass it to sp::cas, so we just create a new one.
-                        service::cas_shard cas_shard(*schema, dk.token());
-
-                        //FIXME: Instead of passing empty_service_permit() to the background operation,
-                        // the current permit's lifetime should be prolonged, so that it's destructed
-                        // only after all background operations are finished as well.
-                        return cas_write(proxy, schema, std::move(cas_shard), dk, std::move(mb), client_state, std::move(trace_state), empty_service_permit());
-                    });
-                }).finally([desired_shard = std::move(desired_shard)]{});
-            }
-        });
+            static const auto* injection_name = "alternator_executor_batch_write_wait";
+            return utils::get_local_injector().inject(injection_name, [s = std::move(s)] (auto& handler) -> future<> {
+                const auto ks = handler.get("keyspace");
+                const auto cf = handler.get("table");
+                const auto shard = std::atoll(handler.get("shard")->data());
+                if (ks == s->ks_name() && cf == s->cf_name() && shard == this_shard_id()) {
+                    elogger.info("{}: hit", injection_name);
+                    co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+                    elogger.info("{}: continue", injection_name);
+                }
+            }).then([&e, desired_shard = std::move(desired_shard),
+                 &client_state, trace_state = std::move(trace_state), permit = std::move(permit), this]() mutable
+            {
+                return cas_write(e.first.schema, std::move(desired_shard), e.first.dk,
+                    std::move(e.second), client_state, std::move(trace_state), std::move(permit));
+            });
+        }).finally([key_builders = std::move(key_builders)]{});
    }
 }

@@ -3204,7 +3246,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
        per_table_stats->api_operations.batch_write_item++;
        per_table_stats->api_operations.batch_write_item_batch_total += it->value.Size();
        per_table_stats->api_operations.batch_write_item_histogram.add(it->value.Size());
-        tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+        tracing::add_alternator_table_name(trace_state, schema->cf_name());

        std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
                1, primary_key_hash{schema}, primary_key_equal{schema});
@@ -3308,7 +3350,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
    _stats.wcu_total[stats::DELETE_ITEM] += wcu_delete_units;
    _stats.api_operations.batch_write_item_batch_total += total_items;
    _stats.api_operations.batch_write_item_histogram.add(total_items);
-    co_await do_batch_write(_proxy, _ssg, std::move(mutation_builders), client_state, trace_state, std::move(permit), _stats);
+    co_await do_batch_write(std::move(mutation_builders), client_state, trace_state, std::move(permit));
    // FIXME: Issue #5650: If we failed writing some of the updates,
    // need to return a list of these failed updates in UnprocessedItems
    // rather than fail the whole write (issue #5650).
@@ -3353,7 +3395,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
        }
        rjson::value newv = rjson::empty_object();
        for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
-            std::string attr = it->name.GetString();
+            std::string attr = rjson::to_string(it->name);
            auto x = members.find(attr);
            if (x != members.end()) {
                if (x->second) {
@@ -3573,7 +3615,7 @@ static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& re
        const rjson::value& attributes_to_get = req["AttributesToGet"];
        attrs_to_get ret;
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
-            attribute_path_map_add("AttributesToGet", ret, it->GetString());
+            attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it));
            validate_attr_name_length("AttributesToGet", it->GetStringLength(), false);
        }
        if (ret.empty()) {
@@ -4239,12 +4281,12 @@ inline void update_item_operation::apply_attribute_updates(const std::unique_ptr
        attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
    for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
        // Note that it.key() is the name of the column, *it is the operation
-        bytes column_name = to_bytes(it->name.GetString());
+        bytes column_name = to_bytes(rjson::to_string_view(it->name));
        const column_definition* cdef = _schema->get_column_definition(column_name);
        if (cdef && cdef->is_primary_key()) {
-            throw api_error::validation(format("UpdateItem cannot update key column {}", it->name.GetString()));
+            throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
        }
-        std::string action = (it->value)["Action"].GetString();
+        std::string action = rjson::to_string((it->value)["Action"]);
        if (action == "DELETE") {
            // The DELETE operation can do two unrelated tasks. Without a
            // "Value" option, it is used to delete an attribute. With a
@@ -4464,7 +4506,7 @@ future<executor::request_return_type> executor::update_item(client_state& client
    elogger.trace("update_item {}", request);

    auto op = make_shared<update_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
-    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
+    tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
    const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();

    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
@@ -4545,7 +4587,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    schema_ptr schema = get_table(_proxy, request);
    lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *schema);
    per_table_stats->api_operations.get_item++;
-    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+    tracing::add_alternator_table_name(trace_state, schema->cf_name());

    rjson::value& query_key = request["Key"];
    db::consistency_level cl = get_read_consistency(request);
@@ -4694,7 +4736,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    uint batch_size = 0;
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        table_requests rs(get_table_from_batch_request(_proxy, it));
-        tracing::add_table_name(trace_state, sstring(executor::KEYSPACE_NAME_PREFIX) + rs.schema->cf_name(), rs.schema->cf_name());
+        tracing::add_alternator_table_name(trace_state, rs.schema->cf_name());
        rs.cl = get_read_consistency(it->value);
        std::unordered_set<std::string> used_attribute_names;
        rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names));
@@ -5130,13 +5172,15 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    }
    auto pos = paging_state.get_position_in_partition();
    if (pos.has_key()) {
-        auto exploded_ck = pos.key().explode();
-        auto exploded_ck_it = exploded_ck.begin();
-        for (const column_definition& cdef : schema.clustering_key_columns()) {
-            rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
-            rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-            rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
-            ++exploded_ck_it;
+        // Alternator itself allows at most one column in clustering key, but 
+        // user can use Alternator api to access system tables which might have
+        // multiple clustering key columns. So we need to handle that case here.
+        auto cdef_it = schema.clustering_key_columns().begin();        
+        for(const auto &exploded_ck : pos.key().explode()) {
+            rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object());
+            rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()];
+            rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it));
+            ++cdef_it;
        }
    }
    // To avoid possible conflicts (and thus having to reserve these names) we
@@ -5296,6 +5340,7 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
    elogger.trace("Scanning {}", request);

    auto [schema, table_type] = get_table_or_view(_proxy, request);
+    tracing::add_alternator_table_name(trace_state, schema->cf_name());
    get_stats_from_schema(_proxy, *schema)->api_operations.scan++;
    auto segment = get_int_attribute(request, "Segment");
    auto total_segments = get_int_attribute(request, "TotalSegments");
@@ -5438,7 +5483,7 @@ calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {
    std::vector<query::clustering_range> ck_bounds;

    for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) {
-        std::string key = it->name.GetString();
+        sstring key = rjson::to_sstring(it->name);
        const rjson::value& condition = it->value;

        const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
@@ -5446,13 +5491,13 @@ calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {

        const column_definition& pk_cdef = schema->partition_key_columns().front();
        const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr;
-        if (sstring(key) == pk_cdef.name_as_text()) {
+        if (key == pk_cdef.name_as_text()) {
            if (!partition_ranges.empty()) {
                throw api_error::validation("Currently only a single restriction per key is allowed");
            }
            partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list));
        }
-        if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) {
+        if (ck_cdef && key == ck_cdef->name_as_text()) {
            if (!ck_bounds.empty()) {
                throw api_error::validation("Currently only a single restriction per key is allowed");
            }
@@ -5775,7 +5820,7 @@ future<executor::request_return_type> executor::query(client_state& client_state

    auto [schema, table_type] = get_table_or_view(_proxy, request);
    get_stats_from_schema(_proxy, *schema)->api_operations.query++;
-    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
+    tracing::add_alternator_table_name(trace_state, schema->cf_name());

    rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
    db::consistency_level cl = get_read_consistency(request);
@@ -5853,7 +5898,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client

    rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
    rjson::value* limit_json = rjson::find(request, "Limit");
-    std::string exclusive_start = exclusive_start_json ? exclusive_start_json->GetString() : "";
+    std::string exclusive_start = exclusive_start_json ? rjson::to_string(*exclusive_start_json) : "";
    int limit = limit_json ? limit_json->GetInt() : 100;
    if (limit < 1 || limit > 100) {
        co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -40,6 +40,7 @@ namespace cql3::selection {

 namespace service {
    class storage_proxy;
+    class cas_shard;
 }

 namespace cdc {
@@ -57,6 +58,7 @@ class schema_builder;
 namespace alternator {

 class rmw_operation;
+class put_or_delete_item;

 schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request);
 bool is_alternator_keyspace(const sstring& ks_name);
@@ -219,6 +221,16 @@ private:

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);

+    future<> do_batch_write(
+        std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
+        service::client_state& client_state,
+        tracing::trace_state_ptr trace_state,
+        service_permit permit);
+
+    future<> cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
+        const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
+        tracing::trace_state_ptr trace_state, service_permit permit);
+
 public:
    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&, const std::map<sstring, sstring> *tags = nullptr);

--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -282,15 +282,23 @@ std::string type_to_string(data_type type) {
    return it->second;
 }

-bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
+std::optional<bytes> try_get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
    const rjson::value* key_typed_value = rjson::find(item, column_name);
    if (!key_typed_value) {
-        throw api_error::validation(fmt::format("Key column {} not found", column_name));
+        return std::nullopt;
    }
    return get_key_from_typed_value(*key_typed_value, column);
 }

+bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
+    auto value = try_get_key_column_value(item, column);
+    if (!value) {
+        throw api_error::validation(fmt::format("Key column {} not found", column.name_as_text()));
+    }
+    return std::move(*value);
+}
+
 // Parses the JSON encoding for a key value, which is a map with a single
 // entry whose key is the type and the value is the encoded value.
 // If this type does not match the desired "type_str", an api_error::validation
@@ -380,20 +388,38 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {
        return clustering_key::make_empty();
    }
    std::vector<bytes> raw_ck;
-    // FIXME: this is a loop, but we really allow only one clustering key column.
+    // Note: it's possible to get more than one clustering column here, as
+    // Alternator can be used to read scylla internal tables.
    for (const column_definition& cdef : schema->clustering_key_columns()) {
-        bytes raw_value = get_key_column_value(item,  cdef);
+        auto raw_value = get_key_column_value(item,  cdef);
        raw_ck.push_back(std::move(raw_value));
    }

    return clustering_key::from_exploded(raw_ck);
 }

-position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema) {
-    auto ck = ck_from_json(item, schema);
-    if (is_alternator_keyspace(schema->ks_name())) {
-        return position_in_partition::for_key(std::move(ck));
+clustering_key_prefix ck_prefix_from_json(const rjson::value& item, schema_ptr schema) {
+    if (schema->clustering_key_size() == 0) {
+        return clustering_key_prefix::make_empty();
    }
+    std::vector<bytes> raw_ck;
+    for (const column_definition& cdef : schema->clustering_key_columns()) {
+        auto raw_value = try_get_key_column_value(item,  cdef);
+        if (!raw_value) {
+            break;
+        }
+        raw_ck.push_back(std::move(*raw_value));
+    }
+
+    return clustering_key_prefix::from_exploded(raw_ck);
+}
+
+position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema) {
+    const bool is_alternator_ks = is_alternator_keyspace(schema->ks_name());
+    if (is_alternator_ks) {
+        return position_in_partition::for_key(ck_from_json(item, schema));
+    }
+    
    const auto region_item = rjson::find(item, scylla_paging_region);
    const auto weight_item = rjson::find(item, scylla_paging_weight);
    if (bool(region_item) != bool(weight_item)) {
@@ -413,8 +439,9 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
        } else {
            throw std::runtime_error(fmt::format("Invalid value for weight: {}", weight_view));
        }
-        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
+        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(ck_prefix_from_json(item, schema)) : std::nullopt);
    }
+    auto ck = ck_from_json(item, schema);
    if (ck.is_empty()) {
        return position_in_partition::for_partition_start();
    }
@@ -469,7 +496,7 @@ const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value&
        return {"", nullptr};
    }
    auto it = v.MemberBegin();
-    const std::string it_key = it->name.GetString();
+    const std::string it_key = rjson::to_string(it->name);
    if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
        return {std::move(it_key), nullptr};
    }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -13,6 +13,7 @@
 #include <seastar/http/function_handlers.hh>
 #include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
@@ -32,6 +33,7 @@
 #include "utils/aws_sigv4.hh"
 #include "client_data.hh"
 #include "utils/updateable_value.hh"
+#include <zlib.h>

 static logging::logger slogger("alternator-server");

@@ -551,6 +553,106 @@ read_entire_stream(input_stream<char>& inp, size_t length_limit) {
    co_return ret;
 }

+// safe_gzip_stream is an exception-safe wrapper for zlib's z_stream.
+// The "z_stream" struct is used by zlib to hold state while decompressing a
+// stream of data. It allocates memory which must be freed with inflateEnd(),
+// which the destructor of this class does.
+class safe_gzip_zstream {
+    z_stream _zs;
+public:
+    safe_gzip_zstream() {
+        memset(&_zs, 0, sizeof(_zs));
+        // The strange 16 + WMAX_BITS tells zlib to expect and decode
+        // a gzip header, not a zlib header.
+        if (inflateInit2(&_zs, 16 + MAX_WBITS) != Z_OK) {
+            // Should only happen if memory allocation fails
+            throw std::bad_alloc();
+        }
+    }
+    ~safe_gzip_zstream() {
+        inflateEnd(&_zs);
+    }
+    z_stream* operator->() {
+        return &_zs;
+    }
+    z_stream* get() {
+        return &_zs;
+    }
+    void reset() {
+        inflateReset(&_zs);
+    }
+};
+
+// ungzip() takes a chunked_content with a gzip-compressed request body,
+// uncompresses it, and returns the uncompressed content as a chunked_content.
+// If the uncompressed content exceeds length_limit, an error is thrown.
+static future<chunked_content>
+ungzip(chunked_content&& compressed_body, size_t length_limit) {
+    chunked_content ret;
+    // output_buf can be any size - when uncompressing input_buf, it doesn't
+    // need to fit in a single output_buf, we'll use multiple output_buf for
+    // a single input_buf if needed.
+    constexpr size_t OUTPUT_BUF_SIZE = 4096;
+    temporary_buffer<char> output_buf;
+    safe_gzip_zstream strm;
+    bool complete_stream = false; // empty input is not a valid gzip
+    size_t total_out_bytes = 0;
+    for (const temporary_buffer<char>& input_buf : compressed_body) {
+        if (input_buf.empty()) {
+            continue;
+        }
+        complete_stream = false;
+        strm->next_in = (Bytef*) input_buf.get();
+        strm->avail_in = (uInt) input_buf.size();
+        do {
+            co_await coroutine::maybe_yield();
+            if (output_buf.empty()) {
+                output_buf = temporary_buffer<char>(OUTPUT_BUF_SIZE);
+            }
+            strm->next_out = (Bytef*) output_buf.get();
+            strm->avail_out = OUTPUT_BUF_SIZE;
+            int e = inflate(strm.get(), Z_NO_FLUSH);
+            size_t out_bytes = OUTPUT_BUF_SIZE - strm->avail_out;
+            if (out_bytes > 0) {
+                // If output_buf is nearly full, we save it as-is in ret. But
+                // if it only has little data, better copy to a small buffer.
+                if (out_bytes > OUTPUT_BUF_SIZE/2) {
+                    ret.push_back(std::move(output_buf).prefix(out_bytes));
+                    // output_buf is now empty. if this loop finds more input,
+                    // we'll allocate a new output buffer.
+                } else {
+                    ret.push_back(temporary_buffer<char>(output_buf.get(), out_bytes));
+                }
+                total_out_bytes += out_bytes;
+                if (total_out_bytes > length_limit) {
+                    throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit));
+                }
+            }
+            if (e == Z_STREAM_END) {
+                // There may be more input after the first gzip stream - in
+                // either this input_buf or the next one. The additional input
+                // should be a second concatenated gzip. We need to allow that
+                // by resetting the gzip stream and continuing the input loop
+                // until there's no more input.
+                strm.reset();
+                if (strm->avail_in == 0) {
+                    complete_stream = true;
+                    break;
+                }
+            } else if (e != Z_OK && e != Z_BUF_ERROR) {
+                // DynamoDB returns an InternalServerError when given a bad
+                // gzip request body. See test test_broken_gzip_content
+                throw api_error::internal("Error during gzip decompression of request body");
+            }
+        } while (strm->avail_in > 0 || strm->avail_out == 0);
+    }
+    if (!complete_stream) {
+        // The gzip stream was not properly finished with Z_STREAM_END
+        throw api_error::internal("Truncated gzip in request body");
+    }
+    co_return ret;
+}
+
 future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request> req) {
    _executor._stats.total_operations++;
    sstring target = req->get_header("X-Amz-Target");
@@ -588,6 +690,21 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        units.return_units(mem_estimate - new_mem_estimate);
    }
    auto username = co_await verify_signature(*req, content);
+    // If the request is compressed, uncompress it now, after we checked
+    // the signature (the signature is computed on the compressed content).
+    // We apply the request_content_length_limit again to the uncompressed
+    // content - we don't want to allow a tiny compressed request to
+    // expand to a huge uncompressed request.
+    sstring content_encoding = req->get_header("Content-Encoding");
+    if (content_encoding == "gzip") {
+        content = co_await ungzip(std::move(content), request_content_length_limit);
+    } else if (!content_encoding.empty()) {
+        // DynamoDB returns a 500 error for unsupported Content-Encoding.
+        // I'm not sure if this is the best error code, but let's do it too.
+        // See the test test_garbage_content_encoding confirming this case.
+        co_return api_error::internal("Unsupported Content-Encoding");
+    }
+
    // As long as the system_clients_entry object is alive, this request will
    // be visible in the "system.clients" virtual table. When requested, this
    // entry will be formatted by server::ongoing_request::make_client_data().
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -93,7 +93,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    if (v->GetStringLength() < 1 || v->GetStringLength() > 255) {
        co_return api_error::validation("The length of AttributeName must be between 1 and 255");
    }
-    sstring attribute_name(v->GetString(), v->GetStringLength());
+    sstring attribute_name = rjson::to_sstring(*v);

    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
--- a/api/CMakeLists.txt
+++ b/api/CMakeLists.txt
@@ -31,6 +31,7 @@ set(swagger_files
  api-doc/column_family.json
  api-doc/commitlog.json
  api-doc/compaction_manager.json
+  api-doc/client_routes.json
  api-doc/config.json
  api-doc/cql_server_test.json
  api-doc/endpoint_snitch_info.json
@@ -68,6 +69,7 @@ target_sources(api
  PRIVATE
    api.cc
    cache_service.cc
+    client_routes.cc
    collectd.cc
    column_family.cc
    commitlog.cc
@@ -106,5 +108,8 @@ target_link_libraries(api
    wasmtime_bindings
    absl::headers)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(api REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers api
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/api/api-doc/client_routes.def.json
+++ b/api/api-doc/client_routes.def.json
@@ -0,0 +1,23 @@
+    , "client_routes_entry": {
+        "id": "client_routes_entry",
+        "summary": "An entry storing client routes",
+        "properties": {
+            "connection_id": {"type": "string"},
+            "host_id": {"type": "string", "format": "uuid"},
+            "address": {"type": "string"},
+            "port": {"type": "integer"},
+            "tls_port": {"type": "integer"},
+            "alternator_port": {"type": "integer"},
+            "alternator_https_port": {"type": "integer"}
+        },
+        "required": ["connection_id", "host_id", "address"]
+    }
+    , "client_routes_key": {
+        "id": "client_routes_key",
+        "summary": "A key of client_routes_entry",
+        "properties": {
+            "connection_id": {"type": "string"},
+            "host_id": {"type": "string", "format": "uuid"}
+        }
+    }
+
--- a/api/api-doc/client_routes.json
+++ b/api/api-doc/client_routes.json
@@ -0,0 +1,74 @@
+    , "/v2/client-routes":{
+        "get": {
+            "description":"List all client route entries",
+            "operationId":"get_client_routes",
+            "tags":["client_routes"],
+            "produces":[
+                "application/json"
+            ],
+            "parameters":[],
+            "responses":{
+                "200":{
+                    "schema":{
+                        "type":"array",
+                        "items":{ "$ref":"#/definitions/client_routes_entry" }
+                    }
+                },
+                "default":{
+                    "description":"unexpected error",
+                    "schema":{"$ref":"#/definitions/ErrorModel"}
+                }
+            }
+        },
+        "post": {
+            "description":"Upsert one or more client route entries",
+            "operationId":"set_client_routes",
+            "tags":["client_routes"],
+            "parameters":[
+                {
+                    "name":"body",
+                    "in":"body",
+                    "required":true,
+                    "schema":{
+                        "type":"array",
+                        "items":{ "$ref":"#/definitions/client_routes_entry" }
+                    }
+                }
+            ],
+            "responses":{
+                "200":{ "description": "OK" },
+                "default":{
+                    "description":"unexpected error",
+                    "schema":{ "$ref":"#/definitions/ErrorModel" }
+                }
+            }
+        },
+        "delete": {
+            "description":"Delete one or more client route entries",
+            "operationId":"delete_client_routes",
+            "tags":["client_routes"],
+            "parameters":[
+                {
+                    "name":"body",
+                    "in":"body",
+                    "required":true,
+                    "schema":{
+                        "type":"array",
+                        "items":{ "$ref":"#/definitions/client_routes_key" }
+                    }
+                }
+            ],
+            "responses":{
+                "200":{
+                    "description": "OK"
+                },
+                "default":{
+                    "description":"unexpected error",
+                    "schema":{
+                        "$ref":"#/definitions/ErrorModel"
+                    }
+                }
+            }
+        }
+    }
+
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -3051,7 +3051,7 @@
                  },
                  {
                     "name":"incremental_mode",
-                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
+                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api.cc
+++ b/api/api.cc
@@ -37,6 +37,7 @@
 #include "raft.hh"
 #include "gms/gossip_address_map.hh"
 #include "service_levels.hh"
+#include "client_routes.hh"

 logging::logger apilog("api");

@@ -67,9 +68,11 @@ future<> set_server_init(http_context& ctx) {
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
        rb02->register_api_file(r, "metrics");
+        rb02->register_api_file(r, "client_routes");
        rb->register_function(r, "system",
                "The system related API");
        rb02->add_definitions_file(r, "metrics");
+        rb02->add_definitions_file(r, "client_routes");
        set_system(ctx, r);
        rb->register_function(r, "error_injection",
            "The error injection API");
@@ -129,6 +132,16 @@ future<> unset_server_storage_service(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_storage_service(ctx, r); });
 }

+future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr) {
+    return ctx.http_server.set_routes([&ctx, &cr] (routes& r) {
+        set_client_routes(ctx, r, cr);
+    });
+}
+
+future<> unset_server_client_routes(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_client_routes(ctx, r); });
+}
+
 future<> set_load_meter(http_context& ctx, service::load_meter& lm) {
    return ctx.http_server.set_routes([&ctx, &lm] (routes& r) { set_load_meter(ctx, r, lm); });
 }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -29,6 +29,7 @@ class storage_proxy;
 class storage_service;
 class raft_group0_client;
 class raft_group_registry;
+class client_routes_service;

 } // namespace service

@@ -99,6 +100,8 @@ future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snit
 future<> unset_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
 future<> unset_server_storage_service(http_context& ctx);
+future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
+future<> unset_server_client_routes(http_context& ctx);
 future<> set_server_sstables_loader(http_context& ctx, sharded<sstables_loader>& sst_loader);
 future<> unset_server_sstables_loader(http_context& ctx);
 future<> set_server_view_builder(http_context& ctx, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g);
--- a/api/client_routes.cc
+++ b/api/client_routes.cc
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ *
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+ #include <seastar/http/short_streams.hh>
+
+#include "client_routes.hh"
+#include "api/api.hh"
+#include "service/storage_service.hh"
+#include "service/client_routes.hh"
+#include "utils/rjson.hh"
+
+
+#include "api/api-doc/client_routes.json.hh"
+
+using namespace seastar::httpd;
+using namespace std::chrono_literals;
+using namespace json;
+
+extern logging::logger apilog;
+
+namespace api {
+
+static void validate_client_routes_endpoint(sharded<service::client_routes_service>& cr, sstring endpoint_name) {
+    if (!cr.local().get_feature_service().client_routes) {
+        apilog.warn("{}: called before the cluster feature was enabled", endpoint_name);
+        throw std::runtime_error(fmt::format("{} requires all nodes to support the CLIENT_ROUTES cluster feature", endpoint_name));
+    }
+}
+
+static sstring parse_string(const char* name, rapidjson::Value const& v) {
+    const auto it = v.FindMember(name);
+    if (it == v.MemberEnd()) {
+        throw bad_param_exception(fmt::format("Missing '{}'", name));
+    }
+    if (!it->value.IsString()) {
+        throw bad_param_exception(fmt::format("'{}' must be a string", name));
+    }
+    return {it->value.GetString(), it->value.GetStringLength()};
+}
+
+static std::optional<uint32_t> parse_port(const char* name, rapidjson::Value const& v) {
+    const auto it = v.FindMember(name);
+    if (it == v.MemberEnd()) {
+        return std::nullopt;
+    }
+    if (!it->value.IsInt()) {
+        throw bad_param_exception(fmt::format("'{}' must be an integer", name));
+    }
+    auto port = it->value.GetInt();
+    if (port < 1 || port > 65535) {
+        throw bad_param_exception(fmt::format("'{}' value={} is outside the allowed port range", name, port));
+    }
+    return port;
+}
+
+static std::vector<service::client_routes_service::client_route_entry> parse_set_client_array(const rapidjson::Document& root) {
+    if (!root.IsArray()) {
+        throw bad_param_exception("Body must be a JSON array");
+    }
+
+    std::vector<service::client_routes_service::client_route_entry> v;
+    v.reserve(root.GetArray().Size());
+    for (const auto& element : root.GetArray()) {
+        if (!element.IsObject()) { throw bad_param_exception("Each element must be object"); }
+
+        const auto port = parse_port("port", element);
+        const auto tls_port = parse_port("tls_port", element);
+        const auto alternator_port = parse_port("alternator_port", element);
+        const auto alternator_https_port = parse_port("alternator_https_port", element);
+
+        if (!port.has_value() && !tls_port.has_value() && !alternator_port.has_value() && !alternator_https_port.has_value()) {
+            throw bad_param_exception("At least one port field ('port', 'tls_port', 'alternator_port', 'alternator_https_port') must be specified");
+        }
+
+        v.emplace_back(
+            parse_string("connection_id", element),
+            utils::UUID{parse_string("host_id", element)},
+            parse_string("address", element),
+            port,
+            tls_port,
+            alternator_port,
+            alternator_https_port
+        );
+    }
+
+    return v;
+}
+
+static
+future<json::json_return_type>
+rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
+    validate_client_routes_endpoint(cr, "rest_set_client_routes");
+
+    rapidjson::Document root;
+    auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
+    root.Parse(content.c_str());
+    const auto route_entries = parse_set_client_array(root);
+
+    co_await cr.local().set_client_routes(route_entries);
+    co_return seastar::json::json_void();
+}
+
+static std::vector<service::client_routes_service::client_route_key> parse_delete_client_array(const rapidjson::Document& root) {
+    if (!root.IsArray()) {
+        throw bad_param_exception("Body must be a JSON array");
+    }
+
+    std::vector<service::client_routes_service::client_route_key> v;
+    v.reserve(root.GetArray().Size());
+    for (const auto& element : root.GetArray()) {
+        v.emplace_back(
+            parse_string("connection_id", element),
+            utils::UUID{parse_string("host_id", element)}
+        );
+    }
+
+    return v;
+}
+
+static
+future<json::json_return_type>
+rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
+    validate_client_routes_endpoint(cr, "delete_client_routes");
+
+    rapidjson::Document root;
+    auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
+    root.Parse(content.c_str());
+
+    const auto route_keys = parse_delete_client_array(root);
+    co_await cr.local().delete_client_routes(route_keys);
+    co_return seastar::json::json_void();
+}
+
+static
+future<json::json_return_type>
+rest_get_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr, std::unique_ptr<http::request> req) {
+    validate_client_routes_endpoint(cr, "get_client_routes");
+
+    co_return co_await cr.invoke_on(0, [] (service::client_routes_service& cr) -> future<json::json_return_type> {
+        co_return json::json_return_type(stream_range_as_array(co_await cr.get_client_routes(), [](const service::client_routes_service::client_route_entry & entry) {
+            seastar::httpd::client_routes_json::client_routes_entry obj;
+            obj.connection_id = entry.connection_id;
+            obj.host_id = fmt::to_string(entry.host_id);
+            obj.address = entry.address;
+            if (entry.port.has_value()) { obj.port = entry.port.value(); }
+            if (entry.tls_port.has_value()) { obj.tls_port = entry.tls_port.value(); }
+            if (entry.alternator_port.has_value()) { obj.alternator_port = entry.alternator_port.value(); }
+            if (entry.alternator_https_port.has_value()) { obj.alternator_https_port = entry.alternator_https_port.value(); }
+            return obj;
+        }));
+    });
+}
+
+void set_client_routes(http_context& ctx, routes& r, sharded<service::client_routes_service>& cr) {
+    seastar::httpd::client_routes_json::set_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
+        return rest_set_client_routes(ctx, cr, std::move(req));
+    });
+    seastar::httpd::client_routes_json::delete_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
+        return rest_delete_client_routes(ctx, cr, std::move(req));
+    });
+    seastar::httpd::client_routes_json::get_client_routes.set(r, [&ctx, &cr] (std::unique_ptr<seastar::http::request> req) {
+        return rest_get_client_routes(ctx, cr, std::move(req));
+    });
+}
+
+void unset_client_routes(http_context& ctx, routes& r) {
+    seastar::httpd::client_routes_json::set_client_routes.unset(r);
+    seastar::httpd::client_routes_json::delete_client_routes.unset(r);
+    seastar::httpd::client_routes_json::get_client_routes.unset(r);
+}
+
+}
--- a/api/client_routes.hh
+++ b/api/client_routes.hh
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ *
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include <seastar/core/sharded.hh>
+#include <seastar/json/json_elements.hh>
+#include "api/api_init.hh"
+
+namespace api {
+
+void set_client_routes(http_context& ctx, httpd::routes& r, sharded<service::client_routes_service>& cr);
+void unset_client_routes(http_context& ctx, httpd::routes& r);
+
+}
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -66,6 +66,13 @@ static future<json::json_return_type>  get_cf_stats(sharded<replica::database>&
    }, std::plus<int64_t>());
 }

+static future<json::json_return_type>  get_cf_stats(sharded<replica::database>& db,
+        std::function<int64_t(const replica::column_family_stats&)> f) {
+    return map_reduce_cf(db, int64_t(0), [f](const replica::column_family& cf) {
+        return f(cf.get_stats());
+    }, std::plus<int64_t>());
+}
+
 static future<json::json_return_type> for_tables_on_all_shards(sharded<replica::database>& db, std::vector<table_info> tables, std::function<future<>(replica::table&)> set) {
    return do_with(std::move(tables), [&db, set] (const std::vector<table_info>& tables) {
        return db.invoke_on_all([&tables, set] (replica::database& db) {
@@ -1066,10 +1073,14 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
    });

    ss::get_load.set(r, [&db] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(db, &replica::column_family_stats::live_disk_space_used);
+        return get_cf_stats(db, [](const replica::column_family_stats& stats) {
+            return stats.live_disk_space_used.on_disk;
+        });
    });
    ss::get_metrics_load.set(r, [&db] (std::unique_ptr<http::request> req) {
-        return get_cf_stats(db, &replica::column_family_stats::live_disk_space_used);
+        return get_cf_stats(db, [](const replica::column_family_stats& stats) {
+            return stats.live_disk_space_used.on_disk;
+        });
    });

    ss::get_keyspaces.set(r, [&db] (const_req req) {
--- a/audit/CMakeLists.txt
+++ b/audit/CMakeLists.txt
@@ -17,4 +17,7 @@ target_link_libraries(scylla_audit
  PRIVATE
    cql3)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(scylla_audit REUSE_FROM scylla-precompiled-header)
+endif()
 add_whole_archive(audit scylla_audit)
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -9,6 +9,7 @@ target_sources(scylla_auth
    allow_all_authorizer.cc
    authenticated_user.cc
    authenticator.cc
+    cache.cc
    certificate_authenticator.cc
    common.cc
    default_authorizer.cc
@@ -44,5 +45,8 @@ target_link_libraries(scylla_auth

 add_whole_archive(auth scylla_auth)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(scylla_auth REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers scylla_auth
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,7 +9,6 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
-#include "utils/alien_worker.hh"
 #include "utils/class_registrator.hh"

 namespace auth {
@@ -23,6 +22,6 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+        cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -12,8 +12,8 @@

 #include "auth/authenticated_user.hh"
 #include "auth/authenticator.hh"
+#include "auth/cache.hh"
 #include "auth/common.hh"
-#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -29,7 +29,7 @@ extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
-    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&) {
+    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {
    }

    virtual future<> start() override {
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2017-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "auth/cache.hh"
+#include "auth/common.hh"
+#include "auth/roles-metadata.hh"
+#include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
+#include "db/consistency_level_type.hh"
+#include "db/system_keyspace.hh"
+#include "schema/schema.hh"
+#include <iterator>
+#include <seastar/coroutine/maybe_yield.hh>
+#include <seastar/core/format.hh>
+
+namespace auth {
+
+logging::logger logger("auth-cache");
+
+cache::cache(cql3::query_processor& qp) noexcept
+    : _current_version(0)
+    , _qp(qp) {
+}
+
+lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
+    auto it = _roles.find(role);
+    if (it == _roles.end()) {
+        return {};
+    }
+    return it->second;
+}
+
+future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
+    auto rec = make_lw_shared<role_record>();
+    rec->version = _current_version;
+
+    auto fetch = [this, &role](const sstring& q) {
+        return _qp.execute_internal(q, db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), {role},
+                cql3::query_processor::cache_internal::yes);
+    };
+    // roles
+    {
+        static const sstring q = format("SELECT * FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, meta::roles_table::name);
+        auto rs = co_await fetch(q);
+        if (!rs->empty()) {
+            auto& r = rs->one();
+            rec->is_superuser = r.get_or<bool>("is_superuser", false);
+            rec->can_login = r.get_or<bool>("can_login", false);
+            rec->salted_hash = r.get_or<sstring>("salted_hash", "");
+            if (r.has("member_of")) {
+                auto mo = r.get_set<sstring>("member_of");
+                rec->member_of.insert(
+                        std::make_move_iterator(mo.begin()),
+                        std::make_move_iterator(mo.end()));
+            }
+        } else {
+            // role got deleted
+            co_return nullptr;
+        }
+    }
+    // members
+    {
+        static const sstring q = format("SELECT role, member FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_MEMBERS_CF);
+        auto rs = co_await fetch(q);
+        for (const auto& r : *rs) {
+            rec->members.insert(r.get_as<sstring>("member"));
+            co_await coroutine::maybe_yield();
+        }
+    }
+    // attributes
+    {
+        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
+        auto rs = co_await fetch(q);
+        for (const auto& r : *rs) {
+            rec->attributes[r.get_as<sstring>("name")] =
+                    r.get_as<sstring>("value");
+            co_await coroutine::maybe_yield();
+        }
+    }
+    // permissions
+    {
+        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
+        auto rs = co_await fetch(q);
+        for (const auto& r : *rs) {
+            auto resource = r.get_as<sstring>("resource");
+            auto perms_strings = r.get_set<sstring>("permissions");
+            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
+            auto pset = permissions::from_strings(perms_set);
+            rec->permissions[std::move(resource)] = std::move(pset);
+            co_await coroutine::maybe_yield();
+        }
+    }
+    co_return rec;
+}
+
+future<> cache::prune_all() noexcept {
+    for (auto it = _roles.begin(); it != _roles.end(); ) {
+        if (it->second->version != _current_version) {
+            _roles.erase(it++);
+            co_await coroutine::maybe_yield();
+        } else {
+            ++it;
+        }
+    }
+    co_return;
+}
+
+future<> cache::load_all() {
+    if (legacy_mode(_qp)) {
+        co_return;
+    }
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    ++_current_version;
+
+    logger.info("Loading all roles");
+    const uint32_t page_size = 128;
+    auto loader = [this](const cql3::untyped_result_set::row& r) -> future<stop_iteration> {
+        const auto name = r.get_as<sstring>("role");
+        auto role = co_await fetch_role(name);
+        if (role) {
+            _roles[name] = role;
+        }
+        co_return stop_iteration::no;
+    };
+    co_await _qp.query_internal(format("SELECT * FROM {}.{}",
+            db::system_keyspace::NAME, meta::roles_table::name),
+            db::consistency_level::LOCAL_ONE, {}, page_size, loader);
+
+    co_await prune_all();
+    for (const auto& [name, role] : _roles) {
+        co_await distribute_role(name, role);
+    }
+    co_await container().invoke_on_others([this](cache& c) -> future<> {
+        c._current_version = _current_version;
+        co_await c.prune_all();
+    });
+}
+
+future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
+    if (legacy_mode(_qp)) {
+        co_return;
+    }
+    for (const auto& name : roles) {
+        logger.info("Loading role {}", name);
+        auto role = co_await fetch_role(name);
+         if (role) {
+            _roles[name] = role;
+        } else {
+            _roles.erase(name);
+        }
+        co_await distribute_role(name, role);
+    }
+}
+
+future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
+    auto role_ptr = role.get();
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
+        if (!role_ptr) {
+            c._roles.erase(name);
+            return;
+        }
+        auto role_copy = make_lw_shared<role_record>(*role_ptr);
+        c._roles[name] = std::move(role_copy);
+    });
+}
+
+bool cache::includes_table(const table_id& id) noexcept {
+    return id == db::system_keyspace::roles()->id()
+            || id == db::system_keyspace::role_members()->id()
+            || id == db::system_keyspace::role_attributes()->id()
+            || id == db::system_keyspace::role_permissions()->id();
+}
+
+} // namespace auth
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <unordered_set>
+#include <unordered_map>
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include <absl/container/flat_hash_map.h>
+
+#include "auth/permission.hh"
+#include "auth/common.hh"
+
+namespace cql3 { class query_processor; }
+
+namespace auth {
+
+class cache : public peering_sharded_service<cache> {
+public:
+    using role_name_t = sstring;
+    using version_tag_t = char;
+
+	struct role_record {
+        bool can_login = false;
+        bool is_superuser = false;
+        std::unordered_set<role_name_t> member_of;
+        std::unordered_set<role_name_t> members;
+        sstring salted_hash;
+        std::unordered_map<sstring, sstring> attributes;
+        std::unordered_map<sstring, permission_set> permissions;
+        version_tag_t version; // used for seamless cache reloads
+    };
+
+    explicit cache(cql3::query_processor& qp) noexcept;
+    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    future<> load_all();
+    future<> load_roles(std::unordered_set<role_name_t> roles);
+    static bool includes_table(const table_id&) noexcept;
+
+private:
+    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
+    roles_map _roles;
+    version_tag_t _current_version;
+    cql3::query_processor& _qp;
+
+    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
+    future<> prune_all() noexcept;
+    future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
+};
+
+} // namespace auth
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -8,6 +8,7 @@
 */

 #include "auth/certificate_authenticator.hh"
+#include "auth/cache.hh"

 #include <boost/regex.hpp>
 #include <fmt/ranges.h>
@@ -34,13 +35,13 @@ static const class_registrator<auth::authenticator
    , cql3::query_processor&
    , ::service::raft_group0_client&
    , ::service::migration_manager&
-    , utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);
+    , auth::cache&> cert_auth_reg(CERT_AUTH_NAME);

 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };

-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
+auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, auth::cache&)
    : _queries([&] {
        auto& conf = qp.db().get_config();
        auto queries = conf.auth_certificate_role_queries();
@@ -75,9 +76,9 @@ auth::certificate_authenticator::certificate_authenticator(cql3::query_processor
                        throw std::invalid_argument(fmt::format("Invalid source: {}", map.at(cfg_source_attr)));
                    }
                    continue;
-                } catch (std::out_of_range&) {
+                } catch (const std::out_of_range&) {
                    // just fallthrough
-                } catch (boost::regex_error&) {
+                } catch (const boost::regex_error&) {
                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
                }
            }
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -10,7 +10,6 @@
 #pragma once

 #include "auth/authenticator.hh"
-#include "utils/alien_worker.hh"
 #include <boost/regex_fwd.hpp>  // IWYU pragma: keep

 namespace cql3 {
@@ -26,13 +25,15 @@ class raft_group0_client;

 namespace auth {

+class cache;
+
 extern const std::string_view certificate_authenticator_name;

 class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
 public:
-    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
+    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
    ~certificate_authenticator();

    future<> start() override;
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -94,7 +94,7 @@ static future<> create_legacy_metadata_table_if_missing_impl(
        try {
            co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
                    std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
-        } catch (exceptions::already_exists_exception&) {}
+        } catch (const exceptions::already_exists_exception&) {}
    }
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -48,6 +48,10 @@ extern constinit const std::string_view AUTH_PACKAGE_NAME;

 } // namespace meta

+constexpr std::string_view PERMISSIONS_CF = "role_permissions";
+constexpr std::string_view ROLE_MEMBERS_CF = "role_members";
+constexpr std::string_view ROLE_ATTRIBUTES_CF = "role_attributes";
+
 // This is a helper to check whether auth-v2 is on.
 bool legacy_mode(cql3::query_processor& qp);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -37,7 +37,6 @@ std::string_view default_authorizer::qualified_java_name() const {
 static constexpr std::string_view ROLE_NAME = "role";
 static constexpr std::string_view RESOURCE_NAME = "resource";
 static constexpr std::string_view PERMISSIONS_NAME = "permissions";
-static constexpr std::string_view PERMISSIONS_CF = "role_permissions";

 static logging::logger alogger("default_authorizer");

@@ -257,7 +256,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name, ::service::g
        } else {
            co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
        }
-    } catch (exceptions::request_execution_exception& e) {
+    } catch (const exceptions::request_execution_exception& e) {
        alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
    }
 }
@@ -294,13 +293,13 @@ future<> default_authorizer::revoke_all_legacy(const resource& resource) {
                                [resource](auto ep) {
                    try {
                        std::rethrow_exception(ep);
-                    } catch (exceptions::request_execution_exception& e) {
+                    } catch (const exceptions::request_execution_exception& e) {
                        alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
                    }

                });
            });
-        } catch (exceptions::request_execution_exception& e) {
+        } catch (const exceptions::request_execution_exception& e) {
            alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
            return make_ready_future();
        }
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -83,17 +83,18 @@ static const class_registrator<
    ldap_role_manager,
    cql3::query_processor&,
    ::service::raft_group0_client&,
-    ::service::migration_manager&> registration(ldap_role_manager_full_name);
+    ::service::migration_manager&,
+    cache&> registration(ldap_role_manager_full_name);

 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
-        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm)
-        : _std_mgr(qp, rg0c, mm), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
+        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
+        : _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
        , _bind_password(bind_password)
        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
 }

-ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm)
+ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
    : ldap_role_manager(
            qp.db().get_config().ldap_url_template(),
            qp.db().get_config().ldap_attr_role(),
@@ -101,7 +102,8 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
            qp.db().get_config().ldap_bind_passwd(),
            qp,
            rg0c,
-            mm) {
+            mm,
+            cache) {
 }

 std::string_view ldap_role_manager::qualified_java_name() const noexcept {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -14,6 +14,7 @@

 #include "ent/ldap/ldap_connection.hh"
 #include "standard_role_manager.hh"
+#include "auth/cache.hh"

 namespace auth {

@@ -43,12 +44,13 @@ class ldap_role_manager : public role_manager {
            std::string_view bind_password, ///< LDAP bind credentials.
            cql3::query_processor& qp, ///< Passed to standard_role_manager.
            ::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
-            ::service::migration_manager& mm ///< Passed to standard_role_manager.
+            ::service::migration_manager& mm, ///< Passed to standard_role_manager.
+            cache& cache ///< Passed to standard_role_manager.
    );

    /// Retrieves LDAP configuration entries from qp and invokes the other constructor.  Required by
    /// class_registrator<role_manager>.
-    ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm);
+    ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache);

    /// Thrown when query-template parsing fails.
    struct url_error : public std::runtime_error {
--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -11,6 +11,7 @@
 #include <seastar/core/future.hh>
 #include <stdexcept>
 #include <string_view>
+#include "auth/cache.hh"
 #include "cql3/description.hh"
 #include "utils/class_registrator.hh"

@@ -23,7 +24,8 @@ static const class_registrator<
        maintenance_socket_role_manager,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> registration(sstring{maintenance_socket_role_manager_name});
+        ::service::migration_manager&,
+        cache&> registration(sstring{maintenance_socket_role_manager_name});


 std::string_view maintenance_socket_role_manager::qualified_java_name() const noexcept {
--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "auth/cache.hh"
 #include "auth/resource.hh"
 #include "auth/role_manager.hh"
 #include <seastar/core/future.hh>
@@ -29,7 +30,7 @@ extern const std::string_view maintenance_socket_role_manager_name;
 // system_auth keyspace, which may be not yet created when the maintenance socket starts listening.
 class maintenance_socket_role_manager final : public role_manager {
 public:
-    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {}
+    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {}

    virtual std::string_view qualified_java_name() const noexcept override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -49,7 +49,7 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
+        cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

@@ -63,13 +63,13 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
 password_authenticator::~password_authenticator() {
 }

-password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
+password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
+    , _cache(cache)
    , _stopped(make_ready_future<>()) 
    , _superuser(default_superuser(qp.db().get_config()))
-    , _hashing_worker(hashing_worker)
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -315,24 +315,31 @@ future<authenticated_user> password_authenticator::authenticate(
    const sstring password = credentials.at(PASSWORD_KEY);

    try {
-        const std::optional<sstring> salted_hash = co_await get_password_hash(username);
-        if (!salted_hash) {
-            throw exceptions::authentication_exception("Username and/or password are incorrect");
+        std::optional<sstring> salted_hash;
+        if (legacy_mode(_qp)) {
+            salted_hash = co_await get_password_hash(username);
+            if (!salted_hash) {
+                throw exceptions::authentication_exception("Username and/or password are incorrect");
+            }
+        } else {
+            auto role = _cache.get(username);
+            if (!role || role->salted_hash.empty()) {
+                throw exceptions::authentication_exception("Username and/or password are incorrect");
+            }
+            salted_hash = role->salted_hash;
        }
-        const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash = std::move(salted_hash)]{
-            return passwords::check(password, *salted_hash);
-        });
+        const bool password_match = co_await passwords::check(password, *salted_hash);
        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
        co_return username;
-    } catch (std::system_error &) {
+    } catch (const std::system_error &) {
        std::throw_with_nested(exceptions::authentication_exception("Could not verify password"));
-    } catch (exceptions::request_execution_exception& e) {
+    } catch (const exceptions::request_execution_exception& e) {
        std::throw_with_nested(exceptions::authentication_exception(e.what()));
-    } catch (exceptions::authentication_exception& e) {
+    } catch (const exceptions::authentication_exception& e) {
        std::throw_with_nested(e);
-    } catch (exceptions::unavailable_exception& e) {
+    } catch (const exceptions::unavailable_exception& e) {
        std::throw_with_nested(exceptions::authentication_exception(e.get_message()));
    } catch (...) {
        std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -16,8 +16,8 @@
 #include "db/consistency_level_type.hh"
 #include "auth/authenticator.hh"
 #include "auth/passwords.hh"
+#include "auth/cache.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/alien_worker.hh"

 namespace db {
    class config;
@@ -41,19 +41,19 @@ class password_authenticator : public authenticator {
    cql3::query_processor& _qp;
    ::service::raft_group0_client& _group0_client;
    ::service::migration_manager& _migration_manager;
+    cache& _cache;
    future<> _stopped;
    abort_source _as;
    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
    shared_promise<> _superuser_created_promise;
    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
-    utils::alien_worker& _hashing_worker;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
    static std::string default_superuser(const db::config&);

-    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
+    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    ~password_authenticator();

--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -7,6 +7,8 @@
 */

 #include "auth/passwords.hh"
+#include "utils/crypt_sha512.hh"
+#include <seastar/core/coroutine.hh>

 #include <cerrno>

@@ -21,27 +23,48 @@ static thread_local crypt_data tlcrypt = {};

 namespace detail {

+void verify_hashing_output(const char * res) {
+    if (!res || (res[0] == '*')) {
+        throw std::system_error(errno, std::system_category());
+    }
+}
+
 void verify_scheme(scheme scheme) {
    const sstring random_part_of_salt = "aaaabbbbccccdddd";

    const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
    const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
-
-    if (e && (e[0] != '*')) {
-        return;
+    try {
+        verify_hashing_output(e);
+    } catch (const std::system_error& ex) {
+        throw no_supported_schemes();
    }
-
-    throw no_supported_schemes();
 }

 sstring hash_with_salt(const sstring& pass, const sstring& salt) {
    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
-    if (!res || (res[0] == '*')) {
-        throw std::system_error(errno, std::system_category());
-    }
+    verify_hashing_output(res);
    return res;
 }

+seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt) {
+    sstring res;
+    // Only SHA-512 hashes for passphrases shorter than 256 bytes can be computed using
+    // the __crypt_sha512 method. For other computations, we fall back to the
+    // crypt_r implementation from `<crypt.h>`, which can stall.
+    if (salt.starts_with(prefix_for_scheme(scheme::sha_512)) && pass.size() <= 255) {
+        char buf[128];
+        const char * output_ptr = co_await __crypt_sha512(pass.c_str(), salt.c_str(), buf);
+        verify_hashing_output(output_ptr);
+        res = output_ptr;
+    } else {
+        const char * output_ptr = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
+        verify_hashing_output(output_ptr);
+        res = output_ptr;
+    }
+    co_return res;
+}
+
 std::string_view prefix_for_scheme(scheme c) noexcept {
    switch (c) {
    case scheme::bcrypt_y: return "$2y$";
@@ -58,8 +81,9 @@ no_supported_schemes::no_supported_schemes()
        : std::runtime_error("No allowed hashing schemes are supported on this system") {
 }

-bool check(const sstring& pass, const sstring& salted_hash) {
-    return detail::hash_with_salt(pass, salted_hash) == salted_hash;
+seastar::future<bool> check(const sstring& pass, const sstring& salted_hash) {
+    const auto pwd_hash = co_await detail::hash_with_salt_async(pass, salted_hash);
+    co_return pwd_hash == salted_hash;
 }

 } // namespace auth::passwords
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -11,6 +11,7 @@
 #include <random>
 #include <stdexcept>

+#include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>

 #include "seastarx.hh"
@@ -75,10 +76,19 @@ sstring generate_salt(RandomNumberEngine& g, scheme scheme) {

 ///
 /// Hash a password combined with an implementation-specific salt string.
+/// Deprecated in favor of `hash_with_salt_async`.
 ///
 /// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
 ///
-sstring hash_with_salt(const sstring& pass, const sstring& salt);
+[[deprecated("Use hash_with_salt_async instead")]] sstring hash_with_salt(const sstring& pass, const sstring& salt);
+
+///
+/// Async version of `hash_with_salt` that returns a future.
+/// If possible, hashing uses `coroutine::maybe_yield` to prevent reactor stalls.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt);

 } // namespace detail

@@ -107,6 +117,6 @@ sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
 ///
 /// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
 ///
-bool check(const sstring& pass, const sstring& salted_hash);
+seastar::future<bool> check(const sstring& pass, const sstring& salted_hash);

 } // namespace auth::passwords
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -35,9 +35,9 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
+        cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");

-saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
+saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}

--- a/auth/saslauthd_authenticator.hh
+++ b/auth/saslauthd_authenticator.hh
@@ -11,7 +11,7 @@
 #pragma once

 #include "auth/authenticator.hh"
-#include "utils/alien_worker.hh"
+#include "auth/cache.hh"

 namespace cql3 {
 class query_processor;
@@ -29,7 +29,7 @@ namespace auth {
 class saslauthd_authenticator : public authenticator {
    sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
 public:
-    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
+    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    future<> start() override;

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -17,6 +17,7 @@
 #include <chrono>

 #include <seastar/core/future-util.hh>
+#include <seastar/core/shard_id.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>

@@ -157,6 +158,7 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n

 service::service(
        utils::loading_cache_config c,
+        cache& cache,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
        ::service::migration_notifier& mn,
@@ -166,6 +168,7 @@ service::service(
        maintenance_socket_enabled used_by_maintenance_socket)
            : _loading_cache_config(std::move(c))
            , _permissions_cache(nullptr)
+            , _cache(cache)
            , _qp(qp)
            , _group0_client(g0)
            , _mnotifier(mn)
@@ -188,15 +191,16 @@ service::service(
        ::service::migration_manager& mm,
        const service_config& sc,
        maintenance_socket_enabled used_by_maintenance_socket,
-        utils::alien_worker& hashing_worker)
+        cache& cache)
            : service(
                      std::move(c),
+                      cache,
                      qp,
                      g0,
                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, hashing_worker),
-                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
+                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
                      used_by_maintenance_socket) {
 }

@@ -221,7 +225,7 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
            try {
                co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
                        std::move(group0_guard), seastar::format("auth_service: create {} keyspace", meta::legacy::AUTH_KS));
-            } catch (::service::group0_concurrent_modification&) {
+            } catch (const ::service::group0_concurrent_modification&) {
                log.info("Concurrent operation is detected while creating {} keyspace, retrying.", meta::legacy::AUTH_KS);
            }
        }
@@ -232,6 +236,9 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
    auto auth_version = co_await sys_ks.get_auth_version();
    // version is set in query processor to be easily available in various places we call auth::legacy_mode check.
    _qp.auth_version = auth_version;
+    if (this_shard_id() == 0) {
+        co_await _cache.load_all();
+    }
    if (!_used_by_maintenance_socket) {
        // this legacy keyspace is only used by cqlsh
        // it's needed when executing `list roles` or `list users`
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -21,12 +21,12 @@
 #include "auth/authorizer.hh"
 #include "auth/permission.hh"
 #include "auth/permissions_cache.hh"
+#include "auth/cache.hh"
 #include "auth/role_manager.hh"
 #include "auth/common.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/alien_worker.hh"
 #include "utils/observable.hh"
 #include "utils/serialized_action.hh"
 #include "service/maintenance_mode.hh"
@@ -77,6 +77,7 @@ public:
 class service final : public seastar::peering_sharded_service<service> {
    utils::loading_cache_config _loading_cache_config;
    std::unique_ptr<permissions_cache> _permissions_cache;
+    cache& _cache;

    cql3::query_processor& _qp;

@@ -107,6 +108,7 @@ class service final : public seastar::peering_sharded_service<service> {
 public:
    service(
            utils::loading_cache_config,
+            cache& cache,
            cql3::query_processor&,
            ::service::raft_group0_client&,
            ::service::migration_notifier&,
@@ -128,7 +130,7 @@ public:
            ::service::migration_manager&,
            const service_config&,
            maintenance_socket_enabled,
-            utils::alien_worker&);
+            cache&);

    future<> start(::service::migration_manager&, db::system_keyspace&);

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -41,21 +41,6 @@

 namespace auth {

-namespace meta {
-
-namespace role_members_table {
-
-constexpr std::string_view name{"role_members" , 12};
-
-}
-
-namespace role_attributes_table {
-
-constexpr std::string_view name{"role_attributes", 15};
-
-}
-
-}

 static logging::logger log("standard_role_manager");

@@ -64,7 +49,8 @@ static const class_registrator<
        standard_role_manager,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> registration("org.apache.cassandra.auth.CassandraRoleManager");
+        ::service::migration_manager&,
+        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");

 struct record final {
    sstring name;
@@ -121,10 +107,11 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob_unfragmented("can_login")).is_null());
 }

-standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
+standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
+    , _cache(cache)
    , _stopped(make_ready_future<>())
    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
 {}
@@ -136,7 +123,7 @@ std::string_view standard_role_manager::qualified_java_name() const noexcept {
 const resource_set& standard_role_manager::protected_resources() const {
    static const resource_set resources({
            make_data_resource(meta::legacy::AUTH_KS, meta::roles_table::name),
-            make_data_resource(meta::legacy::AUTH_KS, meta::role_members_table::name)});
+            make_data_resource(meta::legacy::AUTH_KS, ROLE_MEMBERS_CF)});

    return resources;
 }
@@ -160,7 +147,7 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
            "  PRIMARY KEY (role, member)"
            ")",
            meta::legacy::AUTH_KS,
-            meta::role_members_table::name);
+            ROLE_MEMBERS_CF);
    static const sstring create_role_attributes_query = seastar::format(
            "CREATE TABLE {}.{} ("
            "  role text,"
@@ -169,7 +156,7 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
            "  PRIMARY KEY(role, name)"
            ")",
            meta::legacy::AUTH_KS,
-            meta::role_attributes_table::name);
+            ROLE_ATTRIBUTES_CF);
    return when_all_succeed(
            create_legacy_metadata_table_if_missing(
                    meta::roles_table::name,
@@ -177,12 +164,12 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
                    create_roles_query,
                    _migration_manager),
            create_legacy_metadata_table_if_missing(
-                    meta::role_members_table::name,
+                    ROLE_MEMBERS_CF,
                    _qp,
                    create_role_members_query,
                    _migration_manager),
            create_legacy_metadata_table_if_missing(
-                    meta::role_attributes_table::name,
+                    ROLE_ATTRIBUTES_CF,
                    _qp,
                    create_role_attributes_query,
                    _migration_manager)).discard_result();
@@ -205,7 +192,7 @@ future<> standard_role_manager::legacy_create_default_role_if_missing() {
                {_superuser},
                cql3::query_processor::cache_internal::no).discard_result();
        log.info("Created default superuser role '{}'.", _superuser);
-    } catch(const exceptions::unavailable_exception& e) {
+    } catch (const exceptions::unavailable_exception& e) {
        log.warn("Skipped default role setup: some nodes were not ready; will retry");
        throw e;
    }
@@ -429,7 +416,7 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    const auto revoke_from_members = [this, role_name, &mc] () -> future<> {
        const sstring query = seastar::format("SELECT member FROM {}.{} WHERE role = ?",
                get_auth_ks_name(_qp),
-                meta::role_members_table::name);
+                ROLE_MEMBERS_CF);
        const auto members = co_await _qp.execute_internal(
                query,
                consistency_for_role(role_name),
@@ -461,7 +448,7 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    const auto remove_attributes_of = [this, role_name, &mc] () -> future<> {
        const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ?",
                get_auth_ks_name(_qp),
-                meta::role_attributes_table::name);
+                ROLE_ATTRIBUTES_CF);
        if (legacy_mode(_qp)) {
            co_await _qp.execute_internal(query, {sstring(role_name)},
                cql3::query_processor::cache_internal::yes).discard_result();
@@ -517,7 +504,7 @@ standard_role_manager::legacy_modify_membership(
            case membership_change::add: {
                const sstring insert_query = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
                        get_auth_ks_name(_qp),
-                        meta::role_members_table::name);
+                        ROLE_MEMBERS_CF);
                co_return co_await _qp.execute_internal(
                        insert_query,
                        consistency_for_role(role_name),
@@ -529,7 +516,7 @@ standard_role_manager::legacy_modify_membership(
            case membership_change::remove: {
                const sstring delete_query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
                        get_auth_ks_name(_qp),
-                        meta::role_members_table::name);
+                        ROLE_MEMBERS_CF);
                co_return co_await _qp.execute_internal(
                        delete_query,
                        consistency_for_role(role_name),
@@ -567,12 +554,12 @@ standard_role_manager::modify_membership(
    case membership_change::add:
        modify_role_members = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
                get_auth_ks_name(_qp),
-                meta::role_members_table::name);
+                ROLE_MEMBERS_CF);
        break;
    case membership_change::remove:
        modify_role_members = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
                get_auth_ks_name(_qp),
-                meta::role_members_table::name);
+                ROLE_MEMBERS_CF);
        break;
    default:
        on_internal_error(log, format("unknown membership_change value: {}", int(ch)));
@@ -666,7 +653,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
-            meta::role_members_table::name);
+            ROLE_MEMBERS_CF);

    const auto results = co_await _qp.execute_internal(
            query,
@@ -731,15 +718,21 @@ future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    return require_record(_qp, role_name).then([](record r) {
-        return r.can_login;
-    });
+    if (legacy_mode(_qp)) {
+       const auto r = co_await require_record(_qp, role_name);
+       co_return r.can_login;
+    }
+    auto role = _cache.get(sstring(role_name));
+    if (!role) {
+        throw nonexistant_role(role_name);
+    }
+    co_return role->can_login;
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
-            meta::role_attributes_table::name);
+            ROLE_ATTRIBUTES_CF);
    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
@@ -770,7 +763,7 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
    }
    const sstring query = seastar::format("INSERT INTO {}.{} (role, name, value)  VALUES (?, ?, ?)",
            get_auth_ks_name(_qp),
-            meta::role_attributes_table::name);
+            ROLE_ATTRIBUTES_CF);
    if (legacy_mode(_qp)) {
        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, cql3::query_processor::cache_internal::yes).discard_result();
    } else {
@@ -785,7 +778,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
    }
    const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
-            meta::role_attributes_table::name);
+            ROLE_ATTRIBUTES_CF);
    if (legacy_mode(_qp)) {
        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes).discard_result();
    } else {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -10,6 +10,7 @@

 #include "auth/common.hh"
 #include "auth/role_manager.hh"
+#include "auth/cache.hh"

 #include <string_view>

@@ -36,13 +37,14 @@ class standard_role_manager final : public role_manager {
    cql3::query_processor& _qp;
    ::service::raft_group0_client& _group0_client;
    ::service::migration_manager& _migration_manager;
+    cache& _cache;
    future<> _stopped;
    abort_source _as;
    std::string _superuser;
    shared_promise<> _superuser_created_promise;

 public:
-    standard_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    standard_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    virtual std::string_view qualified_java_name() const noexcept override;

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -13,6 +13,7 @@
 #include "auth/authorizer.hh"
 #include "auth/default_authorizer.hh"
 #include "auth/password_authenticator.hh"
+#include "auth/cache.hh"
 #include "auth/permission.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "utils/class_registrator.hh"
@@ -37,8 +38,8 @@ class transitional_authenticator : public authenticator {
 public:
    static const sstring PASSWORD_AUTHENTICATOR_NAME;

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, hashing_worker)) {
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
    }
    transitional_authenticator(std::unique_ptr<authenticator> a)
            : _authenticator(std::move(a)) {
@@ -80,7 +81,7 @@ public:
        }).handle_exception([](auto ep) {
            try {
                std::rethrow_exception(ep);
-            } catch (exceptions::authentication_exception&) {
+            } catch (const exceptions::authentication_exception&) {
                // return anon user
                return make_ready_future<authenticated_user>(anonymous_user());
            }
@@ -125,7 +126,7 @@ public:
            virtual bytes evaluate_response(bytes_view client_response) override {
                try {
                    return _sasl->evaluate_response(client_response);
-                } catch (exceptions::authentication_exception&) {
+                } catch (const exceptions::authentication_exception&) {
                    _complete = true;
                    return {};
                }
@@ -140,7 +141,7 @@ public:
                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
                        try {
                            std::rethrow_exception(ep);
-                        } catch (exceptions::authentication_exception&) {
+                        } catch (const exceptions::authentication_exception&) {
                            // return anon user
                            return make_ready_future<authenticated_user>(anonymous_user());
                        }
@@ -240,7 +241,7 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+        auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");

 static const class_registrator<
        auth::authorizer,
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -15,6 +15,7 @@
 #include <cmath>

 #include "seastarx.hh"
+#include "backlog_controller_fwd.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
@@ -128,11 +129,21 @@ public:
    static constexpr unsigned normalization_factor = 30;
    static constexpr float disable_backlog = std::numeric_limits<double>::infinity();
    static constexpr float backlog_disabled(float backlog) { return std::isinf(backlog); }
-    compaction_controller(backlog_controller::scheduling_group sg, float static_shares, std::chrono::milliseconds interval, std::function<float()> current_backlog)
+    static inline const std::vector<backlog_controller::control_point> default_control_points = {
+            backlog_controller::control_point{0.0, 50}, {1.5, 100}, {normalization_factor, default_compaction_maximum_shares}};
+    compaction_controller(backlog_controller::scheduling_group sg, float static_shares, std::optional<float> max_shares,
+        std::chrono::milliseconds interval, std::function<float()> current_backlog)
        : backlog_controller(std::move(sg), std::move(interval),
-          std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
+          default_control_points,
          std::move(current_backlog),
          static_shares
        )
-    {}
+    {
+        if (max_shares) {
+            set_max_shares(*max_shares);
+        }
+    }
+
+    // Updates the maximum output value for control points.
+    void set_max_shares(float max_shares);
 };
--- a/backlog_controller_fwd.hh
+++ b/backlog_controller_fwd.hh
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+static constexpr uint64_t default_compaction_maximum_shares = 1000;
--- a/cdc/CMakeLists.txt
+++ b/cdc/CMakeLists.txt
@@ -17,5 +17,8 @@ target_link_libraries(cdc
  PRIVATE
    replica)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(cdc REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers cdc
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -25,6 +25,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "locator/topology.hh"
 #include "replica/database.hh"
+#include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "gms/feature_service.hh"
 #include "schema/schema.hh"
@@ -586,11 +587,9 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, const replica::database& db,
-        const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
+static void set_default_properties_log_table(schema_builder& b, const schema& s,
+        const replica::database& db, const keyspace_metadata& ksm)
 {
-    schema_builder b(s.ks_name(), log_name(s.cf_name()));
-    b.with_partitioner(cdc::cdc_partitioner::classname);
    b.set_compaction_strategy(compaction::compaction_strategy_type::time_window);
    b.set_comment(fmt::format("CDC log for {}.{}", s.ks_name(), s.cf_name()));
    auto ttl_seconds = s.cdc_options().ttl();
@@ -616,13 +615,22 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                        std::to_string(std::max(1, window_seconds / 2))},
        });
    }
+    b.set_caching_options(caching_options::get_disabled_caching_options());
+
+    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata(), false));
+    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
+}
+
+static void add_columns_to_cdc_log(schema_builder& b, const schema& s,
+        const api::timestamp_type timestamp, const schema_ptr old)
+{
    b.with_column(log_meta_column_name_bytes("stream_id"), bytes_type, column_kind::partition_key);
    b.with_column(log_meta_column_name_bytes("time"), timeuuid_type, column_kind::clustering_key);
    b.with_column(log_meta_column_name_bytes("batch_seq_no"), int32_type, column_kind::clustering_key);
    b.with_column(log_meta_column_name_bytes("operation"), data_type_for<operation_native_type>());
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
-    b.set_caching_options(caching_options::get_disabled_caching_options());

    auto validate_new_column = [&] (const sstring& name) {
        // When dropping a column from a CDC log table, we set the drop timestamp to be
@@ -692,15 +700,28 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
    add_columns(s.clustering_key_columns());
    add_columns(s.static_columns(), true);
    add_columns(s.regular_columns(), true);
+}
+
+static schema_ptr create_log_schema(const schema& s, const replica::database& db,
+        const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
+{
+    schema_builder b(s.ks_name(), log_name(s.cf_name()));
+
+    b.with_partitioner(cdc::cdc_partitioner::classname);
+
+    if (old) {
+        // If the user reattaches the log table, do not change its properties.
+        b.set_properties(old->get_properties());
+    } else {
+        set_default_properties_log_table(b, s, db, ksm);
+    }
+
+    add_columns_to_cdc_log(b, s, timestamp, old);

    if (uuid) {
        b.set_uuid(*uuid);
    }

-    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
-    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata()));
-    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
-
    /**
     * #10473 - if we are redefining the log table, we need to ensure any dropped
     * columns are registered in "dropped_columns" table, otherwise clients will not
@@ -931,9 +952,6 @@ static managed_bytes merge(const abstract_type& type, const managed_bytes_opt& p
    throw std::runtime_error(format("cdc merge: unknown type {}", type.name()));
 }

-using cell_map = std::unordered_map<const column_definition*, managed_bytes_opt>;
-using row_states_map = std::unordered_map<clustering_key, cell_map, clustering_key::hashing, clustering_key::equality>;
-
 static managed_bytes_opt get_col_from_row_state(const cell_map* state, const column_definition& cdef) {
    if (state) {
        if (auto it = state->find(&cdef); it != state->end()) {
@@ -943,7 +961,12 @@ static managed_bytes_opt get_col_from_row_state(const cell_map* state, const col
    return std::nullopt;
 }

-static cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck) {
+cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck) {
+    auto it = row_states.find(ck);
+    return it == row_states.end() ? nullptr : &it->second;
+}
+
+const cell_map* get_row_state(const row_states_map& row_states, const clustering_key& ck) {
    auto it = row_states.find(ck);
    return it == row_states.end() ? nullptr : &it->second;
 }
@@ -1413,6 +1436,8 @@ struct process_change_visitor {
    row_states_map& _clustering_row_states;
    cell_map& _static_row_state;

+    const bool _is_update = false;
+
    const bool _generate_delta_values = true;

    void static_row_cells(auto&& visit_row_cells) {
@@ -1436,12 +1461,13 @@ struct process_change_visitor {

        struct clustering_row_cells_visitor : public process_row_visitor {
            operation _cdc_op = operation::update;
+            operation _marker_op = operation::insert;

            using process_row_visitor::process_row_visitor;

            void marker(const row_marker& rm) {
                _ttl_column = get_ttl(rm);
-                _cdc_op = operation::insert;
+                _cdc_op = _marker_op;
            }
        };

@@ -1449,6 +1475,9 @@ struct process_change_visitor {
                log_ck, _touched_parts, _builder,
                _enable_updating_state, &ckey, get_row_state(_clustering_row_states, ckey),
                _clustering_row_states, _generate_delta_values);
+        if (_is_update && _request_options.alternator) {
+            v._marker_op = operation::update;
+        }
        visit_row_cells(v);

        if (_enable_updating_state) {
@@ -1602,6 +1631,11 @@ private:

    row_states_map _clustering_row_states;
    cell_map _static_row_state;
+    // True if the mutated row existed before applying the mutation. In other
+    // words, if the preimage is enabled and it isn't empty (otherwise, we
+    // assume that the row is non-existent). Used for Alternator Streams (see
+    // #6918).
+    bool _is_update = false;

    const bool _uses_tablets;

@@ -1728,6 +1762,7 @@ public:
            ._enable_updating_state = _enable_updating_state,
            ._clustering_row_states = _clustering_row_states,
            ._static_row_state = _static_row_state,
+            ._is_update = _is_update,
            ._generate_delta_values = generate_delta_values(_builder->base_schema())
        };
        cdc::inspect_mutation(m, v);
@@ -1738,6 +1773,10 @@ public:
        _builder->end_record();
    }

+    const row_states_map& clustering_row_states() const override {
+        return _clustering_row_states;
+    }
+
    // Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
    // The `transformer` object on which this method was called on should not be used anymore.
    std::tuple<utils::chunked_vector<mutation>, stats::part_type_set> finish() && {
@@ -1861,6 +1900,7 @@ public:
                    _static_row_state[&c] = std::move(*maybe_cell_view);
                }
            }
+            _is_update = true;
        }

        if (static_only) {
@@ -1948,6 +1988,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                return make_ready_future<>();
            }

+            const bool alternator_increased_compatibility = options.alternator && options.alternator_streams_increased_compatibility;
            transformer trans(_ctxt, s, m.decorated_key(), options);

            auto f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(nullptr);
@@ -1955,7 +1996,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                // Preimage has been fetched by upper layers.
                tracing::trace(tr_state, "CDC: Using a prefetched preimage");
                f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(options.preimage);
-            } else if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
+            } else if (s->cdc_options().preimage() || s->cdc_options().postimage() || alternator_increased_compatibility) {
                // Note: further improvement here would be to coalesce the pre-image selects into one
                // if a batch contains several modifications to the same table. Otoh, batch is rare(?)
                // so this is premature.
@@ -1972,7 +2013,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                tracing::trace(tr_state, "CDC: Preimage not enabled for the table, not querying current value of {}", m.decorated_key());
            }

-            return f.then([trans = std::move(trans), &mutations, idx, tr_state, &details] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
+            return f.then([alternator_increased_compatibility, trans = std::move(trans), &mutations, idx, tr_state, &details, &options] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
                auto& m = mutations[idx];
                auto& s = m.schema();

@@ -1987,13 +2028,13 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                details.had_preimage |= preimage;
                details.had_postimage |= postimage;
                tracing::trace(tr_state, "CDC: Generating log mutations for {}", m.decorated_key());
-                if (should_split(m)) {
+                if (should_split(m, options)) {
                    tracing::trace(tr_state, "CDC: Splitting {}", m.decorated_key());
                    details.was_split = true;
-                    process_changes_with_splitting(m, trans, preimage, postimage);
+                    process_changes_with_splitting(m, trans, preimage, postimage, alternator_increased_compatibility);
                } else {
                    tracing::trace(tr_state, "CDC: No need to split {}", m.decorated_key());
-                    process_changes_without_splitting(m, trans, preimage, postimage);
+                    process_changes_without_splitting(m, trans, preimage, postimage, alternator_increased_compatibility);
                }
                auto [log_mut, touched_parts] = std::move(trans).finish();
                const int generated_count = log_mut.size();
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -52,6 +52,9 @@ class database;

 namespace cdc {

+using cell_map = std::unordered_map<const column_definition*, managed_bytes_opt>;
+using row_states_map = std::unordered_map<clustering_key, cell_map, clustering_key::hashing, clustering_key::equality>;
+
 // cdc log table operation
 enum class operation : int8_t {
    // note: these values will eventually be read by a third party, probably not privvy to this
@@ -73,6 +76,14 @@ struct per_request_options {
    // Scylla. Currently, only TTL expiration implementation for Alternator
    // uses this.
    const bool is_system_originated = false;
+    // True if this mutation was emitted by Alternator.
+    const bool alternator = false;
+    // Sacrifice performance for the sake of better compatibility with DynamoDB
+    // Streams. It's important for correctness that
+    // alternator_streams_increased_compatibility config flag be read once per
+    // request, because it's live-updateable. As a result, the flag may change
+    // between reads.
+    const bool alternator_streams_increased_compatibility = false;
 };

 struct operation_result_tracker;
@@ -142,4 +153,7 @@ bool is_cdc_metacolumn_name(const sstring& name);

 utils::UUID generate_timeuuid(api::timestamp_type t);

+cell_map* get_row_state(row_states_map& row_states, const clustering_key& ck);
+const cell_map* get_row_state(const row_states_map& row_states, const clustering_key& ck);
+
 } // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -6,15 +6,28 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

+#include "bytes.hh"
+#include "bytes_fwd.hh"
+#include "mutation/atomic_cell.hh"
+#include "mutation/atomic_cell_or_collection.hh"
+#include "mutation/collection_mutation.hh"
 #include "mutation/mutation.hh"
+#include "mutation/tombstone.hh"
 #include "schema/schema.hh"

+#include "seastar/core/sstring.hh"
 #include "types/concrete_types.hh"
+#include "types/types.hh"
 #include "types/user.hh"

 #include "split.hh"
 #include "log.hh"
 #include "change_visitor.hh"
+#include "utils/managed_bytes.hh"
+#include <string_view>
+#include <unordered_map>
+
+extern logging::logger cdc_log;

 struct atomic_column_update {
    column_id id;
@@ -490,6 +503,8 @@ struct should_split_visitor {
    // Otherwise we store the change's ttl.
    std::optional<gc_clock::duration> _ttl = std::nullopt;

+    virtual ~should_split_visitor() = default;
+
    inline bool finished() const { return _result; }
    inline void stop() { _result = true; }

@@ -512,7 +527,7 @@ struct should_split_visitor {

    void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }

-    void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
+    virtual void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
        if (_had_row_marker) {
            // nonatomic updates cannot be expressed with an INSERT.
            return stop();
@@ -522,7 +537,7 @@ struct should_split_visitor {
    void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
    void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }

-    void marker(const row_marker& rm) {
+    virtual void marker(const row_marker& rm) {
        _had_row_marker = true;
        visit(rm.timestamp(), get_ttl(rm));
    }
@@ -563,7 +578,29 @@ struct should_split_visitor {
    }
 };

-bool should_split(const mutation& m) {
+// This is the same as the above, but it doesn't split a row marker away from
+// an update. As a result, updates that create an item appear as a single log
+// row.
+class alternator_should_split_visitor : public should_split_visitor {
+public:
+    ~alternator_should_split_visitor() override = default;
+
+    void live_collection_cell(bytes_view, const atomic_cell_view& cell) override {
+        visit(cell.timestamp());
+    }
+
+    void marker(const row_marker& rm) override {
+        visit(rm.timestamp());
+    }
+};
+
+bool should_split(const mutation& m, const per_request_options& options) {
+    if (options.alternator) {
+        alternator_should_split_visitor v;
+        cdc::inspect_mutation(m, v);
+        return v._result || v._ts == api::missing_timestamp;
+    }
+
    should_split_visitor v;

    cdc::inspect_mutation(m, v);
@@ -573,8 +610,109 @@ bool should_split(const mutation& m) {
        || v._ts == api::missing_timestamp;
 }

+// Returns true if the row state and the atomic and nonatomic entries represent
+// an equivalent item.
+static bool entries_match_row_state(const schema_ptr& base_schema, const cell_map& row_state, const std::vector<atomic_column_update>& atomic_entries,
+        std::vector<nonatomic_column_update>& nonatomic_entries) {
+    for (const auto& update : atomic_entries) {
+        const column_definition& cdef = base_schema->column_at(column_kind::regular_column, update.id);
+        const auto it = row_state.find(&cdef);
+        if (it == row_state.end()) {
+            return false;
+        }
+        if (to_managed_bytes_opt(update.cell.value().linearize()) != it->second) {
+            return false;
+        }
+    }
+    if (nonatomic_entries.empty()) {
+        return true;
+    }
+
+    for (const auto& update : nonatomic_entries) {
+        const column_definition& cdef = base_schema->column_at(column_kind::regular_column, update.id);
+        const auto it = row_state.find(&cdef);
+        if (it == row_state.end()) {
+            return false;
+        }
+
+        // The only collection used by Alternator is a non-frozen map.
+        auto current_raw_map = cdef.type->deserialize(*it->second);
+        map_type_impl::native_type current_values = value_cast<map_type_impl::native_type>(current_raw_map);
+
+        if (current_values.size() != update.cells.size()) {
+            return false;
+        }
+        
+        std::unordered_map<sstring_view, bytes> current_values_map;
+        for (const auto& entry : current_values) {
+            const auto attr_name = std::string_view(value_cast<sstring>(entry.first));
+            current_values_map[attr_name] = value_cast<bytes>(entry.second);
+        }
+
+        for (const auto& [key, value] : update.cells) {
+            const auto key_str = to_string_view(key);
+            if (!value.is_live()) {
+                if (current_values_map.contains(key_str)) {
+                    return false;
+                }
+            } else if (current_values_map[key_str] != value.value().linearize()) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool should_skip(batch& changes, const mutation& base_mutation, change_processor& processor) {
+    const schema_ptr& base_schema = base_mutation.schema();
+    // Alternator doesn't use static updates and clustered range deletions.
+    if (!changes.static_updates.empty() || !changes.clustered_range_deletions.empty()) {
+        return false;
+    }
+
+    for (clustered_row_insert& u : changes.clustered_inserts) {
+        const cell_map* row_state = get_row_state(processor.clustering_row_states(), u.key);
+        if (!row_state) {
+            return false;
+        }
+        if (!entries_match_row_state(base_schema, *row_state, u.atomic_entries, u.nonatomic_entries)) {
+            return false;
+        }
+    }
+
+    for (clustered_row_update& u : changes.clustered_updates) {
+        const cell_map* row_state = get_row_state(processor.clustering_row_states(), u.key);
+        if (!row_state) {
+            return false;
+        }
+        if (!entries_match_row_state(base_schema, *row_state, u.atomic_entries, u.nonatomic_entries)) {
+            return false;
+        }
+    }
+
+    // Skip only if the row being deleted does not exist (i.e. the deletion is a no-op).
+    for (const auto& row_deletion : changes.clustered_row_deletions) {
+        if (processor.clustering_row_states().contains(row_deletion.key)) {
+            return false;
+        }
+    }
+
+    // Don't skip if the item exists.
+    //
+    // Increased DynamoDB Streams compatibility guarantees that single-item
+    // operations will read the item and store it in the clustering row states.
+    // If it is not found there, we may skip CDC. This is safe as long as the
+    // assumptions of this operation's write isolation are not violated.
+    if (changes.partition_deletions && processor.clustering_row_states().contains(clustering_key::make_empty())) {
+        return false;
+    }
+
+    cdc_log.trace("Skipping CDC log for mutation {}", base_mutation);
+    return true;
+}
+
 void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
-        bool enable_preimage, bool enable_postimage) {
+        bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
    const auto base_schema = base_mutation.schema();
    auto changes = extract_changes(base_mutation);
    auto pk = base_mutation.key();
@@ -586,9 +724,6 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
    const auto last_timestamp = changes.rbegin()->first;

    for (auto& [change_ts, btch] : changes) {
-        const bool is_last = change_ts == last_timestamp;
-        processor.begin_timestamp(change_ts, is_last);
-
        clustered_column_set affected_clustered_columns_per_row{clustering_key::less_compare(*base_schema)};
        one_kind_column_set affected_static_columns{base_schema->static_columns_count()};

@@ -597,6 +732,12 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
            affected_clustered_columns_per_row = btch.get_affected_clustered_columns_per_row(*base_mutation.schema());
        }

+        if (alternator_strict_compatibility && should_skip(btch, base_mutation, processor)) {
+            continue;
+        }
+
+        const bool is_last = change_ts == last_timestamp;
+        processor.begin_timestamp(change_ts, is_last);
        if (enable_preimage) {
            if (affected_static_columns.count() > 0) {
                processor.produce_preimage(nullptr, affected_static_columns);
@@ -684,7 +825,13 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
 }

 void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
-        bool enable_preimage, bool enable_postimage) {
+        bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility) {
+    if (alternator_strict_compatibility) {
+        auto changes = extract_changes(base_mutation);
+        if (should_skip(changes.begin()->second, base_mutation, processor)) {
+            return;
+        }
+    }
    auto ts = find_timestamp(base_mutation);
    processor.begin_timestamp(ts, true);

--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include <boost/dynamic_bitset.hpp>  // IWYU pragma: keep
+#include "cdc/log.hh"
 #include "replica/database_fwd.hh"
 #include "mutation/timestamp.hh"

@@ -65,12 +66,14 @@ public:
    // Tells processor we have reached end of record - last part
    // of a given timestamp batch
    virtual void end_record() = 0;
+
+    virtual const row_states_map& clustering_row_states() const = 0;
 };

-bool should_split(const mutation& base_mutation);
+bool should_split(const mutation& base_mutation, const per_request_options& options);
 void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
-        bool enable_preimage, bool enable_postimage);
+        bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility);
 void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
-        bool enable_preimage, bool enable_postimage);
+        bool enable_preimage, bool enable_postimage, bool alternator_strict_compatibility);

 }
--- a/compaction/CMakeLists.txt
+++ b/compaction/CMakeLists.txt
@@ -21,5 +21,8 @@ target_link_libraries(compaction
    mutation_writer
    replica)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(compaction REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers compaction
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -867,8 +867,8 @@ auto fmt::formatter<compaction::compaction_task_executor>::format(const compacti

 namespace compaction {

-inline compaction_controller make_compaction_controller(const compaction_manager::scheduling_group& csg, uint64_t static_shares, std::function<double()> fn) {
-    return compaction_controller(csg, static_shares, 250ms, std::move(fn));
+inline compaction_controller make_compaction_controller(const compaction_manager::scheduling_group& csg, uint64_t static_shares, std::optional<float> max_shares, std::function<double()> fn) {
+    return compaction_controller(csg, static_shares, max_shares, 250ms, std::move(fn));
 }

 compaction::compaction_state::~compaction_state() {
@@ -1014,7 +1014,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
    , _sys_ks("compaction_manager::system_keyspace")
    , _cfg(std::move(cfg))
    , _compaction_submission_timer(compaction_sg(), compaction_submission_callback())
-    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), [this] () -> float {
+    , _compaction_controller(make_compaction_controller(compaction_sg(), static_shares(), _cfg.max_shares.get(), [this] () -> float {
        _last_backlog = backlog();
        auto b = _last_backlog / available_memory();
        // This means we are using an unimplemented strategy
@@ -1033,6 +1033,10 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
    , _update_compaction_static_shares_action([this] { return update_static_shares(static_shares()); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
+    , _compaction_max_shares_observer(_cfg.max_shares.observe([this] (const float& max_shares) {
+        cmlog.info("Updating max shares to {}", max_shares);
+        _compaction_controller.set_max_shares(max_shares);
+    }))
    , _strategy_control(std::make_unique<strategy_control>(*this))
    , _tombstone_gc_state(_shared_tombstone_gc_state) {
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
@@ -1051,11 +1055,12 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
    , _sys_ks("compaction_manager::system_keyspace")
    , _cfg(config{ .available_memory = 1 })
    , _compaction_submission_timer(compaction_sg(), compaction_submission_callback())
-    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, [] () -> float { return 1.0; }))
+    , _compaction_controller(make_compaction_controller(compaction_sg(), 1, std::nullopt, [] () -> float { return 1.0; }))
    , _backlog_manager(_compaction_controller)
    , _throughput_updater(serialized_action([this] { return update_throughput(throughput_mbs()); }))
    , _update_compaction_static_shares_action([] { return make_ready_future<>(); })
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
+    , _compaction_max_shares_observer(_cfg.max_shares.observe([] (const float& max_shares) {}))
    , _strategy_control(std::make_unique<strategy_control>(*this))
    , _tombstone_gc_state(_shared_tombstone_gc_state) {
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -80,6 +80,7 @@ public:
        scheduling_group maintenance_sched_group;
        size_t available_memory = 0;
        utils::updateable_value<float> static_shares = utils::updateable_value<float>(0);
+        utils::updateable_value<float> max_shares = utils::updateable_value<float>(0);
        utils::updateable_value<uint32_t> throughput_mb_per_sec = utils::updateable_value<uint32_t>(0);
        std::chrono::seconds flush_all_tables_before_major = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::days(1));
    };
@@ -159,6 +160,7 @@ private:
    std::optional<utils::observer<uint32_t>> _throughput_option_observer;
    serialized_action _update_compaction_static_shares_action;
    utils::observer<float> _compaction_static_shares_observer;
+    utils::observer<float> _compaction_max_shares_observer;
    uint64_t _validation_errors = 0;

    class strategy_control;
@@ -291,6 +293,10 @@ public:
        return _cfg.static_shares.get();
    }

+    float max_shares() const noexcept {
+        return _cfg.max_shares.get();
+    }
+
    uint32_t throughput_mbs() const noexcept {
        return _cfg.throughput_mb_per_sec.get();
    }
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -227,7 +227,7 @@ future<> run_table_tasks(replica::database& db, std::vector<table_tasks_info> ta
                // Tables will be kept in descending order.
                std::ranges::sort(table_tasks, std::greater<>(), [&] (const table_tasks_info& tti) {
                    try {
-                        return db.find_column_family(tti.ti.id).get_stats().live_disk_space_used;
+                        return db.find_column_family(tti.ti.id).get_stats().live_disk_space_used.on_disk;
                    } catch (const replica::no_such_column_family& e) {
                        return int64_t(-1);
                    }
@@ -281,7 +281,7 @@ future<> run_keyspace_tasks(replica::database& db, std::vector<keyspace_tasks_in
                    try {
                        return std::accumulate(kti.table_infos.begin(), kti.table_infos.end(), int64_t(0), [&] (int64_t sum, const table_info& t) {
                            try {
-                                sum += db.find_column_family(t.id).get_stats().live_disk_space_used;
+                                sum += db.find_column_family(t.id).get_stats().live_disk_space_used.on_disk;
                            } catch (const replica::no_such_column_family&) {
                                // ignore
                            }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -888,9 +888,18 @@ rf_rack_valid_keyspaces: false
 #
 # Vector Store options
 #
-# A comma-separated list of URIs for the vector store using DNS name. Only HTTP schema is supported. Port number is mandatory.
-# Default is empty, which means that the vector store is not used.
+# HTTP and HTTPS schemes are supported. Port number is mandatory.
+# If both `vector_store_primary_uri` and `vector_store_secondary_uri` are unset or empty, vector search is disabled.
+#
+# A comma-separated list of primary vector store node URIs. These nodes are preferred for vector search operations.
 # vector_store_primary_uri: http://vector-store.dns.name:{port}
+#
+# A comma-separated list of secondary vector store node URIs. These nodes are used as a fallback when all primary nodes are unavailable, and are typically located in a different availability zone for high availability.
+# vector_store_secondary_uri: http://vector-store.dns.name:{port}
+#
+# Options for encrypted connections to the vector store. These options are used for HTTPS URIs in vector_store_primary_uri and vector_store_secondary_uri.
+# vector_store_encryption_options:
+#    truststore: <not set, use system trust>

 # 
 # io-streaming rate limiting
--- a/configure.py
+++ b/configure.py
@@ -445,6 +445,7 @@ ldap_tests = set([
 scylla_tests = set([
    'test/boost/combined_tests',
    'test/boost/UUID_test',
+    'test/boost/url_parse_test',
    'test/boost/advanced_rpc_compressor_test',
    'test/boost/allocation_strategy_test',
    'test/boost/alternator_unit_test',
@@ -646,6 +647,28 @@ vector_search_tests = set([
    'test/vector_search/client_test'
 ])

+vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
+vector_search_validator_deps = set([
+    'test/vector_search_validator/build-validator',
+    'test/vector_search_validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/src/main.rs',
+    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
+    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
+    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
+])
+
+vector_store_bin = 'vector-search-validator/bin/vector-store'
+vector_store_deps = set([
+    'test/vector_search_validator/build-env',
+    'test/vector_search_validator/build-vector-store',
+])
+
+vector_search_validator_bins = set([
+    vector_search_validator_bin,
+    vector_store_bin,
+])
+
 wasms = set([
    'wasm/return_input.wat',
    'wasm/test_complex_null_values.wat',
@@ -679,7 +702,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms
+all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -763,6 +786,7 @@ arg_parser.add_argument('--use-cmake', action=argparse.BooleanOptionalAction, de
 arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scylla with coverage instrumentation')
 arg_parser.add_argument('--build-dir', action='store', default='build',
                        help='Build directory path')
+arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
 arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
 args = arg_parser.parse_args()
 if args.help:
@@ -835,6 +859,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/alien_worker.cc',
                'utils/array-search.cc',
                'utils/base64.cc',
+                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
                'utils/buffer_input_stream.cc',
@@ -1038,7 +1063,6 @@ scylla_core = (['message/messaging_service.cc',
                'db/hints/resource_manager.cc',
                'db/hints/sync_point.cc',
                'db/large_data_handler.cc',
-                'db/legacy_schema_migrator.cc',
                'db/marshal/type_parser.cc',
                'db/per_partition_rate_limit_options.cc',
                'db/rate_limiter.cc',
@@ -1134,6 +1158,7 @@ scylla_core = (['message/messaging_service.cc',
                'locator/topology.cc',
                'locator/util.cc',
                'service/client_state.cc',
+                'service/client_routes.cc',
                'service/storage_service.cc',
                'service/session.cc',
                'service/task_manager_module.cc',
@@ -1172,6 +1197,7 @@ scylla_core = (['message/messaging_service.cc',
                'auth/allow_all_authorizer.cc',
                'auth/authenticated_user.cc',
                'auth/authenticator.cc',
+                'auth/cache.cc',
                'auth/common.cc',
                'auth/default_authorizer.cc',
                'auth/resource.cc',
@@ -1268,7 +1294,8 @@ scylla_core = (['message/messaging_service.cc',
                'vector_search/vector_store_client.cc',
                'vector_search/dns.cc',
                'vector_search/client.cc',
-                'vector_search/clients.cc'
+                'vector_search/clients.cc',
+                'vector_search/truststore.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1292,6 +1319,8 @@ api = ['api/api.cc',
       'api/storage_proxy.cc',
       Json2Code('api/api-doc/cache_service.json'),
       'api/cache_service.cc',
+       Json2Code('api/api-doc/client_routes.json'),
+       'api/client_routes.cc',
       Json2Code('api/api-doc/collectd.json'),
       'api/collectd.cc',
       Json2Code('api/api-doc/endpoint_snitch_info.json'),
@@ -1454,7 +1483,6 @@ deps = {

 pure_boost_tests = set([
    'test/boost/anchorless_list_test',
-    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
    'test/boost/big_decimal_test',
    'test/boost/caching_options_test',
@@ -1579,6 +1607,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/query_processor_test.cc',
    'test/boost/reader_concurrency_semaphore_test.cc',
    'test/boost/repair_test.cc',
+    'test/boost/replicator_test.cc',
    'test/boost/restrictions_test.cc',
    'test/boost/role_manager_test.cc',
    'test/boost/row_cache_test.cc',
@@ -1621,6 +1650,7 @@ deps['test/boost/bytes_ostream_test'] = [
 ]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
 deps['test/boost/UUID_test'] = ['clocks-impl.cc', 'utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/dynamic_bitset.cc', 'utils/hashers.cc', 'utils/on_internal_error.cc']
+deps['test/boost/url_parse_test'] = ['utils/http.cc', 'test/boost/url_parse_test.cc', ]
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'utils/labels.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
@@ -2185,7 +2215,15 @@ if os.path.exists(kmipc_lib):
    user_cflags += f' -I{kmipc_dir}/include -DHAVE_KMIP'

 def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
-    cxxflags = []
+    cxxflags = [
+        # we need this flag for correct precompiled header handling in connection with ccache (or similar)
+        # `git` tools don't preserve timestamps, so when using ccache it might be possible to add pch to ccache
+        # and then later (after for example rebase) get `stdafx.hh` with different timestamp, but the same content.
+        # this will tell ccache to bring pch from its cache. Later on clang will check if timestamps match and complain.
+        # Adding `-fpch-validate-input-files-content` tells clang to check content of stdafx.hh if timestamps don't match.
+        # The flag seems to be present in gcc as well.
+        "" if args.disable_precompiled_header else '-fpch-validate-input-files-content'
+    ]

    optimization_level = mode_config['optimization-level']
    cxxflags.append(f'-O{optimization_level}')
@@ -2250,6 +2288,7 @@ def write_build_file(f,
                     scylla_version,
                     scylla_release,
                     args):
+    use_precompiled_header = not args.disable_precompiled_header
    warnings = get_warning_options(args.cxx)
    rustc_target = pick_rustc_target('wasm32-wasi', 'wasm32-wasip1')
    f.write(textwrap.dedent('''\
@@ -2356,7 +2395,10 @@ def write_build_file(f,

    for mode in build_modes:
        modeval = modes[mode]
-
+        seastar_lib_ext = 'so' if modeval['build_seastar_shared_libs'] else 'a'
+        seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
+        seastar_testing_dep = f'$builddir/{mode}/seastar/libseastar_testing.{seastar_lib_ext}'
+        abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
        fmt_lib = 'fmt'
        f.write(textwrap.dedent('''\
            cxx_ld_flags_{mode} = {cxx_ld_flags}
@@ -2369,6 +2411,14 @@ def write_build_file(f,
              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
+            rule cxx_build_precompiled_header.{mode}
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in -Winvalid-pch -fpch-instantiate-templates -Xclang -emit-pch -DSCYLLA_USE_PRECOMPILED_HEADER
+              description = CXX-PRECOMPILED-HEADER $out
+              depfile = $out.d
+            rule cxx_with_pch.{mode}
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in -Winvalid-pch -Xclang -include-pch -Xclang $builddir/{mode}/stdafx.hh.pch
+              description = CXX $out
+              depfile = $out.d
            rule link.{mode}
              command = $cxx  $ld_flags_{mode} $ldflags -o $out $in $libs $libs_{mode}
              description = LINK $out
@@ -2402,7 +2452,7 @@ def write_build_file(f,
                        $builddir/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            rule checkhh.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc -USCYLLA_USE_PRECOMPILED_HEADER
              description = CHECKHH $in
              depfile = $out.d
            rule test.{mode}
@@ -2416,10 +2466,11 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
+                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2428,6 +2479,7 @@ def write_build_file(f,
        include_dist_target = f'dist-{mode}' if args.enable_dist is None or args.enable_dist else ''
        f.write(f'build {mode}: phony {include_cxx_target} {include_dist_target}\n')
        compiles = {}
+        compiles_with_pch = set()
        swaggers = set()
        serializers = {}
        ragels = {}
@@ -2442,16 +2494,16 @@ def write_build_file(f,
        # object code. And we enable LTO when linking the main Scylla executable, while disable
        # it when linking anything else.

-        seastar_lib_ext = 'so' if modeval['build_seastar_shared_libs'] else 'a'
        for binary in sorted(build_artifacts):
            if modeval['is_profile'] and binary != "scylla":
                # Just to avoid clutter in build.ninja
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms:
+            if binary in other or binary in wasms or binary in vector_search_validator_bins:
                continue
            srcs = deps[binary]
+            # 'scylla'
            objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
                    for src in srcs
                    if src.endswith('.cc')]
@@ -2487,9 +2539,6 @@ def write_build_file(f,
                continue

            do_lto = modes[mode]['has_lto'] and binary in lto_binaries
-            seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
-            seastar_testing_dep = f'$builddir/{mode}/seastar/libseastar_testing.{seastar_lib_ext}'
-            abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
            seastar_testing_libs = f'$seastar_testing_libs_{mode}'

            local_libs = f'$seastar_libs_{mode} $libs'
@@ -2499,6 +2548,7 @@ def write_build_file(f,
                local_libs += ' -flto=thin -ffat-lto-objects'
            else:
                local_libs += ' -fno-lto'
+            use_pch = use_precompiled_header and binary == 'scylla'
            if binary in tests:
                if binary in pure_boost_tests:
                    local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
@@ -2527,6 +2577,8 @@ def write_build_file(f,
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
                    compiles[obj] = src
+                    if use_pch:
+                        compiles_with_pch.add(obj)
                elif src.endswith('.idl.hh'):
                    hh = '$builddir/' + mode + '/gen/' + src.replace('.idl.hh', '.dist.hh')
                    serializers[hh] = src
@@ -2559,10 +2611,11 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
+                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2605,7 +2658,9 @@ def write_build_file(f,
            src = compiles[obj]
            seastar_dep = f'$builddir/{mode}/seastar/libseastar.{seastar_lib_ext}'
            abseil_dep = ' '.join(f'$builddir/{mode}/abseil/{lib}' for lib in abseil_libs)
-            f.write(f'build {obj}: cxx.{mode} {src} | {profile_dep} || {seastar_dep} {abseil_dep} {gen_headers_dep}\n')
+            pch_dep = f'$builddir/{mode}/stdafx.hh.pch' if obj in compiles_with_pch else ''
+            cxx_cmd = 'cxx_with_pch' if obj in compiles_with_pch else 'cxx'
+            f.write(f'build {obj}: {cxx_cmd}.{mode} {src} | {profile_dep} {seastar_dep} {abseil_dep} {gen_headers_dep} {pch_dep}\n')
            if src in modeval['per_src_extra_cxxflags']:
                f.write('    cxxflags = {seastar_cflags} $cxxflags $cxxflags_{mode} {extra_cxxflags}\n'.format(mode=mode, extra_cxxflags=modeval["per_src_extra_cxxflags"][src], **modeval))
        for swagger in swaggers:
@@ -2666,6 +2721,8 @@ def write_build_file(f,
            f.write('  target = {lib}\n'.format(**locals()))
            f.write('  profile_dep = {profile_dep}\n'.format(**locals()))

+        f.write(f'build $builddir/{mode}/stdafx.hh.pch: cxx_build_precompiled_header.{mode} stdafx.hh | {profile_dep} {seastar_dep} {abseil_dep} {gen_headers_dep} {pch_dep}\n')
+
        f.write('build $builddir/{mode}/seastar/apps/iotune/iotune: ninja $builddir/{mode}/seastar/build.ninja | $builddir/{mode}/seastar/libseastar.{seastar_lib_ext}\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
@@ -2729,6 +2786,19 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

+    f.write(textwrap.dedent(f'''\
+        rule build-vector-search-validator
+            command = test/vector_search_validator/build-validator $builddir
+        rule build-vector-store
+            command = test/vector_search_validator/build-vector-store $builddir
+        '''))
+    f.write(
+            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
+    )
+    f.write(
+            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
+    )
+
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -2942,7 +3012,7 @@ def configure_using_cmake(args):
        'CMAKE_DEFAULT_CONFIGS': selected_configs,
        'CMAKE_C_COMPILER': args.cc,
        'CMAKE_CXX_COMPILER': args.cxx,
-        'CMAKE_CXX_FLAGS': args.user_cflags,
+        'CMAKE_CXX_FLAGS': args.user_cflags + ("" if args.disable_precompiled_header else " -fpch-validate-input-files-content"),
        'CMAKE_EXE_LINKER_FLAGS': args.user_ldflags,
        'CMAKE_EXPORT_COMPILE_COMMANDS': 'ON',
        'Scylla_CHECK_HEADERS': 'ON',
@@ -2951,6 +3021,7 @@ def configure_using_cmake(args):
        'Scylla_TEST_REPEAT': args.test_repeat,
        'Scylla_ENABLE_LTO': 'ON' if args.lto else 'OFF',
        'Scylla_WITH_DEBUG_INFO' : 'ON' if args.debuginfo else 'OFF',
+        'Scylla_USE_PRECOMPILED_HEADER': 'OFF' if args.disable_precompiled_header else 'ON',
    }
    if args.date_stamp:
        settings['Scylla_DATE_STAMP'] = args.date_stamp
--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -138,5 +138,8 @@ target_link_libraries(cql3
    lang
    transport)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(cql3 REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers cql3
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -575,6 +575,15 @@ usingTimeoutServiceLevelClauseObjective[std::unique_ptr<cql3::attributes::raw>&
    | serviceLevel sl_name=serviceLevelOrRoleName { attrs->service_level = std::move(sl_name); }
    ;

+usingTimeoutConcurrencyClause[std::unique_ptr<cql3::attributes::raw>& attrs]
+    : K_USING usingTimeoutConcurrencyClauseObjective[attrs] ( K_AND usingTimeoutConcurrencyClauseObjective[attrs] )*
+    ;
+
+usingTimeoutConcurrencyClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
+    : K_TIMEOUT to=term { attrs->timeout = std::move(to); }
+    | K_CONCURRENCY c=term { attrs->concurrency = std::move(c); }
+    ;
+
 /**
 * UPDATE <CF>
 * USING TIMESTAMP <long>
@@ -666,7 +675,7 @@ pruneMaterializedViewStatement returns [std::unique_ptr<raw::select_statement> e
        auto attrs = std::make_unique<cql3::attributes::raw>();
        expression wclause = conjunction{};
    }
-	: K_PRUNE K_MATERIALIZED K_VIEW cf=columnFamilyName (K_WHERE w=whereClause { wclause = std::move(w); } )? ( usingClause[attrs] )?
+	: K_PRUNE K_MATERIALIZED K_VIEW cf=columnFamilyName (K_WHERE w=whereClause { wclause = std::move(w); } )? ( usingTimeoutConcurrencyClause[attrs] )?
 	  {
 	        auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, statement_subtype, bypass_cache);
 	        return std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
@@ -1560,6 +1569,10 @@ serviceLevelOrRoleName returns [sstring name]
 | t=QUOTED_NAME        { $name = sstring($t.text); }
 | k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
+// The literal `default` will not be parsed by any of the previous
+// rules, so we need to cover it manually. Needed by CREATE SERVICE
+// LEVEL and ATTACH SERVICE LEVEL.
+| t=K_DEFAULT          { $name = sstring("default"); }
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;

@@ -2366,6 +2379,7 @@ K_LIKE:        L I K E;

 K_TIMEOUT:     T I M E O U T;
 K_PRUNE:       P R U N E;
+K_CONCURRENCY: C O N C U R R E N C Y;

 K_EXECUTE:     E X E C U T E;

--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -20,19 +20,21 @@
 namespace cql3 {

 std::unique_ptr<attributes> attributes::none() {
-    return std::unique_ptr<attributes>{new attributes{{}, {}, {}, {}}};
+    return std::unique_ptr<attributes>{new attributes{{}, {}, {}, {}, {}}};
 }

 attributes::attributes(std::optional<cql3::expr::expression>&& timestamp,
                       std::optional<cql3::expr::expression>&& time_to_live,
                       std::optional<cql3::expr::expression>&& timeout,
-                       std::optional<sstring> service_level)
+                       std::optional<sstring> service_level,
+                       std::optional<cql3::expr::expression>&& concurrency)
    : _timestamp_unset_guard(timestamp)
    , _timestamp{std::move(timestamp)}
    , _time_to_live_unset_guard(time_to_live)
    , _time_to_live{std::move(time_to_live)}
    , _timeout{std::move(timeout)}
    , _service_level(std::move(service_level))
+    , _concurrency{std::move(concurrency)}
 { }

 bool attributes::is_timestamp_set() const {
@@ -51,6 +53,10 @@ bool attributes::is_service_level_set() const {
    return bool(_service_level);
 }

+bool attributes::is_concurrency_set() const {
+    return bool(_concurrency);
+}
+
 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (!_timestamp.has_value() || _timestamp_unset_guard.is_unset(options)) {
        return now;
@@ -123,6 +129,27 @@ qos::service_level_options attributes::get_service_level(qos::service_level_cont
    return sl_controller.get_service_level(sl_name).slo;
 }

+std::optional<int32_t> attributes::get_concurrency(const query_options& options) const {
+    if (!_concurrency.has_value()) {
+        return std::nullopt;
+    }
+
+    cql3::raw_value concurrency_raw = expr::evaluate(*_concurrency, options);
+    if (concurrency_raw.is_null()) {
+        throw exceptions::invalid_request_exception("Invalid null value of concurrency");
+    }
+    int32_t concurrency;
+    try {
+        concurrency = concurrency_raw.view().validate_and_deserialize<int32_t>(*int32_type);
+    } catch (marshal_exception& e) {
+        throw exceptions::invalid_request_exception("Invalid concurrency value");
+    }
+    if (concurrency <= 0) {
+        throw exceptions::invalid_request_exception("Concurrency must be a positive integer");
+    }
+    return concurrency;
+}
+
 void attributes::fill_prepare_context(prepare_context& ctx) {
    if (_timestamp.has_value()) {
        expr::fill_prepare_context(*_timestamp, ctx);
@@ -133,10 +160,13 @@ void attributes::fill_prepare_context(prepare_context& ctx) {
    if (_timeout.has_value()) {
        expr::fill_prepare_context(*_timeout, ctx);
    }
+    if (_concurrency.has_value()) {
+        expr::fill_prepare_context(*_concurrency, ctx);
+    }
 }

 std::unique_ptr<attributes> attributes::raw::prepare(data_dictionary::database db, const sstring& ks_name, const sstring& cf_name) const {
-    std::optional<expr::expression> ts, ttl, to;
+    std::optional<expr::expression> ts, ttl, to, conc;

    if (timestamp.has_value()) {
        ts = prepare_expression(*timestamp, db, ks_name, nullptr, timestamp_receiver(ks_name, cf_name));
@@ -153,7 +183,12 @@ std::unique_ptr<attributes> attributes::raw::prepare(data_dictionary::database d
        verify_no_aggregate_functions(*timeout, "USING clause");
    }

-    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl), std::move(to), std::move(service_level)}};
+    if (concurrency.has_value()) {
+        conc = prepare_expression(*concurrency, db, ks_name, nullptr, concurrency_receiver(ks_name, cf_name));
+        verify_no_aggregate_functions(*concurrency, "USING clause");
+    }
+
+    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl), std::move(to), std::move(service_level), std::move(conc)}};
 }

 lw_shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
@@ -168,4 +203,8 @@ lw_shared_ptr<column_specification> attributes::raw::timeout_receiver(const sstr
    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timeout]", true), duration_type);
 }

+lw_shared_ptr<column_specification> attributes::raw::concurrency_receiver(const sstring& ks_name, const sstring& cf_name) const {
+    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[concurrency]", true), data_type_for<int32_t>());
+}
+
 }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -36,13 +36,15 @@ private:
    std::optional<cql3::expr::expression> _time_to_live;
    std::optional<cql3::expr::expression> _timeout;
    std::optional<sstring> _service_level;
+    std::optional<cql3::expr::expression> _concurrency;
 public:
    static std::unique_ptr<attributes> none();
 private:
    attributes(std::optional<cql3::expr::expression>&& timestamp,
               std::optional<cql3::expr::expression>&& time_to_live,
               std::optional<cql3::expr::expression>&& timeout,
-               std::optional<sstring> service_level);
+               std::optional<sstring> service_level,
+               std::optional<cql3::expr::expression>&& concurrency);
 public:
    bool is_timestamp_set() const;

@@ -52,6 +54,8 @@ public:

    bool is_service_level_set() const;

+    bool is_concurrency_set() const;
+
    int64_t get_timestamp(int64_t now, const query_options& options);

    std::optional<int32_t> get_time_to_live(const query_options& options);
@@ -60,6 +64,8 @@ public:

    qos::service_level_options get_service_level(qos::service_level_controller& sl_controller) const;

+    std::optional<int32_t> get_concurrency(const query_options& options) const;
+
    void fill_prepare_context(prepare_context& ctx);

    class raw final {
@@ -68,6 +74,7 @@ public:
        std::optional<cql3::expr::expression> time_to_live;
        std::optional<cql3::expr::expression> timeout;
        std::optional<sstring> service_level;
+        std::optional<cql3::expr::expression> concurrency;

        std::unique_ptr<attributes> prepare(data_dictionary::database db, const sstring& ks_name, const sstring& cf_name) const;
    private:
@@ -76,6 +83,8 @@ public:
        lw_shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;

        lw_shared_ptr<column_specification> timeout_receiver(const sstring& ks_name, const sstring& cf_name) const;
+
+        lw_shared_ptr<column_specification> concurrency_receiver(const sstring& ks_name, const sstring& cf_name) const;
    };
 };

--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -64,6 +64,10 @@ bool query_processor::topology_global_queue_empty() {
    return remote().first.get().ss.topology_global_queue_empty();
 }

+future<bool> query_processor::ongoing_rf_change(const service::group0_guard& guard, sstring ks) {
+    return remote().first.get().ss.ongoing_rf_change(guard, std::move(ks));
+}
+
 static service::query_state query_state_for_internal_call() {
    return {service::client_state::for_internal_calls(), empty_service_permit()};
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -474,6 +474,7 @@ public:
    void reset_cache();

    bool topology_global_queue_empty();
+    future<bool> ongoing_rf_change(const service::group0_guard& guard, sstring ks);

    query_options make_internal_options(
            const statements::prepared_statement::checked_weak_ptr& p,
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -1322,6 +1322,10 @@ const std::vector<expr::expression>& statement_restrictions::index_restrictions(
    return _index_restrictions;
 }

+bool statement_restrictions::is_empty() const {
+    return !_where.has_value();
+}
+
 // Current score table:
 // local and restrictions include full partition key: 2
 // global: 1
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -408,6 +408,8 @@ public:

    /// Checks that the primary key restrictions don't contain null values, throws invalid_request_exception otherwise.
    void validate_primary_key(const query_options& options) const;
+
+    bool is_empty() const;
 };

 statement_restrictions analyze_statement_restrictions(
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -19,6 +19,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "mutation/canonical_mutation.hh"
 #include "prepared_statement.hh"
+#include "seastar/coroutine/exception.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
 #include "service/topology_mutation.hh"
@@ -138,6 +139,7 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
 cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
    using namespace cql_transport;
+    bool unknown_keyspace = false;
    try {
        event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
        auto ks = qp.db().find_keyspace(_name);
@@ -158,8 +160,12 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        //       when in reality nothing or only schema is being changed
        if (changes_tablets(qp)) {
            if (!qp.proxy().features().topology_global_request_queue && !qp.topology_global_queue_empty()) {
-                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
-                        exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
+                co_await coroutine::return_exception(
+                    exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
+            }
+            if (qp.proxy().features().rack_list_rf && co_await qp.ongoing_rf_change(mc.guard(),_name)) {
+                co_await coroutine::return_exception(
+                        exceptions::invalid_request_exception(format("Another RF change for this keyspace {} ongoing, please retry.", _name)));
            }
            qp.db().real_database().validate_keyspace_update(*ks_md_update);

@@ -242,10 +248,15 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
                target_type,
                keyspace());
        mc.add_mutations(std::move(muts), "CQL alter keyspace");
-        return make_ready_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(std::make_tuple(std::move(ret), warnings));
+        co_return std::make_tuple(std::move(ret), warnings);
    } catch (data_dictionary::no_such_keyspace& e) {
-        return make_exception_future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(exceptions::invalid_request_exception("Unknown keyspace " + _name));
+        unknown_keyspace = true;
    }
+    if (unknown_keyspace) {
+        co_await coroutine::return_exception(
+                exceptions::invalid_request_exception("Unknown keyspace " + _name));
+    }
+    std::unreachable();
 }

 std::unique_ptr<cql3::statements::prepared_statement>
--- a/cql3/statements/alter_service_level_statement.cc
+++ b/cql3/statements/alter_service_level_statement.cc
@@ -37,6 +37,12 @@ future<::shared_ptr<cql_transport::messages::result_message>>
 alter_service_level_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &, std::optional<service::group0_guard> guard) const {
+    if (_service_level == qos::service_level_controller::default_service_level_name) {
+        sstring reason = seastar::format("The default service level, {}, cannot be altered",
+                qos::service_level_controller::default_service_level_name);
+        throw exceptions::invalid_request_exception(std::move(reason));
+    }
+
    service::group0_batch mc{std::move(guard)};
    validate_shares_option(qp, _slo);
    qos::service_level& sl = state.get_service_level_controller().get_service_level(_service_level);
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -422,7 +422,14 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
                throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
            }

-            _properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace());
+            if (is_cdc_log_table) {
+                auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
+                if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
+                    throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on CDC log tables.");
+                }
+            }
+
+            _properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace(), !is_cdc_log_table);
        }
        break;

--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -55,8 +55,29 @@ view_ptr alter_view_statement::prepare_view(data_dictionary::database db) const
    auto schema_extensions = _properties->make_schema_extensions(db.extensions());
    _properties->validate(db, keyspace(), schema_extensions);

+    bool is_colocated = [&] {
+        if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
+            return false;
+        }
+        auto base_schema = db.find_schema(schema->view_info()->base_id());
+        if (!base_schema) {
+            return false;
+        }
+        return std::ranges::equal(
+            schema->partition_key_columns(),
+            base_schema->partition_key_columns(),
+            [](const column_definition& a, const column_definition& b) { return a.name() == b.name(); });
+    }();
+
+    if (is_colocated) {
+        auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
+        if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
+            throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
+        }
+    }
+
    auto builder = schema_builder(schema);
-    _properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());
+    _properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);

    if (builder.get_gc_grace_seconds() == 0) {
        throw exceptions::invalid_request_exception(
--- a/cql3/statements/attach_service_level_statement.cc
+++ b/cql3/statements/attach_service_level_statement.cc
@@ -43,6 +43,14 @@ attach_service_level_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &,
        std::optional<service::group0_guard> guard) const {
+    if (_service_level == qos::service_level_controller::default_service_level_name) {
+        sstring reason = seastar::format("The default service level, {}, cannot be "
+                "attached to a role. If you want to detach an attached service level, "
+                "use the DETACH SERVICE LEVEL statement",
+                qos::service_level_controller::default_service_level_name);
+        throw exceptions::invalid_request_exception(std::move(reason));
+    }
+
    auto sli = co_await state.get_service_level_controller().get_distributed_service_level(_service_level);
    if (sli.empty()) {
        throw qos::nonexistant_service_level_exception(_service_level);
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -331,7 +331,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
    if (!cl_for_paxos) [[unlikely]] {
        return make_exception_future<shared_ptr<cql_transport::messages::result_message>>(std::move(cl_for_paxos).assume_error());
    }
-    seastar::shared_ptr<cas_request> request;
+    std::unique_ptr<cas_request> request;
    schema_ptr schema;

    db::timeout_clock::time_point now = db::timeout_clock::now();
@@ -354,9 +354,9 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
        if (keys.empty()) {
            continue;
        }
-        if (request.get() == nullptr) {
+        if (!request) {
            schema = statement.s;
-            request = seastar::make_shared<cas_request>(schema, std::move(keys));
+            request = std::make_unique<cas_request>(schema, std::move(keys));
        } else if (keys.size() != 1 || keys.front().equal(request->key().front(), dht::ring_position_comparator(*schema)) == false) {
            throw exceptions::invalid_request_exception("BATCH with conditions cannot span multiple partitions");
        }
@@ -366,7 +366,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe

        request->add_row_update(statement, std::move(ranges), std::move(json_cache), statement_options);
    }
-    if (request.get() == nullptr) {
+    if (!request) {
        throw exceptions::invalid_request_exception(format("Unrestricted partition key in a conditional BATCH"));
    }

@@ -377,9 +377,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
            );
    }

-    return qp.proxy().cas(schema, std::move(cas_shard), request, request->read_command(qp), request->key(),
+    auto* request_ptr = request.get();
+    return qp.proxy().cas(schema, std::move(cas_shard), *request_ptr, request->read_command(qp), request->key(),
            {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
-            std::move(cl_for_paxos).assume_value(), cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
+            std::move(cl_for_paxos).assume_value(), cl_for_learn, batch_timeout, cas_timeout).then([this, request = std::move(request)] (bool is_applied) {
        return request->build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied);
    });
 }
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -293,7 +293,7 @@ std::optional<db::tablet_options::map_type> cf_prop_defs::get_tablet_options() c
    return std::nullopt;
 }

-void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const {
+void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const {
    if (has_property(KW_COMMENT)) {
        builder.set_comment(get_string(KW_COMMENT, ""));
    }
@@ -379,7 +379,7 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
    }
    // Set default tombstone_gc mode.
    if (!schema_extensions.contains(tombstone_gc_extension::NAME)) {
-        auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name));
+        auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name, supports_repair));
        schema_extensions.emplace(tombstone_gc_extension::NAME, std::move(ext));
    }
    builder.set_extensions(std::move(schema_extensions));
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -110,7 +110,7 @@ public:
    bool get_synchronous_updates_flag() const;
    std::optional<db::tablet_options::map_type> get_tablet_options() const;

-    void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const;
+    void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const;
    void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;
 };

--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -201,7 +201,14 @@ view_ptr create_index_statement::create_view_for_index(const schema_ptr schema,
        "";
    builder.with_view_info(schema, false, where_clause);

-    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name()));
+    bool is_colocated = [&] {
+        if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
+            return false;
+        }
+        return im.local();
+    }();
+
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name(), !is_colocated));
    builder.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));

    // A local secondary index should be backed by a *synchronous* view,
@@ -272,11 +279,15 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
        throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
    }

-    try {
-        db::view::validate_view_keyspace(db, keyspace());
-    } catch (const std::exception& e) {
-        // The type of the thrown exception is not specified, so we need to wrap it here.
-        throw exceptions::invalid_request_exception(e.what());
+    // Regular secondary indexes require rf-rack-validity.
+    // Custom indexes need to validate this property themselves, if they need it.
+    if (!_properties || !_properties->custom_class) {
+        try {
+            db::view::validate_view_keyspace(db, keyspace());
+        } catch (const std::exception& e) {
+            // The type of the thrown exception is not specified, so we need to wrap it here.
+            throw exceptions::invalid_request_exception(e.what());
+        }
    }

    validate_for_local_index(*schema);
@@ -292,7 +303,7 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
            throw exceptions::invalid_request_exception(format("Non-supported custom class \'{}\' provided", *(_properties->custom_class)));
        }
        auto custom_index = (*custom_index_factory)();
-        custom_index->validate(*schema, *_properties, targets, db.features());
+        custom_index->validate(*schema, *_properties, targets, db.features(), db);
        _properties->index_version = custom_index->index_version(*schema);
    }

--- a/cql3/statements/create_service_level_statement.cc
+++ b/cql3/statements/create_service_level_statement.cc
@@ -45,6 +45,12 @@ create_service_level_statement::execute(query_processor& qp,
        throw exceptions::invalid_request_exception("Names starting with '$' are reserved for internal tenants. Use a different name.");
    }

+    if (_service_level == qos::service_level_controller::default_service_level_name) {
+        sstring reason = seastar::format("The default service level, {}, already exists "
+                "and cannot be created", qos::service_level_controller::default_service_level_name);
+        throw exceptions::invalid_request_exception(std::move(reason));
+    }
+
    service::group0_batch mc{std::move(guard)};
    validate_shares_option(qp, _slo);
    
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -128,7 +128,7 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
    }

-    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace());
+    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
 }

 void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -373,7 +373,30 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(
            db::view::create_virtual_column(builder, def->name(), def->type);
        }
    }
-    _properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());
+
+    bool is_colocated = [&] {
+        if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
+            return false;
+        }
+        if (target_partition_keys.size() != schema->partition_key_columns().size()) {
+            return false;
+        }
+        for (size_t i = 0; i < target_partition_keys.size(); ++i) {
+            if (target_partition_keys[i] != &schema->partition_key_columns()[i]) {
+                return false;
+            }
+        }
+        return true;
+    }();
+
+    if (is_colocated) {
+        auto gc_opts = _properties.properties()->get_tombstone_gc_options(schema_extensions);
+        if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
+            throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
+        }
+    }
+
+    _properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);

    if (builder.default_time_to_live().count() > 0) {
        throw exceptions::invalid_request_exception(
--- a/cql3/statements/drop_service_level_statement.cc
+++ b/cql3/statements/drop_service_level_statement.cc
@@ -34,6 +34,11 @@ drop_service_level_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &,
        std::optional<service::group0_guard> guard) const {
+    if (_service_level == qos::service_level_controller::default_service_level_name) {
+        sstring reason = seastar::format("The default service level, {}, cannot be dropped",
+                qos::service_level_controller::default_service_level_name);
+        throw exceptions::invalid_request_exception(std::move(reason));
+    }
    service::group0_batch mc{std::move(guard)};
    auto& sl = state.get_service_level_controller();
    co_await sl.drop_distributed_service_level(_service_level, _if_exists, mc);
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "seastar/core/format.hh"
 #include "seastar/core/sstring.hh"
 #include "utils/assert.hh"
 #include "cql3/statements/ks_prop_defs.hh"
@@ -60,7 +61,7 @@ expand_to_racks(const locator::token_metadata& tm,

    // Handle ALTER:
    // ([]|0) -> numeric is allowed, there are no existing replicas
-    // numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
+    // numeric -> numeric' is not supported unless numeric == numeric'. User should convert RF to rack list of equal count first.
    // rack_list -> len(rack_list) is allowed (no-op)
    // rack_list -> numeric is not allowed
    if (old_options.contains(dc)) {
@@ -74,6 +75,8 @@ expand_to_racks(const locator::token_metadata& tm,
                        "Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
                        dc, old_rf_val, data.count()));
            }
+        } else if (old_rf.count() == data.count()) {
+            return rf;
        } else if (old_rf.count() > 0) {
            throw exceptions::configuration_exception(fmt::format(
                    "Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
@@ -113,6 +116,17 @@ static locator::replication_strategy_config_options prepare_options(
        return options;
    }

+    if (uses_tablets) {
+        for (const auto& opt: old_options) {
+            if (opt.first == ks_prop_defs::REPLICATION_FACTOR_KEY) {
+                on_internal_error(logger, format("prepare_options: old_options contains invalid key '{}'", ks_prop_defs::REPLICATION_FACTOR_KEY));
+            }
+            if (!options.contains(opt.first)) {
+                throw exceptions::configuration_exception(fmt::format("Attempted to implicitly drop replicas in datacenter {}. If this is the desired behavior, set replication factor to 0 in {} explicitly.", opt.first, opt.first));
+            }
+        }
+    }
+
    // For users' convenience, expand the 'replication_factor' option into a replication factor for each DC.
    // If the user simply switches from another strategy without providing any options,
    // but the other strategy used the 'replication_factor' option, it will also be expanded.
@@ -141,6 +155,8 @@ static locator::replication_strategy_config_options prepare_options(
    }

    // Validate options.
+    bool numeric_to_rack_list_transition = false;
+    bool rf_change = false;
    for (auto&& [dc, opt] : options) {
        locator::replication_factor_data rf(opt);

@@ -150,6 +166,7 @@ static locator::replication_strategy_config_options prepare_options(
            old_rf = locator::replication_factor_data(i->second);
        }

+        rf_change = rf_change || (old_rf && old_rf->count() != rf.count()) || (!old_rf && rf.count() != 0);
        if (!rf.is_rack_based()) {
            if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
                if (old_rf->count() != rf.count()) {
@@ -175,12 +192,11 @@ static locator::replication_strategy_config_options prepare_options(
            throw exceptions::configuration_exception(fmt::format(
                    "Rack list for '{}' contains duplicate entries", dc));
        }
-        if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
-            // FIXME: Allow this if replicas already conform to the given rack list.
-            // FIXME: Implement automatic colocation to allow transition to rack list.
-            throw exceptions::configuration_exception(fmt::format(
-                    "Cannot change replication factor from numeric to rack list for '{}'", dc));
-        }
+        numeric_to_rack_list_transition = numeric_to_rack_list_transition || (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0);
+    }
+
+    if (numeric_to_rack_list_transition && rf_change) {
+        throw exceptions::configuration_exception("Cannot change replication factor from numeric to rack list and rf value at the same time");
    }

    if (!rf && options.empty() && old_options.empty()) {
@@ -400,7 +416,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
            ? std::optional<unsigned>(0) : std::nullopt;
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
    bool uses_tablets = initial_tablets.has_value();
-    bool rack_list_enabled = feat.rack_list_rf;
+    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
            std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
@@ -416,7 +432,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
        throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
    }
    auto sc = get_replication_strategy_class();
-    bool rack_list_enabled = feat.rack_list_rf;
+    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
    if (sc) {
        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
    } else {
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -401,7 +401,8 @@ modification_statement::execute_with_condition(query_processor& qp, service::que
                    type.is_update() ? "update" : "deletion"));
    }

-    auto request = seastar::make_shared<cas_request>(s, std::move(keys));
+    auto request = std::make_unique<cas_request>(s, std::move(keys));
+    auto* request_ptr = request.get();
    // cas_request can be used for batches as well single statements; Here we have just a single
    // modification in the list of CAS commands, since we're handling single-statement execution.
    request->add_row_update(*this, std::move(ranges), std::move(json_cache), options);
@@ -427,9 +428,9 @@ modification_statement::execute_with_condition(query_processor& qp, service::que
        tablet_info = erm->check_locality(token);
    }

-    return qp.proxy().cas(s, std::move(cas_shard), request, request->read_command(qp), request->key(),
+    return qp.proxy().cas(s, std::move(cas_shard), *request_ptr, request->read_command(qp), request->key(),
            {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
-            std::move(cl_for_paxos).assume_value(), cl_for_learn, statement_timeout, cas_timeout).then([this, request, tablet_replicas = std::move(tablet_info->tablet_replicas), token_range = tablet_info->token_range] (bool is_applied) {
+            std::move(cl_for_paxos).assume_value(), cl_for_learn, statement_timeout, cas_timeout).then([this, request = std::move(request), tablet_replicas = std::move(tablet_info->tablet_replicas), token_range = tablet_info->token_range] (bool is_applied) {
        auto result = request->build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied);
        result->add_tablet_info(tablet_replicas, token_range);
        return result;
--- a/cql3/statements/prune_materialized_view_statement.cc
+++ b/cql3/statements/prune_materialized_view_statement.cc
@@ -21,7 +21,7 @@ namespace cql3 {
 namespace statements {

 static future<> delete_ghost_rows(dht::partition_range_vector partition_ranges, std::vector<query::clustering_range> clustering_bounds, view_ptr view,
-        service::storage_proxy& proxy, service::query_state& state, const query_options& options, cql_stats& stats, db::timeout_clock::duration timeout_duration) {
+        service::storage_proxy& proxy, service::query_state& state, const query_options& options, cql_stats& stats, db::timeout_clock::duration timeout_duration, size_t concurrency) {
    auto key_columns = std::ranges::to<std::vector<const column_definition*>>(
        view->all_columns()
        | std::views::filter([] (const column_definition& cdef) { return cdef.is_primary_key(); })
@@ -35,7 +35,7 @@ static future<> delete_ghost_rows(dht::partition_range_vector partition_ranges,
    tracing::trace(state.get_trace_state(), "Deleting ghost rows from partition ranges {}", partition_ranges);

    auto p = service::pager::query_pagers::ghost_row_deleting_pager(schema_ptr(view), selection, state,
-            options, std::move(command), std::move(partition_ranges), stats, proxy, timeout_duration);
+            options, std::move(command), std::move(partition_ranges), stats, proxy, timeout_duration, concurrency);

    int32_t page_size = std::max(options.get_page_size(), 1000);
    auto now = gc_clock::now();
@@ -62,7 +62,8 @@ future<::shared_ptr<cql_transport::messages::result_message>> prune_materialized
    auto timeout_duration = get_timeout(state.get_client_state(), options);
    dht::partition_range_vector key_ranges = _restrictions->get_partition_key_ranges(options);
    std::vector<query::clustering_range> clustering_bounds = _restrictions->get_clustering_bounds(options);
-    return delete_ghost_rows(std::move(key_ranges), std::move(clustering_bounds), view_ptr(_schema), qp.proxy(), state, options, _stats, timeout_duration).then([] {
+    size_t concurrency = _attrs->is_concurrency_set() ? _attrs->get_concurrency(options).value() : 1;
+    return delete_ghost_rows(std::move(key_ranges), std::move(clustering_bounds), view_ptr(_schema), qp.proxy(), state, options, _stats, timeout_duration, concurrency).then([] {
        return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(::make_shared<cql_transport::messages::result_message::void_message>());
    });
 }
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -1976,7 +1976,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    if (it == indexes.end()) {
        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
    }
-    if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
+    if (index_opt || parameters->allow_filtering() || !(restrictions->is_empty()) || check_needs_allow_filtering_anyway(*restrictions)) {
        throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
    }
    index_opt = *it;
@@ -2031,14 +2031,16 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table
                    fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
        }

-        auto as = abort_source();
-        auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, as);
+        auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
+        auto aoe = abort_on_expiry(timeout);
+        auto pkeys = co_await qp.vector_store_client().ann(
+                _schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, aoe.abort_source());
        if (!pkeys.has_value()) {
            co_await coroutine::return_exception(
                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
        }

-        co_return co_await query_base_table(qp, state, options, pkeys.value());
+        co_return co_await query_base_table(qp, state, options, pkeys.value(), timeout);
    });

    auto page_size = options.get_page_size();
@@ -2073,10 +2075,10 @@ std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vecto
    return util::to_vector<float>(values);
 }

-future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(
-        query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const {
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
+        service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys,
+        lowres_clock::time_point timeout) const {
    auto command = prepare_command_for_base_query(qp, state, options);
-    auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);

    // For tables without clustering columns, we can optimize by querying
    // partition ranges instead of individual primary keys, since the
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -389,8 +389,8 @@ private:

    std::vector<float> get_ann_ordering_vector(const query_options& options) const;

-    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(
-            query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const;
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
+            const query_options& options, const std::vector<vector_search::primary_key>& pkeys, lowres_clock::time_point timeout) const;

    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
            const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
--- a/data_dictionary/CMakeLists.txt
+++ b/data_dictionary/CMakeLists.txt
@@ -12,5 +12,8 @@ target_link_libraries(data_dictionary
    Seastar::seastar
    xxHash::xxhash)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(data_dictionary REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers data_dictionary
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -42,6 +42,11 @@ table::get_index_manager() const {
    return _ops->get_index_manager(*this);
 }

+db_clock::time_point
+table::get_truncation_time() const {
+    return _ops->get_truncation_time(*this);
+}
+
 lw_shared_ptr<keyspace_metadata>
 keyspace::metadata() const {
    return _ops->get_keyspace_metadata(*this);
--- a/data_dictionary/data_dictionary.hh
+++ b/data_dictionary/data_dictionary.hh
@@ -77,6 +77,7 @@ public:
    schema_ptr schema() const;
    const std::vector<view_ptr>& views() const;
    const secondary_index::secondary_index_manager& get_index_manager() const;
+    db_clock::time_point get_truncation_time() const;
 };

 class keyspace {
--- a/data_dictionary/impl.hh
+++ b/data_dictionary/impl.hh
@@ -27,6 +27,7 @@ public:
    virtual std::optional<table> try_find_table(database db, table_id id) const = 0;
    virtual const secondary_index::secondary_index_manager& get_index_manager(table t) const = 0;
    virtual schema_ptr get_table_schema(table t) const = 0;
+    virtual db_clock::time_point get_truncation_time(table t) const = 0;
    virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(keyspace ks) const = 0;
    virtual bool is_internal(keyspace ks) const = 0;
    virtual const locator::abstract_replication_strategy& get_replication_strategy(keyspace ks) const = 0;
--- a/db/CMakeLists.txt
+++ b/db/CMakeLists.txt
@@ -10,7 +10,6 @@ target_sources(db
    schema_applier.cc
    schema_tables.cc
    cql_type_parser.cc
-    legacy_schema_migrator.cc
    commitlog/commitlog.cc
    commitlog/commitlog_replayer.cc
    commitlog/commitlog_entry.cc
@@ -60,5 +59,8 @@ target_link_libraries(db
    data_dictionary
    cql3)

+if (Scylla_USE_PRECOMPILED_HEADER_USE)
+  target_precompile_headers(db REUSE_FROM scylla-precompiled-header)
+endif()
 check_headers(check-headers db
  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/db/batchlog.hh
+++ b/db/batchlog.hh
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "mutation/mutation.hh"
+#include "utils/UUID.hh"
+
+namespace db {
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id);
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id);
+
+}
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -10,6 +10,7 @@

 #include <chrono>
 #include <exception>
+#include <ranges>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -18,12 +19,14 @@
 #include <seastar/core/sleep.hh>

 #include "batchlog_manager.hh"
+#include "batchlog.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "mutation/canonical_mutation.hh"
 #include "service/storage_proxy.hh"
 #include "system_keyspace.hh"
 #include "utils/rate_limiter.hh"
 #include "utils/log.hh"
+#include "utils/murmur_hash.hh"
 #include "db_clock.hh"
 #include "unimplemented.hh"
 #include "idl/frozen_schema.dist.hh"
@@ -33,17 +36,94 @@
 #include "cql3/untyped_result_set.hh"
 #include "service_permit.hh"
 #include "cql3/query_processor.hh"
-#include "replica/database.hh"

 static logging::logger blogger("batchlog_manager");

+namespace db {
+
+// Yields 256 batchlog shards. Even on the largest nodes we currently run on,
+// this should be enough to give every core a batchlog partition.
+static constexpr unsigned batchlog_shard_bits = 8;
+
+int32_t batchlog_shard_of(db_clock::time_point written_at) {
+    const int64_t count = written_at.time_since_epoch().count();
+    std::array<uint64_t, 2> result;
+    utils::murmur_hash::hash3_x64_128(bytes_view(reinterpret_cast<const signed char*>(&count), sizeof(count)), 0, result);
+    uint64_t hash = result[0] ^ result[1];
+    return hash & ((1ULL << batchlog_shard_bits) - 1);
+}
+
+std::pair<partition_key, clustering_key>
+get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
+    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
+
+    std::vector<bytes> ckey_components;
+    ckey_components.reserve(2);
+    ckey_components.push_back(serialized(written_at));
+    if (id) {
+        ckey_components.push_back(serialized(*id));
+    }
+    auto ckey = clustering_key::from_exploded(schema, ckey_components);
+
+    return {std::move(pkey), std::move(ckey)};
+}
+
+std::pair<partition_key, clustering_key>
+get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, db_clock::time_point written_at, std::optional<utils::UUID> id) {
+    return get_batchlog_key(schema, version, stage, batchlog_shard_of(written_at), written_at, id);
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
+
+    auto timestamp = api::new_timestamp();
+
+    mutation m(schema, key);
+    // Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
+    auto cdef_data = schema->get_column_definition(to_bytes("data"));
+    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
+
+    return m;
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto data = [&mutations] {
+        utils::chunked_vector<canonical_mutation> fm(mutations.begin(), mutations.end());
+        bytes_ostream out;
+        for (auto& m : fm) {
+            ser::serialize(out, m);
+        }
+        return std::move(out).to_managed_bytes();
+    }();
+
+    return get_batchlog_mutation_for(std::move(schema), std::move(data), version, stage, now, id);
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id) {
+    return get_batchlog_mutation_for(std::move(schema), mutations, version, batchlog_stage::initial, now, id);
+}
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
+    mutation m(schema, key);
+    auto timestamp = api::new_timestamp();
+    m.partition().apply_delete(*schema, ckey, tombstone(timestamp, gc_clock::now()));
+    return m;
+}
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id) {
+    return get_batchlog_delete_mutation(std::move(schema), version, batchlog_stage::initial, now, id);
+}
+
+} // namespace db
+
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
-        , _write_request_timeout(std::chrono::duration_cast<db_clock::duration>(config.write_request_timeout))
+        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
        , _replay_cleanup_after_replays(config.replay_cleanup_after_replays)
@@ -152,18 +232,75 @@ future<> db::batchlog_manager::stop() {
 }

 future<size_t> db::batchlog_manager::count_all_batches() const {
-    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG);
+    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
    return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> rs) {
       return size_t(rs->one().get_as<int64_t>("count"));
    });
 }

-db_clock::duration db::batchlog_manager::get_batch_log_timeout() const {
-    // enough time for the actual write + BM removal mutation
-    return _write_request_timeout * 2;
+future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
+    if (_migration_done) {
+        return make_ready_future<>();
+    }
+    return with_gate(_gate, [this] () mutable -> future<> {
+        blogger.info("Migrating batchlog entries from v1 -> v2");
+
+        auto schema_v1 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
+        auto schema_v2 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+        auto batch = [this, schema_v1, schema_v2] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+            // check version of serialization format
+            if (!row.has("version")) {
+                blogger.warn("Not migrating logged batch because of unknown version");
+                co_return stop_iteration::no;
+            }
+
+            auto version = row.get_as<int32_t>("version");
+            if (version != netw::messaging_service::current_version) {
+                blogger.warn("Not migrating logged batch because of incorrect version");
+                co_return stop_iteration::no;
+            }
+
+            auto id = row.get_as<utils::UUID>("id");
+            auto written_at = row.get_as<db_clock::time_point>("written_at");
+            auto data = row.get_blob_fragmented("data");
+
+            auto& sp = _qp.proxy();
+
+            utils::get_local_injector().inject("batchlog_manager_fail_migration", [] { throw std::runtime_error("Error injection: failing batchlog migration"); });
+
+            auto migrate_mut = get_batchlog_mutation_for(schema_v2, std::move(data), version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(migrate_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+            mutation delete_mut(schema_v1, partition_key::from_single_value(*schema_v1, serialized(id)));
+            delete_mut.partition().apply_delete(*schema_v1, clustering_key_prefix::make_empty(), tombstone(api::new_timestamp(), gc_clock::now()));
+            co_await sp.mutate_locally(delete_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+            co_return stop_iteration::no;
+        };
+        try {
+            co_await _qp.query_internal(
+                    format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
+                    db::consistency_level::ONE,
+                    {},
+                    page_size,
+                    std::move(batch));
+        } catch (...) {
+            blogger.warn("Batchlog v1 to v2 migration failed: {}; will retry", std::current_exception());
+            co_return;
+        }
+
+        co_await container().invoke_on_all([] (auto& bm) {
+            bm._migration_done = true;
+        });
+
+        blogger.info("Done migrating batchlog entries from v1 -> v2");
+    });
 }

 future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
+    co_await maybe_migrate_v1_to_v2();
+
    typedef db_clock::rep clock_type;

    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
@@ -172,21 +309,26 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-    auto delete_batch = [this, schema = std::move(schema)] (utils::UUID id) {
-        auto key = partition_key::from_singular(*schema, id);
-        mutation m(schema, key);
-        auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
-        m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-        return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+    struct replay_stats {
+        std::optional<db_clock::time_point> min_too_fresh;
+        bool need_cleanup = false;
    };

-    auto batch = [this, limiter, delete_batch = std::move(delete_batch), &all_replayed](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
+
+    // Use a stable `now` accross all batches, so skip/replay decisions are the
+    // same accross a while prefix of written_at (accross all ids).
+    const auto now = db_clock::now();
+
+    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto batch_shard = row.get_as<int32_t>("shard");
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-        auto now = db_clock::now();
-        auto timeout = get_batch_log_timeout();
+        auto timeout = _replay_timeout;

        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
@@ -194,52 +336,48 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
            co_return stop_iteration::no;
        }

-        // check version of serialization format
-        if (!row.has("version")) {
-            blogger.warn("Skipping logged batch because of unknown version");
-            co_await delete_batch(id);
-            co_return stop_iteration::no;
-        }
-
-        auto version = row.get_as<int32_t>("version");
-        if (version != netw::messaging_service::current_version) {
-            blogger.warn("Skipping logged batch because of incorrect version {}; current version = {}", version, netw::messaging_service::current_version);
-            co_await delete_batch(id);
-            co_return stop_iteration::no;
-        }
-
        auto data = row.get_blob_unfragmented("data");

-        blogger.debug("Replaying batch {}", id);
+        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+        utils::chunked_vector<mutation> mutations;
+        bool send_failed = false;
+
+        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;

        try {
-            auto fms = make_lw_shared<std::deque<canonical_mutation>>();
+            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
            auto in = ser::as_input_stream(data);
            while (in.size()) {
-                fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
-                schema_ptr s = _qp.db().find_schema(fms->back().column_family_id());
-                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
+                if (!tbl) {
+                    continue;
+                }
+                if (written_at <= tbl->get_truncation_time()) {
+                    continue;
+                }
+                schema_ptr s = tbl->schema();
+                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                }
+                fms.emplace_back(std::move(fm), std::move(s));
            }

            if (now < written_at + timeout) {
                blogger.debug("Skipping replay of {}, too fresh", id);
+
+                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
                co_return stop_iteration::no;
            }

            auto size = data.size();

-            auto mutations = co_await map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
-                const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
-                return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
-            },
-            utils::chunked_vector<mutation>(),
-            [this] (utils::chunked_vector<mutation> mutations, canonical_mutation* fm) {
-                if (fm) {
-                    schema_ptr s = _qp.db().find_schema(fm->column_family_id());
-                    mutations.emplace_back(fm->to_mutation(s));
-                }
-                return mutations;
-            });
+            for (const auto& [fm, s] : fms) {
+                mutations.emplace_back(fm.to_mutation(s));
+                co_await maybe_yield();
+            }

            if (!mutations.empty()) {
                const auto ttl = [written_at]() -> clock_type {
@@ -265,7 +403,11 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
                    co_await limiter->reserve(size);
                    _stats.write_attempts += mutations.size();
                    auto timeout = db::timeout_clock::now() + write_timeout;
-                    co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    if (cleanup) {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                    } else {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    }
                }
            }
        } catch (data_dictionary::no_such_keyspace& ex) {
@@ -279,31 +421,80 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
            // Do _not_ remove the batch, assuning we got a node write error.
            // Since we don't have hints (which origin is satisfied with),
            // we have to resort to keeping this batch to next lap.
-            co_return stop_iteration::no;
+            if (!cleanup || stage == batchlog_stage::failed_replay) {
+                co_return stop_iteration::no;
+            }
+            send_failed = true;
        }
+
+        auto& sp = _qp.proxy();
+
+        if (send_failed) {
+            blogger.debug("Moving batch {} to stage failed_replay", id);
+            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+        }
+
        // delete batch
-        co_await delete_batch(id);
+        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+        shard_written_at.need_cleanup = true;
+
        co_return stop_iteration::no;
    };

-    co_await with_gate(_gate, [this, cleanup, batch = std::move(batch)] () mutable -> future<> {
-        blogger.debug("Started replayAllFailedBatches (cpu {})", this_shard_id());
+    co_await with_gate(_gate, [this, cleanup, &all_replayed, batch = std::move(batch), now, &replay_stats_per_shard] () mutable -> future<> {
+        blogger.debug("Started replayAllFailedBatches with cleanup: {}", cleanup);
        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
-        co_await _qp.query_internal(
-                format("SELECT id, data, written_at, version FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
-                db::consistency_level::ONE,
-                {},
-                page_size,
-                std::move(batch)).then([this, cleanup] {
-            if (cleanup == post_replay_cleanup::no) {
-                return make_ready_future<>();
+
+        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+        co_await coroutine::parallel_for_each(std::views::iota(0, 16), [&] (int32_t chunk) -> future<> {
+            const int32_t batchlog_chunk_base = chunk * 16;
+            for (int32_t i = 0; i < 16; ++i) {
+                int32_t batchlog_shard = batchlog_chunk_base + i;
+
+                co_await _qp.query_internal(
+                        format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
+                        db::consistency_level::ONE,
+                        {data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::failed_replay)), data_value(batchlog_shard)},
+                        page_size,
+                        batch);
+
+                co_await _qp.query_internal(
+                        format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
+                        db::consistency_level::ONE,
+                        {data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::initial)), data_value(batchlog_shard)},
+                        page_size,
+                        batch);
+
+                if (cleanup != post_replay_cleanup::yes) {
+                    continue;
+                }
+
+                auto it = replay_stats_per_shard.find(batchlog_shard);
+                if (it == replay_stats_per_shard.end() || !it->second.need_cleanup) {
+                    // Nothing was replayed on this batchlog shard, nothing to cleanup.
+                    continue;
+                }
+
+                const auto write_time = it->second.min_too_fresh.value_or(now - _replay_timeout);
+                const auto end_weight  = it->second.min_too_fresh ? bound_weight::before_all_prefixed : bound_weight::after_all_prefixed;
+                auto [key, ckey] = get_batchlog_key(*schema, netw::messaging_service::current_version, batchlog_stage::initial, batchlog_shard, write_time, {});
+                auto end_pos = position_in_partition(partition_region::clustered, end_weight, std::move(ckey));
+
+                range_tombstone rt(position_in_partition::before_all_clustered_rows(), std::move(end_pos), tombstone(api::new_timestamp(), gc_clock::now()));
+
+                blogger.trace("Clean up batchlog shard {} with range tombstone {}", batchlog_shard, rt);
+
+                mutation m(schema, key);
+                m.partition().apply_row_tombstone(*schema, std::move(rt));
+                co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
            }
-            // Replaying batches could have generated tombstones, flush to disk,
-            // where they can be compacted away.
-            return replica::database::flush_table_on_all_shards(_qp.proxy().get_db(), system_keyspace::NAME, system_keyspace::BATCHLOG);
-        }).then([] {
-            blogger.debug("Finished replayAllFailedBatches");
        });
+
+        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
    });

    co_return all_replayed;
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -34,12 +34,17 @@ class system_keyspace;
 using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;

 struct batchlog_manager_config {
-    std::chrono::duration<double> write_request_timeout;
+    db_clock::duration replay_timeout;
    uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
    std::chrono::milliseconds delay = std::chrono::milliseconds(0);
    unsigned replay_cleanup_after_replays;
 };

+enum class batchlog_stage : int8_t {
+    initial,
+    failed_replay
+};
+
 class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
@@ -59,7 +64,7 @@ private:

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
-    db_clock::duration _write_request_timeout;
+    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
    unsigned _replay_cleanup_after_replays = 100;
@@ -71,6 +76,14 @@ private:

    gc_clock::time_point _last_replay;

+    // Was the v1 -> v2 migration already done since last restart?
+    // The migration is attempted once after each restart. This is redundant but
+    // keeps thing simple. Once no upgrade path exists from a ScyllaDB version
+    // which can still produce v1 entries, this migration code can be removed.
+    bool _migration_done = false;
+
+    future<> maybe_migrate_v1_to_v2();
+
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
@@ -85,10 +98,13 @@ public:
    future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);

    future<size_t> count_all_batches() const;
-    db_clock::duration get_batch_log_timeout() const;
    gc_clock::time_point get_last_replay() const {
        return _last_replay;
    }
+
+    const stats& stats() const {
+        return _stats;
+    }
 private:
    future<> batchlog_replay_loop();
 };
--- a/Show More
+++ b/Show More