Simplify ungzip implementation per review feedback

- Remove manual gzip header parsing - libdeflate handles all format details - Rename linearize_chunked_content to build_input_buffer and free chunks as we copy - Add output chunking to split large decompressed data into 1MB chunks - Add comment explaining libdeflate's whole-buffer requirement - Use better initial size heuristic based on compression ratio Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Add edge case check for length limit in ungzip
2025-11-19 12:47:02 +00:00 · 2025-11-19 11:50:31 +00:00 · 2025-11-19 11:48:35 +00:00 · 2025-11-19 11:46:29 +00:00 · 2025-11-19 11:32:38 +00:00 · 2025-11-19 05:54:12 +01:00
599 changed files with 25682 additions and 8507 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -142,20 +142,31 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):


 def with_github_keyword_prefix(repo, pr):
-    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    match = re.findall(pattern, pr.body, re.IGNORECASE)
-    if not match:
-        for commit in pr.get_commits():
-            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
-            if match:
-                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-                break
-    if not match:
-        print(f'No valid close reference for {pr.number}')
-        return False
-    else:
+    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
+    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    
+    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
+    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
+    
+    # Check PR body for GitHub issues
+    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
+    # Check PR body for JIRA issues
+    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
+    
+    match = github_match or jira_match
+
+    if match:
        return True

+    for commit in pr.get_commits():
+        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
+        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
+        if github_match or jira_match:
+            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+            return True
+
+    print(f'No valid close reference for {pr.number}')
+    return False

 def main():
    args = parse_args()
--- a/.github/workflows/trigger_ci.yaml
+++ b/.github/workflows/trigger_ci.yaml
@@ -0,0 +1,242 @@
+name: Trigger next gating
+
+on:
+  pull_request_target:
+    types: [opened, reopened, synchronize]
+  issue_comment:
+    types: [created]
+    
+jobs:
+  trigger-ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+        run: echo "$GITHUB_CONTEXT"
+      - name: Checkout PR code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Needed to access full history
+          ref: ${{ github.event.pull_request.head.ref }}
+
+      - name: Fetch before commit if needed
+        run: |
+          if ! git cat-file -e ${{ github.event.before }} 2>/dev/null; then
+            echo "Fetching before commit ${{ github.event.before }}"
+            git fetch --depth=1 origin ${{ github.event.before }}
+          fi
+
+      - name: Compare commits for file changes
+        if: github.action == 'synchronize'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "Base: ${{ github.event.before }}"
+          echo "Head: ${{ github.event.after }}"
+
+          TREE_BEFORE=$(git show -s --format=%T ${{ github.event.before }})
+          TREE_AFTER=$(git show -s --format=%T ${{ github.event.after }})
+          
+          echo "TREE_BEFORE=$TREE_BEFORE" >> $GITHUB_ENV
+          echo "TREE_AFTER=$TREE_AFTER" >> $GITHUB_ENV
+
+      - name: Check if last push has file changes
+        run: |
+          if [[ "${{ env.TREE_BEFORE }}" == "${{ env.TREE_AFTER }}" ]]; then
+            echo "No file changes detected in the last push, only commit message edit."
+            echo "has_file_changes=false" >> $GITHUB_ENV
+          else
+            echo "File changes detected in the last push."
+            echo "has_file_changes=true" >> $GITHUB_ENV
+          fi
+
+      - name: Rule 1 - Check PR draft or conflict status
+        run: |
+          # Check if PR is in draft mode
+          IS_DRAFT="${{ github.event.pull_request.draft }}"
+          
+          # Check if PR has 'conflict' label
+          HAS_CONFLICT_LABEL="false"
+          LABELS='${{ toJson(github.event.pull_request.labels) }}'
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^conflict$"; then
+            HAS_CONFLICT_LABEL="true"
+          fi
+          
+          # Set draft_or_conflict variable
+          if [[ "$IS_DRAFT" == "true" || "$HAS_CONFLICT_LABEL" == "true" ]]; then
+            echo "draft_or_conflict=true" >> $GITHUB_ENV
+            echo "✅ Rule 1: PR is in draft mode or has conflict label - setting draft_or_conflict=true"
+          else
+            echo "draft_or_conflict=false" >> $GITHUB_ENV
+            echo "✅ Rule 1: PR is ready and has no conflict label - setting draft_or_conflict=false"
+          fi
+          
+          echo "Draft status: $IS_DRAFT"
+          echo "Has conflict label: $HAS_CONFLICT_LABEL"
+          echo "Result: draft_or_conflict = $draft_or_conflict"
+
+      - name: Rule 2 - Check labels
+        run: |
+          # Check if PR has P0 or P1 labels
+          HAS_P0_P1_LABEL="false"
+          LABELS='${{ toJson(github.event.pull_request.labels) }}'
+          if echo "$LABELS" | jq -r '.[].name' | grep -E "^(P0|P1)$" > /dev/null; then
+            HAS_P0_P1_LABEL="true"
+          fi
+          
+          # Check if PR already has force_on_cloud label
+          echo "HAS_FORCE_ON_CLOUD_LABEL=false" >> $GITHUB_ENV
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^force_on_cloud$"; then
+            HAS_FORCE_ON_CLOUD_LABEL="true"
+            echo "HAS_FORCE_ON_CLOUD_LABEL=true" >> $GITHUB_ENV
+          fi
+          
+          echo "Has P0/P1 label: $HAS_P0_P1_LABEL"
+          echo "Has force_on_cloud label: $HAS_FORCE_ON_CLOUD_LABEL"
+          
+          # Add force_on_cloud label if PR has P0/P1 and doesn't already have force_on_cloud
+          if [[ "$HAS_P0_P1_LABEL" == "true" && "$HAS_FORCE_ON_CLOUD_LABEL" == "false" ]]; then
+            echo "✅ Rule 2: PR has P0 or P1 label - adding force_on_cloud label"
+            curl -X POST \
+              -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+              -H "Accept: application/vnd.github.v3+json" \
+              "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels" \
+              -d '{"labels":["force_on_cloud"]}'
+          elif [[ "$HAS_P0_P1_LABEL" == "true" && "$HAS_FORCE_ON_CLOUD_LABEL" == "true" ]]; then
+            echo "✅ Rule 2: PR has P0 or P1 label and already has force_on_cloud label - no action needed"
+          else
+            echo "✅ Rule 2: PR does not have P0 or P1 label - no force_on_cloud label needed"
+          fi
+
+          SKIP_UNIT_TEST_CUSTOM="false"
+          if echo "$LABELS" | jq -r '.[].name' | grep -q "^ci/skip_unit-tests_custom$"; then
+            SKIP_UNIT_TEST_CUSTOM="true"
+          fi
+          echo "SKIP_UNIT_TEST_CUSTOM=$SKIP_UNIT_TEST_CUSTOM" >> $GITHUB_ENV
+
+      - name: Rule 3 - Analyze changed files and set build requirements
+        run: |
+          # Get list of changed files
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }})
+          echo "Changed files:"
+          echo "$CHANGED_FILES"
+          echo ""
+          
+          # Initialize all requirements to false
+          REQUIRE_BUILD="false"
+          REQUIRE_DTEST="false"
+          REQUIRE_UNITTEST="false"
+          REQUIRE_ARTIFACTS="false"
+          REQUIRE_SCYLLA_GDB="false"
+          
+          # Check each file against patterns
+          while IFS= read -r file; do
+            if [[ -n "$file" ]]; then
+              echo "Checking file: $file"
+              
+              # Build pattern: ^(?!scripts\/pull_github_pr.sh).*$
+              # Everything except scripts/pull_github_pr.sh
+              if [[ "$file" != "scripts/pull_github_pr.sh" ]]; then
+                REQUIRE_BUILD="true"
+                echo "  ✓ Matches build pattern"
+              fi
+              
+              # Dtest pattern: ^(?!test(.py|\/)|dist\/docker\/|dist\/common\/scripts\/).*$
+              # Everything except test files, dist/docker/, dist/common/scripts/
+              if [[ ! "$file" =~ ^test\.(py|/).*$ ]] && [[ ! "$file" =~ ^dist/docker/.*$ ]] && [[ ! "$file" =~ ^dist/common/scripts/.*$ ]]; then
+                REQUIRE_DTEST="true"
+                echo "  ✓ Matches dtest pattern"
+              fi
+              
+              # Unittest pattern: ^(?!dist\/docker\/|dist\/common\/scripts).*$
+              # Everything except dist/docker/, dist/common/scripts/
+              if [[ ! "$file" =~ ^dist/docker/.*$ ]] && [[ ! "$file" =~ ^dist/common/scripts.*$ ]]; then
+                REQUIRE_UNITTEST="true"
+                echo "  ✓ Matches unittest pattern"
+              fi
+              
+              # Artifacts pattern: ^(?:dist|tools\/toolchain).*$
+              # Files starting with dist or tools/toolchain
+              if [[ "$file" =~ ^dist.*$ ]] || [[ "$file" =~ ^tools/toolchain.*$ ]]; then
+                REQUIRE_ARTIFACTS="true"
+                echo "  ✓ Matches artifacts pattern"
+              fi
+              
+              # Scylla GDB pattern: ^(scylla-gdb.py).*$
+              # Files starting with scylla-gdb.py
+              if [[ "$file" =~ ^scylla-gdb\.py.*$ ]]; then
+                REQUIRE_SCYLLA_GDB="true"
+                echo "  ✓ Matches scylla_gdb pattern"
+              fi
+            fi
+          done <<< "$CHANGED_FILES"
+          
+          # Set environment variables
+          echo "requireBuild=$REQUIRE_BUILD" >> $GITHUB_ENV
+          echo "requireDtest=$REQUIRE_DTEST" >> $GITHUB_ENV
+          echo "requireUnittest=$REQUIRE_UNITTEST" >> $GITHUB_ENV
+          echo "requireArtifacts=$REQUIRE_ARTIFACTS" >> $GITHUB_ENV
+          echo "requireScyllaGdb=$REQUIRE_SCYLLA_GDB" >> $GITHUB_ENV
+          
+          echo ""
+          echo "✅ Rule 3: File analysis complete"
+          echo "Build required: $REQUIRE_BUILD"
+          echo "Dtest required: $REQUIRE_DTEST"
+          echo "Unittest required: $REQUIRE_UNITTEST"
+          echo "Artifacts required: $REQUIRE_ARTIFACTS"
+          echo "Scylla GDB required: $REQUIRE_SCYLLA_GDB"
+
+      - name: Determine Jenkins Job Name
+        run: |
+          if [[ "${{ github.ref_name }}" == "next" ]]; then
+            FOLDER_NAME="scylla-master"
+          elif [[ "${{ github.ref_name }}" == "next-enterprise" ]]; then
+            FOLDER_NAME="scylla-enterprise"
+          else
+            VERSION=$(echo "${{ github.ref_name }}" | awk -F'-' '{print $2}')
+            if [[ "$VERSION" =~ ^202[0-4]\.[0-9]+$ ]]; then
+              FOLDER_NAME="enterprise-$VERSION"
+            elif [[ "$VERSION" =~ ^[0-9]+\.[0-9]+$ ]]; then
+              FOLDER_NAME="scylla-$VERSION"
+            fi
+          fi
+          echo "JOB_NAME=${FOLDER_NAME}/job/scylla-ci" >> $GITHUB_ENV
+
+      - name: Trigger Jenkins Job
+        if: env.draft_or_conflict == 'false' && env.has_file_changes == 'true' && github.action == 'opened' || github.action == 'reopened'
+        env:
+          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
+          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
+          JENKINS_URL: "https://jenkins.scylladb.com"
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
+          echo "Triggering Jenkins Job: $JOB_NAME"
+          curl -X POST \
+            "$JENKINS_URL/job/$JOB_NAME/buildWithParameters? \
+            PR_NUMBER=$PR_NUMBER& \
+            RUN_DTEST=$REQUIRE_DTEST& \
+            RUN_ONLY_SCYLLA_GDB=$REQUIRE_SCYLLA_GDB& \
+            RUN_UNIT_TEST=$REQUIRE_UNITTEST& \
+            FORCE_ON_CLOUD=$HAS_FORCE_ON_CLOUD_LABEL& \
+            SKIP_UNIT_TEST_CUSTOM=$SKIP_UNIT_TEST_CUSTOM& \
+            RUN_ARTIFACT_TESTS=$REQUIRE_ARTIFACTS" \
+            --fail \
+            --user "$JENKINS_USER:$JENKINS_API_TOKEN" \
+            -i -v
+  trigger-ci-via-comment:
+    if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger Scylla-CI Jenkins Job
+        env:
+          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
+          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
+          JENKINS_URL: "https://jenkins.scylladb.com"
+        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
+          curl -X POST "$JENKINS_URL/job/$JOB_NAME/buildWithParameters?PR_NUMBER=$PR_NUMBER" \
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ include(limit_jobs)
 set(CMAKE_CXX_STANDARD "23" CACHE INTERNAL "")
 set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE INTERNAL "")
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

 if(is_multi_config)
    find_package(Seastar)
@@ -90,13 +90,13 @@ if(is_multi_config)
    add_dependencies(Seastar::seastar_testing Seastar)
 else()
    set(Seastar_TESTING ON CACHE BOOL "" FORCE)
-    set(Seastar_API_LEVEL 8 CACHE STRING "" FORCE)
+    set(Seastar_API_LEVEL 9 CACHE STRING "" FORCE)
    set(Seastar_DEPRECATED_OSTREAM_FORMATTERS OFF CACHE BOOL "" FORCE)
    set(Seastar_APPS ON CACHE BOOL "" FORCE)
    set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
    set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE)
    set(Seastar_IO_URING ON CACHE BOOL "" FORCE)
-    set(Seastar_SCHEDULING_GROUPS_COUNT 20 CACHE STRING "" FORCE)
+    set(Seastar_SCHEDULING_GROUPS_COUNT 21 CACHE STRING "" FORCE)
    set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
    add_subdirectory(seastar)
    target_compile_definitions (seastar
@@ -178,7 +178,6 @@ target_sources(scylla-main
    mutation_query.cc
    node_ops/task_manager_module.cc
    partition_slice_builder.cc
-    querier.cc
    query/query.cc
    query_ranges_to_vnodes.cc
    query/query-result-set.cc
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ Please use the [issue tracker](https://github.com/scylladb/scylla/issues/) to re

 ## Contributing code to Scylla

-Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
+Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form to cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
 If you need help formatting or sending patches, [check out these instructions](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches).

 The Scylla C++ source code uses the [Seastar coding style](https://github.com/scylladb/seastar/blob/master/coding-style.md) so please adhere to that in your patches. Note that Scylla code is written with `using namespace seastar`, so should not explicitly add the `seastar::` prefix to Seastar symbols. You will usually not need to add `using namespace seastar` to new source files, because most Scylla header files have `#include "seastarx.hh"`, which does this.
--- a/HACKING.md
+++ b/HACKING.md
@@ -43,7 +43,7 @@ $ ./tools/toolchain/dbuild ninja build/release/scylla
 $ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
 ```

-Note: do not mix environemtns - either perform all your work with dbuild, or natively on the host.
+Note: do not mix environments - either perform all your work with dbuild, or natively on the host.
 Note2: you can get to an interactive shell within dbuild by running it without any parameters:
 ```bash
 $ ./tools/toolchain/dbuild
@@ -91,7 +91,7 @@ You can also specify a single mode. For example
 $ ninja-build release
 ```

-Will build everytihng in release mode. The valid modes are
+Will build everything in release mode. The valid modes are

 * Debug: Enables [AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer)
  and other sanity checks. It has no optimizations, which allows for debugging with tools like
@@ -361,7 +361,7 @@ avoid that the gold linker can be told to create an index with

 More info at https://gcc.gnu.org/wiki/DebugFission.

-Both options can be enable by passing `--split-dwarf` to configure.py.
+Both options can be enabled by passing `--split-dwarf` to configure.py.

 Note that distcc is *not* compatible with it, but icecream
 (https://github.com/icecc/icecream) is.
@@ -370,7 +370,7 @@ Note that distcc is *not* compatible with it, but icecream

 Sometimes Scylla development is closely tied with a feature being developed in Seastar. It can be useful to compile Scylla with a particular check-out of Seastar.

-One way to do this it to create a local remote for the Seastar submodule in the Scylla repository:
+One way to do this is to create a local remote for the Seastar submodule in the Scylla repository:

 ```bash
 $ cd $HOME/src/scylla
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Scylla is fairly fussy about its build environment, requiring very recent
 versions of the C++23 compiler and of many libraries to build. The document
 [HACKING.md](HACKING.md) includes detailed information on building and
 developing Scylla, but to get Scylla building quickly on (almost) any build
-machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
+machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md).
 This is a pre-configured Docker image which includes recent versions of all
 the required compilers, libraries and build tools. Using the frozen toolchain
 allows you to avoid changing anything in your build machine to meet Scylla's
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.4.0-dev
+VERSION=2026.1.0-dev

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -11,7 +11,6 @@
 #include "utils/log.hh"
 #include <string>
 #include <string_view>
-#include "bytes.hh"
 #include "alternator/auth.hh"
 #include <fmt/format.h>
 #include "auth/password_authenticator.hh"
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -136,6 +136,8 @@ future<> controller::start_server() {
                [this, addr, alternator_port, alternator_https_port, creds = std::move(creds)] (server& server) mutable {
            return server.init(addr, alternator_port, alternator_https_port, creds,
                    _config.alternator_enforce_authorization,
+                    _config.alternator_warn_authorization,
+                    _config.alternator_max_users_query_size_in_trace_output,
                    &_memory_limiter.local().get_semaphore(),
                    _config.max_concurrent_requests_per_shard);
        }).handle_exception([this, addr, alternator_port, alternator_https_port] (std::exception_ptr ep) {
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -94,6 +94,9 @@ public:
    static api_error internal(std::string msg) {
        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }
+    static api_error payload_too_large(std::string msg) {
+        return api_error("PayloadTooLarge", std::move(msg), status_type::payload_too_large);
+    }

    // Provide the "std::exception" interface, to make it easier to print this
    // exception in log messages. Note that this function is *not* used to
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -139,6 +139,7 @@ class executor : public peering_sharded_service<executor> {
    db::system_distributed_keyspace& _sdks;
    cdc::metadata& _cdc_metadata;
    utils::updateable_value<bool> _enforce_authorization;
+    utils::updateable_value<bool> _warn_authorization;
    // An smp_service_group to be used for limiting the concurrency when
    // forwarding Alternator request between shards - if necessary for LWT.
    smp_service_group _ssg;
@@ -228,12 +229,15 @@ public:
        const std::optional<attrs_to_get>&,
        uint64_t* = nullptr);

+    // Converts a multi-row selection result to JSON compatible with DynamoDB.
+    // For each row, this method calls item_callback, which takes the size of
+    // the item as the parameter.
    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
        const query::partition_slice&& slice,
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
-        uint64_t& rcu_half_units);
+        noncopyable_function<void(uint64_t)> item_callback = {});

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<managed_bytes_opt>&,
@@ -261,7 +265,7 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // Check CQL's Role-Based Access Control (RBAC) permission (MODIFY,
 // SELECT, DROP, etc.) on the given table. When permission is denied an
 // appropriate user-readable api_error::access_denied is thrown.
-future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);
+future<> verify_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, const schema_ptr&, auth::permission, alternator::stats& stats);

 /**
 * Make return type for serializing the object "streamed",
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include "cdc/cdc_options.hh"
+#include "cdc/log.hh"
 #include "seastarx.hh"
 #include "service/paxos/cas_request.hh"
 #include "service/cas_shard.hh"
@@ -56,7 +58,7 @@ public:
    static write_isolation get_write_isolation_for_schema(schema_ptr schema);

    static write_isolation default_write_isolation;
-public:
+
    static void set_default_write_isolation(std::string_view mode);

 protected:
@@ -107,10 +109,11 @@ public:
    // violating this). We mark apply() "const" to let the compiler validate
    // this for us. The output-only field _return_attributes is marked
    // "mutable" above so that apply() can still write to it.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
-    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
    virtual ~rmw_operation() = default;
+    const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
    schema_ptr schema() const { return _schema; }
    const rjson::value& request() const { return _request; }
    rjson::value&& move_request() && { return std::move(_request); }
@@ -124,6 +127,9 @@ public:
            stats& per_table_stats,
            uint64_t& wcu_total);
    std::optional<service::cas_shard> shard_for_execute(bool needs_read_before_write);
+
+private:
+    inline bool should_fill_preimage() const { return _schema->cdc_options().enabled(); }
 };

 } // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -12,7 +12,7 @@
 #include "serialization.hh"
 #include "error.hh"
 #include "types/concrete_types.hh"
-#include "cql3/type_json.hh"
+#include "types/json_utils.hh"
 #include "mutation/position_in_partition.hh"

 static logging::logger slogger("alternator-serialization");
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -31,6 +31,7 @@
 #include "utils/overloaded_functor.hh"
 #include "utils/aws_sigv4.hh"
 #include "client_data.hh"
+#include "utils/updateable_value.hh"

 static logging::logger slogger("alternator-server");

@@ -270,24 +271,57 @@ protected:
    }
 };

+// This function increments the authentication_failures counter, and may also
+// log a warn-level message and/or throw an exception, depending on what
+// enforce_authorization and warn_authorization are set to.
+// The username and client address are only used for logging purposes -
+// they are not included in the error message returned to the client, since
+// the client knows who it is.
+// Note that if enforce_authorization is false, this function will return
+// without throwing. So a caller that doesn't want to continue after an
+// authentication_error must explicitly return after calling this function.
+template<typename Exception>
+static void authentication_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, Exception&& e, std::string_view user, gms::inet_address client_address) {
+    stats.authentication_failures++;
+    if (enforce_authorization) {
+        if (warn_authorization) {
+            slogger.warn("alternator_warn_authorization=true: {} for user {}, client address {}", e.what(), user, client_address);
+        }
+        throw std::move(e);
+    } else {
+        if (warn_authorization) {
+            slogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {} for user {}, client address {}", e.what(), user, client_address);
+        }
+    }
+}
+
 future<std::string> server::verify_signature(const request& req, const chunked_content& content) {
-    if (!_enforce_authorization) {
+    if (!_enforce_authorization.get() && !_warn_authorization.get()) {
        slogger.debug("Skipping authorization");
        return make_ready_future<std::string>();
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
-        throw api_error::invalid_signature("Host header is mandatory for signature verification");
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::invalid_signature("Host header is mandatory for signature verification"), 
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::missing_authentication_token("Authorization header is mandatory for signature verification"),
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    std::string host = host_it->second;
    std::string_view authorization_header = authorization_it->second;
    auto pos = authorization_header.find_first_of(' ');
    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
-        throw api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header)),
+            "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    authorization_header.remove_prefix(pos+1);
    std::string credential;
@@ -322,7 +356,9 @@ future<std::string> server::verify_signature(const request& req, const chunked_c

    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        throw api_error::validation(fmt::format("Incorrect credential information format: {}", credential));
+        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+            api_error::validation(fmt::format("Incorrect credential information format: {}", credential)), "", req.get_client_address());
+        return make_ready_future<std::string>();
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -346,7 +382,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
    auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
        return get_key_from_roles(proxy, as, std::move(username));
    };
-    return _key_cache.get_ptr(user, cache_getter).then([this, &req, &content,
+    return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
                                                    user = std::move(user),
                                                    host = std::move(host),
                                                    datestamp = std::move(datestamp),
@@ -354,18 +390,32 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
                                                    signed_headers_map = std::move(signed_headers_map),
                                                    region = std::move(region),
                                                    service = std::move(service),
-                                                    user_signature = std::move(user_signature)] (key_cache::value_ptr key_ptr) {
+                                                    user_signature = std::move(user_signature)] (future<key_cache::value_ptr> key_ptr_fut) {
+        key_cache::value_ptr key_ptr(nullptr);
+        try {
+            key_ptr = key_ptr_fut.get();
+        } catch (const api_error& e) {
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                e, user, req.get_client_address());
+            return std::string();
+        }
        std::string signature;
        try {
            signature = utils::aws::get_signature(user, *key_ptr, std::string_view(host), "/", req._method,
                datestamp, signed_headers_str, signed_headers_map, &content, region, service, "");
        } catch (const std::exception& e) {
-            throw api_error::invalid_signature(e.what());
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                api_error::invalid_signature(fmt::format("invalid signature: {}", e.what())),
+                user, req.get_client_address());
+            return std::string();
        }

        if (signature != std::string_view(user_signature)) {
            _key_cache.remove(user);
-            throw api_error::unrecognized_client("The security token included in the request is invalid.");
+            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
+                api_error::unrecognized_client("wrong signature"),
+                user, req.get_client_address());
+            return std::string();
        }
        return user;
    });
@@ -378,35 +428,82 @@ static tracing::trace_state_ptr create_tracing_session(tracing::tracing& tracing
    return tracing_instance.create_session(tracing::trace_type::QUERY, props);
 }

-// truncated_content_view() prints a potentially long chunked_content for
-// debugging purposes. In the common case when the content is not excessively
-// long, it just returns a view into the given content, without any copying.
-// But when the content is very long, it is truncated after some arbitrary
-// max_len (or one chunk, whichever comes first), with "<truncated>" added at
-// the end. To do this modification to the string, we need to create a new
-// std::string, so the caller must pass us a reference to one, "buf", where
-// we can store the content. The returned view is only alive for as long this
-// buf is kept alive.
-static std::string_view truncated_content_view(const chunked_content& content, std::string& buf) {
-    constexpr size_t max_len = 1024;
-    if (content.empty()) {
-        return std::string_view();
-    } else if (content.size() == 1 && content.begin()->size() <= max_len) {
-        return std::string_view(content.begin()->get(), content.begin()->size());
-    } else {
-        buf = std::string(content.begin()->get(), std::min(content.begin()->size(), max_len)) + "<truncated>";
-        return std::string_view(buf);
+// A helper class to represent a potentially truncated view of a chunked_content.
+// If the content is short enough and single chunked, it just holds a view into the content.
+// Otherwise it will be copied into an internal buffer, possibly truncated (depending on maximum allowed size passed in),
+// and the view will point into that buffer.
+// `as_view()` method will return the view.
+// `take_as_sstring()` will either move out the internal buffer (if any), or create a new sstring from the view.
+// You should consider `as_view()` valid as long both the original chunked_content and the truncated_content object are alive.
+class truncated_content {
+    std::string_view _view;
+    sstring _content_maybe;
+
+    void copy_from_content(const chunked_content& content) {
+        size_t offset = 0;
+        for(auto &tmp : content) {
+            size_t to_copy = std::min(tmp.size(), _content_maybe.size() - offset);
+            std::copy(tmp.get(), tmp.get() + to_copy, _content_maybe.data() + offset);
+            offset += to_copy;
+            if (offset >= _content_maybe.size()) {
+                break;
+            }
+        }
    }
+public:
+    truncated_content(const chunked_content& content, size_t max_len = std::numeric_limits<size_t>::max()) {
+        if (content.empty()) return;
+        if (content.size() == 1 && content.begin()->size() <= max_len) {
+            _view = std::string_view(content.begin()->get(), content.begin()->size());
+            return;
+        }
+
+        constexpr std::string_view truncated_text = "<truncated>";
+        size_t content_size = 0;
+        for(auto &tmp : content) {
+            content_size += tmp.size();
+        }
+        if (content_size <= max_len) {
+            _content_maybe = sstring{ sstring::initialized_later{}, content_size };
+            copy_from_content(content);
+        }
+        else {
+            _content_maybe = sstring{ sstring::initialized_later{}, max_len + truncated_text.size() };
+            copy_from_content(content);
+            std::copy(truncated_text.begin(), truncated_text.end(), _content_maybe.data() + _content_maybe.size() - truncated_text.size());
+        }
+        _view = std::string_view(_content_maybe);
+    }
+
+    std::string_view as_view() const { return _view; }
+    sstring take_as_sstring() && {
+        if (_content_maybe.empty() && !_view.empty()) {
+            return sstring{_view};
+        }
+        return std::move(_content_maybe);
+    }
+};
+
+// `truncated_content_view` will produce an object representing a view to a passed content
+// possibly truncated at some length. The value returned is used in two ways:
+// - to print it in logs (use `as_view()` method for this)
+// - to pass it to tracing object, where it will be stored and used later
+//   (use `take_as_sstring()` method as this produces a copy in form of a sstring)
+// `truncated_content` delays constructing `sstring` object until it's actually needed.
+// `truncated_content` is valid as long as passed `content` is alive.
+// if the content is truncated, `<truncated>` will be appended at the maximum size limit
+// and total size will be `max_users_query_size_in_trace_output() + strlen("<truncated>")`.
+static truncated_content truncated_content_view(const chunked_content& content, size_t max_size) {
+    return truncated_content{content, max_size};
 }

-static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query) {
+static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query, size_t max_users_query_size_in_trace_output) {
    tracing::trace_state_ptr trace_state;
    tracing::tracing& tracing_instance = tracing::tracing::get_local_tracing_instance();
    if (tracing_instance.trace_next_query() || tracing_instance.slow_query_tracing_enabled()) {
        trace_state = create_tracing_session(tracing_instance);
-        std::string buf;
        tracing::add_session_param(trace_state, "alternator_op", op);
-        tracing::add_query(trace_state, truncated_content_view(query, buf));
+        tracing::add_query(trace_state, truncated_content_view(query, max_users_query_size_in_trace_output).take_as_sstring());
        tracing::begin(trace_state, seastar::format("Alternator {}", op), client_state.get_client_address());
        if (!username.empty()) {
            tracing::set_username(trace_state, auth::authenticated_user(username));
@@ -415,25 +512,81 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_
    return trace_state;
 }

+// This read_entire_stream() is similar to Seastar's read_entire_stream()
+// which reads the given content_stream until its end into non-contiguous
+// memory. The difference is that this implementation takes an extra length
+// limit, and throws an error if we read more than this limit.
+// This length-limited variant would not have been needed if Seastar's HTTP
+// server's set_content_length_limit() worked in every case, but unfortunately
+// it does not - it only works if the request has a Content-Length header (see
+// issue #8196). In contrast this function can limit the request's length no
+// matter how it's encoded. We need this limit to protect Alternator from
+// oversized requests that can deplete memory.
+static future<chunked_content>
+read_entire_stream(input_stream<char>& inp, size_t length_limit) {
+    chunked_content ret;
+    // We try to read length_limit + 1 bytes, so that we can throw an
+    // exception if we managed to read more than length_limit.
+    ssize_t remain = length_limit + 1;
+    do {
+        temporary_buffer<char> buf = co_await inp.read_up_to(remain);
+        if (buf.empty()) {
+            break;
+        }
+        remain -= buf.size();
+        ret.push_back(std::move(buf));
+    } while (remain > 0);
+    // If we read the full length_limit + 1 bytes, we went over the limit:
+    if (remain <= 0) {
+        // By throwing here an error, we may send a reply (the error message)
+        // without having read the full request body. Seastar's httpd will
+        // realize that we have not read the entire content stream, and
+        // correctly mark the connection unreusable, i.e., close it.
+        // This means we are currently exposed to issue #12166 caused by
+        // Seastar issue 1325), where the client may get an RST instead of
+        // a FIN, and may rarely get a "Connection reset by peer" before
+        // reading the error we send.
+        throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit));
+    }
+    co_return ret;
+}
+
 future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request> req) {
    _executor._stats.total_operations++;
    sstring target = req->get_header("X-Amz-Target");
    // target is DynamoDB API version followed by a dot '.' and operation type (e.g. CreateTable)
    auto dot = target.find('.');
    std::string_view op = (dot == sstring::npos) ? std::string_view() : std::string_view(target).substr(dot+1);
+    if (req->content_length > request_content_length_limit) {
+        // If we have a Content-Length header and know the request will be too
+        // long, we don't need to wait for read_entire_stream() below to
+        // discover it. And we definitely mustn't try to get_units() below for
+        // for such a size.
+        co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
+    }
    // JSON parsing can allocate up to roughly 2x the size of the raw
    // document, + a couple of bytes for maintenance.
-    // TODO: consider the case where req->content_length is missing. Maybe
-    // we need to take the content_length_limit and return some of the units
-    // when we finish read_content_and_verify_signature?
-    size_t mem_estimate = req->content_length * 2 + 8000;
+    // If the Content-Length of the request is not available, we assume
+    // the largest possible request (request_content_length_limit, i.e., 16 MB)
+    // and after reading the request we return_units() the excess.
+    size_t mem_estimate = (req->content_length ? req->content_length : request_content_length_limit) * 2 + 8000;
    auto units_fut = get_units(*_memory_limiter, mem_estimate);
    if (_memory_limiter->waiters()) {
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
    SCYLLA_ASSERT(req->content_stream);
-    chunked_content content = co_await util::read_entire_stream(*req->content_stream);
+    chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
+    // If the request had no Content-Length, we reserved too many units
+    // so need to return some
+    if (req->content_length == 0) {
+        size_t content_length = 0;
+        for (const auto& chunk : content) {
+            content_length += chunk.size();
+        }
+        size_t new_mem_estimate = content_length * 2 + 8000;
+        units.return_units(mem_estimate - new_mem_estimate);
+    }
    auto username = co_await verify_signature(*req, content);
    // As long as the system_clients_entry object is alive, this request will
    // be visible in the "system.clients" virtual table. When requested, this
@@ -444,8 +597,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        req->get_protocol_name() == "https");

    if (slogger.is_enabled(log_level::trace)) {
-        std::string buf;
-        slogger.trace("Request: {} {} {}", op, truncated_content_view(content, buf), req->_headers);
+        slogger.trace("Request: {} {} {}", op, truncated_content_view(content, _max_users_query_size_in_trace_output).as_view(), req->_headers);
    }
    auto callback_it = _callbacks.find(op);
    if (callback_it == _callbacks.end()) {
@@ -465,7 +617,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    }
    co_await client_state.maybe_update_per_service_level_params();

-    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
+    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
    tracing::trace(trace_state, "{}", op);

    auto user = client_state.user();
@@ -516,7 +668,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
        , _auth_service(auth_service)
        , _sl_controller(sl_controller)
        , _key_cache(1024, 1min, slogger)
-        , _enforce_authorization(false)
+        , _max_users_query_size_in_trace_output(1024)
        , _enabled_servers{}
        , _pending_requests("alternator::server::pending_requests")
        , _timeout_config(_proxy.data_dictionary().get_config())
@@ -597,10 +749,13 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
 }

 future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-        utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
+        utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
+        semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
    _memory_limiter = memory_limiter;
    _enforce_authorization = std::move(enforce_authorization);
+    _warn_authorization = std::move(warn_authorization);
    _max_concurrent_requests = std::move(max_concurrent_requests);
+    _max_users_query_size_in_trace_output = std::move(max_users_query_size_in_trace_output);
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
                " must be specified in order to init an alternator HTTP server instance"));
@@ -610,14 +765,12 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:

        if (port) {
            set_routes(_http_server._routes);
-            _http_server.set_content_length_limit(server::content_length_limit);
            _http_server.set_content_streaming(true);
            _http_server.listen(socket_address{addr, *port}).get();
            _enabled_servers.push_back(std::ref(_http_server));
        }
        if (https_port) {
            set_routes(_https_server._routes);
-            _https_server.set_content_length_limit(server::content_length_limit);
            _https_server.set_content_streaming(true);

            if (this_shard_id() == 0) {
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -28,7 +28,11 @@ namespace alternator {
 using chunked_content = rjson::chunked_content;

 class server : public peering_sharded_service<server> {
-    static constexpr size_t content_length_limit = 16*MB;
+    // The maximum size of a request body that Alternator will accept,
+    // in bytes. This is a safety measure to prevent Alternator from
+    // running out of memory when a client sends a very large request.
+    // DynamoDB also has the same limit set to 16 MB.
+    static constexpr size_t request_content_length_limit = 16*MB;
    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;
@@ -43,6 +47,8 @@ class server : public peering_sharded_service<server> {

    key_cache _key_cache;
    utils::updateable_value<bool> _enforce_authorization;
+    utils::updateable_value<bool> _warn_authorization;
+    utils::updateable_value<uint64_t> _max_users_query_size_in_trace_output;
    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
    named_gate _pending_requests;
    // In some places we will need a CQL updateable_timeout_config object even
@@ -94,7 +100,8 @@ public:
    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-            utils::updateable_value<bool> enforce_authorization, semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
+            utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
+            semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
    future<> stop();
    // get_client_data() is called (on each shard separately) when the virtual
    // table "system.clients" is read. It is expected to generate a list of
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -154,6 +154,18 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
    });

    seastar::metrics::label expression_label("expression");
@@ -176,6 +188,16 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
            seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::PROJECTION_EXPRESSION].misses,
                    seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("ProjectionExpression")).aggregate(aggregate_labels).set_skip_when_empty()
    });
+
+    // Only register the following metrics for the global metrics, not per-table
+    if (!has_table) {
+        metrics.add_group("alternator", {
+            seastar::metrics::make_counter("authentication_failures", stats.authentication_failures,
+                seastar::metrics::description("total number of authentication failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_counter("authorization_failures", stats.authorization_failures,
+                seastar::metrics::description("total number of authorization failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+        });
+    }
 }

 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats) {
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -79,6 +79,43 @@ public:
        utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
        utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
    } api_operations;
+    // Operation size metrics
+    struct {
+        // Item size statistics collected per table and aggregated per node.
+        // Each histogram covers the range 0 - 446. Resolves #25143.
+        // A size is the retrieved item's size.
+        utils::estimated_histogram get_item_op_size_kb{30};
+        // A size is the maximum of the new item's size and the old item's size.
+        utils::estimated_histogram put_item_op_size_kb{30};
+        // A size is the deleted item's size. If the deleted item's size is
+        // unknown (i.e. read-before-write wasn't necessary and it wasn't
+        // forced by a configuration option), it won't be recorded on the
+        // histogram.
+        utils::estimated_histogram delete_item_op_size_kb{30};
+        // A size is the maximum of existing item's size and the estimated size
+        // of the update. This will be changed to the maximum of the existing item's
+        // size and the new item's size in a subsequent PR.
+        utils::estimated_histogram update_item_op_size_kb{30};
+
+        // A size is the sum of the sizes of all items per table. This means
+        // that a single BatchGetItem / BatchWriteItem updates the histogram
+        // for each table that it has items in.
+        // The sizes are the retrieved items' sizes grouped per table.
+        utils::estimated_histogram batch_get_item_op_size_kb{30};
+        // The sizes are the the written items' sizes grouped per table.
+        utils::estimated_histogram batch_write_item_op_size_kb{30};
+    } operation_sizes;
+    // Count of authentication and authorization failures, counted if either
+    // alternator_enforce_authorization or alternator_warn_authorization are
+    // set to true. If both are false, no authentication or authorization
+    // checks are performed, so failures are not recognized or counted.
+    // "authentication" failure means the request was not signed with a valid
+    // user and key combination. "authorization" failure means the request was
+    // authenticated to a valid user - but this user did not have permissions
+    // to perform the operation (considering RBAC settings and the user's
+    // superuser status).
+    uint64_t authentication_failures = 0;
+    uint64_t authorization_failures = 0;
    // Miscellaneous event counters
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
@@ -126,4 +163,8 @@ struct table_stats {
 };
 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);

+inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
+    return (bytes + 1023) / 1024;
+}
+
 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -13,7 +13,6 @@

 #include <seastar/json/formatter.hh>

-#include "auth/permission.hh"
 #include "db/config.hh"

 #include "cdc/log.hh"
@@ -127,7 +126,7 @@ public:
    }
 };

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
@@ -297,7 +296,7 @@ sequence_number::sequence_number(std::string_view v)
    }())
 {}

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::shard_id>
@@ -357,7 +356,7 @@ static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts)
    return type;
 }

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_view_type>
@@ -476,10 +475,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        } else {
            status = "ENABLED";
        }
-    } 
+    }

    auto ttl = std::chrono::seconds(opts.ttl());
-    
+
    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

    stream_view_type type = cdc_options_to_steam_view_type(opts);
@@ -715,7 +714,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&

    auto type = rjson::get<shard_iterator_type>(request, "ShardIteratorType");
    auto seq_num = rjson::get_opt<sequence_number>(request, "SequenceNumber");
-    
+
    if (type < shard_iterator_type::TRIM_HORIZON && !seq_num) {
        throw api_error::validation("Missing required parameter \"SequenceNumber\"");
    }
@@ -725,7 +724,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&

    auto stream_arn = rjson::get<alternator::stream_arn>(request, "StreamArn");
    auto db = _proxy.data_dictionary();
-    
+
    schema_ptr schema = nullptr;
    std::optional<shard_id> sid;

@@ -790,7 +789,7 @@ struct event_id {
        return os;
    }
 };
-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::event_id>
@@ -828,7 +827,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats);

    db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
    partition_key pk = iter.shard.id.to_partition_key(*schema);
@@ -941,7 +940,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.0");
+                rjson::add(record, "eventVersion", "1.1");
                rjson::push_back(records, std::move(record));
                record = rjson::empty_object();
                --limit;
@@ -1000,6 +999,16 @@ future<executor::request_return_type> executor::get_records(client_state& client
            case cdc::operation::insert:
                rjson::add(record, "eventName", "INSERT");
                break;
+            case cdc::operation::service_row_delete:
+            case cdc::operation::service_partition_delete:
+            {
+                auto user_identity = rjson::empty_object();
+                rjson::add(user_identity, "Type", "Service");
+                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+                rjson::add(record, "userIdentity", std::move(user_identity));
+                rjson::add(record, "eventName", "REMOVE");
+                break;
+            }
            default:
                rjson::add(record, "eventName", "REMOVE");
                break;
@@ -1064,9 +1073,7 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
    }

    if (stream_enabled->GetBool()) {
-        auto db = sp.data_dictionary();
-
-        if (!db.features().alternator_streams) {
+        if (!sp.features().alternator_streams) {
            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
        }

@@ -1125,4 +1132,4 @@ void executor::supplement_table_stream_info(rjson::value& descr, const schema& s
    }
 }

-}
+} // namespace alternator
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -17,6 +17,7 @@
 #include <seastar/core/lowres_clock.hh>
 #include <seastar/coroutine/maybe_yield.hh>

+#include "cdc/log.hh"
 #include "exceptions/exceptions.hh"
 #include "gms/gossiper.hh"
 #include "gms/inet_address.hh"
@@ -67,7 +68,7 @@ extern const sstring TTL_TAG_KEY;

 future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.update_time_to_live++;
-    if (!_proxy.data_dictionary().features().alternator_ttl) {
+    if (!_proxy.features().alternator_ttl) {
        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
    }

@@ -94,7 +95,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::ALTER);
+    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
        if (enabled) {
            if (tags_map.contains(TTL_TAG_KEY)) {
@@ -292,7 +293,12 @@ static future<> expire_item(service::storage_proxy& proxy,
        db::consistency_level::LOCAL_QUORUM,
        executor::default_timeout(), // FIXME - which timeout?
        qs.get_trace_state(), qs.get_permit(),
-        db::allow_per_partition_rate_limit::no);
+        db::allow_per_partition_rate_limit::no,
+        false,
+        cdc::per_request_options{
+            .is_system_originated = true,
+        }
+    );
 }

 static size_t random_offset(size_t min, size_t max) {
@@ -747,7 +753,7 @@ static future<bool> scan_table(
        auto my_host_id = erm->get_topology().my_host_id();
        const auto &tablet_map = erm->get_token_metadata().tablets().get_tablet_map(s->id());
        for (std::optional tablet = tablet_map.first_tablet(); tablet; tablet = tablet_map.next_tablet(*tablet)) {
-            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet);
+            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet, erm->get_topology());
            // check if this is the primary replica for the current tablet
            if (tablet_primary_replica.host == my_host_id && tablet_primary_replica.shard == this_shard_id()) {
                co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -220,6 +220,25 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/nodes/excluded",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Retrieve host ids of nodes which are marked as excluded",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_excluded_nodes",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/nodes/joining",
         "operations":[
@@ -594,6 +613,50 @@
            }
         ]
      },
+      {
+         "path": "/storage_service/natural_endpoints/v2/{keyspace}",
+         "operations": [
+            {
+               "method": "GET",
+               "summary":"This method returns the N endpoints that are responsible for storing the specified key i.e for replication. the endpoint responsible for this key",
+               "type": "array",
+               "items": {
+                  "type": "string"
+               },
+               "nickname": "get_natural_endpoints_v2",
+               "produces": [
+                  "application/json"
+               ],
+               "parameters": [
+                  {
+                     "name": "keyspace",
+                     "description": "The keyspace to query about.",
+                     "required": true,
+                     "allowMultiple": false,
+                     "type": "string",
+                     "paramType": "path"
+                  },
+                  {
+                     "name": "cf",
+                     "description": "Column family name.",
+                     "required": true,
+                     "allowMultiple": false,
+                     "type": "string",
+                     "paramType": "query"
+                  },
+                  {
+                     "name": "key_component",
+                     "description": "Each component of the key for which we need to find the endpoint (e.g. ?key_component=part1&key_component=part2).",
+                     "required": true,
+                     "allowMultiple": true,
+                     "type": "string",
+                     "paramType": "query"
+                  }
+               ]
+            }
+         ]
+      },
+
      {
         "path":"/storage_service/cdc_streams_check_and_repair",
         "operations":[
@@ -898,6 +961,14 @@
                          "type":"string",
                          "paramType":"query",
                          "enum": ["all", "dc", "rack", "node"]
+                      },
+                      {
+                         "name":"primary_replica_only",
+                         "description":"Load the sstables and stream to the primary replica node within the scope, if one is specified. If not, stream to the global primary replica.",
+                         "required":false,
+                         "allowMultiple":false,
+                         "type":"boolean",
+                         "paramType":"query"
                      }
                  ]
              }
@@ -984,7 +1055,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all",
+         "path":"/storage_service/cleanup_all/",
         "operations":[
            {
               "method":"POST",
@@ -994,6 +1065,30 @@
               "produces":[
                  "application/json"
               ],
+               "parameters":[
+                    {
+                     "name":"global",
+                     "description":"true if cleanup of entire cluster is requested",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/mark_node_as_clean",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
+               "type":"void",
+               "nickname":"reset_cleanup_needed",
+               "produces":[
+                  "application/json"
+               ],
               "parameters":[]
            }
         ]
@@ -1100,6 +1195,14 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name": "drop_unfixable_sstables",
+                     "description": "When set to true, drop unfixable sstables. Applies only to scrub mode SEGREGATE.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
@@ -1519,6 +1622,30 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/exclude_node",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Marks the node as permanently down (excluded).",
+               "type":"void",
+               "nickname":"exclude_node",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"hosts",
+                     "description":"Comma-separated list of host ids to exclude",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/removal_status",
         "operations":[
@@ -2924,7 +3051,7 @@
                  },
                  {
                     "name":"incremental_mode",
-                     "description":"Set the incremental repair mode. Can be 'disabled', 'regular', or 'full'. 'regular': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to regular.",
+                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,6 +42,14 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/api.cc
+++ b/api/api.cc
@@ -216,10 +216,10 @@ future<> unset_server_gossip(http_context& ctx) {
    });
 }

-future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks) {
+future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db) {
    co_await register_api(ctx, "column_family",
-                "The column family API", [&db, &sys_ks] (http_context& ctx, routes& r) {
-                    set_column_family(ctx, r, db, sys_ks);
+                "The column family API", [&db] (http_context& ctx, routes& r) {
+                    set_column_family(ctx, r, db);
                });
    co_await register_api(ctx, "cache_service",
            "The cache service API", [&db] (http_context& ctx, routes& r) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -58,7 +58,6 @@ class sstables_format_selector;
 namespace view {
 class view_builder;
 }
-class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
@@ -118,7 +117,7 @@ future<> set_server_token_metadata(http_context& ctx, sharded<locator::shared_to
 future<> unset_server_token_metadata(http_context& ctx);
 future<> set_server_gossip(http_context& ctx, sharded<gms::gossiper>& g);
 future<> unset_server_gossip(http_context& ctx);
-future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks);
+future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db);
 future<> unset_server_column_family(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
 future<> unset_server_messaging_service(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,6 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
-#include "db/system_keyspace.hh"
 #include "db/data_listeners.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
@@ -336,7 +335,7 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

-void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks) {
+void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
        const replica::database::tables_metadata& meta = db.local().get_tables_metadata();
@@ -937,30 +936,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        return set_tables_tombstone_gc(db, std::move(tables), false);
    });

-    cf::get_built_indexes.set(r, [&db, &sys_ks](std::unique_ptr<http::request> req) {
-        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
-        // Use of load_built_views() as filtering table should be in sync with
-        // built_indexes_virtual_reader filtering with BUILT_VIEWS table
-        return sys_ks.local().load_built_views().then([ks, cf_name, &db](const std::vector<db::system_keyspace::view_name>& vb) mutable {
-            std::set<sstring> vp;
-            for (auto b : vb) {
-                if (b.first == ks) {
-                    vp.insert(b.second);
-                }
-            }
-            std::vector<sstring> res;
-            auto uuid = validate_table(db.local(), ks, cf_name);
-            replica::column_family& cf = db.local().find_column_family(uuid);
-            res.reserve(cf.get_index_manager().list_indexes().size());
-            for (auto&& i : cf.get_index_manager().list_indexes()) {
-                if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
-                    res.emplace_back(i.metadata().name());
-                }
-            }
-            return make_ready_future<json::json_return_type>(res);
-        });
-    });
-
    cf::get_compression_metadata_off_heap_memory_used.set(r, [](const_req) {
        // FIXME
        // Currently there are no information on the compression
@@ -1215,7 +1190,6 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::disable_tombstone_gc.unset(r);
    ss::enable_tombstone_gc.unset(r);
    ss::disable_tombstone_gc.unset(r);
-    cf::get_built_indexes.unset(r);
    cf::get_compression_metadata_off_heap_memory_used.unset(r);
    cf::get_compression_parameters.unset(r);
    cf::get_compression_ratio.unset(r);
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -13,13 +13,9 @@
 #include <any>
 #include "api/api_init.hh"

-namespace db {
-class system_keyspace;
-}
-
 namespace api {

-void set_column_family(http_context& ctx, httpd::routes& r, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks);
+void set_column_family(http_context& ctx, httpd::routes& r, sharded<replica::database>& db);
 void unset_column_family(http_context& ctx, httpd::routes& r);

 table_info parse_table_info(const sstring& name, const replica::database& db);
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -21,10 +21,10 @@ namespace hf = httpd::error_injection_json;

 void set_error_injection(http_context& ctx, routes& r) {

-    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
        sstring injection = req->get_path_param("injection");
        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto params = req->content;
+        auto params = co_await util::read_entire_stream_contiguous(*req->content_stream);

        const size_t max_params_size = 1024 * 1024;
        if (params.size() > max_params_size) {
@@ -39,12 +39,11 @@ void set_error_injection(http_context& ctx, routes& r) {
                : rjson::parse_to_map<utils::error_injection_parameters>(params);

            auto& errinj = utils::get_local_injector();
-            return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
-                return make_ready_future<json::json_return_type>(json::json_void());
-            });
+            co_await errinj.enable_on_all(injection, one_shot, std::move(parameters));
        } catch (const rjson::error& e) {
            throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
        }
+        co_return json::json_void();
    });

    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -20,6 +20,7 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
+#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -36,6 +37,7 @@
 #include "gms/gossiper.hh"
 #include "db/system_keyspace.hh"
 #include <seastar/http/exception.hh>
+#include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include <seastar/coroutine/exception.hh>
@@ -272,6 +274,13 @@ scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::re
        throw httpd::bad_param_exception(fmt::format("Unknown argument for 'quarantine_mode' parameter: {}", quarantine_mode_str));
    }

+    if(req_param<bool>(*req, "drop_unfixable_sstables", false)) {
+        if(scrub_mode != compaction::compaction_type_options::scrub::mode::segregate) {
+            throw httpd::bad_param_exception("The 'drop_unfixable_sstables' parameter is only valid when 'scrub_mode' is 'SEGREGATE'");
+        }
+        info.opts.drop_unfixable = compaction::compaction_type_options::scrub::drop_unfixable_sstables::yes;
+    }
+
    return info;
 }

@@ -496,17 +505,17 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto bucket = req->get_query_param("bucket");
        auto prefix = req->get_query_param("prefix");
        auto scope = parse_stream_scope(req->get_query_param("scope"));
+        auto primary_replica_only = validate_bool_x(req->get_query_param("primary_replica_only"), false);

-        // TODO: the http_server backing the API does not use content streaming
-        // should use it for better performance
-        rjson::value parsed = rjson::parse(req->content);
+        rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
+        rjson::value parsed = rjson::parse(std::move(content));
        if (!parsed.IsArray()) {
            throw httpd::bad_param_exception("malformatted sstables in body");
        }
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope);
+        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });

@@ -527,10 +536,35 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
        });
    });

+    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
+        // Use of load_built_views() as filtering table should be in sync with
+        // built_indexes_virtual_reader filtering with BUILT_VIEWS table
+        std::vector<db::system_keyspace::view_name> vn = co_await vb.local().get_sys_ks().load_built_views();
+        std::set<sstring> vp;
+        for (auto b : vn) {
+            if (b.first == ks) {
+                vp.insert(b.second);
+            }
+        }
+        std::vector<sstring> res;
+        replica::database& db = vb.local().get_db();
+        auto uuid = validate_table(db, ks, cf_name);
+        replica::column_family& cf = db.find_column_family(uuid);
+        res.reserve(cf.get_index_manager().list_indexes().size());
+        for (auto&& i : cf.get_index_manager().list_indexes()) {
+            if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
+                res.emplace_back(i.metadata().name());
+            }
+        }
+        co_return res;
+    });
+
 }

 void unset_view_builder(http_context& ctx, routes& r) {
    ss::view_build_statuses.unset(r);
+    cf::get_built_indexes.unset(r);
 }

 static future<json::json_return_type> describe_ring_as_json(sharded<service::storage_service>& ss, sstring keyspace) {
@@ -710,6 +744,14 @@ rest_get_natural_endpoints(http_context& ctx, sharded<service::storage_service>&
        return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
 }

+static
+json::json_return_type
+rest_get_natural_endpoints_v2(http_context& ctx, sharded<service::storage_service>& ss, const_req req) {
+        auto keyspace = validate_keyspace(ctx, req);
+        auto res = ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"), req.get_query_param_array("key_component"));
+        return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
+}
+
 static
 future<json::json_return_type>
 rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -723,25 +765,52 @@ rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::un
 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("cleanup_all");
-        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        bool global = true;
+        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
+            global = validate_bool(global_param);
+        }
+
+        apilog.info("cleanup_all global={}", global);
+
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
-            co_await ss.do_cluster_cleanup();
+            co_await ss.do_clusterwide_vnodes_cleanup();
            co_return true;
        });
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local global cleanup if topology coordinator is not enabled
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
        co_await task->done();
+
+        // Mark this node as clean
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
+        });
+
        co_return json::json_return_type(0);
 }

+static
+future<json::json_return_type>
+rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        apilog.info("reset_cleanup_needed");
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
+            return ss.reset_cleanup_needed();
+        });
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_force_flush(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -804,6 +873,25 @@ rest_remove_node(sharded<service::storage_service>& ss, std::unique_ptr<http::re
        });
 }

+static
+future<json::json_return_type>
+rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto hosts = utils::split_comma_separated_list(req->get_query_param("hosts"))
+        | std::views::transform([] (const sstring& s) { return locator::host_id(utils::UUID(s)); })
+        | std::ranges::to<std::vector<locator::host_id>>();
+
+    auto& topo = ss.local().get_token_metadata().get_topology();
+    for (auto host : hosts) {
+        if (!topo.has_node(host)) {
+            throw bad_param_exception(fmt::format("Host ID {} does not belong to this cluster", host));
+        }
+    }
+
+    apilog.info("exclude_node: hosts={}", hosts);
+    co_await ss.local().mark_excluded(hosts);
+    co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_get_removal_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1721,13 +1809,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
    ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
    ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
+    ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
    ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
+    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
+    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
    ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
    ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
    ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
@@ -1800,11 +1891,13 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_natural_endpoints.unset(r);
    ss::cdc_streams_check_and_repair.unset(r);
    ss::cleanup_all.unset(r);
+    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
+    ss::exclude_node.unset(r);
    ss::get_removal_status.unset(r);
    ss::force_remove_completion.unset(r);
    ss::set_logging_level.unset(r);
--- a/api/system.cc
+++ b/api/system.cc
@@ -54,7 +54,8 @@ void set_system(http_context& ctx, routes& r) {

    hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        rapidjson::Document doc;
-        doc.Parse(req->content.c_str());
+        auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
+        doc.Parse(content.c_str());
        if (!doc.IsArray()) {
            throw bad_param_exception("Expected a json array");
        }
@@ -87,21 +88,19 @@ void set_system(http_context& ctx, routes& r) {
                relabels[i].expr = element["regex"].GetString();
            }
        }
-        return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
-            return smp::invoke_on_all([&relabels, &failed] {
-                return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
-                    if (result.metrics_relabeled_due_to_collision > 0) {
-                        failed = true;
-                    }
-                    return;
-                });
-            }).then([&failed](){
-                if (failed) {
-                    throw bad_param_exception("conflicts found during relabeling");
+        bool failed = false;
+        co_await smp::invoke_on_all([&relabels, &failed] {
+            return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
+                if (result.metrics_relabeled_due_to_collision > 0) {
+                    failed = true;
                }
-                return make_ready_future<json::json_return_type>(seastar::json::json_void());
+                return;
            });
        });
+        if (failed) {
+            throw bad_param_exception("conflicts found during relabeling");
+        }
+        co_return seastar::json::json_void();
    });

    hs::get_system_uptime.set(r, [](const_req req) {
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -38,76 +38,78 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

+static future<shared_ptr<compaction::major_keyspace_compaction_task_impl>> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+    auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+    auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
+    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    std::optional<compaction::flush_mode> fmopt;
+    if (!flush && !consider_only_existing_data) {
+        fmopt = compaction::flush_mode::skip;
+    }
+    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+}
+
+static future<shared_ptr<compaction::upgrade_sstables_compaction_task_impl>> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
+    auto& db = ctx.db;
+    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+}
+
+static future<shared_ptr<compaction::cleanup_keyspace_compaction_task_impl>> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+    if (rs.is_local() || !rs.is_vnode_based()) {
+        auto reason = rs.is_local() ? "require" : "support";
+        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+        co_return nullptr;
+    }
+    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+    if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
+        auto msg = "Can not perform cleanup operation when topology changes";
+        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+        co_await coroutine::return_exception(std::runtime_error(msg));
+    }
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
+        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+}
+
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<compaction::flush_mode> fmopt;
-        if (!flush) {
-            fmopt = compaction::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
-        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<compaction::flush_mode> fmopt;
-        if (!flush && !consider_only_existing_data) {
-            fmopt = compaction::flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_await task->done();
        co_return json_void();
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
+        tasks::task_id id = tasks::task_id::create_null_id();
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            id = task->get_status().id;
        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-
-        co_return json::json_return_type(task->get_status().id.to_sstring());
+        co_return json::json_return_type(id.to_sstring());
    });

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-        if (rs.is_local() || !rs.is_vnode_based()) {
-            auto reason = rs.is_local() ? "require" : "support";
-            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-            co_return json::json_return_type(0);
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            co_await task->done();
        }
-        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
-        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
-            {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-        co_await task->done();
        co_return json::json_return_type(0);
    });

@@ -129,25 +131,12 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_await task->done();
        co_return json::json_return_type(0);
    }));
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -62,6 +62,17 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
        return addr | std::ranges::to<std::vector>();
    });

+    ss::get_excluded_nodes.set(r, [&tm](const_req req) {
+        const auto& local_tm = *tm.local().get();
+        std::vector<sstring> eps;
+        local_tm.get_topology().for_each_node([&] (auto& node) {
+            if (node.is_excluded()) {
+                eps.push_back(node.host_id().to_sstring());
+            }
+        });
+        return eps;
+    });
+
    ss::get_joining_nodes.set(r, [&tm, &g](const_req req) {
        const auto& local_tm = *tm.local().get();
        const auto& points = local_tm.get_bootstrap_tokens();
@@ -130,6 +141,7 @@ void unset_token_metadata(http_context& ctx, routes& r) {
    ss::get_leaving_nodes.unset(r);
    ss::get_moving_nodes.unset(r);
    ss::get_joining_nodes.unset(r);
+    ss::get_excluded_nodes.unset(r);
    ss::get_host_id_map.unset(r);
    httpd::endpoint_snitch_info_json::get_datacenter.unset(r);
    httpd::endpoint_snitch_info_json::get_rack.unset(r);
--- a/audit/CMakeLists.txt
+++ b/audit/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(scylla_audit
  PRIVATE
    audit.cc
    audit_cf_storage_helper.cc
+    audit_composite_storage_helper.cc
    audit_syslog_storage_helper.cc)
 target_include_directories(scylla_audit
  PUBLIC
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -13,9 +13,11 @@
 #include "cql3/statements/batch_statement.hh"
 #include "cql3/statements/modification_statement.hh"
 #include "storage_helper.hh"
+#include "audit_cf_storage_helper.hh"
+#include "audit_syslog_storage_helper.hh"
+#include "audit_composite_storage_helper.hh"
 #include "audit.hh"
 #include "../db/config.hh"
-#include "utils/class_registrator.hh"

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/trim.hpp>
@@ -26,6 +28,47 @@ namespace audit {

 logging::logger logger("audit");

+static std::set<sstring> parse_audit_modes(const sstring& data) {
+    std::set<sstring> result;
+    if (!data.empty()) {
+        std::vector<sstring> audit_modes;
+        boost::split(audit_modes, data, boost::is_any_of(","));
+        if (audit_modes.empty()) {
+            return {};
+        }
+        for (sstring& audit_mode : audit_modes) {
+            boost::trim(audit_mode);
+            if (audit_mode == "none") {
+                return {};
+            }
+            if (audit_mode != "table" && audit_mode != "syslog") {
+                throw audit_exception(fmt::format("Bad configuration: invalid 'audit': {}", audit_mode));
+            }
+            result.insert(std::move(audit_mode));
+        }
+    }
+    return result;
+}
+
+static std::unique_ptr<storage_helper> create_storage_helper(const std::set<sstring>& audit_modes, cql3::query_processor& qp, service::migration_manager& mm) {
+    SCYLLA_ASSERT(!audit_modes.empty() && !audit_modes.contains("none"));
+
+    std::vector<std::unique_ptr<storage_helper>> helpers;
+    for (const sstring& audit_mode : audit_modes) {
+        if (audit_mode == "table") {
+            helpers.emplace_back(std::make_unique<audit_cf_storage_helper>(qp, mm));
+        } else if (audit_mode == "syslog") {
+            helpers.emplace_back(std::make_unique<audit_syslog_storage_helper>(qp, mm));
+        }
+    }
+
+    SCYLLA_ASSERT(!helpers.empty());
+    if (helpers.size() == 1) {
+        return std::move(helpers.front());
+    }
+    return std::make_unique<audit_composite_storage_helper>(std::move(helpers));
+}
+
 static sstring category_to_string(statement_category category)
 {
    switch (category) {
@@ -103,7 +146,9 @@ static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
 }

 audit::audit(locator::shared_token_metadata& token_metadata,
-             sstring&& storage_helper_name,
+             cql3::query_processor& qp,
+             service::migration_manager& mm,
+             std::set<sstring>&& audit_modes,
             std::set<sstring>&& audited_keyspaces,
             std::map<sstring, std::set<sstring>>&& audited_tables,
             category_set&& audited_categories,
@@ -112,28 +157,21 @@ audit::audit(locator::shared_token_metadata& token_metadata,
    , _audited_keyspaces(std::move(audited_keyspaces))
    , _audited_tables(std::move(audited_tables))
    , _audited_categories(std::move(audited_categories))
-    , _storage_helper_class_name(std::move(storage_helper_name))
    , _cfg(cfg)
    , _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
    , _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
    , _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
-{ }
+{
+    _storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
+}

 audit::~audit() = default;

-future<> audit::create_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm) {
-    sstring storage_helper_name;
-    if (cfg.audit() == "table") {
-        storage_helper_name = "audit_cf_storage_helper";
-    } else if (cfg.audit() == "syslog") {
-        storage_helper_name = "audit_syslog_storage_helper";
-    } else if (cfg.audit() == "none") {
-        // Audit is off
+future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm) {
+    std::set<sstring> audit_modes = parse_audit_modes(cfg.audit());
+    if (audit_modes.empty()) {
        logger.info("Audit is disabled");
-
        return make_ready_future<>();
-    } else {
-        throw audit_exception(fmt::format("Bad configuration: invalid 'audit': {}", cfg.audit()));
    }
    category_set audited_categories = parse_audit_categories(cfg.audit_categories());
    std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
@@ -143,19 +181,20 @@ future<> audit::create_audit(const db::config& cfg, sharded<locator::shared_toke
                cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());

    return audit_instance().start(std::ref(stm),
-                                  std::move(storage_helper_name),
+                                  std::ref(qp),
+                                  std::ref(mm),
+                                  std::move(audit_modes),
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg));
-}
-
-future<> audit::start_audit(const db::config& cfg, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm) {
-    if (!audit_instance().local_is_initialized()) {
-        return make_ready_future<>();
-    }
-    return audit_instance().invoke_on_all([&cfg, &qp, &mm] (audit& local_audit) {
-        return local_audit.start(cfg, qp.local(), mm.local());
+                                  std::cref(cfg))
+    .then([&cfg] {
+        if (!audit_instance().local_is_initialized()) {
+            return make_ready_future<>();
+        }
+        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+            return local_audit.start(cfg);
+        });
    });
 }

@@ -181,15 +220,7 @@ audit_info_ptr audit::create_no_audit_info() {
    return audit_info_ptr();
 }

-future<> audit::start(const db::config& cfg, cql3::query_processor& qp, service::migration_manager& mm) {
-    try {
-        _storage_helper_ptr = create_object<storage_helper>(_storage_helper_class_name, qp, mm);
-    } catch (no_such_class& e) {
-        logger.error("Can't create audit storage helper {}: not supported", _storage_helper_class_name);
-        throw;
-    } catch (...) {
-        throw;
-    }
+future<> audit::start(const db::config& cfg) {
    return _storage_helper_ptr->start(cfg);
 }

--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -102,7 +102,6 @@ class audit final : public seastar::async_sharded_service<audit> {
    std::map<sstring, std::set<sstring>> _audited_tables;
    category_set _audited_categories;

-    sstring _storage_helper_class_name;
    std::unique_ptr<storage_helper> _storage_helper_ptr;

    const db::config& _cfg;
@@ -125,18 +124,20 @@ public:
    static audit& local_audit_instance() {
        return audit_instance().local();
    }
-    static future<> create_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm);
-    static future<> start_audit(const db::config& cfg, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
+    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
    static audit_info_ptr create_no_audit_info();
-    audit(locator::shared_token_metadata& stm, sstring&& storage_helper_name,
+    audit(locator::shared_token_metadata& stm,
+          cql3::query_processor& qp,
+          service::migration_manager& mm,
+          std::set<sstring>&& audit_modes,
          std::set<sstring>&& audited_keyspaces,
          std::map<sstring, std::set<sstring>>&& audited_tables,
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
-    future<> start(const db::config& cfg, cql3::query_processor& qp, service::migration_manager& mm);
+    future<> start(const db::config& cfg);
    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info* audit_info) const;
--- a/audit/audit_cf_storage_helper.cc
+++ b/audit/audit_cf_storage_helper.cc
@@ -11,11 +11,11 @@
 #include "cql3/query_processor.hh"
 #include "data_dictionary/keyspace_metadata.hh"
 #include "utils/UUID_gen.hh"
-#include "utils/class_registrator.hh"
 #include "cql3/query_options.hh"
 #include "cql3/statements/ks_prop_defs.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
+#include "locator/abstract_replication_strategy.hh"

 namespace audit {

@@ -64,8 +64,8 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
            data_dictionary::database db = _qp.db();
            cql3::statements::ks_prop_defs old_ks_prop_defs;
            auto old_ks_metadata = old_ks_prop_defs.as_ks_metadata_update(
-                    ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features());
-            std::map<sstring, sstring> strategy_opts;
+                    ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features(), db.get_config());
+            locator::replication_strategy_config_options strategy_opts;
            for (const auto &dc: _qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters())
                strategy_opts[dc] = "3";

@@ -73,6 +73,7 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
                                                                   "org.apache.cassandra.locator.NetworkTopologyStrategy",
                                                                   strategy_opts,
                                                                   std::nullopt, // initial_tablets
+                                                                   std::nullopt, // consistency_option
                                                                   old_ks_metadata->durable_writes(),
                                                                   old_ks_metadata->get_storage_options(),
                                                                   old_ks_metadata->tables());
@@ -196,7 +197,4 @@ cql3::query_options audit_cf_storage_helper::make_login_data(socket_address node
    return cql3::query_options(cql3::default_cql_config, db::consistency_level::ONE, std::nullopt, std::move(values), false, cql3::query_options::specific_options::DEFAULT);
 }

-using registry = class_registrator<storage_helper, audit_cf_storage_helper, cql3::query_processor&, service::migration_manager&>;
-static registry registrator1("audit_cf_storage_helper");
-
 }
--- a/audit/audit_composite_storage_helper.cc
+++ b/audit/audit_composite_storage_helper.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2025 ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include <seastar/core/loop.hh>
+#include <seastar/core/future-util.hh>
+
+#include "audit/audit_composite_storage_helper.hh"
+
+#include "utils/class_registrator.hh"
+
+namespace audit {
+
+audit_composite_storage_helper::audit_composite_storage_helper(std::vector<std::unique_ptr<storage_helper>>&& storage_helpers)
+    : _storage_helpers(std::move(storage_helpers))
+{}
+
+future<> audit_composite_storage_helper::start(const db::config& cfg) {
+    auto res = seastar::parallel_for_each(
+        _storage_helpers,
+        [&cfg] (std::unique_ptr<storage_helper>& h) {
+            return h->start(cfg);
+        }
+    );
+    return res;
+}
+
+future<> audit_composite_storage_helper::stop() {
+    auto res = seastar::parallel_for_each(
+        _storage_helpers,
+        [] (std::unique_ptr<storage_helper>& h) {
+            return h->stop();
+        }
+    );
+    return res;
+}
+
+future<> audit_composite_storage_helper::write(const audit_info* audit_info,
+                                               socket_address node_ip,
+                                               socket_address client_ip,
+                                               db::consistency_level cl,
+                                               const sstring& username,
+                                               bool error) {
+    return seastar::parallel_for_each(
+        _storage_helpers,
+        [audit_info, node_ip, client_ip, cl, &username, error](std::unique_ptr<storage_helper>& h) {
+            return h->write(audit_info, node_ip, client_ip, cl, username, error);
+        }
+    );
+}
+
+future<> audit_composite_storage_helper::write_login(const sstring& username,
+                                                     socket_address node_ip,
+                                                     socket_address client_ip,
+                                                     bool error) {
+    return seastar::parallel_for_each(
+        _storage_helpers,
+        [&username, node_ip, client_ip, error](std::unique_ptr<storage_helper>& h) {
+            return h->write_login(username, node_ip, client_ip, error);
+        }
+    );
+}
+
+} // namespace audit
--- a/audit/audit_composite_storage_helper.hh
+++ b/audit/audit_composite_storage_helper.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2025 ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+#pragma once
+
+#include "audit/audit.hh"
+#include <seastar/core/future.hh>
+
+#include "storage_helper.hh"
+
+namespace audit {
+
+class audit_composite_storage_helper : public storage_helper {
+    std::vector<std::unique_ptr<storage_helper>> _storage_helpers;
+
+public:
+    explicit audit_composite_storage_helper(std::vector<std::unique_ptr<storage_helper>>&&);
+    virtual ~audit_composite_storage_helper() = default;
+    virtual future<> start(const db::config& cfg) override;
+    virtual future<> stop() override;
+    virtual future<> write(const audit_info* audit_info,
+                           socket_address node_ip,
+                           socket_address client_ip,
+                           db::consistency_level cl,
+                           const sstring& username,
+                           bool error) override;
+    virtual future<> write_login(const sstring& username,
+                                 socket_address node_ip,
+                                 socket_address client_ip,
+                                 bool error) override;
+};
+
+} // namespace audit
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -21,7 +21,6 @@
 #include <fmt/chrono.h>

 #include "cql3/query_processor.hh"
-#include "utils/class_registrator.hh"

 namespace cql3 {

@@ -143,7 +142,4 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
    co_await syslog_send_helper(msg.c_str());
 }

-using registry = class_registrator<storage_helper, audit_syslog_storage_helper, cql3::query_processor&, service::migration_manager&>;
-static registry registrator1("audit_syslog_storage_helper");
-
 }
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
 }

 future<role_to_directly_granted_map>
-ldap_role_manager::query_all_directly_granted() {
+ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
    role_to_directly_granted_map result;
-    auto roles = co_await query_all();
+    auto roles = co_await query_all(qs);
    for (auto& role: roles) {
        auto granted_set = co_await query_granted(role, recursive_role_query::no);
        for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted() {
    co_return result;
 }

-future<role_set> ldap_role_manager::query_all() {
-    return _std_mgr.query_all();
+future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
+    return _std_mgr.query_all(qs);
 }

 future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> ldap_role_manager::get_attribute(
-        std::string_view role_name, std::string_view attribute_name) {
-    return _std_mgr.get_attribute(role_name, attribute_name);
+        std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
-    return _std_mgr.query_attribute_for_all(attribute_name);
+future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.query_attribute_for_all(attribute_name, qs);
 }

 future<> ldap_role_manager::set_attribute(
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -75,9 +75,9 @@ class ldap_role_manager : public role_manager {

    future<role_set> query_granted(std::string_view, recursive_role_query) override;

-    future<role_to_directly_granted_map> query_all_directly_granted() override;
+    future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    future<role_set> query_all() override;
+    future<role_set> query_all(::service::query_state&) override;

    future<bool> exists(std::string_view) override;

@@ -85,9 +85,9 @@ class ldap_role_manager : public role_manager {

    future<bool> can_login(std::string_view) override;

-    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;
+    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;

-    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;
+    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;

    future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
 }

-future<role_set> maintenance_socket_role_manager::query_all() {
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }

@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
    return make_ready_future<bool>(true);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
 }

--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -53,9 +53,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -63,9 +63,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -36,7 +36,8 @@ static const std::unordered_map<sstring, auth::permission> permission_names({
        {"MODIFY", auth::permission::MODIFY},
        {"AUTHORIZE", auth::permission::AUTHORIZE},
        {"DESCRIBE", auth::permission::DESCRIBE},
-        {"EXECUTE", auth::permission::EXECUTE}});
+        {"EXECUTE", auth::permission::EXECUTE},
+        {"VECTOR_SEARCH_INDEXING", auth::permission::VECTOR_SEARCH_INDEXING}});

 const sstring& auth::permissions::to_string(permission p) {
    for (auto& v : permission_names) {
--- a/auth/permission.hh
+++ b/auth/permission.hh
@@ -33,6 +33,7 @@ enum class permission {
    // data access
    SELECT, // required for SELECT.
    MODIFY, // required for INSERT, UPDATE, DELETE, TRUNCATE.
+    VECTOR_SEARCH_INDEXING, // required for SELECT from tables with vector indexes if SELECT permission is not granted.

    // permission management
    AUTHORIZE, // required for GRANT and REVOKE.
@@ -54,7 +55,8 @@ typedef enum_set<
                permission::MODIFY,
                permission::AUTHORIZE,
                permission::DESCRIBE,
-                permission::EXECUTE>> permission_set;
+                permission::EXECUTE,
+                permission::VECTOR_SEARCH_INDEXING>> permission_set;

 bool operator<(const permission_set&, const permission_set&);

--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -41,22 +41,26 @@ static const std::unordered_map<resource_kind, std::size_t> max_parts{
        {resource_kind::functions, 2}};

 static permission_set applicable_permissions(const data_resource_view& dv) {
-    if (dv.table()) {
-        return permission_set::of<
+    
+    // We only support VECTOR_SEARCH_INDEXING permission for ALL KEYSPACES.
+
+    auto set = permission_set::of<
                permission::ALTER,
                permission::DROP,
                permission::SELECT,
                permission::MODIFY,
                permission::AUTHORIZE>();
+
+    if (!dv.table()) {
+        set.add(permission_set::of<permission::CREATE>());
    }

-    return permission_set::of<
-            permission::CREATE,
-            permission::ALTER,
-            permission::DROP,
-            permission::SELECT,
-            permission::MODIFY,
-            permission::AUTHORIZE>();
+    if (!dv.table() && !dv.keyspace()) {
+        set.add(permission_set::of<permission::VECTOR_SEARCH_INDEXING>());
+    }
+
+    return set;
+        
 }

 static permission_set applicable_permissions(const role_resource_view& rv) {
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -17,12 +17,17 @@
 #include <seastar/core/format.hh>
 #include <seastar/core/sstring.hh>

+#include "auth/common.hh"
 #include "auth/resource.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "exceptions/exceptions.hh"
 #include "service/raft/raft_group0_client.hh"

+namespace service {
+class query_state;
+};
+
 namespace auth {

 struct role_config final {
@@ -167,9 +172,9 @@ public:
    ///   (role2, role3)
    /// }
    ///  
-    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;

-    virtual future<role_set> query_all() = 0;
+    virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;

@@ -186,12 +191,12 @@ public:
    ///
    /// \returns the value of the named attribute, if one is set.
    ///
-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    ///
    /// \returns a mapping of each role's value for the named attribute, if one is set for the role.
    ///
-    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;
+    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    /// Sets `attribute_name` with `attribute_value` for `role_name`.
    /// \returns an exceptional future with nonexistant_role if the role does not exist.
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -215,6 +215,7 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
                    meta::legacy::AUTH_KS,
                    "org.apache.cassandra.locator.SimpleStrategy",
                    opts,
+                    std::nullopt,
                    std::nullopt);

            try {
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -231,6 +231,17 @@ struct command_desc {
    } type_ = type::OTHER;
 };

+/// Similar to command_desc, but used in cases where multiple permissions allow the access to the resource.
+struct command_desc_with_permission_set {
+    permission_set permission;
+    const ::auth::resource& resource;
+    enum class type {
+        ALTER_WITH_OPTS,
+        ALTER_SYSTEM_WITH_ALLOWED_OPTS,
+        OTHER
+    } type_ = type::OTHER;
+};
+
 ///
 /// Protected resources cannot be modified even if the performer has permissions to do so.
 ///
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -663,21 +663,30 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

-future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
            meta::role_members_table::name);

+    const auto results = co_await _qp.execute_internal(
+            query,
+            db::consistency_level::ONE,
+            qs,
+            cql3::query_processor::cache_internal::yes);
+
    role_to_directly_granted_map roles_map;
-    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
-        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
-        co_return stop_iteration::no;
-    });
+    std::transform(
+            results->begin(),
+            results->end(),
+            std::inserter(roles_map, roles_map.begin()),
+            [] (const cql3::untyped_result_set_row& row) {
+                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
+    );

    co_return roles_map;
 }

-future<role_set> standard_role_manager::query_all() {
+future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
@@ -695,7 +704,7 @@ future<role_set> standard_role_manager::query_all() {
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
+            qs,
            cql3::query_processor::cache_internal::yes);

    role_set roles;
@@ -727,11 +736,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
    });
 }

-future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
-    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
+    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
        co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -739,11 +748,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
    co_return std::optional<sstring>{};
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
-    return query_all().then([this, attribute_name] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
-                return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
+    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
+        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
+            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
+                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
                    if (att_val) {
                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
                    }
@@ -788,7 +797,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
    std::vector<cql3::description> result{};

-    const auto grants = co_await query_all_directly_granted();
+    const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
    result.reserve(grants.size());

    for (const auto& [grantee_role, granted_role] : grants) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -66,9 +66,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -76,9 +76,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -1209,6 +1209,23 @@ future<mutation> create_table_streams_mutation(table_id table, db_clock::time_po
    co_return std::move(m);
 }

+future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const utils::chunked_vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
+    auto s = db::system_keyspace::cdc_streams_state();
+
+    mutation m(s, partition_key::from_single_value(*s,
+        data_value(table.uuid()).serialize_nonnull()
+    ));
+    m.set_static_cell("timestamp", stream_ts, ts);
+
+    for (const auto& sid : stream_ids) {
+        auto ck = clustering_key::from_singular(*s, dht::token::to_int64(sid.token()));
+        m.set_cell(ck, "stream_id", data_value(sid.to_bytes()), ts);
+        co_await coroutine::maybe_yield();
+    }
+
+    co_return std::move(m);
+}
+
 utils::chunked_vector<mutation>
 make_drop_table_streams_mutations(table_id table, api::timestamp_type ts) {
    utils::chunked_vector<mutation> mutations;
@@ -1235,32 +1252,50 @@ future<> generation_service::load_cdc_tablet_streams(std::optional<std::unordere
        tables_to_process = _cdc_metadata.get_tables_with_cdc_tablet_streams() | std::ranges::to<std::unordered_set<table_id>>();
    }

-    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f) -> future<> {
+    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, utils::chunked_vector<cdc::stream_id>)> f) -> future<> {
        if (tables) {
            for (auto table : *tables) {
-                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
                    return f(table, base_ts, std::move(base_stream_set));
                });
            }
        } else {
-            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
                return f(table, base_ts, std::move(base_stream_set));
            });
        }
    };

-    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
+    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
        table_streams new_table_map;

-        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, std::vector<cdc::stream_id> stream_set) {
+        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, utils::chunked_vector<cdc::stream_id> stream_set) {
            auto ts = std::chrono::duration_cast<api::timestamp_clock::duration>(stream_tp.time_since_epoch()).count();
            new_table_map[ts] = committed_stream_set {stream_tp, std::move(stream_set)};
        };

-        append_stream(base_ts, std::move(base_stream_set));
+        // if we already have a loaded streams map, and the base timestamp is unchanged, then read
+        // the history entries starting from the latest one we have and append it to the existing map.
+        // we can do it because we only append new rows with higher timestamps to the history table.
+        std::optional<std::reference_wrapper<const committed_stream_set>> from_streams;
+        std::optional<db_clock::time_point> from_ts;
+        const auto& all_streams = _cdc_metadata.get_all_tablet_streams();
+        if (auto it = all_streams.find(table); it != all_streams.end()) {
+            const auto& current_map = *it->second;
+            if (current_map.cbegin()->second.ts == base_ts) {
+                const auto& latest_entry = current_map.crbegin()->second;
+                from_streams = std::cref(latest_entry);
+                from_ts = latest_entry.ts;
+            }
+        }

-        co_await _sys_ks.local().read_cdc_streams_history(table, [&] (table_id tid, db_clock::time_point ts, cdc_stream_diff diff) -> future<> {
-            const auto& prev_stream_set = std::crbegin(new_table_map)->second.streams;
+        if (!from_ts) {
+            append_stream(base_ts, std::move(base_stream_set));
+        }
+
+        co_await _sys_ks.local().read_cdc_streams_history(table, from_ts, [&] (table_id tid, db_clock::time_point ts, cdc_stream_diff diff) -> future<> {
+            const auto& prev_stream_set = new_table_map.empty() ?
+                    from_streams->get().streams : std::crbegin(new_table_map)->second.streams;

            append_stream(ts, co_await cdc::metadata::construct_next_stream_set(
                    prev_stream_set, std::move(diff.opened_streams), diff.closed_streams));
@@ -1272,7 +1307,11 @@ future<> generation_service::load_cdc_tablet_streams(std::optional<std::unordere
                new_table_map_copy[ts] = entry;
                co_await coroutine::maybe_yield();
            }
-            svc._cdc_metadata.load_tablet_streams_map(table, std::move(new_table_map_copy));
+            if (!from_ts) {
+                svc._cdc_metadata.load_tablet_streams_map(table, std::move(new_table_map_copy));
+            } else {
+                svc._cdc_metadata.append_tablet_streams_map(table, std::move(new_table_map_copy));
+            }
        }));

        tables_to_process.erase(table);
@@ -1306,7 +1345,7 @@ future<> generation_service::query_cdc_timestamps(table_id table, bool ascending
    }
 }

-future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
+future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
    const auto& all_tables = _cdc_metadata.get_all_tablet_streams();
    auto table_it = all_tables.find(table);
    if (table_it == all_tables.end()) {
@@ -1363,8 +1402,8 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
        co_return;
    }

-    std::vector<cdc::stream_id> new_streams;
-    new_streams.reserve(new_tablet_map.tablet_count());
+    utils::chunked_vector<cdc::stream_id> new_streams;
+    co_await utils::reserve_gently(new_streams, new_tablet_map.tablet_count());
    for (auto tid : new_tablet_map.tablet_ids()) {
        new_streams.emplace_back(new_tablet_map.get_last_token(tid), 0);
        co_await coroutine::maybe_yield();
@@ -1386,4 +1425,113 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
    muts.emplace_back(std::move(mut));
 }

+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
+    utils::chunked_vector<mutation> muts;
+    muts.reserve(2);
+
+    auto gc_now = gc_clock::now();
+    auto tombstone_ts = ts - 1;
+
+    {
+        // write the new base stream set to cdc_streams_state
+        auto s = db::system_keyspace::cdc_streams_state();
+        mutation m(s, partition_key::from_single_value(*s,
+            data_value(table.uuid()).serialize_nonnull()
+        ));
+        m.partition().apply(tombstone(tombstone_ts, gc_now));
+        m.set_static_cell("timestamp", data_value(base_ts), ts);
+
+        for (const auto& sid : base_stream_set) {
+            co_await coroutine::maybe_yield();
+            auto ck = clustering_key::from_singular(*s, dht::token::to_int64(sid.token()));
+            m.set_cell(ck, "stream_id", data_value(sid.to_bytes()), ts);
+        }
+        muts.emplace_back(std::move(m));
+    }
+
+    {
+        // remove all entries from cdc_streams_history up to the new base
+        auto s = db::system_keyspace::cdc_streams_history();
+        mutation m(s, partition_key::from_single_value(*s,
+            data_value(table.uuid()).serialize_nonnull()
+        ));
+        auto range = query::clustering_range::make_ending_with({
+                clustering_key_prefix::from_single_value(*s, timestamp_type->decompose(base_ts)), true});
+        auto bv = bound_view::from_range(range);
+        m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{ts, gc_now}});
+        muts.emplace_back(std::move(m));
+    }
+
+    co_return std::move(muts);
+}
+
+table_streams::const_iterator get_new_base_for_gc(const table_streams& streams_map, std::chrono::seconds ttl) {
+    // find the most recent timestamp that is older than ttl_seconds, which will become the new base.
+    // all streams with older timestamps can be removed because they are closed for more than ttl_seconds
+    // (they are all replaced by streams with the newer timestamp).
+
+    auto ts_upper_bound = db_clock::now() - ttl;
+
+    auto it = streams_map.begin();
+    while (it != streams_map.end()) {
+        auto next_it = std::next(it);
+        if (next_it == streams_map.end()) {
+            break;
+        }
+
+        auto next_tp = next_it->second.ts;
+        if (next_tp <= ts_upper_bound) {
+            // the next timestamp is older than ttl_seconds, so the current one is obsolete
+            it = next_it;
+        } else {
+            break;
+        }
+    }
+
+    return it;
+}
+
+future<utils::chunked_vector<mutation>> generation_service::garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts) {
+    const auto& table_streams = *_cdc_metadata.get_all_tablet_streams().at(table);
+
+    // if TTL is not provided by the caller then use the table's CDC TTL
+    auto base_schema = cdc::get_base_table(_db, *_db.find_schema(table));
+    ttl = ttl.or_else([&] -> std::optional<std::chrono::seconds> {
+        auto ttl_seconds = base_schema->cdc_options().ttl();
+        if (ttl_seconds > 0) {
+            return std::chrono::seconds(ttl_seconds);
+        } else {
+            // ttl=0 means no ttl
+            return std::nullopt;
+        }
+    });
+    if (!ttl) {
+        co_return utils::chunked_vector<mutation>{};
+    }
+
+    auto new_base_it = get_new_base_for_gc(table_streams, *ttl);
+    if (new_base_it == table_streams.begin() || new_base_it == table_streams.end()) {
+        // nothing to gc
+        co_return utils::chunked_vector<mutation>{};
+    }
+
+    for (auto it = table_streams.begin(); it != new_base_it; ++it) {
+        cdc_log.info("Garbage collecting CDC stream metadata for table {}: removing generation {} because it is older than the CDC TTL of {} seconds",
+                table, it->second.ts, *ttl);
+    }
+
+    co_return co_await get_cdc_stream_gc_mutations(table, new_base_it->second.ts, new_base_it->second.streams, ts);
+}
+
+future<> generation_service::garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts) {
+    for (auto table : _cdc_metadata.get_tables_with_cdc_tablet_streams()) {
+        co_await coroutine::maybe_yield();
+
+        auto table_muts = co_await garbage_collect_cdc_streams_for_table(table, std::nullopt, ts);
+        for (auto&& m : table_muts) {
+            muts.emplace_back(std::move(m));
+        }
+    }
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -143,12 +143,12 @@ stream_state read_stream_state(int8_t val);

 struct committed_stream_set {
    db_clock::time_point ts;
-    std::vector<cdc::stream_id> streams;
+    utils::chunked_vector<cdc::stream_id> streams;
 };

 struct cdc_stream_diff {
-    std::vector<stream_id> closed_streams;
-    std::vector<stream_id> opened_streams;
+    utils::chunked_vector<stream_id> closed_streams;
+    utils::chunked_vector<stream_id> opened_streams;
 };

 using table_streams = std::map<api::timestamp_type, committed_stream_set>;
@@ -220,8 +220,11 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

 future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const locator::tablet_map&, api::timestamp_type);
+future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const utils::chunked_vector<cdc::stream_id>&, api::timestamp_type);
 utils::chunked_vector<mutation> make_drop_table_streams_mutations(table_id, api::timestamp_type ts);

 future<mutation> get_switch_streams_mutation(table_id table, db_clock::time_point stream_ts, cdc_stream_diff diff, api::timestamp_type ts);
+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
+table_streams::const_iterator get_new_base_for_gc(const table_streams&, std::chrono::seconds ttl);

 } // namespace cdc
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -149,10 +149,13 @@ public:
    future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);

    future<> query_cdc_timestamps(table_id table, bool ascending, noncopyable_function<future<>(db_clock::time_point)> f);
-    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);
+    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);

    future<> generate_tablet_resize_update(utils::chunked_vector<canonical_mutation>& muts, table_id table, const locator::tablet_map& new_tablet_map, api::timestamp_type ts);

+    future<utils::chunked_vector<mutation>> garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts);
+    future<> garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts);
+
 private:
    /* Retrieve the CDC generation which starts at the given timestamp (from a distributed table created for this purpose)
     * and start using it for CDC log writes if it's not obsolete.
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -23,6 +23,7 @@
 #include "bytes.hh"
 #include "index/vector_index.hh"
 #include "locator/abstract_replication_strategy.hh"
+#include "locator/topology.hh"
 #include "replica/database.hh"
 #include "db/schema_tables.hh"
 #include "gms/feature_service.hh"
@@ -62,15 +63,20 @@ logging::logger cdc_log("cdc");

 namespace {

-shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy(const keyspace_metadata& ksm) {
-    locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets());
-    return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params);
+shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy(const keyspace_metadata& ksm, const locator::topology& topo) {
+    locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
+    return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, topo);
 }

+// When dropping a column from a CDC log table, we set the drop timestamp
+// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
+// with column drop, the write timestamp is before the column drop timestamp.
+constexpr auto column_drop_leeway = std::chrono::seconds(5);
+
 } // anonymous namespace

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&,
+static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&, api::timestamp_type,
        std::optional<table_id> = {}, schema_ptr = nullptr);
 }

@@ -182,7 +188,7 @@ public:
        muts.emplace_back(std::move(mut));
    }

-    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms) override {
+    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms, api::timestamp_type ts) override {
        std::vector<schema_ptr> new_cfms;

        for (auto sp : cfms) {
@@ -197,11 +203,11 @@ public:
            check_that_cdc_log_table_does_not_exist(db, schema, logname);
            ensure_that_table_has_no_counter_columns(schema);
            if (!db.features().cdc_with_tablets) {
-                ensure_that_table_uses_vnodes(ksm, schema);
+                ensure_that_table_uses_vnodes(ksm, schema, db.get_token_metadata().get_topology());
            }

            // in seastar thread
-            auto log_schema = create_log_schema(schema, db, ksm);
+            auto log_schema = create_log_schema(schema, db, ksm, ts);
            new_cfms.push_back(std::move(log_schema));
        }

@@ -244,11 +250,11 @@ public:
            check_for_attempt_to_create_nested_cdc_log(db, new_schema);
            ensure_that_table_has_no_counter_columns(new_schema);
            if (!db.features().cdc_with_tablets) {
-                ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);
+                ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema, db.get_token_metadata().get_topology());
            }

            std::optional<table_id> maybe_id = log_schema ? std::make_optional(log_schema->id()) : std::nullopt;
-            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), std::move(maybe_id), log_schema);
+            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), timestamp, std::move(maybe_id), log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(_ctxt._proxy, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -311,7 +317,8 @@ public:
        lowres_clock::time_point timeout,
        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
-        db::consistency_level write_cl
+        db::consistency_level write_cl,
+        per_request_options options
    );

    template<typename Iter>
@@ -345,8 +352,8 @@ private:
    // Until we support CDC with tablets (issue #16317), we can't allow this
    // to be attempted - in particular the log table we try to create will not
    // have tablets, and will cause a failure.
-    static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema) {
-        auto rs = generate_replication_strategy(ksm);
+    static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema, const locator::topology& topo) {
+        auto rs = generate_replication_strategy(ksm, topo);
        if (rs->uses_tablets()) {
            throw exceptions::invalid_request_exception(format("Cannot create CDC log for a table {}.{}, because the keyspace uses tablets, and not all nodes support the CDC with tablets feature.",
                schema.ks_name(), schema.cf_name()));
@@ -580,7 +587,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
 }

 static schema_ptr create_log_schema(const schema& s, const replica::database& db,
-        const keyspace_metadata& ksm, std::optional<table_id> uuid, schema_ptr old)
+        const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
 {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
@@ -616,6 +623,28 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
+
+    auto validate_new_column = [&] (const sstring& name) {
+        // When dropping a column from a CDC log table, we set the drop timestamp to be
+        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
+        // Therefore, when recreating a column with the same name, we need to validate
+        // that it's not recreated too soon and that the drop timestamp has passed.
+        if (old && old->dropped_columns().contains(name)) {
+            const auto& drop_info = old->dropped_columns().at(name);
+            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
+            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
+            if (drop_time > create_time) {
+                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
+                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
+            }
+        }
+    };
+
+    auto add_column = [&] (sstring name, data_type type) {
+        validate_new_column(name);
+        b.with_column(to_bytes(name), type);
+    };
+
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -637,9 +666,9 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                    }
                ));
            }
-            b.with_column(log_data_column_name_bytes(column.name()), type);
+            add_column(log_data_column_name(column.name_as_text()), type);
            if (is_data_col) {
-                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
+                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -655,7 +684,7 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
+                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
            }
        }
    };
@@ -668,7 +697,7 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
        b.set_uuid(*uuid);
    }

-    auto rs = generate_replication_strategy(ksm);
+    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata()));
    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));

@@ -681,7 +710,8 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
+                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
            }
        }
    }
@@ -1366,6 +1396,13 @@ struct process_row_visitor {
 };

 struct process_change_visitor {
+    const per_request_options& _request_options;
+    // The types of the operations used for row / partition deletes. Introduced
+    // to differentiate service operations (e.g. operation::service_row_delete
+    // vs operation::row_delete).
+    const operation _row_delete_op = operation::row_delete;
+    const operation _partition_delete_op = operation::partition_delete;
+
    stats::part_type_set& _touched_parts;

    log_mutation_builder& _builder;
@@ -1428,7 +1465,7 @@ struct process_change_visitor {
    void clustered_row_delete(const clustering_key& ckey, const tombstone&) {
        _touched_parts.set<stats::part_type::ROW_DELETE>();

-        auto log_ck = _builder.allocate_new_log_row(operation::row_delete);
+        auto log_ck = _builder.allocate_new_log_row(_row_delete_op);
        _builder.set_clustering_columns(log_ck, ckey);

        if (_enable_updating_state && get_row_state(_clustering_row_states, ckey)) {
@@ -1472,7 +1509,7 @@ struct process_change_visitor {

    void partition_delete(const tombstone&) {
        _touched_parts.set<stats::part_type::PARTITION_DELETE>();
-        auto log_ck = _builder.allocate_new_log_row(operation::partition_delete);
+        auto log_ck = _builder.allocate_new_log_row(_partition_delete_op);
        if (_enable_updating_state) {
            _clustering_row_states.clear();
        }
@@ -1487,6 +1524,7 @@ private:
    schema_ptr _schema;
    dht::decorated_key _dk;
    schema_ptr _log_schema;
+    const per_request_options& _options;

    /**
     * #6070, #6084
@@ -1576,11 +1614,12 @@ private:
    stats::part_type_set _touched_parts;

 public:
-    transformer(db_context ctx, schema_ptr s, dht::decorated_key dk)
+    transformer(db_context ctx, schema_ptr s, dht::decorated_key dk, const per_request_options& options)
        : _ctx(ctx)
        , _schema(std::move(s))
        , _dk(std::move(dk))
-        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
+        , _log_schema(_schema->cdc_schema() ? _schema->cdc_schema() : ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
+        , _options(options)
        , _clustering_row_states(0, clustering_key::hashing(*_schema), clustering_key::equality(*_schema))
        , _uses_tablets(ctx._proxy.get_db().local().find_keyspace(_schema->ks_name()).uses_tablets())
    {
@@ -1595,7 +1634,7 @@ public:
    }

    void produce_preimage(const clustering_key* ck, const one_kind_column_set& columns_to_include) override {
-        // iff we want full preimage, just ignore the affected columns and include everything. 
+        // if we want full preimage, just ignore the affected columns and include everything. 
        generate_image(operation::pre_image, ck, _schema->cdc_options().full_preimage() ? nullptr : &columns_to_include);
    };

@@ -1681,6 +1720,9 @@ public:
    void process_change(const mutation& m) override {
        SCYLLA_ASSERT(_builder);
        process_change_visitor v {
+            ._request_options = _options,
+            ._row_delete_op = _options.is_system_originated ? operation::service_row_delete : operation::row_delete,
+            ._partition_delete_op = _options.is_system_originated ? operation::service_partition_delete : operation::partition_delete,
            ._touched_parts = _touched_parts,
            ._builder = *_builder,
            ._enable_updating_state = _enable_updating_state,
@@ -1712,7 +1754,8 @@ public:
            const mutation& m)
    {
        auto& p = m.partition();
-        if (p.clustered_rows().empty() && p.static_row().empty()) {
+        const bool no_ck_schema_partition_deletion = m.schema()->clustering_key_size() == 0 && bool(p.partition_tombstone());
+        if (p.clustered_rows().empty() && p.static_row().empty() && !no_ck_schema_partition_deletion) {
            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
        }

@@ -1761,12 +1804,12 @@ public:
                });
            }
        }
-        if (!p.clustered_rows().empty()) {
+        if (!p.clustered_rows().empty() || no_ck_schema_partition_deletion) {
            const bool has_row_delete = std::any_of(p.clustered_rows().begin(), p.clustered_rows().end(), [] (const rows_entry& re) {
                return re.row().deleted_at();
            });
            // for postimage we need everything...
-            if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage()) {
+            if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage() || no_ck_schema_partition_deletion) {
                for (const column_definition& c: _schema->regular_columns()) {
                    regular_columns.emplace_back(c.id);
                    columns.emplace_back(&c);
@@ -1881,7 +1924,7 @@ transform_mutations(utils::chunked_vector<mutation>& muts, decltype(muts.size())
 } // namespace cdc

 future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
    // we do all this because in the case of batches, we can have mixed schemas.
    auto e = mutations.end();
    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
@@ -1895,9 +1938,9 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
    tracing::trace(tr_state, "CDC: Started generating mutations for log rows");
    mutations.reserve(2 * mutations.size());

-    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
-            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
-        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
+    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{}, std::move(options),
+            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details, per_request_options& options) {
+        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl, &options] (int idx) mutable {
            auto& m = mutations[idx];
            auto s = m.schema();

@@ -1905,12 +1948,16 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                return make_ready_future<>();
            }

-            transformer trans(_ctxt, s, m.decorated_key());
+            transformer trans(_ctxt, s, m.decorated_key(), options);

            auto f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(nullptr);
-            if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
+            if (options.preimage && !options.preimage->empty()) {
+                // Preimage has been fetched by upper layers.
+                tracing::trace(tr_state, "CDC: Using a prefetched preimage");
+                f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(options.preimage);
+            } else if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
                // Note: further improvement here would be to coalesce the pre-image selects into one
-                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
+                // if a batch contains several modifications to the same table. Otoh, batch is rare(?)
                // so this is premature.
                tracing::trace(tr_state, "CDC: Selecting preimage for {}", m.decorated_key());
                f = trans.pre_image_select(qs.get_client_state(), write_cl, m).then_wrapped([this] (future<lw_shared_ptr<cql3::untyped_result_set>> f) {
@@ -1971,11 +2018,11 @@ bool cdc::cdc_service::needs_cdc_augmentation(const utils::chunked_vector<mutati
 }

 future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
-        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
-            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl, options = std::move(options)] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
        });
    }
-    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
 }
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -21,6 +21,7 @@
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sstring.hh>

+#include "cql3/untyped_result_set.hh"
 #include "mutation/timestamp.hh"
 #include "tracing/trace_state.hh"
 #include "utils/UUID.hh"
@@ -51,6 +52,29 @@ class database;

 namespace cdc {

+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+
+    // Operations initiated internally by Scylla. Currently used only by Alternator
+    service_row_delete = -3, service_partition_delete = -4,
+};
+
+struct per_request_options {
+    // The value of the base row before current operation, queried by higher
+    // layers than CDC. We assume that CDC could have seen the row in this
+    // state, i.e. the value isn't 'stale'/'too recent'.
+    lw_shared_ptr<cql3::untyped_result_set> preimage;
+    // Whether this mutation is a result of an internal operation initiated by
+    // Scylla. Currently, only TTL expiration implementation for Alternator
+    // uses this.
+    const bool is_system_originated = false;
+};
+
 struct operation_result_tracker;
 class db_context;
 class metadata;
@@ -80,8 +104,9 @@ public:
        lowres_clock::time_point timeout,
        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
-        db::consistency_level write_cl
-        );
+        db::consistency_level write_cl,
+        per_request_options options = {}
+    );
    bool needs_cdc_augmentation(const utils::chunked_vector<mutation>&) const;
 };

@@ -93,15 +118,6 @@ struct db_context final {
        : _proxy(proxy), _migration_notifier(notifier), _cdc_metadata(cdc_meta) {}
 };

-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
-    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
-    post_image = 9,
-};
-
 bool is_log_for_some_table(const replica::database& db, const sstring& ks_name, const std::string_view& table_name);

 schema_ptr get_base_table(const replica::database&, const schema&);
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -54,7 +54,7 @@ cdc::stream_id get_stream(
 }

 static cdc::stream_id get_stream(
-        const std::vector<cdc::stream_id>& streams,
+        const utils::chunked_vector<cdc::stream_id>& streams,
        dht::token tok) {
    if (streams.empty()) {
        on_internal_error(cdc_log, "get_stream: streams empty");
@@ -159,7 +159,7 @@ cdc::stream_id cdc::metadata::get_vnode_stream(api::timestamp_type ts, dht::toke
    return ret;
 }

-const std::vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
+const utils::chunked_vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
    auto now = api::new_timestamp();
    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(seastar::format(
@@ -259,10 +259,10 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    return !it->second;
 }

-future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
-        const std::vector<cdc::stream_id>& prev_stream_set,
-        std::vector<cdc::stream_id> opened,
-        const std::vector<cdc::stream_id>& closed) {
+future<utils::chunked_vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
+        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
+        utils::chunked_vector<cdc::stream_id> opened,
+        const utils::chunked_vector<cdc::stream_id>& closed) {

    if (closed.size() == prev_stream_set.size()) {
        // all previous streams are closed, so the next stream set is just the opened streams.
@@ -273,8 +273,8 @@ future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
    // streams and removing the closed streams. we assume each stream set is
    // sorted by token, and the result is sorted as well.

-    std::vector<cdc::stream_id> next_stream_set;
-    next_stream_set.reserve(prev_stream_set.size() + opened.size() - closed.size());
+    utils::chunked_vector<cdc::stream_id> next_stream_set;
+    co_await utils::reserve_gently(next_stream_set, prev_stream_set.size() + opened.size() - closed.size());

    auto next_prev = prev_stream_set.begin();
    auto next_closed = closed.begin();
@@ -306,6 +306,10 @@ void cdc::metadata::load_tablet_streams_map(table_id tid, table_streams new_tabl
    _tablet_streams[tid] = make_lw_shared(std::move(new_table_map));
 }

+void cdc::metadata::append_tablet_streams_map(table_id tid, table_streams new_table_map) {
+    _tablet_streams[tid]->insert(std::make_move_iterator(new_table_map.begin()), std::make_move_iterator(new_table_map.end()));
+}
+
 void cdc::metadata::remove_tablet_streams_map(table_id tid) {
    _tablet_streams.erase(tid);
 }
@@ -314,8 +318,8 @@ std::vector<table_id> cdc::metadata::get_tables_with_cdc_tablet_streams() const
    return _tablet_streams | std::views::keys | std::ranges::to<std::vector<table_id>>();
 }

-future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const std::vector<stream_id>& before, const std::vector<stream_id>& after) {
-    std::vector<stream_id> closed, opened;
+future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const utils::chunked_vector<stream_id>& before, const utils::chunked_vector<stream_id>& after) {
+    utils::chunked_vector<stream_id> closed, opened;

    auto before_it = before.begin();
    auto after_it = after.begin();
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -37,7 +37,9 @@ class metadata final {
    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
    container_t _gens;

-    using table_streams_ptr = lw_shared_ptr<const table_streams>;
+    // per-table streams map for tables in tablets-based keyspaces.
+    // the streams map is shared with the virtual tables reader, hence we can only insert new entries to it, not erase.
+    using table_streams_ptr = lw_shared_ptr<table_streams>;
    using tablet_streams_map = std::unordered_map<table_id, table_streams_ptr>;

    tablet_streams_map _tablet_streams;
@@ -47,7 +49,7 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;

-    const std::vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;
+    const utils::chunked_vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;

 public:
    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
@@ -100,6 +102,7 @@ public:
    bool prepare(db_clock::time_point ts);

    void load_tablet_streams_map(table_id tid, table_streams new_table_map);
+    void append_tablet_streams_map(table_id tid, table_streams new_table_map);
    void remove_tablet_streams_map(table_id tid);

    const tablet_streams_map& get_all_tablet_streams() const {
@@ -108,14 +111,14 @@ public:

    std::vector<table_id> get_tables_with_cdc_tablet_streams() const;

-    static future<std::vector<stream_id>> construct_next_stream_set(
-        const std::vector<cdc::stream_id>& prev_stream_set,
-        std::vector<cdc::stream_id> opened,
-        const std::vector<cdc::stream_id>& closed);
+    static future<utils::chunked_vector<stream_id>> construct_next_stream_set(
+        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
+        utils::chunked_vector<cdc::stream_id> opened,
+        const utils::chunked_vector<cdc::stream_id>& closed);

    static future<cdc_stream_diff> generate_stream_diff(
-        const std::vector<stream_id>& before,
-        const std::vector<stream_id>& after);
+        const utils::chunked_vector<stream_id>& before,
+        const utils::chunked_vector<stream_id>& after);

 };

--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -111,6 +111,15 @@ struct batch {
                ret.insert(std::make_pair(change.key, all_columns));
            }
        }
+        // While deleting a full partition avoids row-by-row logging for performance
+        // reasons, we must explicitly log single-row deletions for tables without a
+        // clustering key. This ensures consistent behavior with deletions of single
+        // rows from tables with a clustering key. See issue #26382.
+        if (partition_deletions && s.clustering_key_size() == 0) {
+            cdc::one_kind_column_set all_columns{s.regular_columns_count()};
+            all_columns.set(0, s.regular_columns_count(), true);
+            ret.emplace(clustering_key::make_empty(), all_columns);
+        }

        auto process_change_type = [&] (const auto& changes) {
            for (const auto& change : changes) {
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -117,6 +117,9 @@ add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}=.")
 cmake_path(GET CMAKE_BINARY_DIR FILENAME build_dir_name)
 add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}/=${build_dir_name}")

+# https://github.com/llvm/llvm-project/issues/163007
+add_compile_options("-fextend-variable-liveness=none")
+
 default_target_arch(target_arch)
 if(target_arch)
  add_compile_options("-march=${target_arch}")
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -129,6 +129,7 @@ static const std::unordered_map<compaction_type, sstring> compaction_types = {
    { compaction_type::Upgrade, "UPGRADE" },
    { compaction_type::Reshape, "RESHAPE" },
    { compaction_type::Split, "SPLIT" },
+    { compaction_type::Major, "MAJOR" },
 };

 sstring compaction_name(compaction_type type) {
@@ -159,6 +160,7 @@ std::string_view to_string(compaction_type type) {
    case compaction_type::Upgrade: return "Upgrade";
    case compaction_type::Reshape: return "Reshape";
    case compaction_type::Split: return "Split";
+    case compaction_type::Major: return "Major";
    }
    on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
    return "(invalid)";
@@ -1537,6 +1539,8 @@ private:
        mutation_fragment_stream_validator _validator;
        bool _skip_to_next_partition = false;
        uint64_t& _validation_errors;
+        bool& _failed_to_fix_sstable;
+        compaction_type_options::scrub::drop_unfixable_sstables _drop_unfixable_sstables;

    private:
        void maybe_abort_scrub(std::function<void()> report_error) {
@@ -1547,7 +1551,7 @@ private:
            ++_validation_errors;
        }

-        void on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
+        skip on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
            auto report_fn = [this, error] (std::string_view action = "") {
                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
@@ -1556,6 +1560,11 @@ private:

            auto pe = mutation_fragment_v2(*_schema, _permit, partition_end{});
            if (!_validator(pe)) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
@@ -1564,11 +1573,17 @@ private:
            push_mutation_fragment(std::move(pe));

            if (!_validator(ps)) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
                        "scrub compaction failed to rectify unexpected partition-start, validator rejects it even after the injected partition-end");
            }
+            return skip::no;
        }

        skip on_invalid_partition(const dht::decorated_key& new_key, sstring error) {
@@ -1596,6 +1611,11 @@ private:
            const auto& key = _validator.previous_partition_key();

            if (_validator.current_tombstone()) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
@@ -1635,13 +1655,21 @@ private:
        }

        void on_malformed_sstable_exception(std::exception_ptr e) {
-            if (_scrub_mode != compaction_type_options::scrub::mode::skip) {
+            bool should_abort = _scrub_mode == compaction_type_options::scrub::mode::abort ||
+                    (_scrub_mode == compaction_type_options::scrub::mode::segregate && !_drop_unfixable_sstables);
+            if (should_abort) {
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
                        format("scrub compaction failed due to unrecoverable error: {}", e));
            }
+            if (_drop_unfixable_sstables) {
+                _failed_to_fix_sstable = true;
+            }
+            end_stream();
+        }

+        void end_stream() {
            // Closes the active range tombstone if needed, before emitting partition end.
            if (auto current_tombstone = _validator.current_tombstone(); current_tombstone) {
                const auto& last_pos = _validator.previous_position();
@@ -1662,6 +1690,10 @@ private:
        void fill_buffer_from_underlying() {
            utils::get_local_injector().inject("rest_api_keyspace_scrub_abort", [] { throw compaction_aborted_exception("", "", "scrub compaction found invalid data"); });
            while (!_reader.is_buffer_empty() && !is_buffer_full()) {
+                if (_end_of_stream && _failed_to_fix_sstable) {
+                    return;
+                }
+
                auto mf = _reader.pop_mutation_fragment();
                if (mf.is_partition_start()) {
                    // First check that fragment kind monotonicity stands.
@@ -1672,7 +1704,9 @@ private:
                    // will confuse it.
                    if (!_skip_to_next_partition) {
                        if (auto res = _validator(mf); !res) {
-                            on_unexpected_partition_start(mf, res.what());
+                            if (on_unexpected_partition_start(mf, res.what()) == skip::yes) {
+                                continue;
+                            }
                        }
                        // Continue processing this partition start.
                    }
@@ -1696,6 +1730,10 @@ private:
                push_mutation_fragment(std::move(mf));
            }

+            if (_end_of_stream && _failed_to_fix_sstable) {
+                return;
+            }
+
            _end_of_stream = _reader.is_end_of_stream() && _reader.is_buffer_empty();

            if (_end_of_stream) {
@@ -1706,12 +1744,15 @@ private:
        }

    public:
-        reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors)
+        reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors,
+                bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables)
            : impl(underlying.schema(), underlying.permit())
            , _scrub_mode(scrub_mode)
            , _reader(std::move(underlying))
            , _validator(*_schema)
            , _validation_errors(validation_errors)
+            , _failed_to_fix_sstable(failed_to_fix_sstable)
+            , _drop_unfixable_sstables(drop_unfixable_sstables)
        { }
        virtual future<> fill_buffer() override {
            if (_end_of_stream) {
@@ -1762,6 +1803,7 @@ private:
    mutable std::string _scrub_finish_description;
    uint64_t _bucket_count = 0;
    uint64_t _validation_errors = 0;
+    bool _failed_to_fix_sstable = false;

 public:
    scrub_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
@@ -1793,7 +1835,7 @@ public:
            on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
        }
        auto full_scan_reader = _compacting->make_full_scan_reader(std::move(s), std::move(permit), nullptr, unwrap_monitor_generator(), sstables::integrity_check::yes);
-        return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors);
+        return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors, _failed_to_fix_sstable, _options.drop_unfixable);
    }

    uint64_t partitions_per_sstable() const override {
@@ -1830,11 +1872,45 @@ public:
        return ret;
    }

-    friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
+    void drop_unfixable_sstables() {
+        if (!_sstables.empty() || !used_garbage_collected_sstables().empty()) {
+            std::vector<sstables::shared_sstable> old_sstables;
+            std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
+
+            // Remove Garbage Collected SSTables from the SSTable set if any was previously added.
+            auto& used_gc_sstables = used_garbage_collected_sstables();
+            old_sstables.insert(old_sstables.end(), used_gc_sstables.begin(), used_gc_sstables.end());
+
+            _replacer(get_compaction_completion_desc(std::move(old_sstables), {}));
+        }
+
+        // Mark new sstables for deletion as well
+        for (auto& sst : boost::range::join(_new_partial_sstables, _new_unused_sstables)) {
+            sst->mark_for_deletion();
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (_options.drop_unfixable && _failed_to_fix_sstable) {
+            drop_unfixable_sstables();
+        } else {
+            regular_compaction::on_end_of_compaction();
+        }
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (_options.drop_unfixable && _failed_to_fix_sstable && writer) {
+            finish_new_sstable(writer);
+        } else {
+            regular_compaction::stop_sstable_writer(writer);
+        }
+    }
+
+    friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);
 };

-mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors) {
-    return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors);
+mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables) {
+    return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors, failed_to_fix_sstable, drop_unfixable_sstables);
 }

 class resharding_compaction final : public compaction {
@@ -1971,6 +2047,7 @@ compaction_type compaction_type_options::type() const {
        compaction_type::Reshard,
        compaction_type::Reshape,
        compaction_type::Split,
+        compaction_type::Major,
    };
    static_assert(std::variant_size_v<compaction_type_options::options_variant> == std::size(index_to_type));
    return index_to_type[_options.index()];
@@ -1992,6 +2069,9 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
        std::unique_ptr<compaction> operator()(compaction_type_options::regular) {
            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
        }
+        std::unique_ptr<compaction> operator()(compaction_type_options::major) {
+            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+        }
        std::unique_ptr<compaction> operator()(compaction_type_options::cleanup) {
            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
        }
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -138,6 +138,6 @@ std::unordered_set<sstables::shared_sstable>
 get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);

 // For tests, can drop after we virtualize sstables.
-mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
+mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);

 }
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -20,7 +20,7 @@
 namespace compaction {

 enum class compaction_type {
-    Compaction = 0,
+    Compaction = 0, // Used only for regular compactions
    Cleanup = 1,
    Validation = 2, // Origin uses this for a compaction that is used exclusively for repair
    Scrub = 3,
@@ -29,6 +29,7 @@ enum class compaction_type {
    Upgrade = 6,
    Reshape = 7,
    Split = 8,
+    Major = 9,
 };

 struct compaction_completion_desc {
@@ -49,6 +50,8 @@ class compaction_type_options {
 public:
    struct regular {
    };
+    struct major {
+    };
    struct cleanup {
    };
    struct upgrade {
@@ -74,6 +77,11 @@ public:
        // Should invalid sstables be moved into quarantine.
        // Only applies to validate-mode.
        quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
+
+        using drop_unfixable_sstables = bool_class<class drop_unfixable_sstables_tag>;
+        // Drop sstables that cannot be fixed.
+        // Only applies to segregate-mode.
+        drop_unfixable_sstables drop_unfixable = drop_unfixable_sstables::no;
    };
    struct reshard {
    };
@@ -83,7 +91,7 @@ public:
        mutation_writer::classify_by_token_group classifier;
    };
 private:
-    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split>;
+    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major>;

 private:
    options_variant _options;
@@ -105,6 +113,10 @@ public:
        return compaction_type_options(regular{});
    }

+    static compaction_type_options make_major() {
+        return compaction_type_options(major{});
+    }
+
    static compaction_type_options make_cleanup() {
        return compaction_type_options(cleanup{});
    }
@@ -113,8 +125,8 @@ public:
        return compaction_type_options(upgrade{});
    }

-    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
-        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
+    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes, scrub::drop_unfixable_sstables drop_unfixable_sstables = scrub::drop_unfixable_sstables::no) {
+        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables, .drop_unfixable = drop_unfixable_sstables});
    }

    static compaction_type_options make_split(mutation_writer::classify_by_token_group classifier) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -547,7 +547,7 @@ public:
            compaction_group_view* t,
            tasks::task_id parent_id,
            bool consider_only_existing_data)
-        : compaction_task_executor(mgr, do_throw_if_stopping, t, compaction_type::Compaction, "Major compaction")
+        : compaction_task_executor(mgr, do_throw_if_stopping, t, compaction_type::Major, "Major compaction")
        , major_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id, flush_mode::compacted_tables, consider_only_existing_data)
    {
        _status.progress_units = "bytes";
@@ -1506,7 +1506,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
        co_return;
    }
    auto num_runs_for_compaction = [&, this] -> future<size_t> {
-        auto& cs = t.get_compaction_strategy();
+        auto cs = t.get_compaction_strategy();
        auto desc = co_await cs.get_sstables_for_compaction(t, get_strategy_control());
        co_return std::ranges::size(desc.sstables
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
@@ -2313,7 +2313,7 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    }
    owned_ranges_ptr owned_ranges_ptr = {};
    sstring option_desc = fmt::format("mode: {};\nquarantine_mode: {}\n", opts.operation_mode, opts.quarantine_operation_mode);
-    co_return co_await rewrite_sstables(t, compaction_type_options::make_scrub(scrub_mode), std::move(owned_ranges_ptr), [&t, opts] -> future<std::vector<sstables::shared_sstable>> {
+    co_return co_await rewrite_sstables(t, compaction_type_options::make_scrub(scrub_mode, opts.quarantine_sstables, opts.drop_unfixable), std::move(owned_ranges_ptr), [&t, opts] -> future<std::vector<sstables::shared_sstable>> {
        auto all_sstables = co_await get_all_sstables(t);
        std::vector<sstables::shared_sstable> sstables = all_sstables
                | std::views::filter([&opts] (const sstables::shared_sstable& sst) {
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -569,7 +569,7 @@ protected:
                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> update_history(::compaction::compaction_group_view& t, compaction_result&& res, const compaction_data& cdata);
    bool should_update_history(compaction_type ct) {
-        return ct == compaction_type::Compaction;
+        return ct == compaction_type::Compaction || ct == compaction_type::Major;
    }
 public:
    compaction_manager::compaction_stats_opt get_stats() const noexcept {
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -41,7 +41,7 @@ using timestamp_type = api::timestamp_type;

 compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::vector<sstables::shared_sstable> candidates, int level, uint64_t max_sstable_bytes) {
    // run major compaction in maintenance priority
-    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
+    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes, sstables::run_id::create_random_id(), compaction_type_options::make_major());
 }

 std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
@@ -804,9 +804,9 @@ compaction_strategy_state compaction_strategy_state::make(const compaction_strat
        case compaction_strategy_type::incremental:
            return compaction_strategy_state(default_empty_state{});
        case compaction_strategy_type::leveled:
-            return compaction_strategy_state(leveled_compaction_strategy_state{});
+            return compaction_strategy_state(seastar::make_shared<leveled_compaction_strategy_state>());
        case compaction_strategy_type::time_window:
-            return compaction_strategy_state(time_window_compaction_strategy_state{});
+            return compaction_strategy_state(seastar::make_shared<time_window_compaction_strategy_state>());
        default:
            throw std::runtime_error("strategy not supported");
    }
--- a/compaction/compaction_strategy_state.hh
+++ b/compaction/compaction_strategy_state.hh
@@ -18,7 +18,7 @@ namespace compaction {
 class compaction_strategy_state {
 public:
    struct default_empty_state {};
-    using states_variant = std::variant<default_empty_state, leveled_compaction_strategy_state, time_window_compaction_strategy_state>;
+    using states_variant = std::variant<default_empty_state, leveled_compaction_strategy_state_ptr, time_window_compaction_strategy_state_ptr>;
 private:
    states_variant _state;
 public:
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -14,12 +14,12 @@

 namespace compaction {

-leveled_compaction_strategy_state& leveled_compaction_strategy::get_state(compaction_group_view& table_s) const {
-    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state>();
+leveled_compaction_strategy_state_ptr leveled_compaction_strategy::get_state(compaction_group_view& table_s) const {
+    return table_s.get_compaction_strategy_state().get<leveled_compaction_strategy_state_ptr>();
 }

 future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
-    auto& state = get_state(table_s);
+    auto state = get_state(table_s);
    auto candidates = co_await control.candidates(table_s);
    // NOTE: leveled_manifest creation may be slightly expensive, so later on,
    // we may want to store it in the strategy itself. However, the sstable
@@ -27,10 +27,10 @@ future<compaction_descriptor> leveled_compaction_strategy::get_sstables_for_comp
    // sstable in it may be marked for deletion after compacted.
    // Currently, we create a new manifest whenever it's time for compaction.
    leveled_manifest manifest = leveled_manifest::create(table_s, candidates, _max_sstable_size_in_mb, _stcs_options);
-    if (!state.last_compacted_keys) {
-        generate_last_compacted_keys(state, manifest);
+    if (!state->last_compacted_keys) {
+        generate_last_compacted_keys(*state, manifest);
    }
-    auto candidate = manifest.get_compaction_candidates(*state.last_compacted_keys, state.compaction_counter);
+    auto candidate = manifest.get_compaction_candidates(*state->last_compacted_keys, state->compaction_counter);

    if (!candidate.sstables.empty()) {
        auto main_set = co_await table_s.main_sstable_set();
@@ -78,12 +78,12 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(comp
 }

 void leveled_compaction_strategy::notify_completion(compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
-    auto& state = get_state(table_s);
+    auto state = get_state(table_s);
    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
-    if (removed.empty() || added.empty() || !state.last_compacted_keys) {
+    if (removed.empty() || added.empty() || !state->last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
@@ -99,16 +99,16 @@ void leveled_compaction_strategy::notify_completion(compaction_group_view& table
        }
        target_level = std::max(target_level, int(candidate->get_sstable_level()));
    }
-    state.last_compacted_keys.value().at(min_level) = last->get_last_decorated_key();
+    state->last_compacted_keys.value().at(min_level) = last->get_last_decorated_key();

    for (int i = leveled_manifest::MAX_LEVELS - 1; i > 0; i--) {
-        state.compaction_counter[i]++;
+        state->compaction_counter[i]++;
    }
-    state.compaction_counter[target_level] = 0;
+    state->compaction_counter[target_level] = 0;

    if (leveled_manifest::logger.level() == logging::log_level::debug) {
-        for (auto j = 0U; j < state.compaction_counter.size(); j++) {
-            leveled_manifest::logger.debug("CompactionCounter: {}: {}", j, state.compaction_counter[j]);
+        for (auto j = 0U; j < state->compaction_counter.size(); j++) {
+            leveled_manifest::logger.debug("CompactionCounter: {}: {}", j, state->compaction_counter[j]);
        }
    }
 }
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -36,6 +36,8 @@ struct leveled_compaction_strategy_state {
    leveled_compaction_strategy_state();
 };

+using leveled_compaction_strategy_state_ptr = seastar::shared_ptr<leveled_compaction_strategy_state>;
+
 class leveled_compaction_strategy : public compaction_strategy_impl {
    static constexpr int32_t DEFAULT_MAX_SSTABLE_SIZE_IN_MB = 160;
    static constexpr auto SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
@@ -45,7 +47,7 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
 private:
    int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;

-    leveled_compaction_strategy_state& get_state(compaction_group_view& table_s) const;
+    leveled_compaction_strategy_state_ptr get_state(compaction_group_view& table_s) const;
 public:
    static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -13,6 +13,7 @@
 #include "sstables/sstables.hh"
 #include "sstables/sstable_set_impl.hh"
 #include "compaction_strategy_state.hh"
+#include "utils/error_injection.hh"

 #include <ranges>

@@ -22,8 +23,8 @@ extern logging::logger clogger;

 using timestamp_type = api::timestamp_type;

-time_window_compaction_strategy_state& time_window_compaction_strategy::get_state(compaction_group_view& table_s) const {
-    return table_s.get_compaction_strategy_state().get<time_window_compaction_strategy_state>();
+time_window_compaction_strategy_state_ptr time_window_compaction_strategy::get_state(compaction_group_view& table_s) const {
+    return table_s.get_compaction_strategy_state().get<time_window_compaction_strategy_state_ptr>();
 }

 const std::unordered_map<sstring, std::chrono::seconds> time_window_compaction_strategy_options::valid_window_units = {
@@ -335,7 +336,7 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<sstables::shared_

 future<compaction_descriptor>
 time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_view& table_s, strategy_control& control) {
-    auto& state = get_state(table_s);
+    auto state = get_state(table_s);
    auto compaction_time = gc_clock::now();
    auto candidates = co_await control.candidates(table_s);

@@ -344,7 +345,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_vi
    }

    auto now = db_clock::now();
-    if (now - state.last_expired_check > _options.expired_sstable_check_frequency) {
+    if (now - state->last_expired_check > _options.expired_sstable_check_frequency) {
        clogger.debug("[{}] TWCS expired check sufficiently far in the past, checking for fully expired SSTables", fmt::ptr(this));

        // Find fully expired SSTables. Those will be included no matter what.
@@ -356,12 +357,14 @@ time_window_compaction_strategy::get_sstables_for_compaction(compaction_group_vi
        // Keep checking for fully_expired_sstables until we don't find
        // any among the candidates, meaning they are either already compacted
        // or registered for compaction.
-        state.last_expired_check = now;
+        state->last_expired_check = now;
    } else {
        clogger.debug("[{}] TWCS skipping check for fully expired SSTables", fmt::ptr(this));
    }

-    auto compaction_candidates = get_next_non_expired_sstables(table_s, control, std::move(candidates), compaction_time);
+    co_await utils::get_local_injector().inject("twcs_get_sstables_for_compaction", utils::wait_for_message(30s));
+
+    auto compaction_candidates = get_next_non_expired_sstables(table_s, control, std::move(candidates), compaction_time, *state);
    clogger.debug("[{}] Going to compact {} non-expired sstables", fmt::ptr(this), compaction_candidates.size());
    co_return compaction_descriptor(std::move(compaction_candidates));
 }
@@ -384,8 +387,8 @@ time_window_compaction_strategy::compaction_mode(const time_window_compaction_st

 std::vector<sstables::shared_sstable>
 time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control,
-        std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time) {
-    auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables);
+        std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state) {
+    auto most_interesting = get_compaction_candidates(table_s, control, non_expiring_sstables, state);

    if (!most_interesting.empty()) {
        return most_interesting;
@@ -410,14 +413,14 @@ time_window_compaction_strategy::get_next_non_expired_sstables(compaction_group_
 }

 std::vector<sstables::shared_sstable>
-time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidate_sstables) {
-    auto& state = get_state(table_s);
+time_window_compaction_strategy::get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
+    std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state) {
    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
    // Update the highest window seen, if necessary
    state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);

    return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
-        state.highest_window_seen);
+        state.highest_window_seen, state);
 }

 timestamp_type
@@ -465,8 +468,7 @@ namespace compaction {

 std::vector<sstables::shared_sstable>
 time_window_compaction_strategy::newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<timestamp_type, std::vector<sstables::shared_sstable>> buckets,
-        int min_threshold, int max_threshold, timestamp_type now) {
-    auto& state = get_state(table_s);
+        int min_threshold, int max_threshold, timestamp_type now, time_window_compaction_strategy_state& state) {
    clogger.debug("time_window_compaction_strategy::newest_bucket:\n  now {}\n{}", now, buckets);

    for (auto&& [key, bucket] : buckets | std::views::reverse) {
@@ -517,7 +519,7 @@ time_window_compaction_strategy::trim_to_threshold(std::vector<sstables::shared_
 }

 future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(compaction_group_view& table_s) const {
-    auto& state = get_state(table_s);
+    auto state = get_state(table_s);
    auto min_threshold = table_s.min_compaction_threshold();
    auto max_threshold = table_s.schema()->max_compaction_threshold();
    auto main_set = co_await table_s.main_sstable_set();
@@ -526,7 +528,7 @@ future<int64_t> time_window_compaction_strategy::estimated_pending_compactions(c

    int64_t n = 0;
    for (auto& [bucket_key, bucket] : buckets) {
-        switch (compaction_mode(state, bucket, bucket_key, max_timestamp, min_threshold)) {
+        switch (compaction_mode(*state, bucket, bucket_key, max_timestamp, min_threshold)) {
        case bucket_compaction_mode::size_tiered:
            n += size_tiered_compaction_strategy::estimated_pending_compactions(bucket, min_threshold, max_threshold, _stcs_options);
            break;
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -67,6 +67,8 @@ struct time_window_compaction_strategy_state {
    std::unordered_set<api::timestamp_type> recent_active_windows;
 };

+using time_window_compaction_strategy_state_ptr = seastar::shared_ptr<time_window_compaction_strategy_state>;
+
 class time_window_compaction_strategy : public compaction_strategy_impl {
    time_window_compaction_strategy_options _options;
    size_tiered_compaction_strategy_options _stcs_options;
@@ -87,7 +89,7 @@ public:

    static void validate_options(const std::map<sstring, sstring>& options, std::map<sstring, sstring>& unchecked_options);
 private:
-    time_window_compaction_strategy_state& get_state(compaction_group_view& table_s) const;
+    time_window_compaction_strategy_state_ptr get_state(compaction_group_view& table_s) const;

    static api::timestamp_type
    to_timestamp_type(time_window_compaction_strategy_options::timestamp_resolutions resolution, int64_t timestamp_from_sstable) {
@@ -110,9 +112,11 @@ private:
    compaction_mode(const time_window_compaction_strategy_state&, const bucket_t& bucket, api::timestamp_type bucket_key, api::timestamp_type now, size_t min_threshold) const;

    std::vector<sstables::shared_sstable>
-    get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control, std::vector<sstables::shared_sstable> non_expiring_sstables, gc_clock::time_point compaction_time);
+    get_next_non_expired_sstables(compaction_group_view& table_s, strategy_control& control, std::vector<sstables::shared_sstable> non_expiring_sstables,
+        gc_clock::time_point compaction_time, time_window_compaction_strategy_state& state);

-    std::vector<sstables::shared_sstable> get_compaction_candidates(compaction_group_view& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidate_sstables);
+    std::vector<sstables::shared_sstable> get_compaction_candidates(compaction_group_view& table_s, strategy_control& control,
+        std::vector<sstables::shared_sstable> candidate_sstables, time_window_compaction_strategy_state& state);
 public:
    // Find the lowest timestamp for window of given size
    static api::timestamp_type
@@ -126,7 +130,7 @@ public:

    std::vector<sstables::shared_sstable>
    newest_bucket(compaction_group_view& table_s, strategy_control& control, std::map<api::timestamp_type, std::vector<sstables::shared_sstable>> buckets,
-        int min_threshold, int max_threshold, api::timestamp_type now);
+        int min_threshold, int max_threshold, api::timestamp_type now, time_window_compaction_strategy_state& state);

    static std::vector<sstables::shared_sstable>
    trim_to_threshold(std::vector<sstables::shared_sstable> bucket, int max_threshold);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,7 +855,7 @@ maintenance_socket: ignore
 # enable_create_table_with_compact_storage: false

 # Control tablets for new keyspaces.
-# Can be set to: disabled|enabled
+# Can be set to: disabled|enabled|enforced
 #
 # When enabled, newly created keyspaces will have tablets enabled by default.
 # That can be explicitly disabled in the CREATE KEYSPACE query
--- a/configure.py
+++ b/configure.py
@@ -526,6 +526,7 @@ scylla_tests = set([
    'test/boost/mutation_test',
    'test/boost/mvcc_test',
    'test/boost/nonwrapping_interval_test',
+    'test/boost/object_storage_upload_test',
    'test/boost/observable_test',
    'test/boost/partitioner_test',
    'test/boost/pretty_printers_test',
@@ -619,6 +620,7 @@ perf_tests = set([
    'test/perf/perf_idl',
    'test/perf/perf_vint',
    'test/perf/perf_big_decimal',
+    'test/perf/perf_bti_key_translation',
    'test/perf/perf_sort_by_proximity',
 ])

@@ -640,7 +642,8 @@ raft_tests = set([

 vector_search_tests = set([
    'test/vector_search/vector_store_client_test',
-    'test/vector_search/load_balancer_test'
+    'test/vector_search/load_balancer_test',
+    'test/vector_search/client_test'
 ])

 wasms = set([
@@ -789,6 +792,9 @@ scylla_raft_core = [
 ]

 scylla_core = (['message/messaging_service.cc',
+                'message/advanced_rpc_compressor.cc',
+                'message/stream_compressor.cc',
+                'message/dict_trainer.cc',
                'replica/database.cc',
                'replica/schema_describe_helper.cc',
                'replica/table.cc',
@@ -799,6 +805,7 @@ scylla_core = (['message/messaging_service.cc',
                'replica/dirty_memory_manager.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
+                'replica/querier.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -833,7 +840,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/buffer_input_stream.cc',
                'utils/limiting_data_source.cc',
                'utils/updateable_value.cc',
-                'utils/dict_trainer.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
                'gms/generation-number.cc',
@@ -843,7 +849,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/io-wrappers.cc',
                'utils/on_internal_error.cc',
                'utils/pretty_printers.cc',
-                'utils/stream_compressor.cc',
                'utils/labels.cc',
                'mutation/converting_mutation_partition_applier.cc',
                'readers/combined.cc',
@@ -877,6 +882,7 @@ scylla_core = (['message/messaging_service.cc',
                'compaction/incremental_compaction_strategy.cc',
                'compaction/incremental_backlog_tracker.cc',
                'sstables/integrity_checked_file_impl.cc',
+                'sstables/object_storage_client.cc',
                'sstables/prepended_input_stream.cc',
                'sstables/m_format_read_helpers.cc',
                'sstables/sstable_directory.cc',
@@ -901,7 +907,6 @@ scylla_core = (['message/messaging_service.cc',
                'cdc/split.cc',
                'cdc/generation.cc',
                'cdc/metadata.cc',
-                'cql3/type_json.cc',
                'cql3/attributes.cc',
                'cql3/cf_name.cc',
                'cql3/cql3_type.cc',
@@ -988,6 +993,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/uuid.cc',
                'utils/big_decimal.cc',
                'types/comparable_bytes.cc',
+                'types/json_utils.cc',
                'types/types.cc',
                'validation.cc',
                'service/migration_manager.cc',
@@ -1056,6 +1062,7 @@ scylla_core = (['message/messaging_service.cc',
                'db/virtual_table.cc',
                'db/virtual_tables.cc',
                'db/tablet_options.cc',
+                'db/object_storage_endpoint_param.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
                'index/vector_index.cc',
@@ -1076,16 +1083,13 @@ scylla_core = (['message/messaging_service.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
-                'utils/s3/retryable_http_client.cc',
-                'utils/s3/retry_strategy.cc',
-                'utils/s3/s3_retry_strategy.cc',
+                'utils/s3/default_aws_retry_strategy.cc',
                'utils/s3/credentials_providers/aws_credentials_provider.cc',
                'utils/s3/credentials_providers/environment_aws_credentials_provider.cc',
                'utils/s3/credentials_providers/instance_profile_credentials_provider.cc',
                'utils/s3/credentials_providers/sts_assume_role_credentials_provider.cc',
                'utils/s3/credentials_providers/aws_credentials_provider_chain.cc',
                'utils/s3/utils/manip_s3.cc',
-                'utils/advanced_rpc_compressor.cc',
                'utils/azure/identity/credentials.cc',
                'utils/azure/identity/service_principal_credentials.cc',
                'utils/azure/identity/managed_identity_credentials.cc',
@@ -1192,6 +1196,7 @@ scylla_core = (['message/messaging_service.cc',
                'table_helper.cc',
                'audit/audit.cc',
                'audit/audit_cf_storage_helper.cc',
+                'audit/audit_composite_storage_helper.cc',
                'audit/audit_syslog_storage_helper.cc',
                'tombstone_gc_options.cc',
                'tombstone_gc.cc',
@@ -1200,7 +1205,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/aws_sigv4.cc',
                'types/duration.cc',
                'vint-serialization.cc',
-                'querier.cc',
                'mutation_writer/multishard_writer.cc',
                'ent/encryption/encryption_config.cc',
                'ent/encryption/encryption.cc',
@@ -1263,6 +1267,8 @@ scylla_core = (['message/messaging_service.cc',
                'utils/disk_space_monitor.cc',
                'vector_search/vector_store_client.cc',
                'vector_search/dns.cc',
+                'vector_search/client.cc',
+                'vector_search/clients.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1405,6 +1411,9 @@ scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_gener
    'test/lib/random_schema.cc',
    'test/lib/key_utils.cc',
    'test/lib/proc_utils.cc',
+    'test/lib/gcs_fixture.cc',
+    'test/lib/aws_kms_fixture.cc',
+    'test/lib/azure_kms_fixture.cc',
 ]

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc', 'utils/exceptions.cc']
@@ -1657,6 +1666,7 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',

 deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
+deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies

 wasm_deps = {}

@@ -1811,6 +1821,9 @@ user_cflags = args.user_cflags + f" -ffile-prefix-map={curdir}=."
 # Since gcc 13, libgcc doesn't need the exception workaround
 user_cflags += ' -DSEASTAR_NO_EXCEPTION_HACK'

+# https://github.com/llvm/llvm-project/issues/163007
+user_cflags += ' -fextend-variable-liveness=none'
+
 if args.target != '':
    user_cflags += ' -march=' + args.target

@@ -1998,11 +2011,11 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DCMAKE_CXX_EXTENSIONS=ON',
        '-DSeastar_CXX_FLAGS=SHELL:{}'.format(mode_config['lib_cflags'] + extra_file_prefix_map),
        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(mode_config['lib_ldflags'], seastar_cxx_ld_flags)),
-        '-DSeastar_API_LEVEL=8',
+        '-DSeastar_API_LEVEL=9',
        '-DSeastar_DEPRECATED_OSTREAM_FORMATTERS=OFF',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-        '-DSeastar_SCHEDULING_GROUPS_COUNT=20',
+        '-DSeastar_SCHEDULING_GROUPS_COUNT=21',
        '-DSeastar_IO_URING=ON',
    ]

--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -28,7 +28,6 @@ set_property(
 add_library(cql3 STATIC)
 target_sources(cql3
  PRIVATE
-    type_json.cc
    attributes.cc
    cf_name.cc
    cql3_type.cc
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -219,44 +219,17 @@ using uexpression = uninitialized<expression>;
        return token->getText();
    }

+    error_sink_fn get_error_sink() {
+        return [this] (const std::string& msg) { add_recognition_error(msg); };
+    }
+
    std::map<sstring, sstring> convert_property_map(const collection_constructor& map) {
-        if (map.elements.empty()) {
-            return std::map<sstring, sstring>{};
-        }
-        std::map<sstring, sstring> res;
-        for (auto&& entry : map.elements) {
-            auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
-            // Because the parser tries to be smart and recover on error (to
-            // allow displaying more than one error I suppose), we have default-constructed
-            // entries in map.elements. Just skip those, a proper error will be thrown in the end.
-            if (!entry_tuple || entry_tuple->elements.size() != 2) {
-                break;
-            }
-            auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
-            if (!left) {
-                sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
-                if (expr::is<bind_variable>(entry_tuple->elements[0])) {
-                    msg += " (bind variables are not supported in DDL queries)";
-                }
-                add_recognition_error(msg);
-                break;
-            }
-            auto right = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
-            if (!right) {
-                sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[0], entry_tuple->elements[1]);
-                if (expr::is<bind_variable>(entry_tuple->elements[1])) {
-                    msg += " (bind variables are not supported in DDL queries)";
-                }
-                add_recognition_error(msg);
-                break;
-            }
-            if (!res.emplace(left->raw_text, right->raw_text).second) {
-                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
-                add_recognition_error(msg);
-                break;
-            }
-        }
-        return res;
+        return cql3::expr::convert_property_map(map, get_error_sink());
+    }
+
+    property_definitions::extended_map_type
+    convert_extended_property_map(const collection_constructor& map) {
+        return cql3::expr::convert_extended_property_map(map, get_error_sink());
    }

    sstring to_lower(std::string_view s) {
@@ -1224,7 +1197,7 @@ listPermissionsStatement returns [std::unique_ptr<list_permissions_statement> st
    ;

 permission returns [auth::permission perm = auth::permission{}]
-    : p=(K_CREATE | K_ALTER | K_DROP | K_SELECT | K_MODIFY | K_AUTHORIZE | K_DESCRIBE | K_EXECUTE)
+    : p=(K_CREATE | K_ALTER | K_DROP | K_SELECT | K_MODIFY | K_AUTHORIZE | K_DESCRIBE | K_EXECUTE | K_VECTOR_SEARCH_INDEXING)
    { $perm = auth::permissions::from_string($p.text); }
    ;

@@ -1834,7 +1807,7 @@ properties[cql3::statements::property_definitions& props]

 property[cql3::statements::property_definitions& props]
    : k=ident '=' simple=propertyValue { try { $props.add_property(k->to_string(), simple); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
-    | k=ident '=' map=mapLiteral { try { $props.add_property(k->to_string(), convert_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
+    | k=ident '=' map=mapLiteral { try { $props.add_property(k->to_string(), convert_extended_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
    ;

 propertyValue returns [sstring str]
@@ -2398,6 +2371,8 @@ K_EXECUTE:     E X E C U T E;

 K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

+K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1349,7 +1349,7 @@ static managed_bytes reserialize_value(View value_bytes,
    if (type.is_map()) {
        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

-        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
+        const map_type_impl& mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

@@ -1391,7 +1391,7 @@ static managed_bytes reserialize_value(View value_bytes,
        const vector_type_impl& vtype = dynamic_cast<const vector_type_impl&>(type);
        std::vector<managed_bytes> elements = vtype.split_fragmented(value_bytes);

-        auto elements_type = vtype.get_elements_type()->without_reversed();
+        const auto& elements_type = vtype.get_elements_type()->without_reversed();

        if (elements_type.bound_value_needs_to_be_reserialized()) {
            for (size_t i = 0; i < elements.size(); i++) {
@@ -2397,6 +2397,107 @@ split_aggregation(std::span<const expression> aggregation) {
    };
 }

+std::map<sstring, sstring> convert_property_map(const collection_constructor& map, error_sink_fn add_recognition_error) {
+    if (map.elements.empty()) {
+        return std::map<sstring, sstring>{};
+    }
+    std::map<sstring, sstring> res;
+    for (auto&& entry : map.elements) {
+        auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
+        // Because the parser tries to be smart and recover on error (to
+        // allow displaying more than one error I suppose), we have default-constructed
+        // entries in map.elements. Just skip those, a proper error will be thrown in the end.
+        if (!entry_tuple || entry_tuple->elements.size() != 2) {
+            break;
+        }
+        auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
+        if (!left) {
+            sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
+            if (expr::is<bind_variable>(entry_tuple->elements[0])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        auto right = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
+        if (!right) {
+            sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[0], entry_tuple->elements[1]);
+            if (expr::is<bind_variable>(entry_tuple->elements[1])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        if (!res.emplace(left->raw_text, right->raw_text).second) {
+            sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+            add_recognition_error(msg);
+            break;
+        }
+    }
+    return res;
+}
+
+std::map<sstring, std::variant<sstring, std::vector<sstring>>>
+convert_extended_property_map(const collection_constructor& map, error_sink_fn add_recognition_error) {
+    if (map.elements.empty()) {
+        return {};
+    }
+    std::map<sstring, std::variant<sstring, std::vector<sstring>>> res;
+    for (auto&& entry : map.elements) {
+        auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
+        // Because the parser tries to be smart and recover on error (to
+        // allow displaying more than one error I suppose), we have default-constructed
+        // entries in map.elements. Just skip those, a proper error will be thrown in the end.
+        if (!entry_tuple || entry_tuple->elements.size() != 2) {
+            break;
+        }
+        auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
+        if (!left) {
+            sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
+            if (expr::is<bind_variable>(entry_tuple->elements[0])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        auto right_str = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
+        if (right_str) {
+            if (!res.emplace(left->raw_text, right_str->raw_text).second) {
+                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+                add_recognition_error(msg);
+                break;
+            }
+        } else {
+            auto right_vec = expr::as_if<collection_constructor>(&entry_tuple->elements[1]);
+            if (!right_vec) {
+                sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[1], entry_tuple->elements[0]);
+                if (expr::is<bind_variable>(entry_tuple->elements[1])) {
+                    msg += " (bind variables are not supported in DDL queries)";
+                }
+                add_recognition_error(msg);
+                break;
+            }
+            auto values = right_vec->elements | std::views::transform([&] (const auto& x) -> sstring {
+                auto elem = expr::as_if<untyped_constant>(&x);
+                if (!elem) {
+                    sstring msg = fmt::format("Invalid property vector value: {} for property: {}", x, entry_tuple->elements[0]);
+                    if (expr::is<bind_variable>(x)) {
+                        msg += " (bind variables are not supported in DDL queries)";
+                    }
+                    add_recognition_error(msg);
+                    return "<invalid>";
+                }
+                return elem->raw_text;
+            }) | std::ranges::to<std::vector<sstring>>();
+            if (!res.emplace(left->raw_text, std::move(values)).second) {
+                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+                add_recognition_error(msg);
+                break;
+            }
+        }
+    }
+    return res;
+}

 } // namespace expr
 } // namespace cql3
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -430,6 +430,14 @@ struct collection_constructor {
    friend bool operator==(const collection_constructor&, const collection_constructor&) = default;
 };

+// Called with error message string.
+using error_sink_fn = std::function<void(const std::string&)>;
+
+std::map<sstring, sstring> convert_property_map(const collection_constructor&, error_sink_fn);
+
+std::map<sstring, std::variant<sstring, std::vector<sstring>>>
+convert_extended_property_map(const collection_constructor&, error_sink_fn);
+
 // Constructs an object of a user-defined type
 // For example: "{field1: 23343, field2: ?}"
 // During preparation usertype constructors with constant values are converted to expr::constant.
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -13,10 +13,10 @@
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/functions/function_name.hh"
 #include "cql3/cql3_type.hh"
-#include "cql3/type_json.hh"

 #include "bytes_ostream.hh"
 #include "types/types.hh"
+#include "types/json_utils.hh"

 namespace cql3 {

--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -10,7 +10,6 @@
 #include "functions.hh"
 #include "token_fct.hh"
 #include "cql3/ut_name.hh"
-#include "cql3/type_json.hh"
 #include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/bytes_conversion_fcts.hh"
 #include "cql3/functions/time_uuid_fcts.hh"
@@ -22,6 +21,7 @@
 #include "cql3/prepare_context.hh"
 #include "user_aggregate.hh"
 #include "cql3/expr/expression.hh"
+#include "types/json_utils.hh"
 #include "types/set.hh"
 #include "types/listlike_partial_deserializing_iterator.hh"

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -14,7 +14,9 @@
 #include <stdexcept>
 #include <vector>
 #include "alter_keyspace_statement.hh"
+#include "cql3/statements/property_definitions.hh"
 #include "locator/tablets.hh"
+#include "locator/abstract_replication_strategy.hh"
 #include "mutation/canonical_mutation.hh"
 #include "prepared_statement.hh"
 #include "service/migration_manager.hh"
@@ -49,16 +51,8 @@ future<> cql3::statements::alter_keyspace_statement::check_access(query_processo
    return state.has_keyspace_access(_name, auth::permission::ALTER);
 }

-static unsigned get_abs_rf_diff(const std::string& curr_rf, const std::string& new_rf) {
-    try {
-        return std::abs(std::stoi(curr_rf) - std::stoi(new_rf));
-    } catch (std::invalid_argument const& ex) {
-        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments, "
-                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
-    } catch (std::out_of_range const& ex) {
-        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments to fit into `int` type, "
-                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
-    }
+static unsigned get_abs_rf_diff(const locator::replication_strategy_config_option& curr_rf, const locator::replication_strategy_config_option& new_rf) {
+    return std::abs(ssize_t(locator::get_replication_factor(curr_rf)) - ssize_t(locator::get_replication_factor(new_rf)));
 }

 void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, const service::client_state& state) const {
@@ -85,19 +79,22 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                        current_options.type_string(), new_options.type_string()));
            }

-            auto new_ks = _attrs->as_ks_metadata_update(ks.metadata(), *qp.proxy().get_token_metadata_ptr(), qp.proxy().features());
+            auto new_ks = _attrs->as_ks_metadata_update(ks.metadata(), *qp.proxy().get_token_metadata_ptr(), qp.proxy().features(), qp.db().get_config());
+
+            auto tmptr = qp.proxy().get_token_metadata_ptr();
+            const auto& topo = tmptr->get_topology();

            if (ks.get_replication_strategy().uses_tablets()) {
-                const std::map<sstring, sstring>& current_rf_per_dc = ks.metadata()->strategy_options();
+                auto& current_rf_per_dc = ks.metadata()->strategy_options();
                auto new_rf_per_dc = _attrs->get_replication_options();
                new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
                unsigned total_abs_rfs_diff = 0;
                for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
-                    sstring old_rf = "0";
+                    auto old_rf = locator::replication_strategy_config_option(sstring("0"));
                    if (auto new_dc_in_current_mapping = current_rf_per_dc.find(new_dc);
                             new_dc_in_current_mapping != current_rf_per_dc.end()) {
                        old_rf = new_dc_in_current_mapping->second;
-                    } else if (!qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters().contains(new_dc)) {
+                    } else if (!topo.get_datacenters().contains(new_dc)) {
                        // This means that the DC listed in ALTER doesn't exist. This error will be reported later,
                        // during validation in abstract_replication_strategy::validate_replication_strategy.
                        // We can't report this error now, because it'd change the order of errors reported:
@@ -110,11 +107,14 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                }
            }

-            locator::replication_strategy_params params(new_ks->strategy_options(), new_ks->initial_tablets());
-            auto new_rs = locator::abstract_replication_strategy::create_replication_strategy(new_ks->strategy_name(), params);
+            locator::replication_strategy_params params(new_ks->strategy_options(), new_ks->initial_tablets(), new_ks->consistency_option());
+            auto new_rs = locator::abstract_replication_strategy::create_replication_strategy(new_ks->strategy_name(), params, topo);
            if (new_rs->is_per_table() != ks.get_replication_strategy().is_per_table()) {
                throw exceptions::invalid_request_exception(format("Cannot alter replication strategy vnode/tablets flavor"));
            }
+            if (new_ks->consistency_option() && new_ks->consistency_option() != ks.metadata()->consistency_option()) {
+                throw exceptions::invalid_request_exception(format("Cannot alter consistency option"));
+            }
        } catch (const std::runtime_error& e) {
            throw exceptions::invalid_request_exception(e.what());
        }
@@ -135,62 +135,6 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
    return ks.get_replication_strategy().uses_tablets() && !_attrs->get_replication_options().empty();
 }

-namespace {
-// These functions are used to flatten all the options in the keyspace definition into a single-level map<string, string>.
-// (Currently options are stored in a nested structure that looks more like a map<string, map<string, string>>).
-// Flattening is simply joining the keys of maps from both levels with a colon ':' character,
-// or in other words: prefixing the keys in the output map with the option type, e.g. 'replication', 'storage', etc.,
-// so that the output map contains entries like: "replication:dc1" -> "3".
-// This is done to avoid key conflicts and to be able to de-flatten the map back into the original structure.
-
-void add_prefixed_key(const sstring& prefix, const std::map<sstring, sstring>& in, std::map<sstring, sstring>& out) {
-    for (const auto& [in_key, in_value]: in) {
-        out[prefix + ":" + in_key] = in_value;
-    }
-};
-
-std::map<sstring, sstring> get_current_options_flattened(const shared_ptr<cql3::statements::ks_prop_defs>& ks,
-                                                         const gms::feature_service& feat) {
-    std::map<sstring, sstring> all_options;
-
-    add_prefixed_key(ks->KW_REPLICATION, ks->get_replication_options(), all_options);
-    add_prefixed_key(ks->KW_STORAGE, ks->get_storage_options().to_map(), all_options);
-    // if no tablet options are specified in ATLER KS statement,
-    // we want to preserve the old ones and hence cannot overwrite them with defaults
-    if (ks->has_property(ks->KW_TABLETS)) {
-        auto initial_tablets = ks->get_initial_tablets(std::nullopt);
-        add_prefixed_key(ks->KW_TABLETS,
-                         {{"enabled", initial_tablets ? "true" : "false"},
-                         {"initial", std::to_string(initial_tablets.value_or(0))}},
-                         all_options);
-    }
-    add_prefixed_key(ks->KW_DURABLE_WRITES,
-                     {{sstring(ks->KW_DURABLE_WRITES), to_sstring(ks->get_boolean(ks->KW_DURABLE_WRITES, true))}},
-                     all_options);
-
-    return all_options;
-}
-
-std::map<sstring, sstring> get_old_options_flattened(const data_dictionary::keyspace& ks) {
-    std::map<sstring, sstring> all_options;
-
-    using namespace cql3::statements;
-    add_prefixed_key(ks_prop_defs::KW_REPLICATION, ks.get_replication_strategy().get_config_options(), all_options);
-    add_prefixed_key(ks_prop_defs::KW_STORAGE, ks.metadata()->get_storage_options().to_map(), all_options);
-    if (ks.metadata()->initial_tablets()) {
-        add_prefixed_key(ks_prop_defs::KW_TABLETS,
-                         {{"enabled", ks.metadata()->initial_tablets() ? "true" : "false"},
-                          {"initial", std::to_string(ks.metadata()->initial_tablets().value_or(0))}},
-                         all_options);
-    }
-    add_prefixed_key(ks_prop_defs::KW_DURABLE_WRITES,
-                     {{sstring(ks_prop_defs::KW_DURABLE_WRITES), to_sstring(ks.metadata()->durable_writes())}},
-                     all_options);
-
-    return all_options;
-}
-} // <anonymous> namespace
-
 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
 cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
    using namespace cql_transport;
@@ -199,36 +143,15 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        auto ks = qp.db().find_keyspace(_name);
        auto ks_md = ks.metadata();
        const auto tmptr = qp.proxy().get_token_metadata_ptr();
+        const auto& topo = tmptr->get_topology();
        const auto& feat = qp.proxy().features();
-        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat);
+        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat, qp.db().get_config());
        utils::chunked_vector<mutation> muts;
        std::vector<sstring> warnings;
-        auto old_ks_options = get_old_options_flattened(ks);
-        auto ks_options = get_current_options_flattened(_attrs, feat);
-        ks_options.merge(old_ks_options);

        auto ts = mc.write_timestamp();
        auto global_request_id = mc.new_group0_state_id();

-        // #22688 - filter out any dc*:0 entries - consider these
-        // null and void (removed). Migration planning will treat it
-        // as dc*=0 still.
-        std::erase_if(ks_options, [](const auto& i) {
-            static constexpr std::string replication_prefix = ks_prop_defs::KW_REPLICATION + ":"s;
-            // Flattened map, replication entries starts with "replication:".
-            // Only valid options are replication_factor, class and per-dc rf:s. We want to
-            // filter out any dcN=0 entries.
-            auto& [key, val] = i;
-            if (key.starts_with(replication_prefix) && val == "0") {
-                std::string_view v(key);
-                v.remove_prefix(replication_prefix.size());
-                return v != ks_prop_defs::REPLICATION_FACTOR_KEY 
-                    && v != ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY
-                    ;
-            }
-            return false;
-        });
-
        // we only want to run the tablets path if there are actually any tablets changes, not only schema changes
        // TODO: the current `if (changes_tablets(qp))` is insufficient: someone may set the same RFs as before,
        //       and we'll unnecessarily trigger the processing path for ALTER tablets KS,
@@ -238,10 +161,6 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
                        exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
            }
-            if (_attrs->get_replication_options().contains(ks_prop_defs::REPLICATION_FACTOR_KEY)) {
-                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
-                       exceptions::invalid_request_exception("'replication_factor' tag is not allowed when executing ALTER KEYSPACE with tablets, please list the DCs explicitly"));
-            }
            qp.db().real_database().validate_keyspace_update(*ks_md_update);

            service::topology_mutation_builder builder(ts);
@@ -251,11 +170,11 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            if (!qp.proxy().features().topology_global_request_queue) {
                builder.set_global_topology_request(service::global_topology_request::keyspace_rf_change);
                builder.set_global_topology_request_id(global_request_id);
-                builder.set_new_keyspace_rf_change_data(_name, ks_options);
+                builder.set_new_keyspace_rf_change_data(_name, _attrs->flattened());
            } else {
                builder.queue_global_topology_request_id(global_request_id);
                rtbuilder.set("request_type", service::global_topology_request::keyspace_rf_change)
-                         .set_new_keyspace_rf_change_data(_name, ks_options);
+                         .set_new_keyspace_rf_change_data(_name, _attrs->flattened());

            };
            service::topology_change change{{builder.build()}};
@@ -278,7 +197,8 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce

        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
                ks_md_update->strategy_name(),
-                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets(), ks_md_update->consistency_option()),
+                topo);

        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -113,7 +113,7 @@ bool cas_request::applies_to() const {
 }

 std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
-        const query::partition_slice& slice, api::timestamp_type ts) {
+        const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options&) {
    _rows = update_parameters::build_prefetch_data(_schema, *qr, slice);
    if (applies_to()) {
        return apply_updates(ts);
--- a/cql3/statements/cas_request.hh
+++ b/cql3/statements/cas_request.hh
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */
 #pragma once
+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include "service/paxos/cas_request.hh"
 #include "cql3/statements/modification_statement.hh"
@@ -67,7 +68,7 @@ public:
        modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg);

    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
-            const query::partition_slice& slice, api::timestamp_type ts) override;
+            const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options&) override;

    /// Build a result set with prefetched rows, but return only
    /// the columns required by CAS.
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -9,6 +9,7 @@
 */

 #include "cql3/statements/cf_prop_defs.hh"
+#include "cql3/statements/property_definitions.hh"
 #include "cql3/statements/request_validations.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "db/extensions.hh"
@@ -23,6 +24,7 @@
 #include "db/per_partition_rate_limit_options.hh"
 #include "db/tablet_options.hh"
 #include "utils/bloom_calculations.hh"
+#include "utils/overloaded_functor.hh"
 #include "db/config.hh"

 #include <boost/algorithm/string/predicate.hpp>
@@ -62,12 +64,19 @@ schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions
    for (auto& p : exts.schema_extensions()) {
        auto i = _properties.find(p.first);
        if (i != _properties.end()) {
-            std::visit([&](auto& v) {
+            std::visit(overloaded_functor{
+            [&](const sstring& v) {
                auto ep = p.second(v);
                if (ep) {
                    er.emplace(p.first, std::move(ep));
                }
-            }, i->second);
+            },
+            [&](const property_definitions::extended_map_type& xmap) {
+                auto ep = p.second(property_definitions::to_simple_map(std::move(xmap)));
+                if (ep) {
+                    er.emplace(p.first, std::move(ep));
+                }
+            }}, i->second);
        }
    }
    return er;
@@ -136,9 +145,7 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + compression_parameters::SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
        }
        compression_parameters cp(*compression_options);
-        cp.validate(
-            compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)),
-            compression_parameters::dicts_usage_allowed(db.get_config().sstable_compression_dictionaries_allow_in_ddl()));
+        cp.validate(compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)));
    }

    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
@@ -240,8 +247,8 @@ std::optional<caching_options> cf_prop_defs::get_caching_options() const {
        return {};
    }
    return std::visit(make_visitor(
-        [] (const property_definitions::map_type& map) {
-            return map.empty() ? std::nullopt : std::optional<caching_options>(caching_options::from_map(map));
+        [] (const property_definitions::extended_map_type& map) {
+            return map.empty() ? std::nullopt : std::optional<caching_options>(caching_options::from_map(to_simple_map(map)));
        },
        [] (const sstring& str) {
            return std::optional<caching_options>(caching_options::from_sstring(str));
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -10,8 +10,12 @@

 #include <seastar/core/coroutine.hh>
 #include "create_index_statement.hh"
+#include "db/config.hh"
+#include "db/view/view.hh"
 #include "exceptions/exceptions.hh"
+#include "index/vector_index.hh"
 #include "prepared_statement.hh"
+#include "replica/database.hh"
 #include "types/types.hh"
 #include "validation.hh"
 #include "service/storage_proxy.hh"
@@ -27,6 +31,11 @@
 #include "cql3/statements/index_prop_defs.hh"
 #include "index/secondary_index_manager.hh"
 #include "mutation/mutation.hh"
+#include "db/schema_tables.hh"
+#include "index/secondary_index_manager.hh"
+#include "types/concrete_types.hh"
+#include "db/tags/extension.hh"
+#include "tombstone_gc_extension.hh"

 #include <stdexcept>

@@ -34,6 +43,177 @@ namespace cql3 {

 namespace statements {

+static const data_type collection_keys_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_keys_type: only collections (maps, lists and sets) supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const list_type_impl& l) {
+            return timeuuid_type;
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return m.get_keys_type();
+        }
+        const data_type operator()(const set_type_impl& s) {
+            return s.get_elements_type();
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static const data_type collection_values_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_values_type: only maps and lists supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return m.get_values_type();
+        }
+        const data_type operator()(const list_type_impl& l) {
+            return l.get_elements_type();
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static const data_type collection_entries_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_entries_type: only maps supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return tuple_type_impl::get_instance({m.get_keys_type(), m.get_values_type()});
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static bytes get_available_column_name(const schema& schema, const bytes& root) {
+    bytes accepted_name = root;
+    int i = 0;
+    while (schema.get_column_definition(accepted_name)) {
+        accepted_name = root + to_bytes("_") + to_bytes(std::to_string(++i));
+    }
+    return accepted_name;
+}
+
+static bytes get_available_token_column_name(const schema& schema) {
+    return get_available_column_name(schema, "idx_token");
+}
+
+static bytes get_available_computed_collection_column_name(const schema& schema) {
+    return get_available_column_name(schema, "coll_value");
+}
+
+static data_type type_for_computed_column(cql3::statements::index_target::target_type target, const abstract_type& collection_type) {
+    using namespace cql3::statements;
+    switch (target) {
+        case index_target::target_type::keys:               return collection_keys_type(collection_type);
+        case index_target::target_type::keys_and_values:    return collection_entries_type(collection_type);
+        case index_target::target_type::collection_values:  return collection_values_type(collection_type);
+        default: throw std::logic_error("reached regular values or full when only collection index target types were expected");
+    }
+}
+
+view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
+        const data_dictionary::database& db) const
+{
+    sstring index_target_name = im.options().at(cql3::statements::index_target::target_option_name);
+    schema_builder builder{schema->ks_name(), secondary_index::index_table_name(im.name())};
+    auto target_info = secondary_index::target_parser::parse(schema, im);
+    const auto* index_target = im.local() ? target_info.ck_columns.front() : target_info.pk_columns.front();
+    auto target_type = target_info.type;
+
+    // For local indexing, start with base partition key
+    if (im.local()) {
+        if (index_target->is_partition_key()) {
+            throw exceptions::invalid_request_exception("Local indexing based on partition key column is not allowed,"
+                    " since whole base partition key must be used in queries anyway. Use global indexing instead.");
+        }
+        for (auto& col : schema->partition_key_columns()) {
+            builder.with_column(col.name(), col.type, column_kind::partition_key);
+        }
+        builder.with_column(index_target->name(), index_target->type, column_kind::clustering_key);
+    } else {
+        if (target_type == cql3::statements::index_target::target_type::regular_values) {
+            builder.with_column(index_target->name(), index_target->type, column_kind::partition_key);
+        } else {
+            bytes key_column_name = get_available_computed_collection_column_name(*schema);
+            column_computation_ptr collection_column_computation_ptr = [&name = index_target->name(), target_type] {
+                switch (target_type) {
+                    case cql3::statements::index_target::target_type::keys:
+                        return collection_column_computation::for_keys(name);
+                    case cql3::statements::index_target::target_type::collection_values:
+                        return collection_column_computation::for_values(name);
+                    case cql3::statements::index_target::target_type::keys_and_values:
+                        return collection_column_computation::for_entries(name);
+                    default:
+                        throw std::logic_error(format("create_view_for_index: invalid target_type, received {}", to_sstring(target_type)));
+                }
+            }().clone();
+
+            data_type t = type_for_computed_column(target_type, *index_target->type);
+            builder.with_computed_column(key_column_name, t, column_kind::partition_key, std::move(collection_column_computation_ptr));
+        }
+        // Additional token column is added to ensure token order on secondary index queries
+        bytes token_column_name = get_available_token_column_name(*schema);
+        builder.with_computed_column(token_column_name, long_type, column_kind::clustering_key, std::make_unique<token_column_computation>());
+
+        for (auto& col : schema->partition_key_columns()) {
+            if (col == *index_target) {
+                continue;
+            }
+            builder.with_column(col.name(), col.type, column_kind::clustering_key);
+        }
+    }
+
+    if (!index_target->is_static()) {
+        for (auto& col : schema->clustering_key_columns()) {
+            if (col == *index_target) {
+                continue;
+            }
+            builder.with_column(col.name(), col.type, column_kind::clustering_key);
+        }
+    }
+
+    // This column needs to be after the base clustering key.
+    if (!im.local()) {
+        // If two cells within the same collection share the same value but not liveness information, then
+        // for the index on the values, the rows generated would share the same primary key and thus the
+        // liveness information as well. Prevent that by distinguishing them in the clustering key.
+        if (target_type == cql3::statements::index_target::target_type::collection_values) {
+            data_type t = type_for_computed_column(cql3::statements::index_target::target_type::keys, *index_target->type);
+            bytes column_name = get_available_column_name(*schema, "keys_for_values_idx");
+            builder.with_computed_column(column_name, t, column_kind::clustering_key, collection_column_computation::for_keys(index_target->name()).clone());
+        }
+    }
+
+    if (index_target->is_primary_key()) {
+        for (auto& def : schema->regular_columns()) {
+            db::view::create_virtual_column(builder, def.name(), def.type);
+        }
+    }
+    // "WHERE col IS NOT NULL" is not needed (and doesn't work)
+    // when col is a collection.
+    const sstring where_clause =
+        (target_type == cql3::statements::index_target::target_type::regular_values) ?
+        format("{} IS NOT NULL", index_target->name_as_cql_string()) :
+        "";
+    builder.with_view_info(schema, false, where_clause);
+
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name()));
+    builder.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
+
+    // A local secondary index should be backed by a *synchronous* view,
+    // see #16371. A view is marked synchronous with a tag. Non-local indexes
+    // do not need the tags schema extension at all.
+    if (im.local()) {
+        std::map<sstring, sstring> tags_map = {{db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY, "true"}};
+        builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
+    }
+    return view_ptr{builder.build()};
+}
+
 create_index_statement::create_index_statement(cf_name name,
                                               ::shared_ptr<index_name> index_name,
                                               std::vector<::shared_ptr<index_target::raw>> raw_targets,
@@ -92,9 +272,13 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
        throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
    }

-    if (!db.features().views_with_tablets && db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
-        throw exceptions::invalid_request_exception(format("Secondary indexes are not supported on base tables with tablets (keyspace '{}')", keyspace()));
+    try {
+        db::view::validate_view_keyspace(db, keyspace());
+    } catch (const std::exception& e) {
+        // The type of the thrown exception is not specified, so we need to wrap it here.
+        throw exceptions::invalid_request_exception(e.what());
    }
+
    validate_for_local_index(*schema);

    std::vector<::shared_ptr<index_target>> targets;
@@ -375,6 +559,15 @@ std::optional<create_index_statement::base_schema_with_new_index> create_index_s
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    bool existing_vector_index = _properties->custom_class && _properties->custom_class == "vector_index" && secondary_index::vector_index::has_vector_index_on_column(*schema, targets[0]->column_name());
+    bool custom_index_with_same_name = _properties->custom_class && db.existing_index_names(keyspace()).contains(_index_name);
+    if (existing_vector_index || custom_index_with_same_name) {
+        if (_if_not_exists) {
+            return {};
+        } else {
+            throw exceptions::invalid_request_exception("There exists a duplicate custom index");
+        }
+    }
    auto index_table_name = secondary_index::index_table_name(accepted_name);
    if (db.has_schema(keyspace(), index_table_name)) {
        // We print this error even if _if_not_exists - in this case the user
@@ -395,10 +588,25 @@ create_index_statement::prepare_schema_mutations(query_processor& qp, const quer
    auto res = build_index_schema(qp.db());

    ::shared_ptr<event::schema_change> ret;
-    utils::chunked_vector<mutation> m;
+    utils::chunked_vector<mutation> muts;

    if (res) {
-        m = co_await service::prepare_column_family_update_announcement(qp.proxy(), std::move(res->schema), {}, ts);
+        const replica::database& db = qp.proxy().local_db();
+        const auto& cf = db.find_column_family(keyspace(), column_family());
+
+        // Produce statements to update schema tables with index-specific information.
+        muts = co_await service::prepare_column_family_update_announcement(qp.proxy(), std::move(res->schema), {}, ts);
+
+        // Produce the underlying view for the index.
+        if (db::schema_tables::view_should_exist(res->index)) {
+            view_ptr view = create_view_for_index(cf.schema(), res->index, qp.db());
+            utils::chunked_vector<mutation> view_muts = co_await service::prepare_new_view_announcement(qp.proxy(), std::move(view), ts);
+
+            muts.reserve(muts.size() + view_muts.size());
+            for (mutation& view_mutation : view_muts) {
+                muts.push_back(std::move(view_mutation));
+            }
+        }

        ret = ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
@@ -407,7 +615,7 @@ create_index_statement::prepare_schema_mutations(query_processor& qp, const quer
                column_family());
    }

-    co_return std::make_tuple(std::move(ret), std::move(m), std::vector<sstring>());
+    co_return std::make_tuple(std::move(ret), std::move(muts), std::vector<sstring>());
 }

 std::unique_ptr<cql3::statements::prepared_statement>
--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -54,6 +54,7 @@ public:
        index_metadata index;
    };
    std::optional<base_schema_with_new_index> build_index_schema(data_dictionary::database db) const;
+    view_ptr create_view_for_index(const schema_ptr, const index_metadata& im, const data_dictionary::database&) const;
 private:
    void validate_for_local_index(const schema& schema) const;
    void validate_for_frozen_collection(const index_target& target) const;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -22,6 +22,7 @@
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
 #include "gms/feature_service.hh"
+#include "replica/database.hh"

 #include <boost/regex.hpp>
 #include <stdexcept>
@@ -109,16 +110,10 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
        // remove this check.
        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
            ksm->strategy_name(),
-            locator::replication_strategy_params(ksm->strategy_options(), ksm->initial_tablets()));
-        if (rs->uses_tablets()) {
-            warnings.push_back(
-                "Tables in this keyspace will be replicated using Tablets "
-                "and will not support Materialized Views, Secondary Indexes and counters features. "
-                "To use Materialized Views, Secondary Indexes or counters, drop this keyspace and re-create it "
-                "without tablets by adding AND TABLETS = {'enabled': false} to the CREATE KEYSPACE statement.");
-            if (ksm->initial_tablets().value()) {
-                warnings.push_back("Keyspace `initial` tablets option is deprecated.  Use per-table tablet options instead.");
-            }
+            locator::replication_strategy_params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option()),
+            tmptr->get_topology());
+        if (rs->uses_tablets() && ksm->initial_tablets().value()) {
+            warnings.push_back("Keyspace `initial` tablets option is deprecated.  Use per-table tablet options instead.");
        }

        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
@@ -202,10 +197,11 @@ std::vector<sstring> check_against_restricted_replication_strategies(

    std::vector<sstring> warnings;
    locator::replication_strategy_config_options opts;
-    locator::replication_strategy_params params(opts, std::nullopt);
+    locator::replication_strategy_params params(opts, std::nullopt, std::nullopt);
    auto replication_strategy = locator::abstract_replication_strategy::create_replication_strategy(
            locator::abstract_replication_strategy::to_qualified_class_name(
-                    *attrs.get_replication_strategy_class()), params)->get_type();
+                    *attrs.get_replication_strategy_class()), params,
+                    qp.db().real_database().get_token_metadata().get_topology())->get_type();
    auto rs_warn_list = qp.db().get_config().replication_strategy_warn_list();
    auto rs_fail_list = qp.db().get_config().replication_strategy_fail_list();

@@ -249,7 +245,12 @@ std::vector<sstring> check_against_restricted_replication_strategies(
    // these are checked and reported elsewhere.
    for (auto opt : attrs.get_replication_options()) {
        try {
-            auto rf = std::stol(opt.second);
+            long rf = 0;
+            try {
+                rf = locator::get_replication_factor(opt.second);
+            } catch (const exceptions::configuration_exception&) {
+            }
+
            if (rf > 0) {
                if (auto min_fail = qp.proxy().data_dictionary().get_config().minimum_replication_factor_fail_threshold();
                    min_fail >= 0 && rf < min_fail) {
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -222,7 +222,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
            throw exceptions::invalid_request_exception("Cannot set default_time_to_live on a table with counters");
        }

-        if (ks_uses_tablets && pt.is_counter()) {
+        if (ks_uses_tablets && pt.is_counter() && !db.features().counters_with_tablets) {
            throw exceptions::invalid_request_exception(format("Cannot use the 'counter' type for table {}.{}: Counters are not yet supported with tablets", keyspace(), cf_name));
        }

--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -152,9 +152,13 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(

    schema_ptr schema = validation::validate_column_family(db, _base_name.get_keyspace(), _base_name.get_column_family());

-    if (!db.features().views_with_tablets && db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
-        throw exceptions::invalid_request_exception(format("Materialized views are not supported on base tables with tablets"));
+    try {
+        db::view::validate_view_keyspace(db, keyspace());
+    } catch (const std::exception& e) {
+        // The type of the thrown exception is not specified, so we need to wrap it here.
+        throw exceptions::invalid_request_exception(e.what());
    }
+
    if (schema->is_counter()) {
        throw exceptions::invalid_request_exception(format("Materialized views are not supported on counter tables"));
    }
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -13,16 +13,15 @@
 #include "index_prop_defs.hh"
 #include "index/secondary_index.hh"
 #include "exceptions/exceptions.hh"
-#include "schema/schema.hh"

-void check_system_option_specified(const index_options_map& options, const sstring& option_name) {
+static void check_system_option_specified(const index_options_map& options, const sstring& option_name) {
    if (options.count(option_name)) {
        throw exceptions::invalid_request_exception(
                fmt::format("Cannot specify {} as a CUSTOM option", option_name));
    }
 }

-void cql3::statements::index_prop_defs::validate() {
+void cql3::statements::index_prop_defs::validate() const {
    static std::set<sstring> keywords({ sstring(KW_OPTIONS) });

    property_definitions::validate(keywords);
@@ -41,13 +40,13 @@ void cql3::statements::index_prop_defs::validate() {
 }

 index_options_map
-cql3::statements::index_prop_defs::get_raw_options() {
+cql3::statements::index_prop_defs::get_raw_options() const {
    auto options = get_map(KW_OPTIONS);
    return !options ? std::unordered_map<sstring, sstring>() : std::unordered_map<sstring, sstring>(options->begin(), options->end());
 }

 index_options_map
-cql3::statements::index_prop_defs::get_options() {
+cql3::statements::index_prop_defs::get_options() const {
    auto options = get_raw_options();
    options.emplace(db::index::secondary_index::custom_class_option_name, *custom_class);
    if (index_version.has_value()) {
--- a/cql3/statements/index_prop_defs.hh
+++ b/cql3/statements/index_prop_defs.hh
@@ -12,7 +12,7 @@

 #include "property_definitions.hh"
 #include <seastar/core/sstring.hh>
-#include "schema/schema.hh"
+#include "schema/schema_fwd.hh"

 #include <unordered_map>
 #include <optional>
@@ -32,9 +32,9 @@ public:
    // The only assumption about the value of `index_version` should be that it is different for every index.
    std::optional<table_schema_version> index_version;

-    void validate();
-    index_options_map get_raw_options();
-    index_options_map get_options();
+    void validate() const;
+    index_options_map get_raw_options() const;
+    index_options_map get_options() const;
 };

 }
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "seastar/core/sstring.hh"
 #include "utils/assert.hh"
 #include "cql3/statements/ks_prop_defs.hh"
 #include "cql3/statements/request_validations.hh"
@@ -18,19 +19,97 @@
 #include "exceptions/exceptions.hh"
 #include "gms/feature_service.hh"
 #include "db/config.hh"
+#include <random>

 namespace cql3 {

 namespace statements {

-static std::map<sstring, sstring> prepare_options(
+static logging::logger logger("ks_prop_defs");
+
+static
+locator::replication_strategy_config_option
+expand_to_racks(const locator::token_metadata& tm,
+                const sstring& dc,
+                const locator::replication_strategy_config_option& rf,
+                const locator::replication_strategy_config_options& old_options)
+{
+    auto dc_racks = locator::get_allowed_racks(tm, dc);
+
+    logger.debug("expand_to_racks: dc={} rf={} allowed_racks={}", dc, rf, dc_racks);
+
+    if (!tm.get_topology().get_datacenters().contains(dc)) {
+        throw exceptions::configuration_exception(fmt::format("Unrecognized datacenter name '{}'", dc));
+    }
+
+    auto data = locator::abstract_replication_strategy::parse_replication_factor(rf);
+    data.validate(std::ranges::to<std::unordered_set<sstring>>(dc_racks));
+
+    if (data.is_rack_based()) {
+        return rf;
+    }
+
+    if (data.count() == 0) {
+        return locator::rack_list();
+    }
+
+    if (data.count() > dc_racks.size()) {
+        throw exceptions::configuration_exception(fmt::format(
+                "Replication factor {} exceeds the number of racks ({}) in dc {}", data.count(), dc_racks.size(), dc));
+    }
+
+    // Handle ALTER:
+    // ([]|0) -> numeric is allowed, there are no existing replicas
+    // numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
+    // rack_list -> len(rack_list) is allowed (no-op)
+    // rack_list -> numeric is not allowed
+    if (old_options.contains(dc)) {
+        auto& old_rf_val = old_options.at(dc);
+        auto old_rf = locator::replication_factor_data(old_rf_val);
+        if (old_rf.is_rack_based()) {
+            if (old_rf.count() == data.count()) {
+                return old_rf_val;
+            } else if (old_rf.count() > 0) {
+                throw exceptions::configuration_exception(fmt::format(
+                        "Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
+                        dc, old_rf_val, data.count()));
+            }
+        } else if (old_rf.count() > 0) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
+                    dc, old_rf_val, data.count()));
+        }
+    }
+
+    // If the replication factor is less than the number of racks, pick rf racks at random.
+    if (data.count() < dc_racks.size()) {
+        static thread_local auto gen = std::default_random_engine(std::random_device{}());
+        std::ranges::shuffle(dc_racks, gen);
+        dc_racks.resize(data.count());
+    }
+
+    return dc_racks;
+}
+
+static locator::replication_strategy_config_options prepare_options(
        const sstring& strategy_class,
        const locator::token_metadata& tm,
-        std::map<sstring, sstring> options,
-        const std::map<sstring, sstring>& old_options = {}) {
+        bool rf_rack_valid_keyspaces,
+        locator::replication_strategy_config_options options,
+        const locator::replication_strategy_config_options& old_options,
+        bool rack_list_enabled,
+        bool uses_tablets) {
    options.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);

-    if (locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) != "org.apache.cassandra.locator.NetworkTopologyStrategy") {
+    auto is_nts = locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) == "org.apache.cassandra.locator.NetworkTopologyStrategy";
+    auto is_alter = !old_options.empty();
+    const auto& all_dcs = tm.get_datacenter_racks_token_owners();
+    auto auto_expand_racks = uses_tablets && rf_rack_valid_keyspaces && rack_list_enabled;
+
+    logger.debug("prepare_options: {}: is_nts={} auto_expand_racks={} rack_list_enabled={} old_options={} new_options={} all_dcs={}",
+                 strategy_class, is_nts, auto_expand_racks, rack_list_enabled, old_options, options, all_dcs);
+
+    if (!is_nts) {
        return options;
    }

@@ -38,26 +117,86 @@ static std::map<sstring, sstring> prepare_options(
    // If the user simply switches from another strategy without providing any options,
    // but the other strategy used the 'replication_factor' option, it will also be expanded.
    // See issue CASSANDRA-14303.
-
    std::optional<sstring> rf;
    auto it = options.find(ks_prop_defs::REPLICATION_FACTOR_KEY);
    if (it != options.end()) {
        // Expand: the user explicitly provided a 'replication_factor'.
-        rf = it->second;
+        try {
+            rf = std::get<sstring>(it->second);
+        } catch (...) {
+            throw exceptions::configuration_exception(fmt::format("Invalid replication factor: {}: must be a string holding a numerical value", it->second));
+        }
        options.erase(it);
    } else if (options.empty()) {
        auto it = old_options.find(ks_prop_defs::REPLICATION_FACTOR_KEY);
        if (it != old_options.end()) {
            // Expand: the user switched from another strategy that specified a 'replication_factor'
            // and didn't provide any additional options.
-            rf = it->second;
+            rf = std::get<sstring>(it->second);
+        }
+    }
+
+    if (rf && uses_tablets && is_alter) {
+        throw exceptions::invalid_request_exception("'replication_factor' tag is not allowed when executing ALTER KEYSPACE with tablets, please list the DCs explicitly");
+    }
+
+    // Validate options.
+    for (auto&& [dc, opt] : options) {
+        locator::replication_factor_data rf(opt);
+
+        std::optional<locator::replication_factor_data> old_rf;
+        auto i = old_options.find(dc);
+        if (i != old_options.end()) {
+            old_rf = locator::replication_factor_data(i->second);
+        }
+
+        if (!rf.is_rack_based()) {
+            if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
+                if (old_rf->count() != rf.count()) {
+                    throw exceptions::configuration_exception(fmt::format(
+                            "Cannot change replication factor for '{}' from {} to {} when the old value was a rack list",
+                            dc, old_options.at(dc), opt));
+                } else {
+                    options[dc] = i->second; // Preserve rack list.
+                }
+            }
+            continue;
+        }
+        if (!rack_list_enabled) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Using rack list for '{}' is not allowed because the 'rf_rack_list' feature is disabled", dc));
+        }
+        if (!uses_tablets) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Using rack list for '{}' is not allowed because the keyspace is not using tablets", dc));
+        }
+        auto& racks = rf.get_rack_list();
+        if (std::unordered_set<sstring>(racks.begin(), racks.end()).size() != rf.count()) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Rack list for '{}' contains duplicate entries", dc));
+        }
+        if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
+            // FIXME: Allow this if replicas already conform to the given rack list.
+            // FIXME: Implement automatic colocation to allow transition to rack list.
+            throw exceptions::configuration_exception(fmt::format(
+                    "Cannot change replication factor from numeric to rack list for '{}'", dc));
+        }
+    }
+
+    if (!rf && options.empty() && old_options.empty()) {
+        if (all_dcs.empty()) {
+            throw request_validations::invalid_request("No data centers found in the cluster, cannot determine replication factor");
+        }
+        for (const auto& [dc, racks_map] : all_dcs) {
+            if (racks_map.empty()) {
+                continue;
+            }
+            options.emplace(dc, std::to_string(racks_map.size()));
        }
    }

    if (rf.has_value()) {
-        // The code below may end up not using "rf" at all (if all the DCs
-        // already have rf settings), so let's validate it once (#8880).
-        locator::abstract_replication_strategy::parse_replication_factor(*rf);
+        locator::replication_factor_data::parse(*rf);

        // We keep previously specified DC factors for safety.
        for (const auto& opt : old_options) {
@@ -69,17 +208,11 @@ static std::map<sstring, sstring> prepare_options(
        for (const auto& dc : tm.get_topology().get_datacenters()) {
            options.emplace(dc, *rf);
        }
-    } else if (options.empty() && old_options.empty()) {
-        // For default replication factor consider only racks with nodes that are NOT zero-token only nodes,
-        auto dc_racks = tm.get_datacenter_racks_token_owners();
-        if (dc_racks.empty()) {
-            throw request_validations::invalid_request("No data centers found in the cluster, cannot determine replication factor");
-        }
-        for (const auto& [dc, racks_map] : dc_racks) {
-            if (racks_map.empty()) {
-                continue;
-            }
-            options.emplace(dc, std::to_string(racks_map.size()));
+    }
+
+    if (auto_expand_racks) {
+        for (const auto& [dc, dc_rf] : options) {
+            options[dc] = expand_to_racks(tm, dc, dc_rf, old_options);
        }
    }

@@ -93,14 +226,34 @@ static std::map<sstring, sstring> prepare_options(
        throw exceptions::configuration_exception("Configuration for at least one datacenter must be present");
    }

+    if (uses_tablets) {
+        // We keep previously specified DC factors for safety.
+        for (const auto& opt: old_options) {
+            if (opt.first != ks_prop_defs::REPLICATION_FACTOR_KEY) {
+                options.insert(opt);
+            }
+        }
+    }
+
+    // #22688 - filter out any dc*:0 and dc*:[] entries - consider these
+    // null and void (removed).
+    std::erase_if(options, [] (const auto& e) {
+        auto& [dc, rf] = e;
+        return locator::replication_factor_data(rf).count() == 0;
+    });
+
    return options;
 }

-ks_prop_defs::ks_prop_defs(std::map<sstring, sstring> options) {
-    std::map<sstring, sstring> replication_opts, storage_opts, tablets_opts, durable_writes_opts;
+ks_prop_defs::ks_prop_defs(property_definitions::map_type options) {
+    map_type replication_opts, storage_opts, tablets_opts, durable_writes_opts, consistency_opts;

    auto read_property_into = [] (auto& map, const sstring& name, const sstring& value, const sstring& tag) {
-        map[name.substr(sstring(tag).size() + 1)] = value;
+        auto prefix = sstring(tag) + ":";
+        if (!name.starts_with(prefix)) {
+            throw std::runtime_error(seastar::format("ks_prop_defs: Expected name to start with \"{}\", but got: \"{}\"", prefix, name));
+        }
+        map[name.substr(prefix.size())] = value;
    };

    for (const auto& [name, value] : options) {
@@ -112,17 +265,22 @@ ks_prop_defs::ks_prop_defs(std::map<sstring, sstring> options) {
            read_property_into(tablets_opts, name, value, KW_TABLETS);
        } else if (name.starts_with(KW_STORAGE)) {
            read_property_into(storage_opts, name, value, KW_STORAGE);
+        } else if (name.starts_with(KW_CONSISTENCY)) {
+            read_property_into(consistency_opts, name, value, KW_CONSISTENCY);
        }
    }

    if (!replication_opts.empty())
-        add_property(KW_REPLICATION, replication_opts);
+        add_property(KW_REPLICATION, from_flattened_map(replication_opts));
    if (!storage_opts.empty())
        add_property(KW_STORAGE, storage_opts);
    if (!tablets_opts.empty())
        add_property(KW_TABLETS, tablets_opts);
    if (!durable_writes_opts.empty())
        add_property(KW_DURABLE_WRITES, durable_writes_opts.begin()->second);
+    if (!consistency_opts.empty()) {
+        add_property(KW_CONSISTENCY, consistency_opts.begin()->second);
+    }
 }

 void ks_prop_defs::validate() {
@@ -132,21 +290,25 @@ void ks_prop_defs::validate() {
        return;
    }

-    static std::set<sstring> keywords({ sstring(KW_DURABLE_WRITES), sstring(KW_REPLICATION), sstring(KW_STORAGE), sstring(KW_TABLETS) });
+    static std::set<sstring> keywords({ sstring(KW_DURABLE_WRITES), sstring(KW_REPLICATION), sstring(KW_STORAGE), sstring(KW_TABLETS), sstring(KW_CONSISTENCY) });
    property_definitions::validate(keywords);

    auto replication_options = get_replication_options();
    if (replication_options.contains(REPLICATION_STRATEGY_CLASS_KEY)) {
-        _strategy_class = replication_options[REPLICATION_STRATEGY_CLASS_KEY];
+        const auto& class_name = replication_options[REPLICATION_STRATEGY_CLASS_KEY];
+        if (!std::holds_alternative<sstring>(class_name)) {
+            throw exceptions::configuration_exception(seastar::format("Invalid replication strategy class: {}", class_name));
+        }
+        _strategy_class = std::get<sstring>(class_name);
    }
 }

-std::map<sstring, sstring> ks_prop_defs::get_replication_options() const {
-    auto replication_options = get_map(KW_REPLICATION);
+locator::replication_strategy_config_options ks_prop_defs::get_replication_options() const {
+    auto replication_options = get_extended_map(KW_REPLICATION);
    if (replication_options) {
        return replication_options.value();
    }
-    return std::map<sstring, sstring>{};
+    return {};
 }

 data_dictionary::storage_options ks_prop_defs::get_storage_options() const {
@@ -204,6 +366,15 @@ std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned
    return initial_count;
 }

+std::optional<data_dictionary::consistency_config_option> ks_prop_defs::get_consistency_option() const {
+    auto value = get_simple(KW_CONSISTENCY);
+    if (value) {
+        return data_dictionary::consistency_config_option_from_string(value.value());
+    } else {
+        return std::nullopt;
+    }
+}
+
 std::optional<sstring> ks_prop_defs::get_replication_strategy_class() const {
    return _strategy_class;
 }
@@ -228,26 +399,68 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
    std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
            ? std::optional<unsigned>(0) : std::nullopt;
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
-    auto options = prepare_options(sc, tm, get_replication_options());
+    bool uses_tablets = initial_tablets.has_value();
+    bool rack_list_enabled = feat.rack_list_rf;
+    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
-            std::move(options), initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
+            std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }

-lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata& tm, const gms::feature_service& feat) {
-    std::map<sstring, sstring> options;
+lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
+    locator::replication_strategy_config_options options;
    const auto& old_options = old->strategy_options();
+    // if tablets options have not been specified, inherit them if it's tablets-enabled KS
+    auto initial_tablets = get_initial_tablets(old->initial_tablets());
+    auto uses_tablets = initial_tablets.has_value();
+    if (old->uses_tablets() != uses_tablets) {
+        throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
+    }
    auto sc = get_replication_strategy_class();
+    bool rack_list_enabled = feat.rack_list_rf;
    if (sc) {
-        options = prepare_options(*sc, tm, get_replication_options(), old_options);
+        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
    } else {
        sc = old->strategy_name();
        options = old_options;
    }
-    // if tablets options have not been specified, inherit them if it's tablets-enabled KS
-    auto initial_tablets = get_initial_tablets(old->initial_tablets());
-    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
+    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }

+namespace {
+
+void add_prefixed_key(const sstring& prefix, const property_definitions::map_type& in, property_definitions::map_type& out) {
+    for (const auto& [in_key, in_value]: in) {
+        out[fmt::format("{}:{}", prefix, in_key)] = in_value;
+    }
+}
+
+void add_prefixed_key(const sstring& prefix, const property_definitions::extended_map_type& in, property_definitions::map_type& out) {
+    add_prefixed_key(prefix, to_flattened_map(in), out);
+}
+
+} // namespace
+
+property_definitions::map_type ks_prop_defs::flattened() const {
+    map_type result;
+
+    for (auto kw : {
+            ks_prop_defs::KW_REPLICATION,
+            ks_prop_defs::KW_STORAGE,
+            ks_prop_defs::KW_TABLETS}) {
+        if (auto val_opt = get_extended_map(kw)) {
+            add_prefixed_key(kw, *val_opt, result);
+        }
+    }
+
+    for (auto kw : {ks_prop_defs::KW_DURABLE_WRITES}) {
+        if (auto val_opt = get_simple(kw)) {
+            // Use nested map for backwards compatibility, ks_prop_defs() constructor expects this.
+            add_prefixed_key(kw, std::map<sstring, sstring>({{sstring(kw), to_sstring(*val_opt)}}), result);
+        }
+    }
+
+    return {result};
+}

 }

--- a/cql3/statements/ks_prop_defs.hh
+++ b/cql3/statements/ks_prop_defs.hh
@@ -12,6 +12,8 @@

 #include "cql3/statements/property_definitions.hh"
 #include "data_dictionary/storage_options.hh"
+#include "locator/abstract_replication_strategy.hh"
+#include "data_dictionary/consistency_config_options.hh"

 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sstring.hh>
@@ -48,6 +50,7 @@ public:
    static constexpr auto KW_REPLICATION = "replication";
    static constexpr auto KW_STORAGE = "storage";
    static constexpr auto KW_TABLETS = "tablets";
+    static constexpr auto KW_CONSISTENCY = "consistency";

    static constexpr auto REPLICATION_STRATEGY_CLASS_KEY = "class";
    static constexpr auto DEFAULT_REPLICATION_STRATEGY_CLASS = "NetworkTopologyStrategy";
@@ -57,17 +60,27 @@ private:
    std::optional<sstring> _strategy_class;
 public:
    ks_prop_defs() = default;
-    explicit ks_prop_defs(std::map<sstring, sstring> options);
+
+    explicit ks_prop_defs(map_type options);
+
+    /// Converts options to a flattened map of properties.
+    ///
+    /// It holds that:
+    ///
+    ///   ks_prop_defs(flattened()) == *this
+    ///
+    map_type flattened() const;

    void validate();
-    std::map<sstring, sstring> get_replication_options() const;
+    locator::replication_strategy_config_options get_replication_options() const;
    std::optional<sstring> get_replication_strategy_class() const;
    void set_default_replication_strategy_class_option();
    std::optional<unsigned> get_initial_tablets(std::optional<unsigned> default_value, bool enforce_tablets = false) const;
+    std::optional<data_dictionary::consistency_config_option> get_consistency_option() const;
    data_dictionary::storage_options get_storage_options() const;
    bool get_durable_writes() const;
    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata(sstring ks_name, const locator::token_metadata&, const gms::feature_service&, const db::config&);
-    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata&, const gms::feature_service&);
+    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata&, const gms::feature_service&, const db::config&);
 };

 }
--- a/cql3/statements/property_definitions.cc
+++ b/cql3/statements/property_definitions.cc
@@ -8,9 +8,12 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include <ranges>
+
 #include <seastar/core/format.hh>
 #include "cql3/statements/property_definitions.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/overloaded_functor.hh"

 namespace cql3 {

@@ -26,7 +29,7 @@ void property_definitions::add_property(const sstring& name, sstring value) {
    }
 }

-void property_definitions::add_property(const sstring& name, const std::map<sstring, sstring>& value) {
+void property_definitions::add_property(const sstring& name, const extended_map_type& value) {
    if (auto [ignored, added] = _properties.try_emplace(name, value); !added) {
        throw exceptions::syntax_exception(format("Multiple definition for property '{}'", name));
    }
@@ -60,18 +63,41 @@ std::optional<sstring> property_definitions::get_simple(const sstring& name) con
    }
 }

-std::optional<std::map<sstring, sstring>> property_definitions::get_map(const sstring& name) const {
+std::optional<property_definitions::extended_map_type> property_definitions::get_extended_map(const sstring& name) const {
    auto it = _properties.find(name);
    if (it == _properties.end()) {
        return std::nullopt;
    }
    try {
-        return std::get<map_type>(it->second);
+        return std::get<extended_map_type>(it->second);
    } catch (const std::bad_variant_access& e) {
        throw exceptions::syntax_exception(format("Invalid value for property '{}'. It should be a map.", name));
    }
 }

+std::optional<property_definitions::map_type> property_definitions::get_map(const sstring& name) const {
+    auto xmap = get_extended_map(name);
+    if (!xmap) {
+        return std::nullopt;
+    }
+    return to_simple_map(std::move(*xmap));
+}
+
+property_definitions::map_type property_definitions::to_simple_map(const extended_map_type& xmap) {
+    return xmap | std::views::transform([](const auto& x) {
+        // Convert each pair to a string key and value
+        try {
+            return std::make_pair(x.first, std::get<sstring>(x.second));
+        } catch (const std::bad_variant_access& e) {
+            throw exceptions::syntax_exception(seastar::format("Invalid map value '{}' for key '{}'. It should be a simple string.", std::get<list_type>(x.second), x.first));
+        }
+    }) | std::ranges::to<map_type>();
+}
+
+property_definitions::extended_map_type property_definitions::to_extended_map(const map_type& map) {
+    return map | std::ranges::to<extended_map_type>();
+}
+
 bool property_definitions::has_property(const sstring& name) const {
    return _properties.contains(name);
 }
@@ -166,7 +192,7 @@ void property_definitions::remove_from_map_if_exists(const sstring& name, const
        return;
    }
    try {
-        auto map = std::get<map_type>(it->second);
+        auto map = std::get<extended_map_type>(it->second);
        map.erase(key);
        _properties[name] = map;
    } catch (const std::bad_variant_access& e) {
@@ -174,6 +200,68 @@ void property_definitions::remove_from_map_if_exists(const sstring& name, const
    }
 }

+/// Converts extended map into a flat map.
+///
+/// Values which are lists are represented as multiple entries in the map
+/// with the list index appended to the key, with ':' as a separator.
+/// Empty list is represented as a single entry with index -1 and empty string as value.
+///
+/// For example:
+///
+///    {'dc1': '3', 'dc2': ['rack1', 'rack2'], 'dc3': []}
+///
+/// has a flattened representation of:
+///
+///   {'dc1': '3', 'dc2:0': 'rack1', 'dc2:1': 'rack2', 'dc3:-1': ''}
+///
+property_definitions::map_type to_flattened_map(const property_definitions::extended_map_type& in) {
+    property_definitions::map_type out;
+    for (const auto& [in_key, in_value]: in) {
+        if (in_key.find(':') != sstring::npos) {
+            throw std::invalid_argument(fmt::format("key '{}' contains reserved character ':'", in_key));
+        }
+        std::visit(overloaded_functor{
+            [&] (const sstring& value) {
+                out[in_key] = value;
+            },
+            [&] (const std::vector<sstring>& list) {
+                if (list.empty()) {
+                    out[fmt::format("{}:{}", in_key, -1)] = "";
+                } else {
+                    // flatten the rack list in multiple entries
+                    for (size_t i = 0; i < list.size(); ++i) {
+                        const auto& v = list[i];
+                        out[fmt::format("{}:{}", in_key, i)] = v;
+                    }
+                }
+            }
+        }, in_value);
+    }
+    return out;
+}
+
+property_definitions::extended_map_type from_flattened_map(const property_definitions::map_type& in) {
+    property_definitions::extended_map_type out;
+    for (const auto& [key, value] : in) {
+        auto pos = key.find(':');
+        if (pos == sstring::npos) {
+            out.emplace(key, value);
+        } else {
+            auto dc = key.substr(0, pos);
+            auto index = std::stol(key.substr(pos + 1));
+            auto [it, empty] = out.try_emplace(dc, std::vector<sstring>());
+            auto& vec = std::get<std::vector<sstring>>(it->second);
+            if (index >= 0) {
+                if (vec.size() <= size_t(index)) {
+                    vec.resize(index + 1);
+                }
+                vec[index] = value;
+            }
+        }
+    }
+    return out;
+}
+
 }

 }
--- a/cql3/statements/property_definitions.hh
+++ b/cql3/statements/property_definitions.hh
@@ -27,19 +27,26 @@ namespace statements {
 class property_definitions {
 public:
    using map_type = std::map<sstring, sstring>;
-    using value_type = std::variant<sstring, map_type>;
+    using list_type = std::vector<sstring>;
+    using extended_map_type = std::map<sstring, std::variant<sstring, list_type>>;
+    using value_type = std::variant<sstring, extended_map_type>;
+    using properties_map_type = std::unordered_map<sstring, value_type>;
 protected:
 #if 0
    protected static final Logger logger = LoggerFactory.getLogger(PropertyDefinitions.class);
 #endif

-    mutable std::unordered_map<sstring, value_type> _properties;
+    mutable properties_map_type _properties;

    property_definitions();
 public:
    void add_property(const sstring& name, sstring value);

-    void add_property(const sstring& name, const std::map<sstring, sstring>& value);
+    void add_property(const sstring& name, const map_type& value) {
+        add_property(name, to_extended_map(value));
+    }
+
+    void add_property(const sstring& name, const extended_map_type& value);

    void validate(const std::set<sstring>& keywords, const std::set<sstring>& exts = {}, const std::set<sstring>& obsolete = {}) const;

@@ -54,7 +61,11 @@ public:

    std::optional<value_type> get(const sstring& name) const;

-    std::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;
+    std::optional<extended_map_type> get_extended_map(const sstring& name) const;
+    std::optional<map_type> get_map(const sstring& name) const;
+
+    static map_type to_simple_map(const extended_map_type&);
+    static extended_map_type to_extended_map(const map_type&);

    sstring get_string(sstring key, sstring default_value) const;

@@ -78,6 +89,9 @@ public:
    }
 };

+property_definitions::map_type to_flattened_map(const property_definitions::extended_map_type&);
+property_definitions::extended_map_type from_flattened_map(const property_definitions::map_type&);
+
 }

 }
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -131,8 +131,6 @@ private:

    void verify_ordering_is_valid(const prepared_orderings_type&, const schema&, const restrictions::statement_restrictions& restrictions) const;

-    void verify_ann_ordering_is_valid(const std::optional<expr::expression>& limit, const std::optional<expr::expression>& per_partition_limit, const selection::selection& selection) const;
-
    prepared_ann_ordering_type prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const;

    // Checks whether this ordering reverses all results.
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -21,9 +21,9 @@
 #include "exceptions/exceptions.hh"
 #include <seastar/core/future.hh>
 #include <seastar/coroutine/exception.hh>
+#include "index/vector_index.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "service/qos/qos_common.hh"
-#include "vector_search/vector_store_client.hh"
 #include "transport/messages/result_message.hh"
 #include "cql3/functions/as_json_function.hh"
 #include "cql3/selection/selection.hh"
@@ -50,7 +50,6 @@
 #include "db/timeout_clock.hh"
 #include "db/consistency_level_validations.hh"
 #include "data_dictionary/data_dictionary.hh"
-#include "test/lib/select_statement_utils.hh"
 #include "gms/feature_service.hh"
 #include "utils/assert.hh"
 #include "utils/result_combinators.hh"
@@ -68,6 +67,25 @@ bool is_internal_keyspace(std::string_view name);
 namespace cql3 {

 namespace statements {
+namespace {
+
+constexpr std::string_view ANN_CUSTOM_INDEX_OPTION = "vector_index";
+
+template <typename Func>
+auto measure_index_latency(const schema& schema, const secondary_index::index& index, Func&& func) -> std::invoke_result_t<Func> {
+    auto start_time = lowres_system_clock::now();
+    auto result = co_await func();
+    auto duration = lowres_system_clock::now() - start_time;
+
+    auto stats = schema.table().get_index_manager().get_index_stats(index.metadata().name());
+    if (stats) {
+        stats->add_latency(duration);
+    }
+
+    co_return result;
+}
+
+} // namespace

 static logging::logger logger("select_statement");

@@ -108,8 +126,6 @@ failed_result_to_result_message(coordinator_result<T>&& r) {
    return ::make_shared<cql_transport::messages::result_message::exception>(std::move(r).assume_error());
 }

-static constexpr int DEFAULT_INTERNAL_PAGING_SIZE = select_statement::DEFAULT_COUNT_PAGE_SIZE;
-thread_local int internal_paging_size = DEFAULT_INTERNAL_PAGING_SIZE;
 thread_local const lw_shared_ptr<const select_statement::parameters> select_statement::_default_parameters = make_lw_shared<select_statement::parameters>();

 select_statement::parameters::parameters()
@@ -245,7 +261,8 @@ future<> select_statement::check_access(query_processor& qp, const service::clie
        auto& cf_name = s->is_view()
            ? s->view_info()->base_name()
            : (cdc ? cdc->cf_name() : column_family());
-        co_await state.has_column_family_access(keyspace(), cf_name, auth::permission::SELECT);
+        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*_schema);
+        co_await state.has_column_family_access(keyspace(), cf_name, auth::permission::SELECT, auth::command_desc::type::OTHER, is_vector_indexed);
    } catch (const data_dictionary::no_such_column_family& e) {
        // Will be validated afterwards.
        co_return;
@@ -442,7 +459,7 @@ select_statement::do_execute(query_processor& qp,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = _restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = page_size <= 0 ? internal_paging_size : page_size;
+        page_size = page_size <= 0 ? qp.db().get_config().select_internal_page_size() : page_size;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -623,7 +640,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
 }

 lw_shared_ptr<query::read_command>
-indexed_table_select_statement::prepare_command_for_base_query(query_processor& qp, const query_options& options,
+view_indexed_table_select_statement::prepare_command_for_base_query(query_processor& qp, const query_options& options,
        service::query_state& state, gc_clock::time_point now, bool use_paging) const {
    auto slice = make_partition_slice(options);
    if (use_paging) {
@@ -651,7 +668,7 @@ indexed_table_select_statement::prepare_command_for_base_query(query_processor&
 }

 future<coordinator_result<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>>
-indexed_table_select_statement::do_execute_base_query(
+view_indexed_table_select_statement::do_execute_base_query(
        query_processor& qp,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -746,7 +763,7 @@ indexed_table_select_statement::do_execute_base_query(
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+view_indexed_table_select_statement::execute_base_query(
        query_processor& qp,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -754,14 +771,14 @@ indexed_table_select_statement::execute_base_query(
        gc_clock::time_point now,
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    return do_execute_base_query(qp, std::move(partition_ranges), state, options, now, paging_state).then(wrap_result_to_error_message(
-            [this, &state, &options, now, paging_state = std::move(paging_state)] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd) {
+            [this, &state, &options, now, paging_state = std::move(paging_state), internal_page_size = qp.db().get_config().select_internal_page_size()] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd) {
        auto&& [result, cmd] = result_and_cmd;
-        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state));
+        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state), internal_page_size);
    }));
 }

 future<coordinator_result<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>>
-indexed_table_select_statement::do_execute_base_query(
+view_indexed_table_select_statement::do_execute_base_query(
        query_processor& qp,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -826,7 +843,7 @@ indexed_table_select_statement::do_execute_base_query(
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+view_indexed_table_select_statement::execute_base_query(
        query_processor& qp,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -834,9 +851,9 @@ indexed_table_select_statement::execute_base_query(
        gc_clock::time_point now,
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    return do_execute_base_query(qp, std::move(primary_keys), state, options, now, paging_state).then(wrap_result_to_error_message(
-            [this, &state, &options, now, paging_state = std::move(paging_state)] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd){
+            [this, &state, &options, now, paging_state = std::move(paging_state), internal_page_size = qp.db().get_config().select_internal_page_size()] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd){
        auto&& [result, cmd] = result_and_cmd;
-        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state));
+        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state), internal_page_size);
    }));
 }

@@ -894,16 +911,17 @@ select_statement::execute_without_checking_exception_message_non_aggregate_unpag
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::process_base_query_results(
+view_indexed_table_select_statement::process_base_query_results(
        foreign_ptr<lw_shared_ptr<query::result>> results,
        lw_shared_ptr<query::read_command> cmd,
        service::query_state& state,
        const query_options& options,
        gc_clock::time_point now,
-        lw_shared_ptr<const service::pager::paging_state> paging_state) const
+        lw_shared_ptr<const service::pager::paging_state> paging_state,
+        uint32_t internal_page_size) const
 {
    if (paging_state) {
-        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options);
+        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options, internal_page_size);
        _selection->get_result_metadata()->maybe_set_paging_state(std::move(paging_state));
    }
    return process_results(std::move(results), std::move(cmd), options, now);
@@ -996,7 +1014,7 @@ bool check_needs_allow_filtering_anyway(const restrictions::statement_restrictio
 }

 ::shared_ptr<cql3::statements::select_statement>
-indexed_table_select_statement::prepare(data_dictionary::database db,
+view_indexed_table_select_statement::prepare(data_dictionary::database db,
                                        schema_ptr schema,
                                        uint32_t bound_terms,
                                        lw_shared_ptr<const parameters> parameters,
@@ -1005,7 +1023,6 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
                                        ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                        bool is_reversed,
                                        ordering_comparator_type ordering_comparator,
-                                        std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                        std::optional<expr::expression> limit,
                                         std::optional<expr::expression> per_partition_limit,
                                         cql_stats &stats,
@@ -1015,36 +1032,18 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
    auto& sim = cf.get_index_manager();
    auto [index_opt, used_index_restrictions] = restrictions->find_idx(sim);

-    if (prepared_ann_ordering.has_value()) {
-        auto indexes = sim.list_indexes();
-        auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
-            return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name)
-                && ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ann_custom_index_option)
-                && (ind.target_column() == prepared_ann_ordering->first->name_as_text());
-        });
-
-        if (it == indexes.end()) {
-            throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
-        } else {
-            if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
-                throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
-            }
-            index_opt = *it;
-        }
-    } else if (index_opt) {
-        auto it = index_opt->metadata().options().find(db::index::secondary_index::custom_class_option_name);
-        if (it != index_opt->metadata().options().end() && it->second == ann_custom_index_option) {
-            throw exceptions::invalid_request_exception("Vector indexes only support ANN queries");
-        }
-    }
-
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }

+    auto it = index_opt->metadata().options().find(db::index::secondary_index::custom_class_option_name);
+    if (it != index_opt->metadata().options().end() && it->second == ANN_CUSTOM_INDEX_OPTION) {
+        throw exceptions::invalid_request_exception("Vector indexes only support ANN queries");
+    }
+
    schema_ptr view_schema = restrictions->get_view_schema();

-    return ::make_shared<cql3::statements::indexed_table_select_statement>(
+    return ::make_shared<cql3::statements::view_indexed_table_select_statement>(
            schema,
            bound_terms,
            parameters,
@@ -1053,7 +1052,6 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
            std::move(group_by_cell_indices),
            is_reversed,
            std::move(ordering_comparator),
-            std::move(prepared_ann_ordering),
            std::move(limit),
            std::move(per_partition_limit),
            stats,
@@ -1064,14 +1062,13 @@ indexed_table_select_statement::prepare(data_dictionary::database db,

 }

-indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
+view_indexed_table_select_statement::view_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           lw_shared_ptr<const parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
                                                           ::shared_ptr<const restrictions::statement_restrictions> restrictions,
                                                           ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                                           bool is_reversed,
                                                           ordering_comparator_type ordering_comparator,
-                                                           std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                                           std::optional<expr::expression> limit,
                                                           std::optional<expr::expression> per_partition_limit,
                                                           cql_stats &stats,
@@ -1083,8 +1080,8 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
    , _index{index}
    , _used_index_restrictions(std::move(used_index_restrictions))
    , _view_schema(view_schema)
-    , _prepared_ann_ordering(std::move(prepared_ann_ordering))
 {
+    SCYLLA_ASSERT(_view_schema);
    if (_index.metadata().local()) {
        _get_partition_ranges_for_posting_list = [this] (const query_options& options) { return get_partition_ranges_for_local_index_posting_list(options); };
        _get_partition_slice_for_posting_list = [this] (const query_options& options) { return get_partition_slice_for_local_index_posting_list(options); };
@@ -1108,7 +1105,7 @@ static void append_base_key_to_index_ck(std::vector<managed_bytes_view>& explode
    std::move(begin, key_view.end(), std::back_inserter(exploded_index_ck));
 }

-bytes indexed_table_select_statement::compute_idx_token(const partition_key& key) const {
+bytes view_indexed_table_select_statement::compute_idx_token(const partition_key& key) const {
    const column_definition& cdef = *_view_schema->clustering_key_columns().begin();
    if (!cdef.is_computed()) {
        throw std::logic_error{format(
@@ -1118,8 +1115,8 @@ bytes indexed_table_select_statement::compute_idx_token(const partition_key& key
    return cdef.get_computation().compute_value(*_schema, key);
 }

-lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement::generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
-        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options) const {
+lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_statement::generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
+        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options, uint32_t internal_page_size) const {
    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
    if (!cdef) {
        throw exceptions::invalid_request_exception("Indexed column not found in schema");
@@ -1166,27 +1163,22 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
-    paging_state_copy->set_remaining(internal_paging_size);
+    paging_state_copy->set_remaining(internal_page_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return paging_state_copy;
 }

-future<shared_ptr<cql_transport::messages::result_message>> indexed_table_select_statement::do_execute(
+future<shared_ptr<cql_transport::messages::result_message>> view_indexed_table_select_statement::do_execute(
        query_processor& qp, service::query_state& state, const query_options& options) const {
-        
-        auto start_time = lowres_system_clock::now();
-        auto result = co_await actually_do_execute(qp, state, options);
-        auto duration = lowres_system_clock::now() - start_time;
-        auto stats = _schema->table().get_index_manager().get_index_stats(_index.metadata().name());
-        if (stats) {
-            stats->add_latency(duration);
-        }
-        co_return result;
+
+    return measure_index_latency(*_schema, _index, [this, &qp, &state, &options]() -> future<shared_ptr<cql_transport::messages::result_message>> {
+        return actually_do_execute(qp, state, options);
+    });
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::actually_do_execute(query_processor& qp,
+view_indexed_table_select_statement::actually_do_execute(query_processor& qp,
                             service::query_state& state,
                             const query_options& options) const
 {
@@ -1207,38 +1199,7 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
            ? source_selector::INTERNAL : source_selector::USER;
    ++_stats.query_cnt(src_sel, _ks_sel, cond_selector::NO_CONDITIONS, statement_type::SELECT);

-    SCYLLA_ASSERT(_restrictions->uses_secondary_indexing() || _prepared_ann_ordering.has_value());
-
-    if (_prepared_ann_ordering.has_value()) {
-        auto limit = get_limit(options, _limit);
-        if (limit > max_ann_query_limit) {
-            co_await coroutine::return_exception(exceptions::invalid_request_exception(fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
-        }
-
-        auto [ann_column, ann_vector_expr] = _prepared_ann_ordering.value();
-
-        auto values = value_cast<vector_type_impl::native_type>(ann_column->type->deserialize(expr::evaluate(ann_vector_expr, options).to_bytes()));
-        auto ann_vector = util::to_vector<float>(values);
-
-        auto as = abort_source();
-        auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema , std::move(ann_vector), limit, as);
-        if (!pkeys.has_value()) {
-            co_await coroutine::return_exception(
-                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
-        }
-
-        // If there are no clustering columns, we have to convert the partition keys to partition ranges.
-        if (_schema->clustering_key_size() == 0) {
-            std::vector<dht::partition_range> partition_ranges;
-            std::ranges::transform(pkeys.value(), std::back_inserter(partition_ranges), [](const auto& pkey) {
-                    return dht::partition_range::make_singular(pkey.partition);
-                });
-
-            co_return co_await this->execute_base_query(qp, std::move(partition_ranges), state, options, now, nullptr);
-        }
-
-        co_return co_await this->execute_base_query(qp, std::move(*pkeys), state, options, now, nullptr);
-    }
+    SCYLLA_ASSERT(_restrictions->uses_secondary_indexing());

    _stats.unpaged_select_queries(_ks_sel) += options.get_page_size() <= 0;

@@ -1292,11 +1253,12 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
        std::unique_ptr<cql3::query_options> internal_options = std::make_unique<cql3::query_options>(cql3::query_options(options));
        stop_iteration stop;
        // page size is set to the internal count page size, regardless of the user-provided value
-        internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_paging_size));
+        auto internal_page_size = qp.db().get_config().select_internal_page_size();
+        internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_page_size));
        do {
-            auto consume_results = [this, &builder, &options, &internal_options, &state] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) -> stop_iteration {
+            auto consume_results = [this, &builder, &options, &internal_options, &state, internal_page_size] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) -> stop_iteration {
                if (paging_state) {
-                    paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options);
+                    paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options, internal_page_size);
                }
                internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
                if (_restrictions_need_filtering) {
@@ -1373,11 +1335,11 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
    }
 }

-dht::partition_range_vector indexed_table_select_statement::get_partition_ranges_for_local_index_posting_list(const query_options& options) const {
+dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_local_index_posting_list(const query_options& options) const {
    return _restrictions->get_partition_key_ranges(options);
 }

-dht::partition_range_vector indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
+dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
    dht::partition_range_vector partition_ranges;

    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
@@ -1396,7 +1358,7 @@ dht::partition_range_vector indexed_table_select_statement::get_partition_ranges
    return partition_ranges;
 }

-query::partition_slice indexed_table_select_statement::get_partition_slice_for_global_index_posting_list(const query_options& options) const {
+query::partition_slice view_indexed_table_select_statement::get_partition_slice_for_global_index_posting_list(const query_options& options) const {
    partition_slice_builder partition_slice_builder{*_view_schema};

    if (!_restrictions->has_partition_key_unrestricted_components()) {
@@ -1415,7 +1377,7 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
    return partition_slice_builder.build();
 }

-query::partition_slice indexed_table_select_statement::get_partition_slice_for_local_index_posting_list(const query_options& options) const {
+query::partition_slice view_indexed_table_select_statement::get_partition_slice_for_local_index_posting_list(const query_options& options) const {
    partition_slice_builder partition_slice_builder{*_view_schema};

    partition_slice_builder.with_ranges(
@@ -1428,7 +1390,7 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_l
 // the posting-list for a particular value of the indexed column.
 // Remember a secondary index can only be created on a single column.
 future<coordinator_result<::shared_ptr<cql_transport::messages::result_message::rows>>>
-indexed_table_select_statement::read_posting_list(query_processor& qp,
+view_indexed_table_select_statement::read_posting_list(query_processor& qp,
                  const query_options& options,
                  uint64_t limit,
                  service::query_state& state,
@@ -1489,7 +1451,7 @@ indexed_table_select_statement::read_posting_list(query_processor& qp,
 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
 future<coordinator_result<std::tuple<dht::partition_range_vector, lw_shared_ptr<const service::pager::paging_state>>>>
-indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
+view_indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
                                             service::query_state& state,
                                             const query_options& options) const
 {
@@ -1565,7 +1527,7 @@ indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
 future<coordinator_result<std::tuple<std::vector<primary_key>, lw_shared_ptr<const service::pager::paging_state>>>>
-indexed_table_select_statement::find_index_clustering_rows(query_processor& qp, service::query_state& state, const query_options& options) const
+view_indexed_table_select_statement::find_index_clustering_rows(query_processor& qp, service::query_state& state, const query_options& options) const
 {
    using value_type = std::tuple<std::vector<primary_key>, lw_shared_ptr<const service::pager::paging_state>>;
    auto now = gc_clock::now();
@@ -1901,7 +1863,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = _restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = internal_paging_size;
+        page_size = qp.db().get_config().select_internal_page_size();
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -1995,6 +1957,181 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
            }));
 }

+::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
+        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<attributes> attrs) {
+    auto cf = db.find_column_family(schema);
+    auto& sim = cf.get_index_manager();
+    auto [index_opt, _] = restrictions->find_idx(sim);
+
+    auto indexes = sim.list_indexes();
+    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
+        return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
+                       ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
+               (ind.target_column() == prepared_ann_ordering.first->name_as_text());
+    });
+
+    if (it == indexes.end()) {
+        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
+    }
+    if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
+        throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
+    }
+    index_opt = *it;
+
+    if (!index_opt) {
+        throw std::runtime_error("No index found.");
+    }
+
+    return ::make_shared<cql3::statements::vector_indexed_table_select_statement>(schema, bound_terms, parameters, std::move(selection), std::move(restrictions),
+            std::move(group_by_cell_indices), is_reversed, std::move(ordering_comparator), std::move(prepared_ann_ordering), std::move(limit),
+            std::move(per_partition_limit), stats, *index_opt, std::move(attrs));
+}
+
+vector_indexed_table_select_statement::vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
+        ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+        ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
+        prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs)
+    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit,
+              per_partition_limit, stats, std::move(attrs)}
+    , _index{index}
+    , _prepared_ann_ordering(std::move(prepared_ann_ordering)) {
+
+    if (!limit.has_value()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
+    }
+
+    if (per_partition_limit.has_value()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries do not support per-partition limits");
+    }
+
+    if (selection->is_aggregate()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries cannot be run with aggregation");
+    }
+}
+
+future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::do_execute(
+        query_processor& qp, service::query_state& state, const query_options& options) const {
+
+    auto limit = get_limit(options, _limit);
+
+    auto result = co_await measure_index_latency(*_schema, _index, [this, &qp, &state, &options, &limit](this auto) -> future<shared_ptr<cql_transport::messages::result_message>> {
+        tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());
+        validate_for_read(options.get_consistency());
+
+        _query_start_time_point = gc_clock::now();
+
+        update_stats();
+
+        if (limit > max_ann_query_limit) {
+            co_await coroutine::return_exception(exceptions::invalid_request_exception(
+                    fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
+        }
+
+        auto as = abort_source();
+        auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, as);
+        if (!pkeys.has_value()) {
+            co_await coroutine::return_exception(
+                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
+        }
+
+        co_return co_await query_base_table(qp, state, options, pkeys.value());
+    });
+
+    auto page_size = options.get_page_size();
+    if (page_size > 0 && (uint64_t) page_size < limit) {
+        result->add_warning("Paging is not supported for Vector Search queries. The entire result set has been returned.");
+    }
+    co_return result;
+}
+
+void vector_indexed_table_select_statement::update_stats() const {
+    ++_stats.secondary_index_reads;
+    ++_stats.query_cnt(source_selector::USER, _ks_sel, cond_selector::NO_CONDITIONS, statement_type::SELECT);
+}
+
+lw_shared_ptr<query::read_command> vector_indexed_table_select_statement::prepare_command_for_base_query(
+        query_processor& qp, service::query_state& state, const query_options& options) const {
+    auto slice = make_partition_slice(options);
+    return ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(slice), qp.proxy().get_max_result_size(slice),
+            query::tombstone_limit(qp.proxy().get_tombstone_limit()),
+            query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())), query::partition_limit(query::max_partitions),
+            _query_start_time_point, tracing::make_trace_info(state.get_trace_state()), query_id::create_null_id(), query::is_first_page::no,
+            options.get_timestamp(state));
+}
+
+std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vector(const query_options& options) const {
+    auto [ann_column, ann_vector_expr] = _prepared_ann_ordering;
+    auto expr_value = expr::evaluate(ann_vector_expr, options);
+    if (expr_value.is_null()) {
+        throw exceptions::invalid_request_exception(fmt::format("Unsupported null value for column {}", _prepared_ann_ordering.first->name_as_text()));
+    }
+    auto values = value_cast<vector_type_impl::native_type>(ann_column->type->deserialize(expr::evaluate(ann_vector_expr, options).to_bytes()));
+    return util::to_vector<float>(values);
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(
+        query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const {
+    auto command = prepare_command_for_base_query(qp, state, options);
+    auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
+
+    // For tables without clustering columns, we can optimize by querying
+    // partition ranges instead of individual primary keys, since the
+    // partition key alone uniquely identifies each row.
+    if (_schema->clustering_key_size() == 0) {
+        auto to_partition_ranges = [](const std::vector<vector_search::primary_key>& pkeys) -> std::vector<dht::partition_range> {
+            std::vector<dht::partition_range> partition_ranges;
+            std::ranges::transform(pkeys, std::back_inserter(partition_ranges), [](const auto& pkey) {
+                return dht::partition_range::make_singular(pkey.partition);
+            });
+
+            return partition_ranges;
+        };
+        co_return co_await query_base_table(qp, state, options, std::move(command), timeout, to_partition_ranges(pkeys));
+    }
+    co_return co_await query_base_table(qp, state, options, std::move(command), timeout, pkeys);
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
+        service::query_state& state, const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+        const std::vector<vector_search::primary_key>& pkeys) const {
+
+    coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>> result = co_await utils::result_map_reduce(
+            pkeys.begin(), pkeys.end(),
+            [&](this auto, auto& key) -> future<coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>>> {
+                auto cmd = ::make_lw_shared<query::read_command>(*command);
+                cmd->slice._row_ranges = query::clustering_row_ranges{query::clustering_range::make_singular(key.clustering)};
+                coordinator_result<service::storage_proxy::coordinator_query_result> rqr =
+                        co_await qp.proxy().query_result(_schema, cmd, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(),
+                                {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state()});
+                if (!rqr) {
+                    co_return std::move(rqr).as_failure();
+                }
+                co_return std::move(rqr.value().query_result);
+            },
+            query::result_merger{command->get_row_limit(), query::max_partitions});
+
+    co_return co_await wrap_result_to_error_message([this, &command, &options](auto result) {
+        return process_results(std::move(result), command, options, _query_start_time_point);
+    })(std::move(result));
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
+        service::query_state& state, const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+        std::vector<dht::partition_range> partition_ranges) const {
+
+    co_return co_await qp.proxy()
+            .query_result(_query_schema, command, std::move(partition_ranges), options.get_consistency(),
+                    {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}, options.get_specific_options().node_local_only},
+                    std::nullopt)
+            .then(wrap_result_to_error_message([this, &options, command](service::storage_proxy::coordinator_query_result qr) {
+                return this->process_results(std::move(qr.query_result), command, options, _query_start_time_point);
+            }));
+}
+
 namespace raw {

 static void validate_attrs(const cql3::attributes::raw& attrs) {
@@ -2153,7 +2290,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        std::visit([&](auto&& ordering) {
            using T = std::decay_t<decltype(ordering)>;
            if constexpr (std::is_same_v<T, select_statement::ann_vector>) {
-                verify_ann_ordering_is_valid(_limit, _per_partition_limit, *selection);
                prepared_ann_ordering = prepare_ann_ordering(*schema, ctx, db);
            } else {
                SCYLLA_ASSERT(!for_view);
@@ -2251,8 +2387,12 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
                std::move(prepared_attrs));
-    } else if (restrictions->uses_secondary_indexing() || prepared_ann_ordering) {
-        stmt = indexed_table_select_statement::prepare(
+    } else if (prepared_ann_ordering) {
+        stmt = vector_indexed_table_select_statement::prepare(db, schema, ctx.bound_variables_size(), _parameters, std::move(selection), std::move(restrictions),
+                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(*prepared_ann_ordering),
+                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, std::move(prepared_attrs));
+    } else if (restrictions->uses_secondary_indexing()) {
+        stmt = view_indexed_table_select_statement::prepare(
                db,
                schema,
                ctx.bound_variables_size(),
@@ -2262,7 +2402,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                std::move(group_by_cell_indices),
                is_reversed_,
                std::move(ordering_comparator),
-                std::move(prepared_ann_ordering),
                prepare_limit(db, ctx, _limit),
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
@@ -2473,22 +2612,6 @@ void select_statement::verify_ordering_is_valid(const prepared_orderings_type& o
    }
 }

-void select_statement::verify_ann_ordering_is_valid(const std::optional<expr::expression>& limit,
-                                                    const std::optional<expr::expression>& per_partition_limit,
-                                                    const selection::selection& selection) const {
-    if (!limit.has_value()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
-    }
-
-    if (per_partition_limit.has_value()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries do not support per-partition limits");
-    }
-
-    if (selection.is_aggregate()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries cannot be run with aggregation");
-    }
-}
-
 select_statement::prepared_ann_ordering_type select_statement::prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const {
    auto [column_id, ordering] = _parameters->orderings().front();
    const auto& ann_vector = std::get_if<select_statement::ann_vector>(&ordering);
@@ -2742,16 +2865,6 @@ std::vector<size_t> select_statement::prepare_group_by(const schema& schema, sel

 }

-future<> set_internal_paging_size(int paging_size) {
-    return seastar::smp::invoke_on_all([paging_size] {
-        internal_paging_size = paging_size;
-    });
-}
-
-future<> reset_internal_paging_size() {
-    return set_internal_paging_size(DEFAULT_INTERNAL_PAGING_SIZE);
-}
-
 }

 namespace util {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -21,6 +21,7 @@
 #include "exceptions/coordinator_result.hh"
 #include "locator/host_id.hh"
 #include "service/cas_shard.hh"
+#include "vector_search/vector_store_client.hh"

 namespace service {
    class client_state;
@@ -63,7 +64,6 @@ public:
    using parameters = raw::select_statement::parameters;
    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
    using prepared_ann_ordering_type = raw::select_statement::prepared_ann_ordering_type;
-    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
    bool _may_use_token_aware_routing;
 protected:
    static thread_local const lw_shared_ptr<const parameters> _default_parameters;
@@ -184,17 +184,14 @@ public:
                     std::unique_ptr<cql3::attributes> attrs);
 };

-class indexed_table_select_statement : public select_statement {
+class view_indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
    expr::expression _used_index_restrictions;
    schema_ptr _view_schema;
-    std::optional<prepared_ann_ordering_type>  _prepared_ann_ordering;
    noncopyable_function<dht::partition_range_vector(const query_options&)> _get_partition_ranges_for_posting_list;
    noncopyable_function<query::partition_slice(const query_options&)> _get_partition_slice_for_posting_list;
 public:
    static constexpr size_t max_base_table_query_concurrency = 4096;
-    static constexpr size_t max_ann_query_limit = 1000;
-    static constexpr std::string_view ann_custom_index_option = "vector_index";

    static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db,
                                                                    schema_ptr schema,
@@ -205,13 +202,12 @@ public:
                                                                    ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                                                    bool is_reversed,
                                                                    ordering_comparator_type ordering_comparator,
-                                                                    std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                                                    std::optional<expr::expression> limit,
                                                                    std::optional<expr::expression> per_partition_limit,
                                                                    cql_stats &stats,
                                                                    std::unique_ptr<cql3::attributes> attrs);

-    indexed_table_select_statement(schema_ptr schema,
+    view_indexed_table_select_statement(schema_ptr schema,
                                   uint32_t bound_terms,
                                   lw_shared_ptr<const parameters> parameters,
                                   ::shared_ptr<selection::selection> selection,
@@ -219,7 +215,6 @@ public:
                                   ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                   bool is_reversed,
                                   ordering_comparator_type ordering_comparator,
-                                   std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                   std::optional<expr::expression> limit,
                                   std::optional<expr::expression> per_partition_limit,
                                   cql_stats &stats,
@@ -236,7 +231,7 @@ private:
            service::query_state& state, const query_options& options) const;

    lw_shared_ptr<const service::pager::paging_state> generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
-            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options) const;
+            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options, uint32_t internal_page_size) const;

    future<coordinator_result<std::tuple<dht::partition_range_vector, lw_shared_ptr<const service::pager::paging_state>>>> find_index_partition_ranges(query_processor& qp,
                                                                    service::query_state& state,
@@ -253,7 +248,8 @@ private:
            service::query_state& state,
            const query_options& options,
            gc_clock::time_point now,
-            lw_shared_ptr<const service::pager::paging_state> paging_state) const;
+            lw_shared_ptr<const service::pager::paging_state> paging_state,
+            uint32_t internal_page_size) const;

    lw_shared_ptr<query::read_command>
    prepare_command_for_base_query(query_processor& qp, const query_options& options, service::query_state& state, gc_clock::time_point now,
@@ -362,6 +358,48 @@ private:
            service::query_state& state, const query_options& options) const override;
 };

-}
+
+class vector_indexed_table_select_statement : public select_statement {
+    secondary_index::index _index;
+    prepared_ann_ordering_type _prepared_ann_ordering;
+    mutable gc_clock::time_point _query_start_time_point;
+
+public:
+    static constexpr size_t max_ann_query_limit = 1000;
+
+    static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
+            lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+            ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+            ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+            std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<cql3::attributes> attrs);
+
+    vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
+            ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+            ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
+            prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit, std::optional<expr::expression> per_partition_limit,
+            cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
+
+private:
+    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(
+            query_processor& qp, service::query_state& state, const query_options& options) const override;
+
+    void update_stats() const;
+
+    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options) const;
+
+    std::vector<float> get_ann_ordering_vector(const query_options& options) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(
+            query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
+            const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+            const std::vector<vector_search::primary_key>& pkeys) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
+            const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+            std::vector<dht::partition_range> partition_ranges) const;
+};

 }
+}
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -21,11 +21,11 @@
 #include "unimplemented.hh"

 #include "cql3/operation_impl.hh"
-#include "cql3/type_json.hh"
 #include "cql3/lists.hh"
 #include "cql3/maps.hh"
 #include "cql3/sets.hh"
 #include "cql3/user_types.hh"
+#include "types/json_utils.hh"
 #include "types/list.hh"
 #include "types/map.hh"
 #include "types/set.hh"
--- a/data_dictionary/consistency_config_options.hh
+++ b/data_dictionary/consistency_config_options.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <seastar/core/sstring.hh>
+
+namespace data_dictionary {
+enum class consistency_config_option : uint8_t {
+    eventual,
+    local,
+    global
+};
+
+consistency_config_option consistency_config_option_from_string(const seastar::sstring& str);
+seastar::sstring consistency_config_option_to_string(consistency_config_option option);
+}
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -9,6 +9,7 @@
 #include <ranges>
 #include "data_dictionary.hh"
 #include "cql3/description.hh"
+#include "data_dictionary/consistency_config_options.hh"
 #include "impl.hh"
 #include "user_types_metadata.hh"
 #include "keyspace_metadata.hh"
@@ -22,6 +23,7 @@
 #include <ostream>
 #include <array>
 #include "replica/database.hh"
+#include "utils/overloaded_functor.hh"

 namespace data_dictionary {

@@ -213,6 +215,7 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
             std::string_view strategy_name,
             locator::replication_strategy_config_options strategy_options,
             std::optional<unsigned> initial_tablets,
+             std::optional<consistency_config_option> consistency_option,
             bool durable_writes,
             std::vector<schema_ptr> cf_defs,
             user_types_metadata user_types,
@@ -224,6 +227,7 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
    , _durable_writes{durable_writes}
    , _user_types{std::move(user_types)}
    , _storage_options(make_lw_shared<storage_options>(std::move(storage_opts)))
+    , _consistency_option(consistency_option)
 {
    for (auto&& s : cf_defs) {
        _cf_meta_data.emplace(s->cf_name(), s);
@@ -232,9 +236,28 @@ keyspace_metadata::keyspace_metadata(std::string_view name,

 void keyspace_metadata::validate(const gms::feature_service& fs, const locator::topology& topology) const {
    using namespace locator;
-    locator::replication_strategy_params params(strategy_options(), initial_tablets());
-    auto strategy = locator::abstract_replication_strategy::create_replication_strategy(strategy_name(), params);
+    locator::replication_strategy_params params(strategy_options(), initial_tablets(), consistency_option());
+    auto strategy = locator::abstract_replication_strategy::create_replication_strategy(strategy_name(), params, topology);
    strategy->validate_options(fs, topology);
+    if (!params.initial_tablets && params.consistency.value_or(data_dictionary::consistency_config_option::eventual) != data_dictionary::consistency_config_option::eventual) {
+        throw exceptions::configuration_exception("Only eventual consistency is supported for non-tablet keyspaces");
+    }
+    if (params.consistency && !fs.strongly_consistent_tables) {
+        throw exceptions::configuration_exception("The strongly_consistent_tables feature must be enabled to use a consistency option");
+    }
+    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::global) {
+        throw exceptions::configuration_exception("Global consistency is not supported yet");
+    }
+}
+
+locator::replication_strategy_config_options keyspace_metadata::strategy_options_v1() const {
+    auto opts = _strategy_options;
+    for (auto& [key, value] : opts) {
+        if (std::holds_alternative<locator::rack_list>(value)) {
+            opts[key] = to_sstring(std::get<locator::rack_list>(value).size());
+        }
+    }
+    return opts;
 }

 lw_shared_ptr<keyspace_metadata>
@@ -242,16 +265,17 @@ keyspace_metadata::new_keyspace(std::string_view name,
                                std::string_view strategy_name,
                                locator::replication_strategy_config_options options,
                                std::optional<unsigned> initial_tablets,
+                                std::optional<consistency_config_option> consistency_option,
                                bool durables_writes,
                                storage_options storage_opts,
                                std::vector<schema_ptr> cf_defs)
 {
-    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
+    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
 }

 lw_shared_ptr<keyspace_metadata>
 keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
-    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.durable_writes(), ksm.get_storage_options());
+    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
 }

 void keyspace_metadata::add_user_type(const user_type ut) {
@@ -277,7 +301,7 @@ std::vector<view_ptr> keyspace_metadata::views() const {
            | std::ranges::to<std::vector<view_ptr>>();
 }

-storage_options::local storage_options::local::from_map(const std::map<sstring, sstring>& values) {
+static storage_options::local local_from_map(const std::map<sstring, sstring>& values) {
    if (!values.empty()) {
        throw std::runtime_error("Local storage does not accept any custom options");
    }
@@ -288,9 +312,13 @@ std::map<sstring, sstring> storage_options::local::to_map() const {
    return {};
 }

-storage_options::s3 storage_options::s3::from_map(const std::map<sstring, sstring>& values) {
-    s3 options;
-    const std::array<std::pair<sstring, sstring*>, 2> allowed_options {
+std::string_view storage_options::local::name() const {
+    return LOCAL_NAME;
+}
+
+static storage_options::object_storage object_storage_from_map(std::string_view type, const std::map<sstring, sstring>& values) {
+    storage_options::object_storage options;
+    const std::array<std::pair<sstring, std::string*>, 2> allowed_options {
        std::make_pair("bucket", &options.bucket),
        std::make_pair("endpoint", &options.endpoint),
    };
@@ -298,38 +326,61 @@ storage_options::s3 storage_options::s3::from_map(const std::map<sstring, sstrin
        if (auto it = values.find(option.first); it != values.end()) {
            *option.second = it->second;
        } else {
-            throw std::runtime_error(fmt::format("Missing S3 option: {}", option.first));
+            throw std::runtime_error(fmt::format("Missing {} option: {}", type, option.first));
        }
    }
    if (values.size() > allowed_options.size()) {
-        throw std::runtime_error(fmt::format("Extraneous options for S3: {}; allowed: {}",
-            fmt::join(values | std::views::keys, ","),
+        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
+            fmt::join(values | std::views::keys, ","), type,
            fmt::join(allowed_options | std::views::keys, ",")));
    }
+    options.type = std::string(type);
    return options;
 }

-std::map<sstring, sstring> storage_options::s3::to_map() const {
+std::map<sstring, sstring> storage_options::object_storage::to_map() const {
    return {{"bucket", bucket},
            {"endpoint", endpoint}};
 }

+std::string_view storage_options::object_storage::name() const {
+    return type;
+}
+
+bool storage_options::object_storage::operator==(const object_storage&) const = default;
+
 bool storage_options::is_local_type() const noexcept {
    return std::holds_alternative<local>(value);
 }

-storage_options::value_type storage_options::from_map(std::string_view type, std::map<sstring, sstring> values) {
-    if (type == local::name) {
-        return local::from_map(values);
+bool storage_options::is_object_storage_type() const noexcept {
+    return std::holds_alternative<object_storage>(value);
+}
+
+bool storage_options::is_s3_type() const noexcept {
+    return is_object_storage_type() && type_string() == S3_NAME;
+}
+
+bool storage_options::is_gs_type() const noexcept {
+    return is_object_storage_type() && type_string() == GS_NAME;
+}
+
+const std::string storage_options::LOCAL_NAME = "LOCAL";
+const std::string storage_options::S3_NAME = "S3";
+const std::string storage_options::GS_NAME = "GS";
+
+storage_options::value_type storage_options::from_map(std::string_view type, const std::map<sstring, sstring>& values) {
+    if (type == LOCAL_NAME) {
+        return local_from_map(values);
    }
-    if (type == s3::name) {
-        return s3::from_map(values);
+    if (type == S3_NAME || type == GS_NAME) {
+        return object_storage_from_map(type, values);
    }
    throw std::runtime_error(fmt::format("Unknown storage type: {}", type));
 }

 std::string_view storage_options::type_string() const {
-    return std::visit([] (auto& opt) { return opt.name; }, value);
+    return std::visit([] (auto& opt) { return opt.name(); }, value);
 }

 std::map<sstring, sstring> storage_options::to_map() const {
@@ -340,7 +391,7 @@ bool storage_options::can_update_to(const storage_options& new_options) {
    return value == new_options.value;
 }

-storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
+storage_options storage_options::append_to_object_storage_prefix(const sstring& s) const {
    // when restoring from object storage, the API of /storage_service/restore
    // provides:
    // 1. a shared prefix
@@ -355,7 +406,7 @@ storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
    //
    // note, this example shows three sstables from two different snapshot backups.
    //
-    // we assume all sstables' locations share the same base prefix (storage_options::s3::prefix).
+    // we assume all sstables' locations share the same base prefix (storage_options::object_storage::prefix).
    // however, sstable in different backups have different prefixes. to handle this, we compose
    // a per-sstable prefix by concatenating the shared prefix and the "parent directory" of the
    // sstable's location. the resulting structure looks like:
@@ -375,14 +426,109 @@ storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
        return ret;
    }

-    s3 s3_options = std::get<s3>(value);
-    SCYLLA_ASSERT(std::holds_alternative<sstring>(s3_options.location));
-    sstring prefix = std::get<sstring>(s3_options.location);
-    s3_options.location = seastar::format("{}/{}", prefix, s);
-    ret.value = std::move(s3_options);
+    object_storage options = std::get<object_storage>(value);
+    SCYLLA_ASSERT(std::holds_alternative<sstring>(options.location));
+    sstring prefix = std::get<sstring>(options.location);
+    options.location = seastar::format("{}/{}", prefix, s);
+    ret.value = std::move(options);
    return ret;
 }

+storage_options make_local_options(std::filesystem::path dir) {
+    storage_options so;
+    so.value = data_dictionary::storage_options::local { .dir = std::move(dir) };
+    return so;
+}
+
+static std::string fqn_type(const std::string& fqn) {
+    auto i = fqn.find_first_of(':');
+    return fqn.substr(0, i) | std::views::transform(&toupper) | std::ranges::to<std::string>();
+}
+
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& fqn, abort_source* as) {
+    std::string bucket;
+    std::string object;
+    auto type = fqn_type(fqn);
+    object_storage_fqn_to_parts(fqn, type, bucket, object);
+    object = std::filesystem::path(object).parent_path().string(); // remove the filename and trailing separator from the path
+    return make_object_storage_options(endpoint, type, bucket, object, as);
+}
+
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& type, const std::string& bucket, const std::string& prefix, abort_source* as) {
+    storage_options so;
+    storage_options::object_storage os{
+        .bucket = std::move(bucket), .endpoint = endpoint, .location = std::move(prefix),
+        .abort_source = as,
+        .type = type | std::views::transform(&toupper) | std::ranges::to<std::string>()
+    };
+    so.value = std::move(os);
+    return so;
+}
+
+namespace fs = std::filesystem;
+using namespace std::string_literals;
+
+static fs::path object_store_canonicalize(const fs::path& path, std::string_view type) {
+    if (!is_object_storage_fqn(path, type) || path.string().length() < (type.length() + 2)) {
+        return path;
+    }
+    // Canonicalizing the original "<type>://" changes it to "<type>:/". Trim and re-add the "type://" prefix.
+    auto canonical = path.lexically_normal().string().substr(type.length() + 2);
+    return (type | std::views::transform(&tolower) | std::ranges::to<std::string>()) + "://"s + canonical;
+}
+
+bool is_object_storage_fqn(const fs::path& fqn, std::string_view type) {
+    if (fqn.empty()) {
+        return false;
+    }
+    std::string tmp = *(fqn.begin());
+    return tmp.size() == (type.size() + 1) // additional ':'
+        && tmp.back() == ':'
+        // allow case insensitive checks, like type=S3 as well as type=s3. Only because ::name 
+        // members (history) are upper case.
+        && std::equal(tmp.begin(), tmp.begin() + type.size(), type.begin(), [](char c1, char c2) {
+            return ::tolower(c1) == ::tolower(c2);
+        })
+        ;
+}
+
+bool object_storage_fqn_to_parts(const fs::path& fqn, std::string_view type, std::string& bucket_name, std::string& object_name) {
+    if (!is_object_storage_fqn(fqn, type)) {
+        return false;
+    }
+
+    const auto canonical = object_store_canonicalize(fqn, type);
+    auto it = canonical.begin();
+
+    // Expect at least two components: the scheme (e.g., "s3:") and the bucket name.
+    if (std::distance(it, canonical.end()) < 2) {
+        return false;
+    }
+
+    // Skip the scheme component.
+    ++it;
+
+    // The next component is the bucket name.
+    bucket_name = it->string();
+
+    // Advance to check for object parts.
+    ++it;
+    if (it == canonical.end()) {
+        // No object parts – default to root.
+        object_name = "/";
+        return true;
+    }
+
+    // Combine remaining parts into the object path.
+    fs::path obj;
+    for (; it != canonical.end(); ++it) {
+        obj /= *it;
+    }
+
+    object_name = obj.string().empty() ? "/" : obj.string();
+    return true;
+}
+
 no_such_keyspace::no_such_keyspace(std::string_view ks_name)
    : runtime_error{fmt::format("Can't find a keyspace {}", ks_name)}
 {
@@ -414,7 +560,22 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
        os << "CREATE KEYSPACE " << cql3::util::maybe_quote(_name)
           << " WITH replication = {'class': " << cql3::util::single_quote(_strategy_name);
        for (const auto& opt: _strategy_options) {
-            os << ", " << cql3::util::single_quote(opt.first) << ": " << cql3::util::single_quote(opt.second);
+            os << ", " << cql3::util::single_quote(opt.first) << ": ";
+            std::visit(overloaded_functor{
+                [&os] (const sstring& str) {
+                    os << cql3::util::single_quote(str);
+                },
+                [&os] (const std::vector<sstring>& vec) {
+                    os << "[";
+                    for (auto it = vec.begin(); it != vec.end(); ++it) {
+                        if (it != vec.begin()) {
+                            os << ", ";
+                        }
+                        os << cql3::util::single_quote(*it);
+                    }
+                    os << "]";
+                }
+            }, opt.second);
        }
        if (!_storage_options->is_local_type()) {
            os << "} AND storage = {'type': " << cql3::util::single_quote(sstring(_storage_options->type_string()));
@@ -424,6 +585,9 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
        }
        os << "} AND durable_writes = " << fmt::to_string(_durable_writes);
        if (db.features().tablets) {
+            if (_consistency_option) {
+                os << " AND consistency = " << cql3::util::single_quote(consistency_config_option_to_string(*_consistency_option));
+            }
            if (!_initial_tablets.has_value()) {
                os << " AND tablets = {'enabled': false}";
            } else {
@@ -443,6 +607,29 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
    };
 }

+consistency_config_option consistency_config_option_from_string(const seastar::sstring& str) {
+    if (str == "eventual") {
+        return consistency_config_option::eventual;
+    } else if (str == "local") {
+        return consistency_config_option::local;
+    } else if (str == "global") {
+        return consistency_config_option::global;
+    } else {
+        throw exceptions::configuration_exception(fmt::format("Consistency option must be one of 'eventual', 'local', or 'global'; found: {}", str));
+    }
+}
+
+seastar::sstring consistency_config_option_to_string(consistency_config_option option) {
+    switch (option) {
+    case consistency_config_option::eventual:
+        return "eventual";
+    case consistency_config_option::local:
+        return "local";
+    case consistency_config_option::global:
+        return "global";
+    }
+}
+
 } // namespace data_dictionary

 template <>
@@ -472,17 +659,18 @@ auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dicti
 }

 auto fmt::formatter<data_dictionary::storage_options>::format(const data_dictionary::storage_options& so, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    auto type = so.type_string() | std::views::transform(&tolower) | std::ranges::to<std::string>();
    return std::visit(overloaded_functor {
        [&ctx] (const data_dictionary::storage_options::local& so) -> decltype(ctx.out()) {
            return fmt::format_to(ctx.out(), "{}", so.dir);
        },
-        [&ctx] (const data_dictionary::storage_options::s3& so) -> decltype(ctx.out()) {
+        [&ctx, &type] (const data_dictionary::storage_options::object_storage& so) -> decltype(ctx.out()) {
            return std::visit(overloaded_functor {
-                [&ctx, &so] (const sstring& prefix) -> decltype(ctx.out()) {
-                    return fmt::format_to(ctx.out(), "s3://{}/{}", so.bucket, prefix);
+                [&] (const sstring& prefix) -> decltype(ctx.out()) {
+                    return fmt::format_to(ctx.out(), "{}://{}/{}", type, so.bucket, prefix);
                },
-                [&ctx, &so] (const table_id& owner) -> decltype(ctx.out()) {
-                    return fmt::format_to(ctx.out(), "s3://{} (owner {})", so.bucket, owner);
+                [&] (const table_id& owner) -> decltype(ctx.out()) {
+                    return fmt::format_to(ctx.out(), "{}://{} (owner {})", type, so.bucket, owner);
                }
            }, so.location);
        }
--- a/data_dictionary/keyspace_metadata.hh
+++ b/data_dictionary/keyspace_metadata.hh
@@ -17,6 +17,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "data_dictionary/user_types_metadata.hh"
 #include "data_dictionary/storage_options.hh"
+#include "data_dictionary/consistency_config_options.hh"

 namespace gms {
 class feature_service;
@@ -33,11 +34,13 @@ class keyspace_metadata final {
    bool _durable_writes;
    user_types_metadata _user_types;
    lw_shared_ptr<const storage_options> _storage_options;
+    std::optional<consistency_config_option> _consistency_option;
 public:
    keyspace_metadata(std::string_view name,
                 std::string_view strategy_name,
                 locator::replication_strategy_config_options strategy_options,
                 std::optional<unsigned> initial_tablets,
+                 std::optional<consistency_config_option> consistency_option,
                 bool durable_writes,
                 std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
                 user_types_metadata user_types = user_types_metadata{},
@@ -47,6 +50,7 @@ public:
                 std::string_view strategy_name,
                 locator::replication_strategy_config_options options,
                 std::optional<unsigned> initial_tablets,
+                 std::optional<consistency_config_option> consistency_option,
                 bool durables_writes = true,
                 storage_options storage_opts = {},
                 std::vector<schema_ptr> cf_defs = {});
@@ -62,9 +66,13 @@ public:
    const locator::replication_strategy_config_options& strategy_options() const {
        return _strategy_options;
    }
+    locator::replication_strategy_config_options strategy_options_v1() const;
    std::optional<unsigned> initial_tablets() const {
        return _initial_tablets;
    }
+    std::optional<data_dictionary::consistency_config_option> consistency_option() const {
+        return _consistency_option;
+    }
    bool uses_tablets() const noexcept {
        return _initial_tablets.has_value();
    }
--- a/Show More
+++ b/Show More