Squash commits into single clean commit

Squashed 7 commits into a single commit with a clean commit message following repository conventions. Co-authored-by: avikivity <1017210+avikivity@users.noreply.github.com>
Remove temporary changes from .gitignore and tablets.cc
2025-11-04 11:44:13 +00:00 · 2025-11-01 10:27:52 +00:00 · 2025-11-01 09:18:49 +00:00 · 2025-10-31 18:46:39 +00:00 · 2025-10-31 17:44:17 +00:00 · 2025-10-31 17:39:17 +00:00
646 changed files with 17772 additions and 17493 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -62,7 +62,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            labels_to_add.append("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
+            pr_comment += "Please resolve them and mark this PR as ready for review"
            backport_pr.create_issue_comment(pr_comment)
        
        # Apply all labels at once if we have any
@@ -142,31 +142,20 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):


 def with_github_keyword_prefix(repo, pr):
-    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
-    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    
-    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
-    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
-    
-    # Check PR body for GitHub issues
-    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
-    # Check PR body for JIRA issues
-    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
-    
-    match = github_match or jira_match
-
-    if match:
+    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    match = re.findall(pattern, pr.body, re.IGNORECASE)
+    if not match:
+        for commit in pr.get_commits():
+            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
+            if match:
+                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+                break
+    if not match:
+        print(f'No valid close reference for {pr.number}')
+        return False
+    else:
        return True

-    for commit in pr.get_commits():
-        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
-        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
-        if github_match or jira_match:
-            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-            return True
-
-    print(f'No valid close reference for {pr.number}')
-    return False

 def main():
    args = parse_args()
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
+            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -1,53 +0,0 @@
-name: Backport with Jira Integration
-
-on:
-  push:
-    branches:
-      - master
-      - next-*.*
-      - branch-*.*
-  pull_request_target:
-    types: [labeled, closed]
-    branches: 
-      - master
-      - next
-      - next-*.*
-      - branch-*.*
-
-jobs:
-  backport-on-push:
-    if: github.event_name == 'push'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'push'
-      base_branch: ${{ github.ref }}
-      commits: ${{ github.event.before }}..${{ github.sha }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-on-label:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'labeled'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      head_commit: ${{ github.event.pull_request.base.sha }}
-      label_name: ${{ github.event.label.name }}
-      pr_state: ${{ github.event.pull_request.state }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-chain:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'chain'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      pr_body: ${{ github.event.pull_request.body }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -3,63 +3,19 @@ name: Trigger Scylla CI Route
 on:
  issue_comment:
    types: [created]
-  pull_request_target:
-    types:
-      - unlabeled

 jobs:
  trigger-jenkins:
-    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
+    if: github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')
    runs-on: ubuntu-latest
    steps:
-      - name: Verify Org Membership
-        id: verify_author
-        env:
-          EVENT_NAME: ${{ github.event_name }}
-          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
-          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
-          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
-        shell: bash
-        run: |
-          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
-            AUTHOR="$PR_AUTHOR"
-            ASSOCIATION="$PR_ASSOCIATION"
-          else
-            AUTHOR="$COMMENT_AUTHOR"
-            ASSOCIATION="$COMMENT_ASSOCIATION"
-          fi
-          ORG="scylladb"
-          if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
-            echo "member=true" >> $GITHUB_OUTPUT
-          else
-            echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
-            echo "member=false" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Validate Comment Trigger
-        if: github.event_name == 'issue_comment'
-        id: verify_comment
-        env:
-          COMMENT_BODY: ${{ github.event.comment.body }}
-        shell: bash
-        run: |
-          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
-
-          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
-            echo "trigger=true" >> $GITHUB_OUTPUT
-          else
-            echo "trigger=false" >> $GITHUB_OUTPUT
-          fi
-
      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
-          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
-          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ clang_build
 .idea/
 nuke
 rust/target
+
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ include(limit_jobs)
 set(CMAKE_CXX_STANDARD "23" CACHE INTERNAL "")
 set(CMAKE_CXX_EXTENSIONS ON CACHE INTERNAL "")
 set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE INTERNAL "")
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

 if(is_multi_config)
    find_package(Seastar)
@@ -90,13 +90,13 @@ if(is_multi_config)
    add_dependencies(Seastar::seastar_testing Seastar)
 else()
    set(Seastar_TESTING ON CACHE BOOL "" FORCE)
-    set(Seastar_API_LEVEL 8 CACHE STRING "" FORCE)
+    set(Seastar_API_LEVEL 9 CACHE STRING "" FORCE)
    set(Seastar_DEPRECATED_OSTREAM_FORMATTERS OFF CACHE BOOL "" FORCE)
    set(Seastar_APPS ON CACHE BOOL "" FORCE)
    set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
    set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE)
    set(Seastar_IO_URING ON CACHE BOOL "" FORCE)
-    set(Seastar_SCHEDULING_GROUPS_COUNT 20 CACHE STRING "" FORCE)
+    set(Seastar_SCHEDULING_GROUPS_COUNT 21 CACHE STRING "" FORCE)
    set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
    add_subdirectory(seastar)
    target_compile_definitions (seastar
@@ -178,7 +178,6 @@ target_sources(scylla-main
    mutation_query.cc
    node_ops/task_manager_module.cc
    partition_slice_builder.cc
-    querier.cc
    query/query.cc
    query_ranges_to_vnodes.cc
    query/query-result-set.cc
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ Please use the [issue tracker](https://github.com/scylladb/scylla/issues/) to re

 ## Contributing code to Scylla

-Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
+Before you can contribute code to Scylla for the first time, you should sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send the signed form to cla@scylladb.com. You can then submit your changes as patches to the [scylladb-dev mailing list](https://groups.google.com/forum/#!forum/scylladb-dev) or as a pull request to the [Scylla project on github](https://github.com/scylladb/scylla).
 If you need help formatting or sending patches, [check out these instructions](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches).

 The Scylla C++ source code uses the [Seastar coding style](https://github.com/scylladb/seastar/blob/master/coding-style.md) so please adhere to that in your patches. Note that Scylla code is written with `using namespace seastar`, so should not explicitly add the `seastar::` prefix to Seastar symbols. You will usually not need to add `using namespace seastar` to new source files, because most Scylla header files have `#include "seastarx.hh"`, which does this.
--- a/HACKING.md
+++ b/HACKING.md
@@ -43,7 +43,7 @@ $ ./tools/toolchain/dbuild ninja build/release/scylla
 $ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
 ```

-Note: do not mix environemtns - either perform all your work with dbuild, or natively on the host.
+Note: do not mix environments - either perform all your work with dbuild, or natively on the host.
 Note2: you can get to an interactive shell within dbuild by running it without any parameters:
 ```bash
 $ ./tools/toolchain/dbuild
@@ -91,7 +91,7 @@ You can also specify a single mode. For example
 $ ninja-build release
 ```

-Will build everytihng in release mode. The valid modes are
+Will build everything in release mode. The valid modes are

 * Debug: Enables [AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer)
  and other sanity checks. It has no optimizations, which allows for debugging with tools like
@@ -361,7 +361,7 @@ avoid that the gold linker can be told to create an index with

 More info at https://gcc.gnu.org/wiki/DebugFission.

-Both options can be enable by passing `--split-dwarf` to configure.py.
+Both options can be enabled by passing `--split-dwarf` to configure.py.

 Note that distcc is *not* compatible with it, but icecream
 (https://github.com/icecc/icecream) is.
@@ -370,7 +370,7 @@ Note that distcc is *not* compatible with it, but icecream

 Sometimes Scylla development is closely tied with a feature being developed in Seastar. It can be useful to compile Scylla with a particular check-out of Seastar.

-One way to do this it to create a local remote for the Seastar submodule in the Scylla repository:
+One way to do this is to create a local remote for the Seastar submodule in the Scylla repository:

 ```bash
 $ cd $HOME/src/scylla
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Scylla is fairly fussy about its build environment, requiring very recent
 versions of the C++23 compiler and of many libraries to build. The document
 [HACKING.md](HACKING.md) includes detailed information on building and
 developing Scylla, but to get Scylla building quickly on (almost) any build
-machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
+machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md).
 This is a pre-configured Docker image which includes recent versions of all
 the required compilers, libraries and build tools. Using the frozen toolchain
 allows you to avoid changing anything in your build machine to meet Scylla's
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.4.6
+VERSION=2026.1.0-dev

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -11,7 +11,6 @@
 #include "utils/log.hh"
 #include <string>
 #include <string_view>
-#include "bytes.hh"
 #include "alternator/auth.hh"
 #include <fmt/format.h>
 #include "auth/password_authenticator.hh"
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -136,7 +136,7 @@ future<> controller::start_server() {
                [this, addr, alternator_port, alternator_https_port, creds = std::move(creds)] (server& server) mutable {
            return server.init(addr, alternator_port, alternator_https_port, creds,
                    _config.alternator_enforce_authorization,
-                    _config.alternator_warn_authorization,
+                    _config.alternator_max_users_query_size_in_trace_output,
                    &_memory_limiter.local().get_semaphore(),
                    _config.max_concurrent_requests_per_shard);
        }).handle_exception([this, addr, alternator_port, alternator_https_port] (std::exception_ptr ep) {
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -94,6 +94,9 @@ public:
    static api_error internal(std::string msg) {
        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }
+    static api_error payload_too_large(std::string msg) {
+        return api_error("PayloadTooLarge", std::move(msg), status_type::payload_too_large);
+    }

    // Provide the "std::exception" interface, to make it easier to print this
    // exception in log messages. Note that this function is *not* used to
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -40,7 +40,6 @@ namespace cql3::selection {

 namespace service {
    class storage_proxy;
-    class cas_shard;
 }

 namespace cdc {
@@ -58,7 +57,6 @@ class schema_builder;
 namespace alternator {

 class rmw_operation;
-class put_or_delete_item;

 schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request);
 bool is_alternator_keyspace(const sstring& ks_name);
@@ -141,7 +139,6 @@ class executor : public peering_sharded_service<executor> {
    db::system_distributed_keyspace& _sdks;
    cdc::metadata& _cdc_metadata;
    utils::updateable_value<bool> _enforce_authorization;
-    utils::updateable_value<bool> _warn_authorization;
    // An smp_service_group to be used for limiting the concurrency when
    // forwarding Alternator request between shards - if necessary for LWT.
    smp_service_group _ssg;
@@ -221,16 +218,6 @@ private:

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);

-    future<> do_batch_write(
-        std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
-        service::client_state& client_state,
-        tracing::trace_state_ptr trace_state,
-        service_permit permit);
-
-    future<> cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
-        const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
-        tracing::trace_state_ptr trace_state, service_permit permit);
-
 public:
    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&, const std::map<sstring, sstring> *tags = nullptr);

@@ -277,7 +264,7 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // Check CQL's Role-Based Access Control (RBAC) permission (MODIFY,
 // SELECT, DROP, etc.) on the given table. When permission is denied an
 // appropriate user-readable api_error::access_denied is thrown.
-future<> verify_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, const schema_ptr&, auth::permission, alternator::stats& stats);
+future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);

 /**
 * Make return type for serializing the object "streamed",
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -8,6 +8,8 @@

 #pragma once

+#include "cdc/cdc_options.hh"
+#include "cdc/log.hh"
 #include "seastarx.hh"
 #include "service/paxos/cas_request.hh"
 #include "service/cas_shard.hh"
@@ -56,7 +58,7 @@ public:
    static write_isolation get_write_isolation_for_schema(schema_ptr schema);

    static write_isolation default_write_isolation;
-public:
+
    static void set_default_write_isolation(std::string_view mode);

 protected:
@@ -107,10 +109,11 @@ public:
    // violating this). We mark apply() "const" to let the compiler validate
    // this for us. The output-only field _return_attributes is marked
    // "mutable" above so that apply() can still write to it.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
-    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
    virtual ~rmw_operation() = default;
+    const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
    schema_ptr schema() const { return _schema; }
    const rjson::value& request() const { return _request; }
    rjson::value&& move_request() && { return std::move(_request); }
@@ -124,6 +127,9 @@ public:
            stats& per_table_stats,
            uint64_t& wcu_total);
    std::optional<service::cas_shard> shard_for_execute(bool needs_read_before_write);
+
+private:
+    inline bool should_fill_preimage() const { return _schema->cdc_options().enabled(); }
 };

 } // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -12,7 +12,7 @@
 #include "serialization.hh"
 #include "error.hh"
 #include "types/concrete_types.hh"
-#include "cql3/type_json.hh"
+#include "types/json_utils.hh"
 #include "mutation/position_in_partition.hh"

 static logging::logger slogger("alternator-serialization");
@@ -282,23 +282,15 @@ std::string type_to_string(data_type type) {
    return it->second;
 }

-std::optional<bytes> try_get_key_column_value(const rjson::value& item, const column_definition& column) {
+bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
    const rjson::value* key_typed_value = rjson::find(item, column_name);
    if (!key_typed_value) {
-        return std::nullopt;
+        throw api_error::validation(fmt::format("Key column {} not found", column_name));
    }
    return get_key_from_typed_value(*key_typed_value, column);
 }

-bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
-    auto value = try_get_key_column_value(item, column);
-    if (!value) {
-        throw api_error::validation(fmt::format("Key column {} not found", column.name_as_text()));
-    }
-    return std::move(*value);
-}
-
 // Parses the JSON encoding for a key value, which is a map with a single
 // entry whose key is the type and the value is the encoded value.
 // If this type does not match the desired "type_str", an api_error::validation
@@ -388,38 +380,20 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {
        return clustering_key::make_empty();
    }
    std::vector<bytes> raw_ck;
-    // Note: it's possible to get more than one clustering column here, as
-    // Alternator can be used to read scylla internal tables.
+    // FIXME: this is a loop, but we really allow only one clustering key column.
    for (const column_definition& cdef : schema->clustering_key_columns()) {
-        auto raw_value = get_key_column_value(item,  cdef);
+        bytes raw_value = get_key_column_value(item,  cdef);
        raw_ck.push_back(std::move(raw_value));
    }

    return clustering_key::from_exploded(raw_ck);
 }

-clustering_key_prefix ck_prefix_from_json(const rjson::value& item, schema_ptr schema) {
-    if (schema->clustering_key_size() == 0) {
-        return clustering_key_prefix::make_empty();
-    }
-    std::vector<bytes> raw_ck;
-    for (const column_definition& cdef : schema->clustering_key_columns()) {
-        auto raw_value = try_get_key_column_value(item,  cdef);
-        if (!raw_value) {
-            break;
-        }
-        raw_ck.push_back(std::move(*raw_value));
-    }
-
-    return clustering_key_prefix::from_exploded(raw_ck);
-}
-
 position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema) {
-    const bool is_alternator_ks = is_alternator_keyspace(schema->ks_name());
-    if (is_alternator_ks) {
-        return position_in_partition::for_key(ck_from_json(item, schema));
+    auto ck = ck_from_json(item, schema);
+    if (is_alternator_keyspace(schema->ks_name())) {
+        return position_in_partition::for_key(std::move(ck));
    }
-    
    const auto region_item = rjson::find(item, scylla_paging_region);
    const auto weight_item = rjson::find(item, scylla_paging_weight);
    if (bool(region_item) != bool(weight_item)) {
@@ -439,9 +413,8 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
        } else {
            throw std::runtime_error(fmt::format("Invalid value for weight: {}", weight_view));
        }
-        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(ck_prefix_from_json(item, schema)) : std::nullopt);
+        return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
    }
-    auto ck = ck_from_json(item, schema);
    if (ck.is_empty()) {
        return position_in_partition::for_partition_start();
    }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -31,7 +31,6 @@
 #include "utils/overloaded_functor.hh"
 #include "utils/aws_sigv4.hh"
 #include "client_data.hh"
-#include "utils/updateable_value.hh"

 static logging::logger slogger("alternator-server");

@@ -271,57 +270,24 @@ protected:
    }
 };

-// This function increments the authentication_failures counter, and may also
-// log a warn-level message and/or throw an exception, depending on what
-// enforce_authorization and warn_authorization are set to.
-// The username and client address are only used for logging purposes -
-// they are not included in the error message returned to the client, since
-// the client knows who it is.
-// Note that if enforce_authorization is false, this function will return
-// without throwing. So a caller that doesn't want to continue after an
-// authentication_error must explicitly return after calling this function.
-template<typename Exception>
-static void authentication_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, Exception&& e, std::string_view user, gms::inet_address client_address) {
-    stats.authentication_failures++;
-    if (enforce_authorization) {
-        if (warn_authorization) {
-            slogger.warn("alternator_warn_authorization=true: {} for user {}, client address {}", e.what(), user, client_address);
-        }
-        throw std::move(e);
-    } else {
-        if (warn_authorization) {
-            slogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {} for user {}, client address {}", e.what(), user, client_address);
-        }
-    }
-}
-
 future<std::string> server::verify_signature(const request& req, const chunked_content& content) {
-    if (!_enforce_authorization.get() && !_warn_authorization.get()) {
+    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
        return make_ready_future<std::string>();
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
-        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-            api_error::invalid_signature("Host header is mandatory for signature verification"), 
-            "", req.get_client_address());
-        return make_ready_future<std::string>();
+        throw api_error::invalid_signature("Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-            api_error::missing_authentication_token("Authorization header is mandatory for signature verification"),
-            "", req.get_client_address());
-        return make_ready_future<std::string>();
+        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
    std::string_view authorization_header = authorization_it->second;
    auto pos = authorization_header.find_first_of(' ');
    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
-        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-            api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header)),
-            "", req.get_client_address());
-        return make_ready_future<std::string>();
+        throw api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
    }
    authorization_header.remove_prefix(pos+1);
    std::string credential;
@@ -356,9 +322,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c

    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-            api_error::validation(fmt::format("Incorrect credential information format: {}", credential)), "", req.get_client_address());
-        return make_ready_future<std::string>();
+        throw api_error::validation(fmt::format("Incorrect credential information format: {}", credential));
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -382,7 +346,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
    auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
        return get_key_from_roles(proxy, as, std::move(username));
    };
-    return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
+    return _key_cache.get_ptr(user, cache_getter).then([this, &req, &content,
                                                    user = std::move(user),
                                                    host = std::move(host),
                                                    datestamp = std::move(datestamp),
@@ -390,32 +354,18 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
                                                    signed_headers_map = std::move(signed_headers_map),
                                                    region = std::move(region),
                                                    service = std::move(service),
-                                                    user_signature = std::move(user_signature)] (future<key_cache::value_ptr> key_ptr_fut) {
-        key_cache::value_ptr key_ptr(nullptr);
-        try {
-            key_ptr = key_ptr_fut.get();
-        } catch (const api_error& e) {
-            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-                e, user, req.get_client_address());
-            return std::string();
-        }
+                                                    user_signature = std::move(user_signature)] (key_cache::value_ptr key_ptr) {
        std::string signature;
        try {
            signature = utils::aws::get_signature(user, *key_ptr, std::string_view(host), "/", req._method,
                datestamp, signed_headers_str, signed_headers_map, &content, region, service, "");
        } catch (const std::exception& e) {
-            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-                api_error::invalid_signature(fmt::format("invalid signature: {}", e.what())),
-                user, req.get_client_address());
-            return std::string();
+            throw api_error::invalid_signature(e.what());
        }

        if (signature != std::string_view(user_signature)) {
            _key_cache.remove(user);
-            authentication_error(_executor._stats, _enforce_authorization.get(), _warn_authorization.get(),
-                api_error::unrecognized_client("wrong signature"),
-                user, req.get_client_address());
-            return std::string();
+            throw api_error::unrecognized_client("The security token included in the request is invalid.");
        }
        return user;
    });
@@ -428,35 +378,82 @@ static tracing::trace_state_ptr create_tracing_session(tracing::tracing& tracing
    return tracing_instance.create_session(tracing::trace_type::QUERY, props);
 }

-// truncated_content_view() prints a potentially long chunked_content for
-// debugging purposes. In the common case when the content is not excessively
-// long, it just returns a view into the given content, without any copying.
-// But when the content is very long, it is truncated after some arbitrary
-// max_len (or one chunk, whichever comes first), with "<truncated>" added at
-// the end. To do this modification to the string, we need to create a new
-// std::string, so the caller must pass us a reference to one, "buf", where
-// we can store the content. The returned view is only alive for as long this
-// buf is kept alive.
-static std::string_view truncated_content_view(const chunked_content& content, std::string& buf) {
-    constexpr size_t max_len = 1024;
-    if (content.empty()) {
-        return std::string_view();
-    } else if (content.size() == 1 && content.begin()->size() <= max_len) {
-        return std::string_view(content.begin()->get(), content.begin()->size());
-    } else {
-        buf = std::string(content.begin()->get(), std::min(content.begin()->size(), max_len)) + "<truncated>";
-        return std::string_view(buf);
+// A helper class to represent a potentially truncated view of a chunked_content.
+// If the content is short enough and single chunked, it just holds a view into the content.
+// Otherwise it will be copied into an internal buffer, possibly truncated (depending on maximum allowed size passed in),
+// and the view will point into that buffer.
+// `as_view()` method will return the view.
+// `take_as_sstring()` will either move out the internal buffer (if any), or create a new sstring from the view.
+// You should consider `as_view()` valid as long both the original chunked_content and the truncated_content object are alive.
+class truncated_content {
+    std::string_view _view;
+    sstring _content_maybe;
+
+    void copy_from_content(const chunked_content& content) {
+        size_t offset = 0;
+        for(auto &tmp : content) {
+            size_t to_copy = std::min(tmp.size(), _content_maybe.size() - offset);
+            std::copy(tmp.get(), tmp.get() + to_copy, _content_maybe.data() + offset);
+            offset += to_copy;
+            if (offset >= _content_maybe.size()) {
+                break;
+            }
+        }
    }
+public:
+    truncated_content(const chunked_content& content, size_t max_len = std::numeric_limits<size_t>::max()) {
+        if (content.empty()) return;
+        if (content.size() == 1 && content.begin()->size() <= max_len) {
+            _view = std::string_view(content.begin()->get(), content.begin()->size());
+            return;
+        }
+
+        constexpr std::string_view truncated_text = "<truncated>";
+        size_t content_size = 0;
+        for(auto &tmp : content) {
+            content_size += tmp.size();
+        }
+        if (content_size <= max_len) {
+            _content_maybe = sstring{ sstring::initialized_later{}, content_size };
+            copy_from_content(content);
+        }
+        else {
+            _content_maybe = sstring{ sstring::initialized_later{}, max_len + truncated_text.size() };
+            copy_from_content(content);
+            std::copy(truncated_text.begin(), truncated_text.end(), _content_maybe.data() + _content_maybe.size() - truncated_text.size());
+        }
+        _view = std::string_view(_content_maybe);
+    }
+
+    std::string_view as_view() const { return _view; }
+    sstring take_as_sstring() && {
+        if (_content_maybe.empty() && !_view.empty()) {
+            return sstring{_view};
+        }
+        return std::move(_content_maybe);
+    }
+};
+
+// `truncated_content_view` will produce an object representing a view to a passed content
+// possibly truncated at some length. The value returned is used in two ways:
+// - to print it in logs (use `as_view()` method for this)
+// - to pass it to tracing object, where it will be stored and used later
+//   (use `take_as_sstring()` method as this produces a copy in form of a sstring)
+// `truncated_content` delays constructing `sstring` object until it's actually needed.
+// `truncated_content` is valid as long as passed `content` is alive.
+// if the content is truncated, `<truncated>` will be appended at the maximum size limit
+// and total size will be `max_users_query_size_in_trace_output() + strlen("<truncated>")`.
+static truncated_content truncated_content_view(const chunked_content& content, size_t max_size) {
+    return truncated_content{content, max_size};
 }

-static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query) {
+static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_state, std::string_view username, std::string_view op, const chunked_content& query, size_t max_users_query_size_in_trace_output) {
    tracing::trace_state_ptr trace_state;
    tracing::tracing& tracing_instance = tracing::tracing::get_local_tracing_instance();
    if (tracing_instance.trace_next_query() || tracing_instance.slow_query_tracing_enabled()) {
        trace_state = create_tracing_session(tracing_instance);
-        std::string buf;
        tracing::add_session_param(trace_state, "alternator_op", op);
-        tracing::add_query(trace_state, truncated_content_view(query, buf));
+        tracing::add_query(trace_state, truncated_content_view(query, max_users_query_size_in_trace_output).take_as_sstring());
        tracing::begin(trace_state, seastar::format("Alternator {}", op), client_state.get_client_address());
        if (!username.empty()) {
            tracing::set_username(trace_state, auth::authenticated_user(username));
@@ -465,25 +462,81 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_
    return trace_state;
 }

+// This read_entire_stream() is similar to Seastar's read_entire_stream()
+// which reads the given content_stream until its end into non-contiguous
+// memory. The difference is that this implementation takes an extra length
+// limit, and throws an error if we read more than this limit.
+// This length-limited variant would not have been needed if Seastar's HTTP
+// server's set_content_length_limit() worked in every case, but unfortunately
+// it does not - it only works if the request has a Content-Length header (see
+// issue #8196). In contrast this function can limit the request's length no
+// matter how it's encoded. We need this limit to protect Alternator from
+// oversized requests that can deplete memory.
+static future<chunked_content>
+read_entire_stream(input_stream<char>& inp, size_t length_limit) {
+    chunked_content ret;
+    // We try to read length_limit + 1 bytes, so that we can throw an
+    // exception if we managed to read more than length_limit.
+    ssize_t remain = length_limit + 1;
+    do {
+        temporary_buffer<char> buf = co_await inp.read_up_to(remain);
+        if (buf.empty()) {
+            break;
+        }
+        remain -= buf.size();
+        ret.push_back(std::move(buf));
+    } while (remain > 0);
+    // If we read the full length_limit + 1 bytes, we went over the limit:
+    if (remain <= 0) {
+        // By throwing here an error, we may send a reply (the error message)
+        // without having read the full request body. Seastar's httpd will
+        // realize that we have not read the entire content stream, and
+        // correctly mark the connection unreusable, i.e., close it.
+        // This means we are currently exposed to issue #12166 caused by
+        // Seastar issue 1325), where the client may get an RST instead of
+        // a FIN, and may rarely get a "Connection reset by peer" before
+        // reading the error we send.
+        throw api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", length_limit));
+    }
+    co_return ret;
+}
+
 future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request> req) {
    _executor._stats.total_operations++;
    sstring target = req->get_header("X-Amz-Target");
    // target is DynamoDB API version followed by a dot '.' and operation type (e.g. CreateTable)
    auto dot = target.find('.');
    std::string_view op = (dot == sstring::npos) ? std::string_view() : std::string_view(target).substr(dot+1);
+    if (req->content_length > request_content_length_limit) {
+        // If we have a Content-Length header and know the request will be too
+        // long, we don't need to wait for read_entire_stream() below to
+        // discover it. And we definitely mustn't try to get_units() below for
+        // for such a size.
+        co_return api_error::payload_too_large(fmt::format("Request content length limit of {} bytes exceeded", request_content_length_limit));
+    }
    // JSON parsing can allocate up to roughly 2x the size of the raw
    // document, + a couple of bytes for maintenance.
-    // TODO: consider the case where req->content_length is missing. Maybe
-    // we need to take the content_length_limit and return some of the units
-    // when we finish read_content_and_verify_signature?
-    size_t mem_estimate = req->content_length * 2 + 8000;
+    // If the Content-Length of the request is not available, we assume
+    // the largest possible request (request_content_length_limit, i.e., 16 MB)
+    // and after reading the request we return_units() the excess.
+    size_t mem_estimate = (req->content_length ? req->content_length : request_content_length_limit) * 2 + 8000;
    auto units_fut = get_units(*_memory_limiter, mem_estimate);
    if (_memory_limiter->waiters()) {
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
    SCYLLA_ASSERT(req->content_stream);
-    chunked_content content = co_await util::read_entire_stream(*req->content_stream);
+    chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
+    // If the request had no Content-Length, we reserved too many units
+    // so need to return some
+    if (req->content_length == 0) {
+        size_t content_length = 0;
+        for (const auto& chunk : content) {
+            content_length += chunk.size();
+        }
+        size_t new_mem_estimate = content_length * 2 + 8000;
+        units.return_units(mem_estimate - new_mem_estimate);
+    }
    auto username = co_await verify_signature(*req, content);
    // As long as the system_clients_entry object is alive, this request will
    // be visible in the "system.clients" virtual table. When requested, this
@@ -494,8 +547,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        req->get_protocol_name() == "https");

    if (slogger.is_enabled(log_level::trace)) {
-        std::string buf;
-        slogger.trace("Request: {} {} {}", op, truncated_content_view(content, buf), req->_headers);
+        slogger.trace("Request: {} {} {}", op, truncated_content_view(content, _max_users_query_size_in_trace_output).as_view(), req->_headers);
    }
    auto callback_it = _callbacks.find(op);
    if (callback_it == _callbacks.end()) {
@@ -515,7 +567,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    }
    co_await client_state.maybe_update_per_service_level_params();

-    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
+    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
    tracing::trace(trace_state, "{}", op);

    auto user = client_state.user();
@@ -567,6 +619,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
        , _sl_controller(sl_controller)
        , _key_cache(1024, 1min, slogger)
        , _enforce_authorization(false)
+        , _max_users_query_size_in_trace_output(1024)
        , _enabled_servers{}
        , _pending_requests("alternator::server::pending_requests")
        , _timeout_config(_proxy.data_dictionary().get_config())
@@ -647,12 +700,12 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
 }

 future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-        utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization,
+        utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
        semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests) {
    _memory_limiter = memory_limiter;
    _enforce_authorization = std::move(enforce_authorization);
-    _warn_authorization = std::move(warn_authorization);
    _max_concurrent_requests = std::move(max_concurrent_requests);
+    _max_users_query_size_in_trace_output = std::move(max_users_query_size_in_trace_output);
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
                " must be specified in order to init an alternator HTTP server instance"));
@@ -662,14 +715,12 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:

        if (port) {
            set_routes(_http_server._routes);
-            _http_server.set_content_length_limit(server::content_length_limit);
            _http_server.set_content_streaming(true);
            _http_server.listen(socket_address{addr, *port}).get();
            _enabled_servers.push_back(std::ref(_http_server));
        }
        if (https_port) {
            set_routes(_https_server._routes);
-            _https_server.set_content_length_limit(server::content_length_limit);
            _https_server.set_content_streaming(true);

            if (this_shard_id() == 0) {
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -28,7 +28,11 @@ namespace alternator {
 using chunked_content = rjson::chunked_content;

 class server : public peering_sharded_service<server> {
-    static constexpr size_t content_length_limit = 16*MB;
+    // The maximum size of a request body that Alternator will accept,
+    // in bytes. This is a safety measure to prevent Alternator from
+    // running out of memory when a client sends a very large request.
+    // DynamoDB also has the same limit set to 16 MB.
+    static constexpr size_t request_content_length_limit = 16*MB;
    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<http::request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;
@@ -43,7 +47,7 @@ class server : public peering_sharded_service<server> {

    key_cache _key_cache;
    utils::updateable_value<bool> _enforce_authorization;
-    utils::updateable_value<bool> _warn_authorization;
+    utils::updateable_value<uint64_t> _max_users_query_size_in_trace_output;
    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
    named_gate _pending_requests;
    // In some places we will need a CQL updateable_timeout_config object even
@@ -95,7 +99,7 @@ public:
    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-            utils::updateable_value<bool> enforce_authorization, utils::updateable_value<bool> warn_authorization,
+            utils::updateable_value<bool> enforce_authorization, utils::updateable_value<uint64_t> max_users_query_size_in_trace_output,
            semaphore* memory_limiter, utils::updateable_value<uint32_t> max_concurrent_requests);
    future<> stop();
    // get_client_data() is called (on each shard separately) when the virtual
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -154,6 +154,18 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
+                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
    });

    seastar::metrics::label expression_label("expression");
@@ -176,16 +188,6 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
            seastar::metrics::make_total_operations("expression_cache_misses", stats.expression_cache.requests[stats::expression_types::PROJECTION_EXPRESSION].misses,
                    seastar::metrics::description("Counts number of misses of cached expressions"), labels)(expression_label("ProjectionExpression")).aggregate(aggregate_labels).set_skip_when_empty()
    });
-
-    // Only register the following metrics for the global metrics, not per-table
-    if (!has_table) {
-        metrics.add_group("alternator", {
-            seastar::metrics::make_counter("authentication_failures", stats.authentication_failures,
-                seastar::metrics::description("total number of authentication failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
-            seastar::metrics::make_counter("authorization_failures", stats.authorization_failures,
-                seastar::metrics::description("total number of authorization failures"), labels).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
-        });
-    }
 }

 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats) {
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -79,17 +79,32 @@ public:
        utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
        utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
    } api_operations;
-    // Count of authentication and authorization failures, counted if either
-    // alternator_enforce_authorization or alternator_warn_authorization are
-    // set to true. If both are false, no authentication or authorization
-    // checks are performed, so failures are not recognized or counted.
-    // "authentication" failure means the request was not signed with a valid
-    // user and key combination. "authorization" failure means the request was
-    // authenticated to a valid user - but this user did not have permissions
-    // to perform the operation (considering RBAC settings and the user's
-    // superuser status).
-    uint64_t authentication_failures = 0;
-    uint64_t authorization_failures = 0;
+    // Operation size metrics
+    struct {
+        // Item size statistics collected per table and aggregated per node.
+        // Each histogram covers the range 0 - 446. Resolves #25143.
+        // A size is the retrieved item's size.
+        utils::estimated_histogram get_item_op_size_kb{30};
+        // A size is the maximum of the new item's size and the old item's size.
+        utils::estimated_histogram put_item_op_size_kb{30};
+        // A size is the deleted item's size. If the deleted item's size is
+        // unknown (i.e. read-before-write wasn't necessary and it wasn't
+        // forced by a configuration option), it won't be recorded on the
+        // histogram.
+        utils::estimated_histogram delete_item_op_size_kb{30};
+        // A size is the maximum of existing item's size and the estimated size
+        // of the update. This will be changed to the maximum of the existing item's
+        // size and the new item's size in a subsequent PR.
+        utils::estimated_histogram update_item_op_size_kb{30};
+
+        // A size is the sum of the sizes of all items per table. This means
+        // that a single BatchGetItem / BatchWriteItem updates the histogram
+        // for each table that it has items in.
+        // The sizes are the retrieved items' sizes grouped per table.
+        utils::estimated_histogram batch_get_item_op_size_kb{30};
+        // The sizes are the the written items' sizes grouped per table.
+        utils::estimated_histogram batch_write_item_op_size_kb{30};
+    } operation_sizes;
    // Miscellaneous event counters
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
@@ -137,4 +152,8 @@ struct table_stats {
 };
 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);

+inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
+    return (bytes + 1023) / 1024;
+}
+
 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -13,7 +13,6 @@

 #include <seastar/json/formatter.hh>

-#include "auth/permission.hh"
 #include "db/config.hh"

 #include "cdc/log.hh"
@@ -127,7 +126,7 @@ public:
    }
 };

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
@@ -297,7 +296,7 @@ sequence_number::sequence_number(std::string_view v)
    }())
 {}

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::shard_id>
@@ -357,7 +356,7 @@ static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts)
    return type;
 }

-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_view_type>
@@ -476,10 +475,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        } else {
            status = "ENABLED";
        }
-    } 
+    }

    auto ttl = std::chrono::seconds(opts.ttl());
-    
+
    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

    stream_view_type type = cdc_options_to_steam_view_type(opts);
@@ -715,7 +714,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&

    auto type = rjson::get<shard_iterator_type>(request, "ShardIteratorType");
    auto seq_num = rjson::get_opt<sequence_number>(request, "SequenceNumber");
-    
+
    if (type < shard_iterator_type::TRIM_HORIZON && !seq_num) {
        throw api_error::validation("Missing required parameter \"SequenceNumber\"");
    }
@@ -725,7 +724,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&

    auto stream_arn = rjson::get<alternator::stream_arn>(request, "StreamArn");
    auto db = _proxy.data_dictionary();
-    
+
    schema_ptr schema = nullptr;
    std::optional<shard_id> sid;

@@ -790,7 +789,7 @@ struct event_id {
        return os;
    }
 };
-}
+} // namespace alternator

 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::event_id>
@@ -828,7 +827,7 @@ future<executor::request_return_type> executor::get_records(client_state& client

    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

-    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats);
+    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);

    db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
    partition_key pk = iter.shard.id.to_partition_key(*schema);
@@ -941,7 +940,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.0");
+                rjson::add(record, "eventVersion", "1.1");
                rjson::push_back(records, std::move(record));
                record = rjson::empty_object();
                --limit;
@@ -1000,6 +999,16 @@ future<executor::request_return_type> executor::get_records(client_state& client
            case cdc::operation::insert:
                rjson::add(record, "eventName", "INSERT");
                break;
+            case cdc::operation::service_row_delete:
+            case cdc::operation::service_partition_delete:
+            {
+                auto user_identity = rjson::empty_object();
+                rjson::add(user_identity, "Type", "Service");
+                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+                rjson::add(record, "userIdentity", std::move(user_identity));
+                rjson::add(record, "eventName", "REMOVE");
+                break;
+            }
            default:
                rjson::add(record, "eventName", "REMOVE");
                break;
@@ -1125,4 +1134,4 @@ void executor::supplement_table_stream_info(rjson::value& descr, const schema& s
    }
 }

-}
+} // namespace alternator
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -17,6 +17,7 @@
 #include <seastar/core/lowres_clock.hh>
 #include <seastar/coroutine/maybe_yield.hh>

+#include "cdc/log.hh"
 #include "exceptions/exceptions.hh"
 #include "gms/gossiper.hh"
 #include "gms/inet_address.hh"
@@ -94,7 +95,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

-    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
+    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::ALTER);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
        if (enabled) {
            if (tags_map.contains(TTL_TAG_KEY)) {
@@ -292,7 +293,12 @@ static future<> expire_item(service::storage_proxy& proxy,
        db::consistency_level::LOCAL_QUORUM,
        executor::default_timeout(), // FIXME - which timeout?
        qs.get_trace_state(), qs.get_permit(),
-        db::allow_per_partition_rate_limit::no);
+        db::allow_per_partition_rate_limit::no,
+        false,
+        cdc::per_request_options{
+            .is_system_originated = true,
+        }
+    );
 }

 static size_t random_offset(size_t min, size_t max) {
@@ -747,7 +753,7 @@ static future<bool> scan_table(
        auto my_host_id = erm->get_topology().my_host_id();
        const auto &tablet_map = erm->get_token_metadata().tablets().get_tablet_map(s->id());
        for (std::optional tablet = tablet_map.first_tablet(); tablet; tablet = tablet_map.next_tablet(*tablet)) {
-            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet, erm->get_topology());
+            auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet);
            // check if this is the primary replica for the current tablet
            if (tablet_primary_replica.host == my_host_id && tablet_primary_replica.shard == this_shard_id()) {
                co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -594,6 +594,50 @@
            }
         ]
      },
+      {
+         "path": "/storage_service/natural_endpoints/v2/{keyspace}",
+         "operations": [
+            {
+               "method": "GET",
+               "summary":"This method returns the N endpoints that are responsible for storing the specified key i.e for replication. the endpoint responsible for this key",
+               "type": "array",
+               "items": {
+                  "type": "string"
+               },
+               "nickname": "get_natural_endpoints_v2",
+               "produces": [
+                  "application/json"
+               ],
+               "parameters": [
+                  {
+                     "name": "keyspace",
+                     "description": "The keyspace to query about.",
+                     "required": true,
+                     "allowMultiple": false,
+                     "type": "string",
+                     "paramType": "path"
+                  },
+                  {
+                     "name": "cf",
+                     "description": "Column family name.",
+                     "required": true,
+                     "allowMultiple": false,
+                     "type": "string",
+                     "paramType": "query"
+                  },
+                  {
+                     "name": "key_component",
+                     "description": "Each component of the key for which we need to find the endpoint (e.g. ?key_component=part1&key_component=part2).",
+                     "required": true,
+                     "allowMultiple": true,
+                     "type": "string",
+                     "paramType": "query"
+                  }
+               ]
+            }
+         ]
+      },
+
      {
         "path":"/storage_service/cdc_streams_check_and_repair",
         "operations":[
@@ -898,14 +942,6 @@
                          "type":"string",
                          "paramType":"query",
                          "enum": ["all", "dc", "rack", "node"]
-                      },
-                      {
-                         "name":"primary_replica_only",
-                         "description":"Load the sstables and stream to the primary replica node within the scope, if one is specified. If not, stream to the global primary replica.",
-                         "required":false,
-                         "allowMultiple":false,
-                         "type":"boolean",
-                         "paramType":"query"
                      }
                  ]
              }
@@ -992,7 +1028,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all/",
+         "path":"/storage_service/cleanup_all",
         "operations":[
            {
               "method":"POST",
@@ -1002,30 +1038,6 @@
               "produces":[
                  "application/json"
               ],
-               "parameters":[
-                    {
-                     "name":"global",
-                     "description":"true if cleanup of entire cluster is requested",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/storage_service/mark_node_as_clean",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
-               "type":"void",
-               "nickname":"reset_cleanup_needed",
-               "produces":[
-                  "application/json"
-               ],
               "parameters":[]
            }
         ]
@@ -1132,6 +1144,14 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name": "drop_unfixable_sstables",
+                     "description": "When set to true, drop unfixable sstables. Applies only to scrub mode SEGREGATE.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
@@ -2956,7 +2976,7 @@
                  },
                  {
                     "name":"incremental_mode",
-                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
+                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,14 +42,6 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
-                  },
-                  {
-                     "name":"consider_only_existing_data",
-                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
                  }
               ]
            }
--- a/api/api.cc
+++ b/api/api.cc
@@ -216,10 +216,10 @@ future<> unset_server_gossip(http_context& ctx) {
    });
 }

-future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks) {
+future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db) {
    co_await register_api(ctx, "column_family",
-                "The column family API", [&db, &sys_ks] (http_context& ctx, routes& r) {
-                    set_column_family(ctx, r, db, sys_ks);
+                "The column family API", [&db] (http_context& ctx, routes& r) {
+                    set_column_family(ctx, r, db);
                });
    co_await register_api(ctx, "cache_service",
            "The cache service API", [&db] (http_context& ctx, routes& r) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -58,7 +58,6 @@ class sstables_format_selector;
 namespace view {
 class view_builder;
 }
-class system_keyspace;
 }
 namespace netw { class messaging_service; }
 class repair_service;
@@ -118,7 +117,7 @@ future<> set_server_token_metadata(http_context& ctx, sharded<locator::shared_to
 future<> unset_server_token_metadata(http_context& ctx);
 future<> set_server_gossip(http_context& ctx, sharded<gms::gossiper>& g);
 future<> unset_server_gossip(http_context& ctx);
-future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks);
+future<> set_server_column_family(http_context& ctx, sharded<replica::database>& db);
 future<> unset_server_column_family(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
 future<> unset_server_messaging_service(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,6 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
-#include "db/system_keyspace.hh"
 #include "db/data_listeners.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
@@ -336,7 +335,7 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

-void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks) {
+void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
        const replica::database::tables_metadata& meta = db.local().get_tables_metadata();
@@ -937,30 +936,6 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        return set_tables_tombstone_gc(db, std::move(tables), false);
    });

-    cf::get_built_indexes.set(r, [&db, &sys_ks](std::unique_ptr<http::request> req) {
-        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
-        // Use of load_built_views() as filtering table should be in sync with
-        // built_indexes_virtual_reader filtering with BUILT_VIEWS table
-        return sys_ks.local().load_built_views().then([ks, cf_name, &db](const std::vector<db::system_keyspace::view_name>& vb) mutable {
-            std::set<sstring> vp;
-            for (auto b : vb) {
-                if (b.first == ks) {
-                    vp.insert(b.second);
-                }
-            }
-            std::vector<sstring> res;
-            auto uuid = validate_table(db.local(), ks, cf_name);
-            replica::column_family& cf = db.local().find_column_family(uuid);
-            res.reserve(cf.get_index_manager().list_indexes().size());
-            for (auto&& i : cf.get_index_manager().list_indexes()) {
-                if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
-                    res.emplace_back(i.metadata().name());
-                }
-            }
-            return make_ready_future<json::json_return_type>(res);
-        });
-    });
-
    cf::get_compression_metadata_off_heap_memory_used.set(r, [](const_req) {
        // FIXME
        // Currently there are no information on the compression
@@ -1215,7 +1190,6 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::disable_tombstone_gc.unset(r);
    ss::enable_tombstone_gc.unset(r);
    ss::disable_tombstone_gc.unset(r);
-    cf::get_built_indexes.unset(r);
    cf::get_compression_metadata_off_heap_memory_used.unset(r);
    cf::get_compression_parameters.unset(r);
    cf::get_compression_ratio.unset(r);
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -13,13 +13,9 @@
 #include <any>
 #include "api/api_init.hh"

-namespace db {
-class system_keyspace;
-}
-
 namespace api {

-void set_column_family(http_context& ctx, httpd::routes& r, sharded<replica::database>& db, sharded<db::system_keyspace>& sys_ks);
+void set_column_family(http_context& ctx, httpd::routes& r, sharded<replica::database>& db);
 void unset_column_family(http_context& ctx, httpd::routes& r);

 table_info parse_table_info(const sstring& name, const replica::database& db);
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -21,10 +21,10 @@ namespace hf = httpd::error_injection_json;

 void set_error_injection(http_context& ctx, routes& r) {

-    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
        sstring injection = req->get_path_param("injection");
        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto params = req->content;
+        auto params = co_await util::read_entire_stream_contiguous(*req->content_stream);

        const size_t max_params_size = 1024 * 1024;
        if (params.size() > max_params_size) {
@@ -39,12 +39,11 @@ void set_error_injection(http_context& ctx, routes& r) {
                : rjson::parse_to_map<utils::error_injection_parameters>(params);

            auto& errinj = utils::get_local_injector();
-            return errinj.enable_on_all(injection, one_shot, std::move(parameters)).then([] {
-                return make_ready_future<json::json_return_type>(json::json_void());
-            });
+            co_await errinj.enable_on_all(injection, one_shot, std::move(parameters));
        } catch (const rjson::error& e) {
            throw httpd::bad_param_exception(format("Failed to parse injections parameters: {}", e.what()));
        }
+        co_return json::json_void();
    });

    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -20,7 +20,6 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
-#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -37,6 +36,7 @@
 #include "gms/gossiper.hh"
 #include "db/system_keyspace.hh"
 #include <seastar/http/exception.hh>
+#include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include <seastar/coroutine/exception.hh>
@@ -273,6 +273,13 @@ scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::re
        throw httpd::bad_param_exception(fmt::format("Unknown argument for 'quarantine_mode' parameter: {}", quarantine_mode_str));
    }

+    if(req_param<bool>(*req, "drop_unfixable_sstables", false)) {
+        if(scrub_mode != compaction::compaction_type_options::scrub::mode::segregate) {
+            throw httpd::bad_param_exception("The 'drop_unfixable_sstables' parameter is only valid when 'scrub_mode' is 'SEGREGATE'");
+        }
+        info.opts.drop_unfixable = compaction::compaction_type_options::scrub::drop_unfixable_sstables::yes;
+    }
+
    return info;
 }

@@ -497,18 +504,16 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto bucket = req->get_query_param("bucket");
        auto prefix = req->get_query_param("prefix");
        auto scope = parse_stream_scope(req->get_query_param("scope"));
-        auto primary_replica_only = validate_bool_x(req->get_query_param("primary_replica_only"), false);

-        // TODO: the http_server backing the API does not use content streaming
-        // should use it for better performance
-        rjson::value parsed = rjson::parse(req->content);
+        rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
+        rjson::value parsed = rjson::parse(std::move(content));
        if (!parsed.IsArray()) {
            throw httpd::bad_param_exception("malformatted sstables in body");
        }
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
+        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope);
        co_return json::json_return_type(fmt::to_string(task_id));
    });

@@ -529,10 +534,35 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
        });
    });

+    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->get_path_param("name"));
+        // Use of load_built_views() as filtering table should be in sync with
+        // built_indexes_virtual_reader filtering with BUILT_VIEWS table
+        std::vector<db::system_keyspace::view_name> vn = co_await vb.local().get_sys_ks().load_built_views();
+        std::set<sstring> vp;
+        for (auto b : vn) {
+            if (b.first == ks) {
+                vp.insert(b.second);
+            }
+        }
+        std::vector<sstring> res;
+        replica::database& db = vb.local().get_db();
+        auto uuid = validate_table(db, ks, cf_name);
+        replica::column_family& cf = db.find_column_family(uuid);
+        res.reserve(cf.get_index_manager().list_indexes().size());
+        for (auto&& i : cf.get_index_manager().list_indexes()) {
+            if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
+                res.emplace_back(i.metadata().name());
+            }
+        }
+        co_return res;
+    });
+
 }

 void unset_view_builder(http_context& ctx, routes& r) {
    ss::view_build_statuses.unset(r);
+    cf::get_built_indexes.unset(r);
 }

 static future<json::json_return_type> describe_ring_as_json(sharded<service::storage_service>& ss, sstring keyspace) {
@@ -712,6 +742,14 @@ rest_get_natural_endpoints(http_context& ctx, sharded<service::storage_service>&
        return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
 }

+static
+json::json_return_type
+rest_get_natural_endpoints_v2(http_context& ctx, sharded<service::storage_service>& ss, const_req req) {
+        auto keyspace = validate_keyspace(ctx, req);
+        auto res = ss.local().get_natural_endpoints(keyspace, req.get_query_param("cf"), req.get_query_param_array("key_component"));
+        return res | std::views::transform([] (auto& ep) { return fmt::to_string(ep); }) | std::ranges::to<std::vector>();
+}
+
 static
 future<json::json_return_type>
 rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -725,52 +763,25 @@ rest_cdc_streams_check_and_repair(sharded<service::storage_service>& ss, std::un
 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        bool global = true;
-        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
-            global = validate_bool(global_param);
-        }
-
-        apilog.info("cleanup_all global={}", global);
-
-        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        apilog.info("cleanup_all");
+        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
-            co_await ss.do_cluster_cleanup();
+            co_await ss.do_clusterwide_vnodes_cleanup();
            co_return true;
        });
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
+        // fall back to the local global cleanup if topology coordinator is not enabled
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
        co_await task->done();
-
-        // Mark this node as clean
-        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-            if (ss.is_topology_coordinator_enabled()) {
-                co_await ss.reset_cleanup_needed();
-            }
-        });
-
        co_return json::json_return_type(0);
 }

-static
-future<json::json_return_type>
-rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("reset_cleanup_needed");
-        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
-            if (!ss.is_topology_coordinator_enabled()) {
-                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
-            }
-            return ss.reset_cleanup_needed();
-        });
-        co_return json_void();
-}
-
 static
 future<json::json_return_type>
 rest_force_flush(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -1750,9 +1761,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
    ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
    ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
+    ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
    ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
-    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
    ss::decommission.set(r, rest_bind(rest_decommission, ss));
@@ -1830,7 +1841,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_natural_endpoints.unset(r);
    ss::cdc_streams_check_and_repair.unset(r);
    ss::cleanup_all.unset(r);
-    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
    ss::decommission.unset(r);
--- a/api/system.cc
+++ b/api/system.cc
@@ -54,7 +54,8 @@ void set_system(http_context& ctx, routes& r) {

    hm::set_metrics_config.set(r, [](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        rapidjson::Document doc;
-        doc.Parse(req->content.c_str());
+        auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
+        doc.Parse(content.c_str());
        if (!doc.IsArray()) {
            throw bad_param_exception("Expected a json array");
        }
@@ -87,21 +88,19 @@ void set_system(http_context& ctx, routes& r) {
                relabels[i].expr = element["regex"].GetString();
            }
        }
-        return do_with(std::move(relabels), false, [](const std::vector<seastar::metrics::relabel_config>& relabels, bool& failed) {
-            return smp::invoke_on_all([&relabels, &failed] {
-                return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
-                    if (result.metrics_relabeled_due_to_collision > 0) {
-                        failed = true;
-                    }
-                    return;
-                });
-            }).then([&failed](){
-                if (failed) {
-                    throw bad_param_exception("conflicts found during relabeling");
+        bool failed = false;
+        co_await smp::invoke_on_all([&relabels, &failed] {
+            return metrics::set_relabel_configs(relabels).then([&failed](const metrics::metric_relabeling_result& result) {
+                if (result.metrics_relabeled_due_to_collision > 0) {
+                    failed = true;
                }
-                return make_ready_future<json::json_return_type>(seastar::json::json_void());
+                return;
            });
        });
+        if (failed) {
+            throw bad_param_exception("conflicts found during relabeling");
+        }
+        co_return seastar::json::json_void();
    });

    hs::get_system_uptime.set(r, [](const_req req) {
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -38,78 +38,76 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

-static future<shared_ptr<compaction::major_keyspace_compaction_task_impl>> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
-    auto& db = ctx.db;
-    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-    auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-    auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
-    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    std::optional<compaction::flush_mode> fmopt;
-    if (!flush && !consider_only_existing_data) {
-        fmopt = compaction::flush_mode::skip;
-    }
-    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-}
-
-static future<shared_ptr<compaction::upgrade_sstables_compaction_task_impl>> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
-    auto& db = ctx.db;
-    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-}
-
-static future<shared_ptr<compaction::cleanup_keyspace_compaction_task_impl>> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-    auto& db = ctx.db;
-    auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-    if (rs.is_local() || !rs.is_vnode_based()) {
-        auto reason = rs.is_local() ? "require" : "support";
-        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-        co_return nullptr;
-    }
-    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-    if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-        auto msg = "Can not perform cleanup operation when topology changes";
-        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-        co_await coroutine::return_exception(std::runtime_error(msg));
-    }
-
-    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
-        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
-}
-
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        auto& db = ctx.db;
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        std::optional<compaction::flush_mode> fmopt;
+        if (!flush) {
+            fmopt = compaction::flush_mode::skip;
+        }
+        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
+
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        auto& db = ctx.db;
+        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
+        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        std::optional<compaction::flush_mode> fmopt;
+        if (!flush && !consider_only_existing_data) {
+            fmopt = compaction::flush_mode::skip;
+        }
+        auto task = co_await compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
        co_await task->done();
        co_return json_void();
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        tasks::task_id id = tasks::task_id::create_null_id();
-        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
-        if (task) {
-            id = task->get_status().id;
+        auto& db = ctx.db;
+        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
        }
-        co_return json::json_return_type(id.to_sstring());
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+
+        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
-        if (task) {
-            co_await task->done();
+        auto& db = ctx.db;
+        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+        if (rs.is_local() || !rs.is_vnode_based()) {
+            auto reason = rs.is_local() ? "require" : "support";
+            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+            co_return json::json_return_type(0);
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+        if (!co_await ss.local().is_vnodes_cleanup_allowed(keyspace)) {
+            auto msg = "Can not perform cleanup operation when topology changes";
+            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+            co_await coroutine::return_exception(std::runtime_error(msg));
+        }
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
+            {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+        co_await task->done();
        co_return json::json_return_type(0);
    });

@@ -131,12 +129,25 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        auto& db = ctx.db;
+        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        auto& db = ctx.db;
+        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+        auto task = co_await compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
        co_await task->done();
        co_return json::json_return_type(0);
    }));
--- a/audit/audit_cf_storage_helper.cc
+++ b/audit/audit_cf_storage_helper.cc
@@ -16,6 +16,7 @@
 #include "cql3/statements/ks_prop_defs.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
+#include "locator/abstract_replication_strategy.hh"

 namespace audit {

@@ -64,8 +65,8 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
            data_dictionary::database db = _qp.db();
            cql3::statements::ks_prop_defs old_ks_prop_defs;
            auto old_ks_metadata = old_ks_prop_defs.as_ks_metadata_update(
-                    ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features());
-            std::map<sstring, sstring> strategy_opts;
+                    ks->metadata(), *_qp.proxy().get_token_metadata_ptr(), db.features(), db.get_config());
+            locator::replication_strategy_config_options strategy_opts;
            for (const auto &dc: _qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters())
                strategy_opts[dc] = "3";

@@ -73,6 +74,7 @@ future<> audit_cf_storage_helper::migrate_audit_table(service::group0_guard grou
                                                                   "org.apache.cassandra.locator.NetworkTopologyStrategy",
                                                                   strategy_opts,
                                                                   std::nullopt, // initial_tablets
+                                                                   std::nullopt, // consistency_option
                                                                   old_ks_metadata->durable_writes(),
                                                                   old_ks_metadata->get_storage_options(),
                                                                   old_ks_metadata->tables());
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -215,6 +215,7 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
                    meta::legacy::AUTH_KS,
                    "org.apache.cassandra.locator.SimpleStrategy",
                    opts,
+                    std::nullopt,
                    std::nullopt);

            try {
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -1209,7 +1209,7 @@ future<mutation> create_table_streams_mutation(table_id table, db_clock::time_po
    co_return std::move(m);
 }

-future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const utils::chunked_vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
+future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const std::vector<cdc::stream_id>& stream_ids, api::timestamp_type ts) {
    auto s = db::system_keyspace::cdc_streams_state();

    mutation m(s, partition_key::from_single_value(*s,
@@ -1252,24 +1252,24 @@ future<> generation_service::load_cdc_tablet_streams(std::optional<std::unordere
        tables_to_process = _cdc_metadata.get_tables_with_cdc_tablet_streams() | std::ranges::to<std::unordered_set<table_id>>();
    }

-    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, utils::chunked_vector<cdc::stream_id>)> f) -> future<> {
+    auto read_streams_state = [this] (const std::optional<std::unordered_set<table_id>>& tables, noncopyable_function<future<>(table_id, db_clock::time_point, std::vector<cdc::stream_id>)> f) -> future<> {
        if (tables) {
            for (auto table : *tables) {
-                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
+                co_await _sys_ks.local().read_cdc_streams_state(table, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
                    return f(table, base_ts, std::move(base_stream_set));
                });
            }
        } else {
-            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
+            co_await _sys_ks.local().read_cdc_streams_state(std::nullopt, [&] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
                return f(table, base_ts, std::move(base_stream_set));
            });
        }
    };

-    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, utils::chunked_vector<cdc::stream_id> base_stream_set) -> future<> {
+    co_await read_streams_state(changed_tables, [this, &tables_to_process] (table_id table, db_clock::time_point base_ts, std::vector<cdc::stream_id> base_stream_set) -> future<> {
        table_streams new_table_map;

-        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, utils::chunked_vector<cdc::stream_id> stream_set) {
+        auto append_stream = [&new_table_map] (db_clock::time_point stream_tp, std::vector<cdc::stream_id> stream_set) {
            auto ts = std::chrono::duration_cast<api::timestamp_clock::duration>(stream_tp.time_since_epoch()).count();
            new_table_map[ts] = committed_stream_set {stream_tp, std::move(stream_set)};
        };
@@ -1345,7 +1345,7 @@ future<> generation_service::query_cdc_timestamps(table_id table, bool ascending
    }
 }

-future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
+future<> generation_service::query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f) {
    const auto& all_tables = _cdc_metadata.get_all_tablet_streams();
    auto table_it = all_tables.find(table);
    if (table_it == all_tables.end()) {
@@ -1402,8 +1402,8 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
        co_return;
    }

-    utils::chunked_vector<cdc::stream_id> new_streams;
-    co_await utils::reserve_gently(new_streams, new_tablet_map.tablet_count());
+    std::vector<cdc::stream_id> new_streams;
+    new_streams.reserve(new_tablet_map.tablet_count());
    for (auto tid : new_tablet_map.tablet_ids()) {
        new_streams.emplace_back(new_tablet_map.get_last_token(tid), 0);
        co_await coroutine::maybe_yield();
@@ -1425,7 +1425,7 @@ future<> generation_service::generate_tablet_resize_update(utils::chunked_vector
    muts.emplace_back(std::move(mut));
 }

-future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts) {
    utils::chunked_vector<mutation> muts;
    muts.reserve(2);

--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -143,12 +143,12 @@ stream_state read_stream_state(int8_t val);

 struct committed_stream_set {
    db_clock::time_point ts;
-    utils::chunked_vector<cdc::stream_id> streams;
+    std::vector<cdc::stream_id> streams;
 };

 struct cdc_stream_diff {
-    utils::chunked_vector<stream_id> closed_streams;
-    utils::chunked_vector<stream_id> opened_streams;
+    std::vector<stream_id> closed_streams;
+    std::vector<stream_id> opened_streams;
 };

 using table_streams = std::map<api::timestamp_type, committed_stream_set>;
@@ -220,11 +220,11 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
    size_t mutation_size_threshold, api::timestamp_type mutation_timestamp);

 future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const locator::tablet_map&, api::timestamp_type);
-future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const utils::chunked_vector<cdc::stream_id>&, api::timestamp_type);
+future<mutation> create_table_streams_mutation(table_id, db_clock::time_point, const std::vector<cdc::stream_id>&, api::timestamp_type);
 utils::chunked_vector<mutation> make_drop_table_streams_mutations(table_id, api::timestamp_type ts);

 future<mutation> get_switch_streams_mutation(table_id table, db_clock::time_point stream_ts, cdc_stream_diff diff, api::timestamp_type ts);
-future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const utils::chunked_vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
+future<utils::chunked_vector<mutation>> get_cdc_stream_gc_mutations(table_id table, db_clock::time_point base_ts, const std::vector<cdc::stream_id>& base_stream_set, api::timestamp_type ts);
 table_streams::const_iterator get_new_base_for_gc(const table_streams&, std::chrono::seconds ttl);

 } // namespace cdc
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -149,7 +149,7 @@ public:
    future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);

    future<> query_cdc_timestamps(table_id table, bool ascending, noncopyable_function<future<>(db_clock::time_point)> f);
-    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const utils::chunked_vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);
+    future<> query_cdc_streams(table_id table, noncopyable_function<future<>(db_clock::time_point, const std::vector<cdc::stream_id>& current, cdc::cdc_stream_diff)> f);

    future<> generate_tablet_resize_update(utils::chunked_vector<canonical_mutation>& muts, table_id table, const locator::tablet_map& new_tablet_map, api::timestamp_type ts);

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -23,6 +23,7 @@
 #include "bytes.hh"
 #include "index/vector_index.hh"
 #include "locator/abstract_replication_strategy.hh"
+#include "locator/topology.hh"
 #include "replica/database.hh"
 #include "db/schema_tables.hh"
 #include "gms/feature_service.hh"
@@ -62,20 +63,15 @@ logging::logger cdc_log("cdc");

 namespace {

-shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy(const keyspace_metadata& ksm) {
-    locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets());
-    return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params);
+shared_ptr<locator::abstract_replication_strategy> generate_replication_strategy(const keyspace_metadata& ksm, const locator::topology& topo) {
+    locator::replication_strategy_params params(ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option());
+    return locator::abstract_replication_strategy::create_replication_strategy(ksm.strategy_name(), params, topo);
 }

-// When dropping a column from a CDC log table, we set the drop timestamp
-// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
-// with column drop, the write timestamp is before the column drop timestamp.
-constexpr auto column_drop_leeway = std::chrono::seconds(5);
-
 } // anonymous namespace

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&, api::timestamp_type,
+static schema_ptr create_log_schema(const schema&, const replica::database&, const keyspace_metadata&,
        std::optional<table_id> = {}, schema_ptr = nullptr);
 }

@@ -187,7 +183,7 @@ public:
        muts.emplace_back(std::move(mut));
    }

-    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms, api::timestamp_type ts) override {
+    void on_pre_create_column_families(const keyspace_metadata& ksm, std::vector<schema_ptr>& cfms) override {
        std::vector<schema_ptr> new_cfms;

        for (auto sp : cfms) {
@@ -202,11 +198,11 @@ public:
            check_that_cdc_log_table_does_not_exist(db, schema, logname);
            ensure_that_table_has_no_counter_columns(schema);
            if (!db.features().cdc_with_tablets) {
-                ensure_that_table_uses_vnodes(ksm, schema);
+                ensure_that_table_uses_vnodes(ksm, schema, db.get_token_metadata().get_topology());
            }

            // in seastar thread
-            auto log_schema = create_log_schema(schema, db, ksm, ts);
+            auto log_schema = create_log_schema(schema, db, ksm);
            new_cfms.push_back(std::move(log_schema));
        }

@@ -249,11 +245,11 @@ public:
            check_for_attempt_to_create_nested_cdc_log(db, new_schema);
            ensure_that_table_has_no_counter_columns(new_schema);
            if (!db.features().cdc_with_tablets) {
-                ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);
+                ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema, db.get_token_metadata().get_topology());
            }

            std::optional<table_id> maybe_id = log_schema ? std::make_optional(log_schema->id()) : std::nullopt;
-            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), timestamp, std::move(maybe_id), log_schema);
+            auto new_log_schema = create_log_schema(new_schema, db, *keyspace.metadata(), std::move(maybe_id), log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(_ctxt._proxy, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -316,7 +312,8 @@ public:
        lowres_clock::time_point timeout,
        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
-        db::consistency_level write_cl
+        db::consistency_level write_cl,
+        per_request_options options
    );

    template<typename Iter>
@@ -350,8 +347,8 @@ private:
    // Until we support CDC with tablets (issue #16317), we can't allow this
    // to be attempted - in particular the log table we try to create will not
    // have tablets, and will cause a failure.
-    static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema) {
-        auto rs = generate_replication_strategy(ksm);
+    static void ensure_that_table_uses_vnodes(const keyspace_metadata& ksm, const schema& schema, const locator::topology& topo) {
+        auto rs = generate_replication_strategy(ksm, topo);
        if (rs->uses_tablets()) {
            throw exceptions::invalid_request_exception(format("Cannot create CDC log for a table {}.{}, because the keyspace uses tablets, and not all nodes support the CDC with tablets feature.",
                schema.ks_name(), schema.cf_name()));
@@ -585,7 +582,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
 }

 static schema_ptr create_log_schema(const schema& s, const replica::database& db,
-        const keyspace_metadata& ksm, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old)
+        const keyspace_metadata& ksm, std::optional<table_id> uuid, schema_ptr old)
 {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
@@ -621,28 +618,6 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
-
-    auto validate_new_column = [&] (const sstring& name) {
-        // When dropping a column from a CDC log table, we set the drop timestamp to be
-        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
-        // Therefore, when recreating a column with the same name, we need to validate
-        // that it's not recreated too soon and that the drop timestamp has passed.
-        if (old && old->dropped_columns().contains(name)) {
-            const auto& drop_info = old->dropped_columns().at(name);
-            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
-            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
-            if (drop_time > create_time) {
-                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
-                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
-            }
-        }
-    };
-
-    auto add_column = [&] (sstring name, data_type type) {
-        validate_new_column(name);
-        b.with_column(to_bytes(name), type);
-    };
-
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -664,9 +639,9 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                    }
                ));
            }
-            add_column(log_data_column_name(column.name_as_text()), type);
+            b.with_column(log_data_column_name_bytes(column.name()), type);
            if (is_data_col) {
-                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
+                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -682,7 +657,7 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
+                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
            }
        }
    };
@@ -695,8 +670,8 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
        b.set_uuid(*uuid);
    }

-    auto rs = generate_replication_strategy(ksm);
-    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata(), false));
+    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata()));
    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));

    /**
@@ -708,8 +683,7 @@ static schema_ptr create_log_schema(const schema& s, const replica::database& db
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
-                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
            }
        }
    }
@@ -1394,6 +1368,13 @@ struct process_row_visitor {
 };

 struct process_change_visitor {
+    const per_request_options& _request_options;
+    // The types of the operations used for row / partition deletes. Introduced
+    // to differentiate service operations (e.g. operation::service_row_delete
+    // vs operation::row_delete).
+    const operation _row_delete_op = operation::row_delete;
+    const operation _partition_delete_op = operation::partition_delete;
+
    stats::part_type_set& _touched_parts;

    log_mutation_builder& _builder;
@@ -1456,7 +1437,7 @@ struct process_change_visitor {
    void clustered_row_delete(const clustering_key& ckey, const tombstone&) {
        _touched_parts.set<stats::part_type::ROW_DELETE>();

-        auto log_ck = _builder.allocate_new_log_row(operation::row_delete);
+        auto log_ck = _builder.allocate_new_log_row(_row_delete_op);
        _builder.set_clustering_columns(log_ck, ckey);

        if (_enable_updating_state && get_row_state(_clustering_row_states, ckey)) {
@@ -1500,7 +1481,7 @@ struct process_change_visitor {

    void partition_delete(const tombstone&) {
        _touched_parts.set<stats::part_type::PARTITION_DELETE>();
-        auto log_ck = _builder.allocate_new_log_row(operation::partition_delete);
+        auto log_ck = _builder.allocate_new_log_row(_partition_delete_op);
        if (_enable_updating_state) {
            _clustering_row_states.clear();
        }
@@ -1515,6 +1496,7 @@ private:
    schema_ptr _schema;
    dht::decorated_key _dk;
    schema_ptr _log_schema;
+    const per_request_options& _options;

    /**
     * #6070, #6084
@@ -1604,11 +1586,12 @@ private:
    stats::part_type_set _touched_parts;

 public:
-    transformer(db_context ctx, schema_ptr s, dht::decorated_key dk)
+    transformer(db_context ctx, schema_ptr s, dht::decorated_key dk, const per_request_options& options)
        : _ctx(ctx)
        , _schema(std::move(s))
        , _dk(std::move(dk))
        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
+        , _options(options)
        , _clustering_row_states(0, clustering_key::hashing(*_schema), clustering_key::equality(*_schema))
        , _uses_tablets(ctx._proxy.get_db().local().find_keyspace(_schema->ks_name()).uses_tablets())
    {
@@ -1623,7 +1606,7 @@ public:
    }

    void produce_preimage(const clustering_key* ck, const one_kind_column_set& columns_to_include) override {
-        // iff we want full preimage, just ignore the affected columns and include everything. 
+        // if we want full preimage, just ignore the affected columns and include everything. 
        generate_image(operation::pre_image, ck, _schema->cdc_options().full_preimage() ? nullptr : &columns_to_include);
    };

@@ -1709,6 +1692,9 @@ public:
    void process_change(const mutation& m) override {
        SCYLLA_ASSERT(_builder);
        process_change_visitor v {
+            ._request_options = _options,
+            ._row_delete_op = _options.is_system_originated ? operation::service_row_delete : operation::row_delete,
+            ._partition_delete_op = _options.is_system_originated ? operation::service_partition_delete : operation::partition_delete,
            ._touched_parts = _touched_parts,
            ._builder = *_builder,
            ._enable_updating_state = _enable_updating_state,
@@ -1740,7 +1726,8 @@ public:
            const mutation& m)
    {
        auto& p = m.partition();
-        if (p.clustered_rows().empty() && p.static_row().empty()) {
+        const bool no_ck_schema_partition_deletion = m.schema()->clustering_key_size() == 0 && bool(p.partition_tombstone());
+        if (p.clustered_rows().empty() && p.static_row().empty() && !no_ck_schema_partition_deletion) {
            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
        }

@@ -1789,12 +1776,12 @@ public:
                });
            }
        }
-        if (!p.clustered_rows().empty()) {
+        if (!p.clustered_rows().empty() || no_ck_schema_partition_deletion) {
            const bool has_row_delete = std::any_of(p.clustered_rows().begin(), p.clustered_rows().end(), [] (const rows_entry& re) {
                return re.row().deleted_at();
            });
            // for postimage we need everything...
-            if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage()) {
+            if (has_row_delete || _schema->cdc_options().postimage() || _schema->cdc_options().full_preimage() || no_ck_schema_partition_deletion) {
                for (const column_definition& c: _schema->regular_columns()) {
                    regular_columns.emplace_back(c.id);
                    columns.emplace_back(&c);
@@ -1909,7 +1896,7 @@ transform_mutations(utils::chunked_vector<mutation>& muts, decltype(muts.size())
 } // namespace cdc

 future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
    // we do all this because in the case of batches, we can have mixed schemas.
    auto e = mutations.end();
    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
@@ -1923,9 +1910,9 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
    tracing::trace(tr_state, "CDC: Started generating mutations for log rows");
    mutations.reserve(2 * mutations.size());

-    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
-            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
-        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
+    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{}, std::move(options),
+            [this, tr_state = std::move(tr_state), write_cl] (utils::chunked_vector<mutation>& mutations, service::query_state& qs, operation_details& details, per_request_options& options) {
+        return transform_mutations(mutations, 1, [this, &mutations, &qs, tr_state = tr_state, &details, write_cl, &options] (int idx) mutable {
            auto& m = mutations[idx];
            auto s = m.schema();

@@ -1933,12 +1920,16 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
                return make_ready_future<>();
            }

-            transformer trans(_ctxt, s, m.decorated_key());
+            transformer trans(_ctxt, s, m.decorated_key(), options);

            auto f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(nullptr);
-            if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
+            if (options.preimage && !options.preimage->empty()) {
+                // Preimage has been fetched by upper layers.
+                tracing::trace(tr_state, "CDC: Using a prefetched preimage");
+                f = make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>(options.preimage);
+            } else if (s->cdc_options().preimage() || s->cdc_options().postimage()) {
                // Note: further improvement here would be to coalesce the pre-image selects into one
-                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
+                // if a batch contains several modifications to the same table. Otoh, batch is rare(?)
                // so this is premature.
                tracing::trace(tr_state, "CDC: Selecting preimage for {}", m.decorated_key());
                f = trans.pre_image_select(qs.get_client_state(), write_cl, m).then_wrapped([this] (future<lw_shared_ptr<cql3::untyped_result_set>> f) {
@@ -1999,11 +1990,11 @@ bool cdc::cdc_service::needs_cdc_augmentation(const utils::chunked_vector<mutati
 }

 future<std::tuple<utils::chunked_vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, utils::chunked_vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl, per_request_options options) {
    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
-        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
-            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl, options = std::move(options)] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
        });
    }
-    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl, std::move(options));
 }
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -21,6 +21,7 @@
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sstring.hh>

+#include "cql3/untyped_result_set.hh"
 #include "mutation/timestamp.hh"
 #include "tracing/trace_state.hh"
 #include "utils/UUID.hh"
@@ -51,6 +52,29 @@ class database;

 namespace cdc {

+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+
+    // Operations initiated internally by Scylla. Currently used only by Alternator
+    service_row_delete = -3, service_partition_delete = -4,
+};
+
+struct per_request_options {
+    // The value of the base row before current operation, queried by higher
+    // layers than CDC. We assume that CDC could have seen the row in this
+    // state, i.e. the value isn't 'stale'/'too recent'.
+    lw_shared_ptr<cql3::untyped_result_set> preimage;
+    // Whether this mutation is a result of an internal operation initiated by
+    // Scylla. Currently, only TTL expiration implementation for Alternator
+    // uses this.
+    const bool is_system_originated = false;
+};
+
 struct operation_result_tracker;
 class db_context;
 class metadata;
@@ -80,8 +104,9 @@ public:
        lowres_clock::time_point timeout,
        utils::chunked_vector<mutation>&& mutations,
        tracing::trace_state_ptr tr_state,
-        db::consistency_level write_cl
-        );
+        db::consistency_level write_cl,
+        per_request_options options = {}
+    );
    bool needs_cdc_augmentation(const utils::chunked_vector<mutation>&) const;
 };

@@ -93,15 +118,6 @@ struct db_context final {
        : _proxy(proxy), _migration_notifier(notifier), _cdc_metadata(cdc_meta) {}
 };

-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
-    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
-    post_image = 9,
-};
-
 bool is_log_for_some_table(const replica::database& db, const sstring& ks_name, const std::string_view& table_name);

 schema_ptr get_base_table(const replica::database&, const schema&);
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -54,7 +54,7 @@ cdc::stream_id get_stream(
 }

 static cdc::stream_id get_stream(
-        const utils::chunked_vector<cdc::stream_id>& streams,
+        const std::vector<cdc::stream_id>& streams,
        dht::token tok) {
    if (streams.empty()) {
        on_internal_error(cdc_log, "get_stream: streams empty");
@@ -159,7 +159,7 @@ cdc::stream_id cdc::metadata::get_vnode_stream(api::timestamp_type ts, dht::toke
    return ret;
 }

-const utils::chunked_vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
+const std::vector<cdc::stream_id>& cdc::metadata::get_tablet_stream_set(table_id tid, api::timestamp_type ts) const {
    auto now = api::new_timestamp();
    if (ts > now + get_generation_leeway().count()) {
        throw exceptions::invalid_request_exception(seastar::format(
@@ -259,10 +259,10 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    return !it->second;
 }

-future<utils::chunked_vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
-        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
-        utils::chunked_vector<cdc::stream_id> opened,
-        const utils::chunked_vector<cdc::stream_id>& closed) {
+future<std::vector<cdc::stream_id>> cdc::metadata::construct_next_stream_set(
+        const std::vector<cdc::stream_id>& prev_stream_set,
+        std::vector<cdc::stream_id> opened,
+        const std::vector<cdc::stream_id>& closed) {

    if (closed.size() == prev_stream_set.size()) {
        // all previous streams are closed, so the next stream set is just the opened streams.
@@ -273,8 +273,8 @@ future<utils::chunked_vector<cdc::stream_id>> cdc::metadata::construct_next_stre
    // streams and removing the closed streams. we assume each stream set is
    // sorted by token, and the result is sorted as well.

-    utils::chunked_vector<cdc::stream_id> next_stream_set;
-    co_await utils::reserve_gently(next_stream_set, prev_stream_set.size() + opened.size() - closed.size());
+    std::vector<cdc::stream_id> next_stream_set;
+    next_stream_set.reserve(prev_stream_set.size() + opened.size() - closed.size());

    auto next_prev = prev_stream_set.begin();
    auto next_closed = closed.begin();
@@ -318,8 +318,8 @@ std::vector<table_id> cdc::metadata::get_tables_with_cdc_tablet_streams() const
    return _tablet_streams | std::views::keys | std::ranges::to<std::vector<table_id>>();
 }

-future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const utils::chunked_vector<stream_id>& before, const utils::chunked_vector<stream_id>& after) {
-    utils::chunked_vector<stream_id> closed, opened;
+future<cdc::cdc_stream_diff> cdc::metadata::generate_stream_diff(const std::vector<stream_id>& before, const std::vector<stream_id>& after) {
+    std::vector<stream_id> closed, opened;

    auto before_it = before.begin();
    auto after_it = after.begin();
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -49,7 +49,7 @@ class metadata final {

    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;

-    const utils::chunked_vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;
+    const std::vector<stream_id>& get_tablet_stream_set(table_id tid, api::timestamp_type ts) const;

 public:
    /* Is a generation with the given timestamp already known or obsolete? It is obsolete if and only if
@@ -111,14 +111,14 @@ public:

    std::vector<table_id> get_tables_with_cdc_tablet_streams() const;

-    static future<utils::chunked_vector<stream_id>> construct_next_stream_set(
-        const utils::chunked_vector<cdc::stream_id>& prev_stream_set,
-        utils::chunked_vector<cdc::stream_id> opened,
-        const utils::chunked_vector<cdc::stream_id>& closed);
+    static future<std::vector<stream_id>> construct_next_stream_set(
+        const std::vector<cdc::stream_id>& prev_stream_set,
+        std::vector<cdc::stream_id> opened,
+        const std::vector<cdc::stream_id>& closed);

    static future<cdc_stream_diff> generate_stream_diff(
-        const utils::chunked_vector<stream_id>& before,
-        const utils::chunked_vector<stream_id>& after);
+        const std::vector<stream_id>& before,
+        const std::vector<stream_id>& after);

 };

--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -111,6 +111,15 @@ struct batch {
                ret.insert(std::make_pair(change.key, all_columns));
            }
        }
+        // While deleting a full partition avoids row-by-row logging for performance
+        // reasons, we must explicitly log single-row deletions for tables without a
+        // clustering key. This ensures consistent behavior with deletions of single
+        // rows from tables with a clustering key. See issue #26382.
+        if (partition_deletions && s.clustering_key_size() == 0) {
+            cdc::one_kind_column_set all_columns{s.regular_columns_count()};
+            all_columns.set(0, s.regular_columns_count(), true);
+            ret.emplace(clustering_key::make_empty(), all_columns);
+        }

        auto process_change_type = [&] (const auto& changes) {
            for (const auto& change : changes) {
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -117,6 +117,9 @@ add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}=.")
 cmake_path(GET CMAKE_BINARY_DIR FILENAME build_dir_name)
 add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}/=${build_dir_name}")

+# https://github.com/llvm/llvm-project/issues/163007
+add_compile_options("-fextend-variable-liveness=none")
+
 default_target_arch(target_arch)
 if(target_arch)
  add_compile_options("-march=${target_arch}")
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -129,6 +129,7 @@ static const std::unordered_map<compaction_type, sstring> compaction_types = {
    { compaction_type::Upgrade, "UPGRADE" },
    { compaction_type::Reshape, "RESHAPE" },
    { compaction_type::Split, "SPLIT" },
+    { compaction_type::Major, "MAJOR" },
 };

 sstring compaction_name(compaction_type type) {
@@ -159,6 +160,7 @@ std::string_view to_string(compaction_type type) {
    case compaction_type::Upgrade: return "Upgrade";
    case compaction_type::Reshape: return "Reshape";
    case compaction_type::Split: return "Split";
+    case compaction_type::Major: return "Major";
    }
    on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
    return "(invalid)";
@@ -1537,6 +1539,8 @@ private:
        mutation_fragment_stream_validator _validator;
        bool _skip_to_next_partition = false;
        uint64_t& _validation_errors;
+        bool& _failed_to_fix_sstable;
+        compaction_type_options::scrub::drop_unfixable_sstables _drop_unfixable_sstables;

    private:
        void maybe_abort_scrub(std::function<void()> report_error) {
@@ -1547,7 +1551,7 @@ private:
            ++_validation_errors;
        }

-        void on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
+        skip on_unexpected_partition_start(const mutation_fragment_v2& ps, sstring error) {
            auto report_fn = [this, error] (std::string_view action = "") {
                report_validation_error(compaction_type::Scrub, *_schema, error, action);
            };
@@ -1556,6 +1560,11 @@ private:

            auto pe = mutation_fragment_v2(*_schema, _permit, partition_end{});
            if (!_validator(pe)) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
@@ -1564,11 +1573,17 @@ private:
            push_mutation_fragment(std::move(pe));

            if (!_validator(ps)) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
                        "scrub compaction failed to rectify unexpected partition-start, validator rejects it even after the injected partition-end");
            }
+            return skip::no;
        }

        skip on_invalid_partition(const dht::decorated_key& new_key, sstring error) {
@@ -1596,6 +1611,11 @@ private:
            const auto& key = _validator.previous_partition_key();

            if (_validator.current_tombstone()) {
+                if (_drop_unfixable_sstables) {
+                    _failed_to_fix_sstable = true;
+                    end_stream();
+                    return skip::yes;
+                }
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
@@ -1635,13 +1655,21 @@ private:
        }

        void on_malformed_sstable_exception(std::exception_ptr e) {
-            if (_scrub_mode != compaction_type_options::scrub::mode::skip) {
+            bool should_abort = _scrub_mode == compaction_type_options::scrub::mode::abort ||
+                    (_scrub_mode == compaction_type_options::scrub::mode::segregate && !_drop_unfixable_sstables);
+            if (should_abort) {
                throw compaction_aborted_exception(
                        _schema->ks_name(),
                        _schema->cf_name(),
                        format("scrub compaction failed due to unrecoverable error: {}", e));
            }
+            if (_drop_unfixable_sstables) {
+                _failed_to_fix_sstable = true;
+            }
+            end_stream();
+        }

+        void end_stream() {
            // Closes the active range tombstone if needed, before emitting partition end.
            if (auto current_tombstone = _validator.current_tombstone(); current_tombstone) {
                const auto& last_pos = _validator.previous_position();
@@ -1662,6 +1690,10 @@ private:
        void fill_buffer_from_underlying() {
            utils::get_local_injector().inject("rest_api_keyspace_scrub_abort", [] { throw compaction_aborted_exception("", "", "scrub compaction found invalid data"); });
            while (!_reader.is_buffer_empty() && !is_buffer_full()) {
+                if (_end_of_stream && _failed_to_fix_sstable) {
+                    return;
+                }
+
                auto mf = _reader.pop_mutation_fragment();
                if (mf.is_partition_start()) {
                    // First check that fragment kind monotonicity stands.
@@ -1672,7 +1704,9 @@ private:
                    // will confuse it.
                    if (!_skip_to_next_partition) {
                        if (auto res = _validator(mf); !res) {
-                            on_unexpected_partition_start(mf, res.what());
+                            if (on_unexpected_partition_start(mf, res.what()) == skip::yes) {
+                                continue;
+                            }
                        }
                        // Continue processing this partition start.
                    }
@@ -1696,6 +1730,10 @@ private:
                push_mutation_fragment(std::move(mf));
            }

+            if (_end_of_stream && _failed_to_fix_sstable) {
+                return;
+            }
+
            _end_of_stream = _reader.is_end_of_stream() && _reader.is_buffer_empty();

            if (_end_of_stream) {
@@ -1706,12 +1744,15 @@ private:
        }

    public:
-        reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors)
+        reader(mutation_reader underlying, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors,
+                bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables)
            : impl(underlying.schema(), underlying.permit())
            , _scrub_mode(scrub_mode)
            , _reader(std::move(underlying))
            , _validator(*_schema)
            , _validation_errors(validation_errors)
+            , _failed_to_fix_sstable(failed_to_fix_sstable)
+            , _drop_unfixable_sstables(drop_unfixable_sstables)
        { }
        virtual future<> fill_buffer() override {
            if (_end_of_stream) {
@@ -1762,6 +1803,7 @@ private:
    mutable std::string _scrub_finish_description;
    uint64_t _bucket_count = 0;
    uint64_t _validation_errors = 0;
+    bool _failed_to_fix_sstable = false;

 public:
    scrub_compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_type_options::scrub options, compaction_progress_monitor& progress_monitor)
@@ -1793,7 +1835,7 @@ public:
            on_internal_error(clogger, fmt::format("Scrub compaction in mode {} expected full partition range, but got {} instead", _options.operation_mode, range));
        }
        auto full_scan_reader = _compacting->make_full_scan_reader(std::move(s), std::move(permit), nullptr, unwrap_monitor_generator(), sstables::integrity_check::yes);
-        return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors);
+        return make_mutation_reader<reader>(std::move(full_scan_reader), _options.operation_mode, _validation_errors, _failed_to_fix_sstable, _options.drop_unfixable);
    }

    uint64_t partitions_per_sstable() const override {
@@ -1830,11 +1872,45 @@ public:
        return ret;
    }

-    friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
+    void drop_unfixable_sstables() {
+        if (!_sstables.empty() || !used_garbage_collected_sstables().empty()) {
+            std::vector<sstables::shared_sstable> old_sstables;
+            std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
+
+            // Remove Garbage Collected SSTables from the SSTable set if any was previously added.
+            auto& used_gc_sstables = used_garbage_collected_sstables();
+            old_sstables.insert(old_sstables.end(), used_gc_sstables.begin(), used_gc_sstables.end());
+
+            _replacer(get_compaction_completion_desc(std::move(old_sstables), {}));
+        }
+
+        // Mark new sstables for deletion as well
+        for (auto& sst : boost::range::join(_new_partial_sstables, _new_unused_sstables)) {
+            sst->mark_for_deletion();
+        }
+    }
+
+    virtual void on_end_of_compaction() override {
+        if (_options.drop_unfixable && _failed_to_fix_sstable) {
+            drop_unfixable_sstables();
+        } else {
+            regular_compaction::on_end_of_compaction();
+        }
+    }
+
+    virtual void stop_sstable_writer(compaction_writer* writer) override {
+        if (_options.drop_unfixable && _failed_to_fix_sstable && writer) {
+            finish_new_sstable(writer);
+        } else {
+            regular_compaction::stop_sstable_writer(writer);
+        }
+    }
+
+    friend mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);
 };

-mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors) {
-    return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors);
+mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables) {
+    return make_mutation_reader<scrub_compaction::reader>(std::move(rd), scrub_mode, validation_errors, failed_to_fix_sstable, drop_unfixable_sstables);
 }

 class resharding_compaction final : public compaction {
@@ -1971,6 +2047,7 @@ compaction_type compaction_type_options::type() const {
        compaction_type::Reshard,
        compaction_type::Reshape,
        compaction_type::Split,
+        compaction_type::Major,
    };
    static_assert(std::variant_size_v<compaction_type_options::options_variant> == std::size(index_to_type));
    return index_to_type[_options.index()];
@@ -1992,6 +2069,9 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
        std::unique_ptr<compaction> operator()(compaction_type_options::regular) {
            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
        }
+        std::unique_ptr<compaction> operator()(compaction_type_options::major) {
+            return std::make_unique<regular_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
+        }
        std::unique_ptr<compaction> operator()(compaction_type_options::cleanup) {
            return std::make_unique<cleanup_compaction>(table_s, std::move(descriptor), cdata, progress_monitor);
        }
--- a/compaction/compaction.hh
+++ b/compaction/compaction.hh
@@ -138,6 +138,6 @@ std::unordered_set<sstables::shared_sstable>
 get_fully_expired_sstables(const compaction_group_view& table_s, const std::vector<sstables::shared_sstable>& compacting, gc_clock::time_point gc_before);

 // For tests, can drop after we virtualize sstables.
-mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors);
+mutation_reader make_scrubbing_reader(mutation_reader rd, compaction_type_options::scrub::mode scrub_mode, uint64_t& validation_errors, bool& failed_to_fix_sstable, compaction_type_options::scrub::drop_unfixable_sstables drop_unfixable_sstables);

 }
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -20,7 +20,7 @@
 namespace compaction {

 enum class compaction_type {
-    Compaction = 0,
+    Compaction = 0, // Used only for regular compactions
    Cleanup = 1,
    Validation = 2, // Origin uses this for a compaction that is used exclusively for repair
    Scrub = 3,
@@ -29,6 +29,7 @@ enum class compaction_type {
    Upgrade = 6,
    Reshape = 7,
    Split = 8,
+    Major = 9,
 };

 struct compaction_completion_desc {
@@ -49,6 +50,8 @@ class compaction_type_options {
 public:
    struct regular {
    };
+    struct major {
+    };
    struct cleanup {
    };
    struct upgrade {
@@ -74,6 +77,11 @@ public:
        // Should invalid sstables be moved into quarantine.
        // Only applies to validate-mode.
        quarantine_invalid_sstables quarantine_sstables = quarantine_invalid_sstables::yes;
+
+        using drop_unfixable_sstables = bool_class<class drop_unfixable_sstables_tag>;
+        // Drop sstables that cannot be fixed.
+        // Only applies to segregate-mode.
+        drop_unfixable_sstables drop_unfixable = drop_unfixable_sstables::no;
    };
    struct reshard {
    };
@@ -83,7 +91,7 @@ public:
        mutation_writer::classify_by_token_group classifier;
    };
 private:
-    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split>;
+    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major>;

 private:
    options_variant _options;
@@ -105,6 +113,10 @@ public:
        return compaction_type_options(regular{});
    }

+    static compaction_type_options make_major() {
+        return compaction_type_options(major{});
+    }
+
    static compaction_type_options make_cleanup() {
        return compaction_type_options(cleanup{});
    }
@@ -113,8 +125,8 @@ public:
        return compaction_type_options(upgrade{});
    }

-    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes) {
-        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables});
+    static compaction_type_options make_scrub(scrub::mode mode, scrub::quarantine_invalid_sstables quarantine_sstables = scrub::quarantine_invalid_sstables::yes, scrub::drop_unfixable_sstables drop_unfixable_sstables = scrub::drop_unfixable_sstables::no) {
+        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables, .drop_unfixable = drop_unfixable_sstables});
    }

    static compaction_type_options make_split(mutation_writer::classify_by_token_group classifier) {
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -547,7 +547,7 @@ public:
            compaction_group_view* t,
            tasks::task_id parent_id,
            bool consider_only_existing_data)
-        : compaction_task_executor(mgr, do_throw_if_stopping, t, compaction_type::Compaction, "Major compaction")
+        : compaction_task_executor(mgr, do_throw_if_stopping, t, compaction_type::Major, "Major compaction")
        , major_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id, flush_mode::compacted_tables, consider_only_existing_data)
    {
        _status.progress_units = "bytes";
@@ -1512,9 +1512,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
-        .value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
-
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1529,7 +1527,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.wait();
+            co_await cstate.compaction_done.wait([this, &t] {
+                return !can_perform_regular_compaction(t);
+            });
        }
    } catch (const broken_condition_variable&) {
        co_return;
@@ -2313,7 +2313,7 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    }
    owned_ranges_ptr owned_ranges_ptr = {};
    sstring option_desc = fmt::format("mode: {};\nquarantine_mode: {}\n", opts.operation_mode, opts.quarantine_operation_mode);
-    co_return co_await rewrite_sstables(t, compaction_type_options::make_scrub(scrub_mode), std::move(owned_ranges_ptr), [&t, opts] -> future<std::vector<sstables::shared_sstable>> {
+    co_return co_await rewrite_sstables(t, compaction_type_options::make_scrub(scrub_mode, opts.quarantine_sstables, opts.drop_unfixable), std::move(owned_ranges_ptr), [&t, opts] -> future<std::vector<sstables::shared_sstable>> {
        auto all_sstables = co_await get_all_sstables(t);
        std::vector<sstables::shared_sstable> sstables = all_sstables
                | std::views::filter([&opts] (const sstables::shared_sstable& sst) {
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -569,7 +569,7 @@ protected:
                                sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> update_history(::compaction::compaction_group_view& t, compaction_result&& res, const compaction_data& cdata);
    bool should_update_history(compaction_type ct) {
-        return ct == compaction_type::Compaction;
+        return ct == compaction_type::Compaction || ct == compaction_type::Major;
    }
 public:
    compaction_manager::compaction_stats_opt get_stats() const noexcept {
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -41,7 +41,7 @@ using timestamp_type = api::timestamp_type;

 compaction_descriptor compaction_strategy_impl::make_major_compaction_job(std::vector<sstables::shared_sstable> candidates, int level, uint64_t max_sstable_bytes) {
    // run major compaction in maintenance priority
-    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes);
+    return compaction_descriptor(std::move(candidates), level, max_sstable_bytes, sstables::run_id::create_random_id(), compaction_type_options::make_major());
 }

 std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compaction_jobs(compaction_group_view& table_s, std::vector<sstables::shared_sstable> candidates) const {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,7 +855,7 @@ maintenance_socket: ignore
 # enable_create_table_with_compact_storage: false

 # Control tablets for new keyspaces.
-# Can be set to: disabled|enabled|enforced
+# Can be set to: disabled|enabled
 #
 # When enabled, newly created keyspaces will have tablets enabled by default.
 # That can be explicitly disabled in the CREATE KEYSPACE query
@@ -888,18 +888,9 @@ rf_rack_valid_keyspaces: false
 #
 # Vector Store options
 #
-# HTTP and HTTPS schemes are supported. Port number is mandatory.
-# If both `vector_store_primary_uri` and `vector_store_secondary_uri` are unset or empty, vector search is disabled.
-#
-# A comma-separated list of primary vector store node URIs. These nodes are preferred for vector search operations.
+# A comma-separated list of URIs for the vector store using DNS name. Only HTTP schema is supported. Port number is mandatory.
+# Default is empty, which means that the vector store is not used.
 # vector_store_primary_uri: http://vector-store.dns.name:{port}
-#
-# A comma-separated list of secondary vector store node URIs. These nodes are used as a fallback when all primary nodes are unavailable, and are typically located in a different availability zone for high availability.
-# vector_store_secondary_uri: http://vector-store.dns.name:{port}
-#
-# Options for encrypted connections to the vector store. These options are used for HTTPS URIs in vector_store_primary_uri and vector_store_secondary_uri.
-# vector_store_encryption_options:
-#    truststore: <not set, use system trust>

 # 
 # io-streaming rate limiting
--- a/configure.py
+++ b/configure.py
@@ -526,6 +526,7 @@ scylla_tests = set([
    'test/boost/mutation_test',
    'test/boost/mvcc_test',
    'test/boost/nonwrapping_interval_test',
+    'test/boost/object_storage_upload_test',
    'test/boost/observable_test',
    'test/boost/partitioner_test',
    'test/boost/pretty_printers_test',
@@ -619,6 +620,7 @@ perf_tests = set([
    'test/perf/perf_idl',
    'test/perf/perf_vint',
    'test/perf/perf_big_decimal',
+    'test/perf/perf_bti_key_translation',
    'test/perf/perf_sort_by_proximity',
 ])

@@ -640,8 +642,7 @@ raft_tests = set([

 vector_search_tests = set([
    'test/vector_search/vector_store_client_test',
-    'test/vector_search/load_balancer_test',
-    'test/vector_search/client_test'
+    'test/vector_search/load_balancer_test'
 ])

 wasms = set([
@@ -790,6 +791,9 @@ scylla_raft_core = [
 ]

 scylla_core = (['message/messaging_service.cc',
+                'message/advanced_rpc_compressor.cc',
+                'message/stream_compressor.cc',
+                'message/dict_trainer.cc',
                'replica/database.cc',
                'replica/schema_describe_helper.cc',
                'replica/table.cc',
@@ -800,6 +804,7 @@ scylla_core = (['message/messaging_service.cc',
                'replica/dirty_memory_manager.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
+                'replica/querier.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -834,7 +839,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/buffer_input_stream.cc',
                'utils/limiting_data_source.cc',
                'utils/updateable_value.cc',
-                'utils/dict_trainer.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
                'gms/generation-number.cc',
@@ -844,7 +848,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/io-wrappers.cc',
                'utils/on_internal_error.cc',
                'utils/pretty_printers.cc',
-                'utils/stream_compressor.cc',
                'utils/labels.cc',
                'mutation/converting_mutation_partition_applier.cc',
                'readers/combined.cc',
@@ -878,6 +881,7 @@ scylla_core = (['message/messaging_service.cc',
                'compaction/incremental_compaction_strategy.cc',
                'compaction/incremental_backlog_tracker.cc',
                'sstables/integrity_checked_file_impl.cc',
+                'sstables/object_storage_client.cc',
                'sstables/prepended_input_stream.cc',
                'sstables/m_format_read_helpers.cc',
                'sstables/sstable_directory.cc',
@@ -902,7 +906,6 @@ scylla_core = (['message/messaging_service.cc',
                'cdc/split.cc',
                'cdc/generation.cc',
                'cdc/metadata.cc',
-                'cql3/type_json.cc',
                'cql3/attributes.cc',
                'cql3/cf_name.cc',
                'cql3/cql3_type.cc',
@@ -989,6 +992,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/uuid.cc',
                'utils/big_decimal.cc',
                'types/comparable_bytes.cc',
+                'types/json_utils.cc',
                'types/types.cc',
                'validation.cc',
                'service/migration_manager.cc',
@@ -1057,6 +1061,7 @@ scylla_core = (['message/messaging_service.cc',
                'db/virtual_table.cc',
                'db/virtual_tables.cc',
                'db/tablet_options.cc',
+                'db/object_storage_endpoint_param.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
                'index/vector_index.cc',
@@ -1077,15 +1082,13 @@ scylla_core = (['message/messaging_service.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
-                'utils/s3/retryable_http_client.cc',
-                'utils/s3/retry_strategy.cc',
+                'utils/s3/default_aws_retry_strategy.cc',
                'utils/s3/credentials_providers/aws_credentials_provider.cc',
                'utils/s3/credentials_providers/environment_aws_credentials_provider.cc',
                'utils/s3/credentials_providers/instance_profile_credentials_provider.cc',
                'utils/s3/credentials_providers/sts_assume_role_credentials_provider.cc',
                'utils/s3/credentials_providers/aws_credentials_provider_chain.cc',
                'utils/s3/utils/manip_s3.cc',
-                'utils/advanced_rpc_compressor.cc',
                'utils/azure/identity/credentials.cc',
                'utils/azure/identity/service_principal_credentials.cc',
                'utils/azure/identity/managed_identity_credentials.cc',
@@ -1200,7 +1203,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/aws_sigv4.cc',
                'types/duration.cc',
                'vint-serialization.cc',
-                'querier.cc',
                'mutation_writer/multishard_writer.cc',
                'ent/encryption/encryption_config.cc',
                'ent/encryption/encryption.cc',
@@ -1263,9 +1265,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/disk_space_monitor.cc',
                'vector_search/vector_store_client.cc',
                'vector_search/dns.cc',
-                'vector_search/client.cc',
-                'vector_search/clients.cc',
-                'vector_search/truststore.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
               )
@@ -1408,6 +1407,7 @@ scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_gener
    'test/lib/random_schema.cc',
    'test/lib/key_utils.cc',
    'test/lib/proc_utils.cc',
+    'test/lib/gcs_fixture.cc',
 ]

 scylla_raft_dependencies = scylla_raft_core + ['utils/uuid.cc', 'utils/error_injection.cc', 'utils/exceptions.cc']
@@ -1573,7 +1573,6 @@ deps['test/boost/combined_tests'] += [
    'test/boost/query_processor_test.cc',
    'test/boost/reader_concurrency_semaphore_test.cc',
    'test/boost/repair_test.cc',
-    'test/boost/replicator_test.cc',
    'test/boost/restrictions_test.cc',
    'test/boost/role_manager_test.cc',
    'test/boost/row_cache_test.cc',
@@ -1661,7 +1660,6 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',

 deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
-deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies

 wasm_deps = {}

@@ -1816,6 +1814,9 @@ user_cflags = args.user_cflags + f" -ffile-prefix-map={curdir}=."
 # Since gcc 13, libgcc doesn't need the exception workaround
 user_cflags += ' -DSEASTAR_NO_EXCEPTION_HACK'

+# https://github.com/llvm/llvm-project/issues/163007
+user_cflags += ' -fextend-variable-liveness=none'
+
 if args.target != '':
    user_cflags += ' -march=' + args.target

@@ -2003,11 +2004,11 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DCMAKE_CXX_EXTENSIONS=ON',
        '-DSeastar_CXX_FLAGS=SHELL:{}'.format(mode_config['lib_cflags'] + extra_file_prefix_map),
        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(mode_config['lib_ldflags'], seastar_cxx_ld_flags)),
-        '-DSeastar_API_LEVEL=8',
+        '-DSeastar_API_LEVEL=9',
        '-DSeastar_DEPRECATED_OSTREAM_FORMATTERS=OFF',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
        '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
-        '-DSeastar_SCHEDULING_GROUPS_COUNT=20',
+        '-DSeastar_SCHEDULING_GROUPS_COUNT=21',
        '-DSeastar_IO_URING=ON',
    ]

--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -28,7 +28,6 @@ set_property(
 add_library(cql3 STATIC)
 target_sources(cql3
  PRIVATE
-    type_json.cc
    attributes.cc
    cf_name.cc
    cql3_type.cc
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -219,44 +219,17 @@ using uexpression = uninitialized<expression>;
        return token->getText();
    }

+    error_sink_fn get_error_sink() {
+        return [this] (const std::string& msg) { add_recognition_error(msg); };
+    }
+
    std::map<sstring, sstring> convert_property_map(const collection_constructor& map) {
-        if (map.elements.empty()) {
-            return std::map<sstring, sstring>{};
-        }
-        std::map<sstring, sstring> res;
-        for (auto&& entry : map.elements) {
-            auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
-            // Because the parser tries to be smart and recover on error (to
-            // allow displaying more than one error I suppose), we have default-constructed
-            // entries in map.elements. Just skip those, a proper error will be thrown in the end.
-            if (!entry_tuple || entry_tuple->elements.size() != 2) {
-                break;
-            }
-            auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
-            if (!left) {
-                sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
-                if (expr::is<bind_variable>(entry_tuple->elements[0])) {
-                    msg += " (bind variables are not supported in DDL queries)";
-                }
-                add_recognition_error(msg);
-                break;
-            }
-            auto right = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
-            if (!right) {
-                sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[0], entry_tuple->elements[1]);
-                if (expr::is<bind_variable>(entry_tuple->elements[1])) {
-                    msg += " (bind variables are not supported in DDL queries)";
-                }
-                add_recognition_error(msg);
-                break;
-            }
-            if (!res.emplace(left->raw_text, right->raw_text).second) {
-                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
-                add_recognition_error(msg);
-                break;
-            }
-        }
-        return res;
+        return cql3::expr::convert_property_map(map, get_error_sink());
+    }
+
+    property_definitions::extended_map_type
+    convert_extended_property_map(const collection_constructor& map) {
+        return cql3::expr::convert_extended_property_map(map, get_error_sink());
    }

    sstring to_lower(std::string_view s) {
@@ -1834,7 +1807,7 @@ properties[cql3::statements::property_definitions& props]

 property[cql3::statements::property_definitions& props]
    : k=ident '=' simple=propertyValue { try { $props.add_property(k->to_string(), simple); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
-    | k=ident '=' map=mapLiteral { try { $props.add_property(k->to_string(), convert_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
+    | k=ident '=' map=mapLiteral { try { $props.add_property(k->to_string(), convert_extended_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
    ;

 propertyValue returns [sstring str]
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1349,7 +1349,7 @@ static managed_bytes reserialize_value(View value_bytes,
    if (type.is_map()) {
        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

-        const map_type_impl& mapt = dynamic_cast<const map_type_impl&>(type);
+        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

@@ -1391,7 +1391,7 @@ static managed_bytes reserialize_value(View value_bytes,
        const vector_type_impl& vtype = dynamic_cast<const vector_type_impl&>(type);
        std::vector<managed_bytes> elements = vtype.split_fragmented(value_bytes);

-        const auto& elements_type = vtype.get_elements_type()->without_reversed();
+        auto elements_type = vtype.get_elements_type()->without_reversed();

        if (elements_type.bound_value_needs_to_be_reserialized()) {
            for (size_t i = 0; i < elements.size(); i++) {
@@ -2397,6 +2397,107 @@ split_aggregation(std::span<const expression> aggregation) {
    };
 }

+std::map<sstring, sstring> convert_property_map(const collection_constructor& map, error_sink_fn add_recognition_error) {
+    if (map.elements.empty()) {
+        return std::map<sstring, sstring>{};
+    }
+    std::map<sstring, sstring> res;
+    for (auto&& entry : map.elements) {
+        auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
+        // Because the parser tries to be smart and recover on error (to
+        // allow displaying more than one error I suppose), we have default-constructed
+        // entries in map.elements. Just skip those, a proper error will be thrown in the end.
+        if (!entry_tuple || entry_tuple->elements.size() != 2) {
+            break;
+        }
+        auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
+        if (!left) {
+            sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
+            if (expr::is<bind_variable>(entry_tuple->elements[0])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        auto right = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
+        if (!right) {
+            sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[0], entry_tuple->elements[1]);
+            if (expr::is<bind_variable>(entry_tuple->elements[1])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        if (!res.emplace(left->raw_text, right->raw_text).second) {
+            sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+            add_recognition_error(msg);
+            break;
+        }
+    }
+    return res;
+}
+
+std::map<sstring, std::variant<sstring, std::vector<sstring>>>
+convert_extended_property_map(const collection_constructor& map, error_sink_fn add_recognition_error) {
+    if (map.elements.empty()) {
+        return {};
+    }
+    std::map<sstring, std::variant<sstring, std::vector<sstring>>> res;
+    for (auto&& entry : map.elements) {
+        auto entry_tuple = expr::as_if<tuple_constructor>(&entry);
+        // Because the parser tries to be smart and recover on error (to
+        // allow displaying more than one error I suppose), we have default-constructed
+        // entries in map.elements. Just skip those, a proper error will be thrown in the end.
+        if (!entry_tuple || entry_tuple->elements.size() != 2) {
+            break;
+        }
+        auto left = expr::as_if<untyped_constant>(&entry_tuple->elements[0]);
+        if (!left) {
+            sstring msg = fmt::format("Invalid property name: {}", entry_tuple->elements[0]);
+            if (expr::is<bind_variable>(entry_tuple->elements[0])) {
+                msg += " (bind variables are not supported in DDL queries)";
+            }
+            add_recognition_error(msg);
+            break;
+        }
+        auto right_str = expr::as_if<untyped_constant>(&entry_tuple->elements[1]);
+        if (right_str) {
+            if (!res.emplace(left->raw_text, right_str->raw_text).second) {
+                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+                add_recognition_error(msg);
+                break;
+            }
+        } else {
+            auto right_vec = expr::as_if<collection_constructor>(&entry_tuple->elements[1]);
+            if (!right_vec) {
+                sstring msg = fmt::format("Invalid property value: {} for property: {}", entry_tuple->elements[1], entry_tuple->elements[0]);
+                if (expr::is<bind_variable>(entry_tuple->elements[1])) {
+                    msg += " (bind variables are not supported in DDL queries)";
+                }
+                add_recognition_error(msg);
+                break;
+            }
+            auto values = right_vec->elements | std::views::transform([&] (const auto& x) -> sstring {
+                auto elem = expr::as_if<untyped_constant>(&x);
+                if (!elem) {
+                    sstring msg = fmt::format("Invalid property vector value: {} for property: {}", x, entry_tuple->elements[0]);
+                    if (expr::is<bind_variable>(x)) {
+                        msg += " (bind variables are not supported in DDL queries)";
+                    }
+                    add_recognition_error(msg);
+                    return "<invalid>";
+                }
+                return elem->raw_text;
+            }) | std::ranges::to<std::vector<sstring>>();
+            if (!res.emplace(left->raw_text, std::move(values)).second) {
+                sstring msg = fmt::format("Multiple definition for property {}", left->raw_text);
+                add_recognition_error(msg);
+                break;
+            }
+        }
+    }
+    return res;
+}

 } // namespace expr
 } // namespace cql3
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -430,6 +430,14 @@ struct collection_constructor {
    friend bool operator==(const collection_constructor&, const collection_constructor&) = default;
 };

+// Called with error message string.
+using error_sink_fn = std::function<void(const std::string&)>;
+
+std::map<sstring, sstring> convert_property_map(const collection_constructor&, error_sink_fn);
+
+std::map<sstring, std::variant<sstring, std::vector<sstring>>>
+convert_extended_property_map(const collection_constructor&, error_sink_fn);
+
 // Constructs an object of a user-defined type
 // For example: "{field1: 23343, field2: ?}"
 // During preparation usertype constructors with constant values are converted to expr::constant.
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -13,10 +13,10 @@
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/functions/function_name.hh"
 #include "cql3/cql3_type.hh"
-#include "cql3/type_json.hh"

 #include "bytes_ostream.hh"
 #include "types/types.hh"
+#include "types/json_utils.hh"

 namespace cql3 {

--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -10,7 +10,6 @@
 #include "functions.hh"
 #include "token_fct.hh"
 #include "cql3/ut_name.hh"
-#include "cql3/type_json.hh"
 #include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/bytes_conversion_fcts.hh"
 #include "cql3/functions/time_uuid_fcts.hh"
@@ -22,6 +21,7 @@
 #include "cql3/prepare_context.hh"
 #include "user_aggregate.hh"
 #include "cql3/expr/expression.hh"
+#include "types/json_utils.hh"
 #include "types/set.hh"
 #include "types/listlike_partial_deserializing_iterator.hh"

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -1322,10 +1322,6 @@ const std::vector<expr::expression>& statement_restrictions::index_restrictions(
    return _index_restrictions;
 }

-bool statement_restrictions::is_empty() const {
-    return !_where.has_value();
-}
-
 // Current score table:
 // local and restrictions include full partition key: 2
 // global: 1
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -408,8 +408,6 @@ public:

    /// Checks that the primary key restrictions don't contain null values, throws invalid_request_exception otherwise.
    void validate_primary_key(const query_options& options) const;
-
-    bool is_empty() const;
 };

 statement_restrictions analyze_statement_restrictions(
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -14,7 +14,9 @@
 #include <stdexcept>
 #include <vector>
 #include "alter_keyspace_statement.hh"
+#include "cql3/statements/property_definitions.hh"
 #include "locator/tablets.hh"
+#include "locator/abstract_replication_strategy.hh"
 #include "mutation/canonical_mutation.hh"
 #include "prepared_statement.hh"
 #include "service/migration_manager.hh"
@@ -49,16 +51,8 @@ future<> cql3::statements::alter_keyspace_statement::check_access(query_processo
    return state.has_keyspace_access(_name, auth::permission::ALTER);
 }

-static unsigned get_abs_rf_diff(const std::string& curr_rf, const std::string& new_rf) {
-    try {
-        return std::abs(std::stoi(curr_rf) - std::stoi(new_rf));
-    } catch (std::invalid_argument const& ex) {
-        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments, "
-                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
-    } catch (std::out_of_range const& ex) {
-        on_internal_error(mylogger, fmt::format("get_abs_rf_diff expects integer arguments to fit into `int` type, "
-                                                "but got curr_rf:{} and new_rf:{}", curr_rf, new_rf));
-    }
+static unsigned get_abs_rf_diff(const locator::replication_strategy_config_option& curr_rf, const locator::replication_strategy_config_option& new_rf) {
+    return std::abs(ssize_t(locator::get_replication_factor(curr_rf)) - ssize_t(locator::get_replication_factor(new_rf)));
 }

 void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, const service::client_state& state) const {
@@ -85,19 +79,22 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                        current_options.type_string(), new_options.type_string()));
            }

-            auto new_ks = _attrs->as_ks_metadata_update(ks.metadata(), *qp.proxy().get_token_metadata_ptr(), qp.proxy().features());
+            auto new_ks = _attrs->as_ks_metadata_update(ks.metadata(), *qp.proxy().get_token_metadata_ptr(), qp.proxy().features(), qp.db().get_config());
+
+            auto tmptr = qp.proxy().get_token_metadata_ptr();
+            const auto& topo = tmptr->get_topology();

            if (ks.get_replication_strategy().uses_tablets()) {
-                const std::map<sstring, sstring>& current_rf_per_dc = ks.metadata()->strategy_options();
+                auto& current_rf_per_dc = ks.metadata()->strategy_options();
                auto new_rf_per_dc = _attrs->get_replication_options();
                new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
                unsigned total_abs_rfs_diff = 0;
                for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
-                    sstring old_rf = "0";
+                    auto old_rf = locator::replication_strategy_config_option(sstring("0"));
                    if (auto new_dc_in_current_mapping = current_rf_per_dc.find(new_dc);
                             new_dc_in_current_mapping != current_rf_per_dc.end()) {
                        old_rf = new_dc_in_current_mapping->second;
-                    } else if (!qp.proxy().get_token_metadata_ptr()->get_topology().get_datacenters().contains(new_dc)) {
+                    } else if (!topo.get_datacenters().contains(new_dc)) {
                        // This means that the DC listed in ALTER doesn't exist. This error will be reported later,
                        // during validation in abstract_replication_strategy::validate_replication_strategy.
                        // We can't report this error now, because it'd change the order of errors reported:
@@ -110,11 +107,14 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                }
            }

-            locator::replication_strategy_params params(new_ks->strategy_options(), new_ks->initial_tablets());
-            auto new_rs = locator::abstract_replication_strategy::create_replication_strategy(new_ks->strategy_name(), params);
+            locator::replication_strategy_params params(new_ks->strategy_options(), new_ks->initial_tablets(), new_ks->consistency_option());
+            auto new_rs = locator::abstract_replication_strategy::create_replication_strategy(new_ks->strategy_name(), params, topo);
            if (new_rs->is_per_table() != ks.get_replication_strategy().is_per_table()) {
                throw exceptions::invalid_request_exception(format("Cannot alter replication strategy vnode/tablets flavor"));
            }
+            if (new_ks->consistency_option() && new_ks->consistency_option() != ks.metadata()->consistency_option()) {
+                throw exceptions::invalid_request_exception(format("Cannot alter consistency option"));
+            }
        } catch (const std::runtime_error& e) {
            throw exceptions::invalid_request_exception(e.what());
        }
@@ -135,62 +135,6 @@ bool cql3::statements::alter_keyspace_statement::changes_tablets(query_processor
    return ks.get_replication_strategy().uses_tablets() && !_attrs->get_replication_options().empty();
 }

-namespace {
-// These functions are used to flatten all the options in the keyspace definition into a single-level map<string, string>.
-// (Currently options are stored in a nested structure that looks more like a map<string, map<string, string>>).
-// Flattening is simply joining the keys of maps from both levels with a colon ':' character,
-// or in other words: prefixing the keys in the output map with the option type, e.g. 'replication', 'storage', etc.,
-// so that the output map contains entries like: "replication:dc1" -> "3".
-// This is done to avoid key conflicts and to be able to de-flatten the map back into the original structure.
-
-void add_prefixed_key(const sstring& prefix, const std::map<sstring, sstring>& in, std::map<sstring, sstring>& out) {
-    for (const auto& [in_key, in_value]: in) {
-        out[prefix + ":" + in_key] = in_value;
-    }
-};
-
-std::map<sstring, sstring> get_current_options_flattened(const shared_ptr<cql3::statements::ks_prop_defs>& ks,
-                                                         const gms::feature_service& feat) {
-    std::map<sstring, sstring> all_options;
-
-    add_prefixed_key(ks->KW_REPLICATION, ks->get_replication_options(), all_options);
-    add_prefixed_key(ks->KW_STORAGE, ks->get_storage_options().to_map(), all_options);
-    // if no tablet options are specified in ATLER KS statement,
-    // we want to preserve the old ones and hence cannot overwrite them with defaults
-    if (ks->has_property(ks->KW_TABLETS)) {
-        auto initial_tablets = ks->get_initial_tablets(std::nullopt);
-        add_prefixed_key(ks->KW_TABLETS,
-                         {{"enabled", initial_tablets ? "true" : "false"},
-                         {"initial", std::to_string(initial_tablets.value_or(0))}},
-                         all_options);
-    }
-    add_prefixed_key(ks->KW_DURABLE_WRITES,
-                     {{sstring(ks->KW_DURABLE_WRITES), to_sstring(ks->get_boolean(ks->KW_DURABLE_WRITES, true))}},
-                     all_options);
-
-    return all_options;
-}
-
-std::map<sstring, sstring> get_old_options_flattened(const data_dictionary::keyspace& ks) {
-    std::map<sstring, sstring> all_options;
-
-    using namespace cql3::statements;
-    add_prefixed_key(ks_prop_defs::KW_REPLICATION, ks.get_replication_strategy().get_config_options(), all_options);
-    add_prefixed_key(ks_prop_defs::KW_STORAGE, ks.metadata()->get_storage_options().to_map(), all_options);
-    if (ks.metadata()->initial_tablets()) {
-        add_prefixed_key(ks_prop_defs::KW_TABLETS,
-                         {{"enabled", ks.metadata()->initial_tablets() ? "true" : "false"},
-                          {"initial", std::to_string(ks.metadata()->initial_tablets().value_or(0))}},
-                         all_options);
-    }
-    add_prefixed_key(ks_prop_defs::KW_DURABLE_WRITES,
-                     {{sstring(ks_prop_defs::KW_DURABLE_WRITES), to_sstring(ks.metadata()->durable_writes())}},
-                     all_options);
-
-    return all_options;
-}
-} // <anonymous> namespace
-
 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, cql3::cql_warnings_vec>>
 cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_processor& qp, service::query_state& state, const query_options& options, service::group0_batch& mc) const {
    using namespace cql_transport;
@@ -199,36 +143,15 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        auto ks = qp.db().find_keyspace(_name);
        auto ks_md = ks.metadata();
        const auto tmptr = qp.proxy().get_token_metadata_ptr();
+        const auto& topo = tmptr->get_topology();
        const auto& feat = qp.proxy().features();
-        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat);
+        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat, qp.db().get_config());
        utils::chunked_vector<mutation> muts;
        std::vector<sstring> warnings;
-        auto old_ks_options = get_old_options_flattened(ks);
-        auto ks_options = get_current_options_flattened(_attrs, feat);
-        ks_options.merge(old_ks_options);

        auto ts = mc.write_timestamp();
        auto global_request_id = mc.new_group0_state_id();

-        // #22688 - filter out any dc*:0 entries - consider these
-        // null and void (removed). Migration planning will treat it
-        // as dc*=0 still.
-        std::erase_if(ks_options, [](const auto& i) {
-            static constexpr std::string replication_prefix = ks_prop_defs::KW_REPLICATION + ":"s;
-            // Flattened map, replication entries starts with "replication:".
-            // Only valid options are replication_factor, class and per-dc rf:s. We want to
-            // filter out any dcN=0 entries.
-            auto& [key, val] = i;
-            if (key.starts_with(replication_prefix) && val == "0") {
-                std::string_view v(key);
-                v.remove_prefix(replication_prefix.size());
-                return v != ks_prop_defs::REPLICATION_FACTOR_KEY 
-                    && v != ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY
-                    ;
-            }
-            return false;
-        });
-
        // we only want to run the tablets path if there are actually any tablets changes, not only schema changes
        // TODO: the current `if (changes_tablets(qp))` is insufficient: someone may set the same RFs as before,
        //       and we'll unnecessarily trigger the processing path for ALTER tablets KS,
@@ -238,10 +161,6 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
                        exceptions::invalid_request_exception("Another global topology request is ongoing, please retry."));
            }
-            if (_attrs->get_replication_options().contains(ks_prop_defs::REPLICATION_FACTOR_KEY)) {
-                return make_exception_future<std::tuple<::shared_ptr<::cql_transport::event::schema_change>, cql3::cql_warnings_vec>>(
-                       exceptions::invalid_request_exception("'replication_factor' tag is not allowed when executing ALTER KEYSPACE with tablets, please list the DCs explicitly"));
-            }
            qp.db().real_database().validate_keyspace_update(*ks_md_update);

            service::topology_mutation_builder builder(ts);
@@ -251,11 +170,11 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            if (!qp.proxy().features().topology_global_request_queue) {
                builder.set_global_topology_request(service::global_topology_request::keyspace_rf_change);
                builder.set_global_topology_request_id(global_request_id);
-                builder.set_new_keyspace_rf_change_data(_name, ks_options);
+                builder.set_new_keyspace_rf_change_data(_name, _attrs->flattened());
            } else {
                builder.queue_global_topology_request_id(global_request_id);
                rtbuilder.set("request_type", service::global_topology_request::keyspace_rf_change)
-                         .set_new_keyspace_rf_change_data(_name, ks_options);
+                         .set_new_keyspace_rf_change_data(_name, _attrs->flattened());

            };
            service::topology_change change{{builder.build()}};
@@ -278,7 +197,8 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce

        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
                ks_md_update->strategy_name(),
-                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets(), ks_md_update->consistency_option()),
+                topo);

        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -422,14 +422,7 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
                throw exceptions::invalid_request_exception(format("The synchronous_updates option is only applicable to materialized views, not to base tables"));
            }

-            if (is_cdc_log_table) {
-                auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
-                if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
-                    throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on CDC log tables.");
-                }
-            }
-
-            _properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace(), !is_cdc_log_table);
+            _properties->apply_to_builder(cfm, std::move(schema_extensions), db, keyspace());
        }
        break;

--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -55,29 +55,8 @@ view_ptr alter_view_statement::prepare_view(data_dictionary::database db) const
    auto schema_extensions = _properties->make_schema_extensions(db.extensions());
    _properties->validate(db, keyspace(), schema_extensions);

-    bool is_colocated = [&] {
-        if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
-            return false;
-        }
-        auto base_schema = db.find_schema(schema->view_info()->base_id());
-        if (!base_schema) {
-            return false;
-        }
-        return std::ranges::equal(
-            schema->partition_key_columns(),
-            base_schema->partition_key_columns(),
-            [](const column_definition& a, const column_definition& b) { return a.name() == b.name(); });
-    }();
-
-    if (is_colocated) {
-        auto gc_opts = _properties->get_tombstone_gc_options(schema_extensions);
-        if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
-            throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
-        }
-    }
-
    auto builder = schema_builder(schema);
-    _properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);
+    _properties->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());

    if (builder.get_gc_grace_seconds() == 0) {
        throw exceptions::invalid_request_exception(
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -113,7 +113,7 @@ bool cas_request::applies_to() const {
 }

 std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
-        const query::partition_slice& slice, api::timestamp_type ts) {
+        const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options&) {
    _rows = update_parameters::build_prefetch_data(_schema, *qr, slice);
    if (applies_to()) {
        return apply_updates(ts);
--- a/cql3/statements/cas_request.hh
+++ b/cql3/statements/cas_request.hh
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */
 #pragma once
+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include "service/paxos/cas_request.hh"
 #include "cql3/statements/modification_statement.hh"
@@ -67,7 +68,7 @@ public:
        modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg);

    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
-            const query::partition_slice& slice, api::timestamp_type ts) override;
+            const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options&) override;

    /// Build a result set with prefetched rows, but return only
    /// the columns required by CAS.
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -9,6 +9,7 @@
 */

 #include "cql3/statements/cf_prop_defs.hh"
+#include "cql3/statements/property_definitions.hh"
 #include "cql3/statements/request_validations.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "db/extensions.hh"
@@ -23,6 +24,7 @@
 #include "db/per_partition_rate_limit_options.hh"
 #include "db/tablet_options.hh"
 #include "utils/bloom_calculations.hh"
+#include "utils/overloaded_functor.hh"
 #include "db/config.hh"

 #include <boost/algorithm/string/predicate.hpp>
@@ -62,12 +64,19 @@ schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions
    for (auto& p : exts.schema_extensions()) {
        auto i = _properties.find(p.first);
        if (i != _properties.end()) {
-            std::visit([&](auto& v) {
+            std::visit(overloaded_functor{
+            [&](const sstring& v) {
                auto ep = p.second(v);
                if (ep) {
                    er.emplace(p.first, std::move(ep));
                }
-            }, i->second);
+            },
+            [&](const property_definitions::extended_map_type& xmap) {
+                auto ep = p.second(property_definitions::to_simple_map(std::move(xmap)));
+                if (ep) {
+                    er.emplace(p.first, std::move(ep));
+                }
+            }}, i->second);
        }
    }
    return er;
@@ -136,7 +145,9 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + compression_parameters::SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
        }
        compression_parameters cp(*compression_options);
-        cp.validate(compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)));
+        cp.validate(
+            compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)),
+            compression_parameters::dicts_usage_allowed(db.get_config().sstable_compression_dictionaries_allow_in_ddl()));
    }

    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
@@ -238,8 +249,8 @@ std::optional<caching_options> cf_prop_defs::get_caching_options() const {
        return {};
    }
    return std::visit(make_visitor(
-        [] (const property_definitions::map_type& map) {
-            return map.empty() ? std::nullopt : std::optional<caching_options>(caching_options::from_map(map));
+        [] (const property_definitions::extended_map_type& map) {
+            return map.empty() ? std::nullopt : std::optional<caching_options>(caching_options::from_map(to_simple_map(map)));
        },
        [] (const sstring& str) {
            return std::optional<caching_options>(caching_options::from_sstring(str));
@@ -284,7 +295,7 @@ std::optional<db::tablet_options::map_type> cf_prop_defs::get_tablet_options() c
    return std::nullopt;
 }

-void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const {
+void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const {
    if (has_property(KW_COMMENT)) {
        builder.set_comment(get_string(KW_COMMENT, ""));
    }
@@ -370,7 +381,7 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
    }
    // Set default tombstone_gc mode.
    if (!schema_extensions.contains(tombstone_gc_extension::NAME)) {
-        auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name, supports_repair));
+        auto ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, ks_name));
        schema_extensions.emplace(tombstone_gc_extension::NAME, std::move(ext));
    }
    builder.set_extensions(std::move(schema_extensions));
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -110,7 +110,7 @@ public:
    bool get_synchronous_updates_flag() const;
    std::optional<db::tablet_options::map_type> get_tablet_options() const;

-    void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name, bool supports_repair) const;
+    void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const;
    void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;
 };

--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -15,6 +15,7 @@
 #include "exceptions/exceptions.hh"
 #include "index/vector_index.hh"
 #include "prepared_statement.hh"
+#include "replica/database.hh"
 #include "types/types.hh"
 #include "validation.hh"
 #include "service/storage_proxy.hh"
@@ -30,6 +31,11 @@
 #include "cql3/statements/index_prop_defs.hh"
 #include "index/secondary_index_manager.hh"
 #include "mutation/mutation.hh"
+#include "db/schema_tables.hh"
+#include "index/secondary_index_manager.hh"
+#include "types/concrete_types.hh"
+#include "db/tags/extension.hh"
+#include "tombstone_gc_extension.hh"

 #include <stdexcept>

@@ -37,6 +43,177 @@ namespace cql3 {

 namespace statements {

+static const data_type collection_keys_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_keys_type: only collections (maps, lists and sets) supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const list_type_impl& l) {
+            return timeuuid_type;
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return m.get_keys_type();
+        }
+        const data_type operator()(const set_type_impl& s) {
+            return s.get_elements_type();
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static const data_type collection_values_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_values_type: only maps and lists supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return m.get_values_type();
+        }
+        const data_type operator()(const list_type_impl& l) {
+            return l.get_elements_type();
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static const data_type collection_entries_type(const abstract_type& t) {
+    struct visitor {
+        const data_type operator()(const abstract_type& t) {
+            throw std::logic_error(format("collection_entries_type: only maps supported, but received {}", t.cql3_type_name()));
+        }
+        const data_type operator()(const map_type_impl& m) {
+            return tuple_type_impl::get_instance({m.get_keys_type(), m.get_values_type()});
+        }
+    };
+    return visit(t, visitor{});
+}
+
+static bytes get_available_column_name(const schema& schema, const bytes& root) {
+    bytes accepted_name = root;
+    int i = 0;
+    while (schema.get_column_definition(accepted_name)) {
+        accepted_name = root + to_bytes("_") + to_bytes(std::to_string(++i));
+    }
+    return accepted_name;
+}
+
+static bytes get_available_token_column_name(const schema& schema) {
+    return get_available_column_name(schema, "idx_token");
+}
+
+static bytes get_available_computed_collection_column_name(const schema& schema) {
+    return get_available_column_name(schema, "coll_value");
+}
+
+static data_type type_for_computed_column(cql3::statements::index_target::target_type target, const abstract_type& collection_type) {
+    using namespace cql3::statements;
+    switch (target) {
+        case index_target::target_type::keys:               return collection_keys_type(collection_type);
+        case index_target::target_type::keys_and_values:    return collection_entries_type(collection_type);
+        case index_target::target_type::collection_values:  return collection_values_type(collection_type);
+        default: throw std::logic_error("reached regular values or full when only collection index target types were expected");
+    }
+}
+
+view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
+        const data_dictionary::database& db) const
+{
+    sstring index_target_name = im.options().at(cql3::statements::index_target::target_option_name);
+    schema_builder builder{schema->ks_name(), secondary_index::index_table_name(im.name())};
+    auto target_info = secondary_index::target_parser::parse(schema, im);
+    const auto* index_target = im.local() ? target_info.ck_columns.front() : target_info.pk_columns.front();
+    auto target_type = target_info.type;
+
+    // For local indexing, start with base partition key
+    if (im.local()) {
+        if (index_target->is_partition_key()) {
+            throw exceptions::invalid_request_exception("Local indexing based on partition key column is not allowed,"
+                    " since whole base partition key must be used in queries anyway. Use global indexing instead.");
+        }
+        for (auto& col : schema->partition_key_columns()) {
+            builder.with_column(col.name(), col.type, column_kind::partition_key);
+        }
+        builder.with_column(index_target->name(), index_target->type, column_kind::clustering_key);
+    } else {
+        if (target_type == cql3::statements::index_target::target_type::regular_values) {
+            builder.with_column(index_target->name(), index_target->type, column_kind::partition_key);
+        } else {
+            bytes key_column_name = get_available_computed_collection_column_name(*schema);
+            column_computation_ptr collection_column_computation_ptr = [&name = index_target->name(), target_type] {
+                switch (target_type) {
+                    case cql3::statements::index_target::target_type::keys:
+                        return collection_column_computation::for_keys(name);
+                    case cql3::statements::index_target::target_type::collection_values:
+                        return collection_column_computation::for_values(name);
+                    case cql3::statements::index_target::target_type::keys_and_values:
+                        return collection_column_computation::for_entries(name);
+                    default:
+                        throw std::logic_error(format("create_view_for_index: invalid target_type, received {}", to_sstring(target_type)));
+                }
+            }().clone();
+
+            data_type t = type_for_computed_column(target_type, *index_target->type);
+            builder.with_computed_column(key_column_name, t, column_kind::partition_key, std::move(collection_column_computation_ptr));
+        }
+        // Additional token column is added to ensure token order on secondary index queries
+        bytes token_column_name = get_available_token_column_name(*schema);
+        builder.with_computed_column(token_column_name, long_type, column_kind::clustering_key, std::make_unique<token_column_computation>());
+
+        for (auto& col : schema->partition_key_columns()) {
+            if (col == *index_target) {
+                continue;
+            }
+            builder.with_column(col.name(), col.type, column_kind::clustering_key);
+        }
+    }
+
+    if (!index_target->is_static()) {
+        for (auto& col : schema->clustering_key_columns()) {
+            if (col == *index_target) {
+                continue;
+            }
+            builder.with_column(col.name(), col.type, column_kind::clustering_key);
+        }
+    }
+
+    // This column needs to be after the base clustering key.
+    if (!im.local()) {
+        // If two cells within the same collection share the same value but not liveness information, then
+        // for the index on the values, the rows generated would share the same primary key and thus the
+        // liveness information as well. Prevent that by distinguishing them in the clustering key.
+        if (target_type == cql3::statements::index_target::target_type::collection_values) {
+            data_type t = type_for_computed_column(cql3::statements::index_target::target_type::keys, *index_target->type);
+            bytes column_name = get_available_column_name(*schema, "keys_for_values_idx");
+            builder.with_computed_column(column_name, t, column_kind::clustering_key, collection_column_computation::for_keys(index_target->name()).clone());
+        }
+    }
+
+    if (index_target->is_primary_key()) {
+        for (auto& def : schema->regular_columns()) {
+            db::view::create_virtual_column(builder, def.name(), def.type);
+        }
+    }
+    // "WHERE col IS NOT NULL" is not needed (and doesn't work)
+    // when col is a collection.
+    const sstring where_clause =
+        (target_type == cql3::statements::index_target::target_type::regular_values) ?
+        format("{} IS NOT NULL", index_target->name_as_cql_string()) :
+        "";
+    builder.with_view_info(schema, false, where_clause);
+
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(db, schema->ks_name()));
+    builder.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
+
+    // A local secondary index should be backed by a *synchronous* view,
+    // see #16371. A view is marked synchronous with a tag. Non-local indexes
+    // do not need the tags schema extension at all.
+    if (im.local()) {
+        std::map<sstring, sstring> tags_map = {{db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY, "true"}};
+        builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
+    }
+    return view_ptr{builder.build()};
+}
+
 create_index_statement::create_index_statement(cf_name name,
                                               ::shared_ptr<index_name> index_name,
                                               std::vector<::shared_ptr<index_target::raw>> raw_targets,
@@ -95,15 +272,11 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
        throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
    }

-    // Regular secondary indexes require rf-rack-validity.
-    // Custom indexes need to validate this property themselves, if they need it.
-    if (!_properties || !_properties->custom_class) {
-        try {
-            db::view::validate_view_keyspace(db, keyspace());
-        } catch (const std::exception& e) {
-            // The type of the thrown exception is not specified, so we need to wrap it here.
-            throw exceptions::invalid_request_exception(e.what());
-        }
+    try {
+        db::view::validate_view_keyspace(db, keyspace());
+    } catch (const std::exception& e) {
+        // The type of the thrown exception is not specified, so we need to wrap it here.
+        throw exceptions::invalid_request_exception(e.what());
    }

    validate_for_local_index(*schema);
@@ -119,7 +292,7 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
            throw exceptions::invalid_request_exception(format("Non-supported custom class \'{}\' provided", *(_properties->custom_class)));
        }
        auto custom_index = (*custom_index_factory)();
-        custom_index->validate(*schema, *_properties, targets, db.features(), db);
+        custom_index->validate(*schema, *_properties, targets, db.features());
        _properties->index_version = custom_index->index_version(*schema);
    }

@@ -415,10 +588,25 @@ create_index_statement::prepare_schema_mutations(query_processor& qp, const quer
    auto res = build_index_schema(qp.db());

    ::shared_ptr<event::schema_change> ret;
-    utils::chunked_vector<mutation> m;
+    utils::chunked_vector<mutation> muts;

    if (res) {
-        m = co_await service::prepare_column_family_update_announcement(qp.proxy(), std::move(res->schema), {}, ts);
+        const replica::database& db = qp.proxy().local_db();
+        const auto& cf = db.find_column_family(keyspace(), column_family());
+
+        // Produce statements to update schema tables with index-specific information.
+        muts = co_await service::prepare_column_family_update_announcement(qp.proxy(), std::move(res->schema), {}, ts);
+
+        // Produce the underlying view for the index.
+        if (db::schema_tables::view_should_exist(res->index)) {
+            view_ptr view = create_view_for_index(cf.schema(), res->index, qp.db());
+            utils::chunked_vector<mutation> view_muts = co_await service::prepare_new_view_announcement(qp.proxy(), std::move(view), ts);
+
+            muts.reserve(muts.size() + view_muts.size());
+            for (mutation& view_mutation : view_muts) {
+                muts.push_back(std::move(view_mutation));
+            }
+        }

        ret = ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
@@ -427,7 +615,7 @@ create_index_statement::prepare_schema_mutations(query_processor& qp, const quer
                column_family());
    }

-    co_return std::make_tuple(std::move(ret), std::move(m), std::vector<sstring>());
+    co_return std::make_tuple(std::move(ret), std::move(muts), std::vector<sstring>());
 }

 std::unique_ptr<cql3::statements::prepared_statement>
--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -54,6 +54,7 @@ public:
        index_metadata index;
    };
    std::optional<base_schema_with_new_index> build_index_schema(data_dictionary::database db) const;
+    view_ptr create_view_for_index(const schema_ptr, const index_metadata& im, const data_dictionary::database&) const;
 private:
    void validate_for_local_index(const schema& schema) const;
    void validate_for_frozen_collection(const index_target& target) const;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -22,6 +22,7 @@
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
 #include "gms/feature_service.hh"
+#include "replica/database.hh"

 #include <boost/regex.hpp>
 #include <stdexcept>
@@ -109,7 +110,8 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
        // remove this check.
        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
            ksm->strategy_name(),
-            locator::replication_strategy_params(ksm->strategy_options(), ksm->initial_tablets()));
+            locator::replication_strategy_params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option()),
+            tmptr->get_topology());
        if (rs->uses_tablets()) {
            warnings.push_back(
                "Tables in this keyspace will be replicated using Tablets "
@@ -201,10 +203,11 @@ std::vector<sstring> check_against_restricted_replication_strategies(

    std::vector<sstring> warnings;
    locator::replication_strategy_config_options opts;
-    locator::replication_strategy_params params(opts, std::nullopt);
+    locator::replication_strategy_params params(opts, std::nullopt, std::nullopt);
    auto replication_strategy = locator::abstract_replication_strategy::create_replication_strategy(
            locator::abstract_replication_strategy::to_qualified_class_name(
-                    *attrs.get_replication_strategy_class()), params)->get_type();
+                    *attrs.get_replication_strategy_class()), params,
+                    qp.db().real_database().get_token_metadata().get_topology())->get_type();
    auto rs_warn_list = qp.db().get_config().replication_strategy_warn_list();
    auto rs_fail_list = qp.db().get_config().replication_strategy_fail_list();

@@ -248,7 +251,12 @@ std::vector<sstring> check_against_restricted_replication_strategies(
    // these are checked and reported elsewhere.
    for (auto opt : attrs.get_replication_options()) {
        try {
-            auto rf = std::stol(opt.second);
+            long rf = 0;
+            try {
+                rf = locator::get_replication_factor(opt.second);
+            } catch (const exceptions::configuration_exception&) {
+            }
+
            if (rf > 0) {
                if (auto min_fail = qp.proxy().data_dictionary().get_config().minimum_replication_factor_fail_threshold();
                    min_fail >= 0 && rf < min_fail) {
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -31,6 +31,8 @@
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"

+bool is_internal_keyspace(std::string_view name);
+
 namespace cql3 {

 namespace statements {
@@ -122,7 +124,11 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 #endif

-    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
+    if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
+        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
+    }
+
+    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace());
 }

 void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -373,30 +373,7 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(
            db::view::create_virtual_column(builder, def->name(), def->type);
        }
    }
-
-    bool is_colocated = [&] {
-        if (!db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
-            return false;
-        }
-        if (target_partition_keys.size() != schema->partition_key_columns().size()) {
-            return false;
-        }
-        for (size_t i = 0; i < target_partition_keys.size(); ++i) {
-            if (target_partition_keys[i] != &schema->partition_key_columns()[i]) {
-                return false;
-            }
-        }
-        return true;
-    }();
-
-    if (is_colocated) {
-        auto gc_opts = _properties.properties()->get_tombstone_gc_options(schema_extensions);
-        if (gc_opts && gc_opts->mode() == tombstone_gc_mode::repair) {
-            throw exceptions::invalid_request_exception("The 'repair' mode for tombstone_gc is not allowed on co-located materialized view tables.");
-        }
-    }
-
-    _properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace(), !is_colocated);
+    _properties.properties()->apply_to_builder(builder, std::move(schema_extensions), db, keyspace());

    if (builder.default_time_to_live().count() > 0) {
        throw exceptions::invalid_request_exception(
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,7 +23,6 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
-#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

-        table_desc.create_statement = std::move(os).to_managed_string();
-    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
-        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
-        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
-        fragmented_ostringstream os{};
-
-        fmt::format_to(os.to_iter(),
-                "/* Do NOT execute this statement! It's only for informational purposes.\n"
-                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
-                "\n{}\n"
-                "*/",
-                *table_desc.create_statement);
-
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -13,16 +13,15 @@
 #include "index_prop_defs.hh"
 #include "index/secondary_index.hh"
 #include "exceptions/exceptions.hh"
-#include "schema/schema.hh"

-void check_system_option_specified(const index_options_map& options, const sstring& option_name) {
+static void check_system_option_specified(const index_options_map& options, const sstring& option_name) {
    if (options.count(option_name)) {
        throw exceptions::invalid_request_exception(
                fmt::format("Cannot specify {} as a CUSTOM option", option_name));
    }
 }

-void cql3::statements::index_prop_defs::validate() {
+void cql3::statements::index_prop_defs::validate() const {
    static std::set<sstring> keywords({ sstring(KW_OPTIONS) });

    property_definitions::validate(keywords);
@@ -41,13 +40,13 @@ void cql3::statements::index_prop_defs::validate() {
 }

 index_options_map
-cql3::statements::index_prop_defs::get_raw_options() {
+cql3::statements::index_prop_defs::get_raw_options() const {
    auto options = get_map(KW_OPTIONS);
    return !options ? std::unordered_map<sstring, sstring>() : std::unordered_map<sstring, sstring>(options->begin(), options->end());
 }

 index_options_map
-cql3::statements::index_prop_defs::get_options() {
+cql3::statements::index_prop_defs::get_options() const {
    auto options = get_raw_options();
    options.emplace(db::index::secondary_index::custom_class_option_name, *custom_class);
    if (index_version.has_value()) {
--- a/cql3/statements/index_prop_defs.hh
+++ b/cql3/statements/index_prop_defs.hh
@@ -12,7 +12,7 @@

 #include "property_definitions.hh"
 #include <seastar/core/sstring.hh>
-#include "schema/schema.hh"
+#include "schema/schema_fwd.hh"

 #include <unordered_map>
 #include <optional>
@@ -32,9 +32,9 @@ public:
    // The only assumption about the value of `index_version` should be that it is different for every index.
    std::optional<table_schema_version> index_version;

-    void validate();
-    index_options_map get_raw_options();
-    index_options_map get_options();
+    void validate() const;
+    index_options_map get_raw_options() const;
+    index_options_map get_options() const;
 };

 }
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "seastar/core/sstring.hh"
 #include "utils/assert.hh"
 #include "cql3/statements/ks_prop_defs.hh"
 #include "cql3/statements/request_validations.hh"
@@ -18,19 +19,97 @@
 #include "exceptions/exceptions.hh"
 #include "gms/feature_service.hh"
 #include "db/config.hh"
+#include <random>

 namespace cql3 {

 namespace statements {

-static std::map<sstring, sstring> prepare_options(
+static logging::logger logger("ks_prop_defs");
+
+static
+locator::replication_strategy_config_option
+expand_to_racks(const locator::token_metadata& tm,
+                const sstring& dc,
+                const locator::replication_strategy_config_option& rf,
+                const locator::replication_strategy_config_options& old_options)
+{
+    auto dc_racks = locator::get_allowed_racks(tm, dc);
+
+    logger.debug("expand_to_racks: dc={} rf={} allowed_racks={}", dc, rf, dc_racks);
+
+    if (!tm.get_topology().get_datacenters().contains(dc)) {
+        throw exceptions::configuration_exception(fmt::format("Unrecognized datacenter name '{}'", dc));
+    }
+
+    auto data = locator::abstract_replication_strategy::parse_replication_factor(rf);
+    data.validate(std::ranges::to<std::unordered_set<sstring>>(dc_racks));
+
+    if (data.is_rack_based()) {
+        return rf;
+    }
+
+    if (data.count() == 0) {
+        return locator::rack_list();
+    }
+
+    if (data.count() > dc_racks.size()) {
+        throw exceptions::configuration_exception(fmt::format(
+                "Replication factor {} exceeds the number of racks ({}) in dc {}", data.count(), dc_racks.size(), dc));
+    }
+
+    // Handle ALTER:
+    // ([]|0) -> numeric is allowed, there are no existing replicas
+    // numeric -> numeric' is not supported. User should convert RF to rack list of equal count first.
+    // rack_list -> len(rack_list) is allowed (no-op)
+    // rack_list -> numeric is not allowed
+    if (old_options.contains(dc)) {
+        auto& old_rf_val = old_options.at(dc);
+        auto old_rf = locator::replication_factor_data(old_rf_val);
+        if (old_rf.is_rack_based()) {
+            if (old_rf.count() == data.count()) {
+                return old_rf_val;
+            } else if (old_rf.count() > 0) {
+                throw exceptions::configuration_exception(fmt::format(
+                        "Cannot change replication factor for '{}' from {} to numeric {}, use rack list instead",
+                        dc, old_rf_val, data.count()));
+            }
+        } else if (old_rf.count() > 0) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Cannot change replication factor for '{}' from {} to {}, only rack list is allowed",
+                    dc, old_rf_val, data.count()));
+        }
+    }
+
+    // If the replication factor is less than the number of racks, pick rf racks at random.
+    if (data.count() < dc_racks.size()) {
+        static thread_local auto gen = std::default_random_engine(std::random_device{}());
+        std::ranges::shuffle(dc_racks, gen);
+        dc_racks.resize(data.count());
+    }
+
+    return dc_racks;
+}
+
+static locator::replication_strategy_config_options prepare_options(
        const sstring& strategy_class,
        const locator::token_metadata& tm,
-        std::map<sstring, sstring> options,
-        const std::map<sstring, sstring>& old_options = {}) {
+        bool rf_rack_valid_keyspaces,
+        locator::replication_strategy_config_options options,
+        const locator::replication_strategy_config_options& old_options,
+        bool rack_list_enabled,
+        bool uses_tablets) {
    options.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);

-    if (locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) != "org.apache.cassandra.locator.NetworkTopologyStrategy") {
+    auto is_nts = locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) == "org.apache.cassandra.locator.NetworkTopologyStrategy";
+    auto is_alter = !old_options.empty();
+    const auto& all_dcs = tm.get_datacenter_racks_token_owners();
+    auto auto_expand_racks = uses_tablets && rf_rack_valid_keyspaces && rack_list_enabled;
+
+    logger.debug("prepare_options: {}: is_nts={} auto_expand_racks={} rack_list_enabled={} old_options={} new_options={} all_dcs={}",
+                 strategy_class, is_nts, auto_expand_racks, rack_list_enabled, old_options, options, all_dcs);
+
+    if (!is_nts) {
        return options;
    }

@@ -38,26 +117,86 @@ static std::map<sstring, sstring> prepare_options(
    // If the user simply switches from another strategy without providing any options,
    // but the other strategy used the 'replication_factor' option, it will also be expanded.
    // See issue CASSANDRA-14303.
-
    std::optional<sstring> rf;
    auto it = options.find(ks_prop_defs::REPLICATION_FACTOR_KEY);
    if (it != options.end()) {
        // Expand: the user explicitly provided a 'replication_factor'.
-        rf = it->second;
+        try {
+            rf = std::get<sstring>(it->second);
+        } catch (...) {
+            throw exceptions::configuration_exception(fmt::format("Invalid replication factor: {}: must be a string holding a numerical value", it->second));
+        }
        options.erase(it);
    } else if (options.empty()) {
        auto it = old_options.find(ks_prop_defs::REPLICATION_FACTOR_KEY);
        if (it != old_options.end()) {
            // Expand: the user switched from another strategy that specified a 'replication_factor'
            // and didn't provide any additional options.
-            rf = it->second;
+            rf = std::get<sstring>(it->second);
+        }
+    }
+
+    if (rf && uses_tablets && is_alter) {
+        throw exceptions::invalid_request_exception("'replication_factor' tag is not allowed when executing ALTER KEYSPACE with tablets, please list the DCs explicitly");
+    }
+
+    // Validate options.
+    for (auto&& [dc, opt] : options) {
+        locator::replication_factor_data rf(opt);
+
+        std::optional<locator::replication_factor_data> old_rf;
+        auto i = old_options.find(dc);
+        if (i != old_options.end()) {
+            old_rf = locator::replication_factor_data(i->second);
+        }
+
+        if (!rf.is_rack_based()) {
+            if (old_rf && old_rf->is_rack_based() && rf.count() != 0) {
+                if (old_rf->count() != rf.count()) {
+                    throw exceptions::configuration_exception(fmt::format(
+                            "Cannot change replication factor for '{}' from {} to {} when the old value was a rack list",
+                            dc, old_options.at(dc), opt));
+                } else {
+                    options[dc] = i->second; // Preserve rack list.
+                }
+            }
+            continue;
+        }
+        if (!rack_list_enabled) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Using rack list for '{}' is not allowed because the 'rf_rack_list' feature is disabled", dc));
+        }
+        if (!uses_tablets) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Using rack list for '{}' is not allowed because the keyspace is not using tablets", dc));
+        }
+        auto& racks = rf.get_rack_list();
+        if (std::unordered_set<sstring>(racks.begin(), racks.end()).size() != rf.count()) {
+            throw exceptions::configuration_exception(fmt::format(
+                    "Rack list for '{}' contains duplicate entries", dc));
+        }
+        if (old_rf && !old_rf->is_rack_based() && old_rf->count() != 0) {
+            // FIXME: Allow this if replicas already conform to the given rack list.
+            // FIXME: Implement automatic colocation to allow transition to rack list.
+            throw exceptions::configuration_exception(fmt::format(
+                    "Cannot change replication factor from numeric to rack list for '{}'", dc));
+        }
+    }
+
+    if (!rf && options.empty() && old_options.empty()) {
+        if (all_dcs.empty()) {
+            throw request_validations::invalid_request("No data centers found in the cluster, cannot determine replication factor");
+        }
+        for (const auto& [dc, racks_map] : all_dcs) {
+            if (racks_map.empty()) {
+                continue;
+            }
+            options.emplace(dc, std::to_string(racks_map.size()));
        }
    }

    if (rf.has_value()) {
-        // The code below may end up not using "rf" at all (if all the DCs
-        // already have rf settings), so let's validate it once (#8880).
-        locator::abstract_replication_strategy::parse_replication_factor(*rf);
+        locator::replication_factor_data::parse(*rf);

        // We keep previously specified DC factors for safety.
        for (const auto& opt : old_options) {
@@ -69,17 +208,11 @@ static std::map<sstring, sstring> prepare_options(
        for (const auto& dc : tm.get_topology().get_datacenters()) {
            options.emplace(dc, *rf);
        }
-    } else if (options.empty() && old_options.empty()) {
-        // For default replication factor consider only racks with nodes that are NOT zero-token only nodes,
-        auto dc_racks = tm.get_datacenter_racks_token_owners();
-        if (dc_racks.empty()) {
-            throw request_validations::invalid_request("No data centers found in the cluster, cannot determine replication factor");
-        }
-        for (const auto& [dc, racks_map] : dc_racks) {
-            if (racks_map.empty()) {
-                continue;
-            }
-            options.emplace(dc, std::to_string(racks_map.size()));
+    }
+
+    if (auto_expand_racks) {
+        for (const auto& [dc, dc_rf] : options) {
+            options[dc] = expand_to_racks(tm, dc, dc_rf, old_options);
        }
    }

@@ -93,14 +226,34 @@ static std::map<sstring, sstring> prepare_options(
        throw exceptions::configuration_exception("Configuration for at least one datacenter must be present");
    }

+    if (uses_tablets) {
+        // We keep previously specified DC factors for safety.
+        for (const auto& opt: old_options) {
+            if (opt.first != ks_prop_defs::REPLICATION_FACTOR_KEY) {
+                options.insert(opt);
+            }
+        }
+    }
+
+    // #22688 - filter out any dc*:0 and dc*:[] entries - consider these
+    // null and void (removed).
+    std::erase_if(options, [] (const auto& e) {
+        auto& [dc, rf] = e;
+        return locator::replication_factor_data(rf).count() == 0;
+    });
+
    return options;
 }

-ks_prop_defs::ks_prop_defs(std::map<sstring, sstring> options) {
-    std::map<sstring, sstring> replication_opts, storage_opts, tablets_opts, durable_writes_opts;
+ks_prop_defs::ks_prop_defs(property_definitions::map_type options) {
+    map_type replication_opts, storage_opts, tablets_opts, durable_writes_opts, consistency_opts;

    auto read_property_into = [] (auto& map, const sstring& name, const sstring& value, const sstring& tag) {
-        map[name.substr(sstring(tag).size() + 1)] = value;
+        auto prefix = sstring(tag) + ":";
+        if (!name.starts_with(prefix)) {
+            throw std::runtime_error(seastar::format("ks_prop_defs: Expected name to start with \"{}\", but got: \"{}\"", prefix, name));
+        }
+        map[name.substr(prefix.size())] = value;
    };

    for (const auto& [name, value] : options) {
@@ -112,17 +265,22 @@ ks_prop_defs::ks_prop_defs(std::map<sstring, sstring> options) {
            read_property_into(tablets_opts, name, value, KW_TABLETS);
        } else if (name.starts_with(KW_STORAGE)) {
            read_property_into(storage_opts, name, value, KW_STORAGE);
+        } else if (name.starts_with(KW_CONSISTENCY)) {
+            read_property_into(consistency_opts, name, value, KW_CONSISTENCY);
        }
    }

    if (!replication_opts.empty())
-        add_property(KW_REPLICATION, replication_opts);
+        add_property(KW_REPLICATION, from_flattened_map(replication_opts));
    if (!storage_opts.empty())
        add_property(KW_STORAGE, storage_opts);
    if (!tablets_opts.empty())
        add_property(KW_TABLETS, tablets_opts);
    if (!durable_writes_opts.empty())
        add_property(KW_DURABLE_WRITES, durable_writes_opts.begin()->second);
+    if (!consistency_opts.empty()) {
+        add_property(KW_CONSISTENCY, consistency_opts.begin()->second);
+    }
 }

 void ks_prop_defs::validate() {
@@ -132,21 +290,25 @@ void ks_prop_defs::validate() {
        return;
    }

-    static std::set<sstring> keywords({ sstring(KW_DURABLE_WRITES), sstring(KW_REPLICATION), sstring(KW_STORAGE), sstring(KW_TABLETS) });
+    static std::set<sstring> keywords({ sstring(KW_DURABLE_WRITES), sstring(KW_REPLICATION), sstring(KW_STORAGE), sstring(KW_TABLETS), sstring(KW_CONSISTENCY) });
    property_definitions::validate(keywords);

    auto replication_options = get_replication_options();
    if (replication_options.contains(REPLICATION_STRATEGY_CLASS_KEY)) {
-        _strategy_class = replication_options[REPLICATION_STRATEGY_CLASS_KEY];
+        const auto& class_name = replication_options[REPLICATION_STRATEGY_CLASS_KEY];
+        if (!std::holds_alternative<sstring>(class_name)) {
+            throw exceptions::configuration_exception(seastar::format("Invalid replication strategy class: {}", class_name));
+        }
+        _strategy_class = std::get<sstring>(class_name);
    }
 }

-std::map<sstring, sstring> ks_prop_defs::get_replication_options() const {
-    auto replication_options = get_map(KW_REPLICATION);
+locator::replication_strategy_config_options ks_prop_defs::get_replication_options() const {
+    auto replication_options = get_extended_map(KW_REPLICATION);
    if (replication_options) {
        return replication_options.value();
    }
-    return std::map<sstring, sstring>{};
+    return {};
 }

 data_dictionary::storage_options ks_prop_defs::get_storage_options() const {
@@ -204,6 +366,15 @@ std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned
    return initial_count;
 }

+std::optional<data_dictionary::consistency_config_option> ks_prop_defs::get_consistency_option() const {
+    auto value = get_simple(KW_CONSISTENCY);
+    if (value) {
+        return data_dictionary::consistency_config_option_from_string(value.value());
+    } else {
+        return std::nullopt;
+    }
+}
+
 std::optional<sstring> ks_prop_defs::get_replication_strategy_class() const {
    return _strategy_class;
 }
@@ -228,26 +399,68 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
    std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
            ? std::optional<unsigned>(0) : std::nullopt;
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
-    auto options = prepare_options(sc, tm, get_replication_options());
+    bool uses_tablets = initial_tablets.has_value();
+    bool rack_list_enabled = feat.rack_list_rf;
+    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
-            std::move(options), initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
+            std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }

-lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata& tm, const gms::feature_service& feat) {
-    std::map<sstring, sstring> options;
+lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
+    locator::replication_strategy_config_options options;
    const auto& old_options = old->strategy_options();
+    // if tablets options have not been specified, inherit them if it's tablets-enabled KS
+    auto initial_tablets = get_initial_tablets(old->initial_tablets());
+    auto uses_tablets = initial_tablets.has_value();
+    if (old->uses_tablets() != uses_tablets) {
+        throw exceptions::invalid_request_exception("Cannot alter replication strategy vnode/tablets flavor");
+    }
    auto sc = get_replication_strategy_class();
+    bool rack_list_enabled = feat.rack_list_rf;
    if (sc) {
-        options = prepare_options(*sc, tm, get_replication_options(), old_options);
+        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
    } else {
        sc = old->strategy_name();
        options = old_options;
    }
-    // if tablets options have not been specified, inherit them if it's tablets-enabled KS
-    auto initial_tablets = get_initial_tablets(old->initial_tablets());
-    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
+    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }

+namespace {
+
+void add_prefixed_key(const sstring& prefix, const property_definitions::map_type& in, property_definitions::map_type& out) {
+    for (const auto& [in_key, in_value]: in) {
+        out[fmt::format("{}:{}", prefix, in_key)] = in_value;
+    }
+}
+
+void add_prefixed_key(const sstring& prefix, const property_definitions::extended_map_type& in, property_definitions::map_type& out) {
+    add_prefixed_key(prefix, to_flattened_map(in), out);
+}
+
+} // namespace
+
+property_definitions::map_type ks_prop_defs::flattened() const {
+    map_type result;
+
+    for (auto kw : {
+            ks_prop_defs::KW_REPLICATION,
+            ks_prop_defs::KW_STORAGE,
+            ks_prop_defs::KW_TABLETS}) {
+        if (auto val_opt = get_extended_map(kw)) {
+            add_prefixed_key(kw, *val_opt, result);
+        }
+    }
+
+    for (auto kw : {ks_prop_defs::KW_DURABLE_WRITES}) {
+        if (auto val_opt = get_simple(kw)) {
+            // Use nested map for backwards compatibility, ks_prop_defs() constructor expects this.
+            add_prefixed_key(kw, std::map<sstring, sstring>({{sstring(kw), to_sstring(*val_opt)}}), result);
+        }
+    }
+
+    return {result};
+}

 }

--- a/cql3/statements/ks_prop_defs.hh
+++ b/cql3/statements/ks_prop_defs.hh
@@ -12,6 +12,8 @@

 #include "cql3/statements/property_definitions.hh"
 #include "data_dictionary/storage_options.hh"
+#include "locator/abstract_replication_strategy.hh"
+#include "data_dictionary/consistency_config_options.hh"

 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/sstring.hh>
@@ -48,6 +50,7 @@ public:
    static constexpr auto KW_REPLICATION = "replication";
    static constexpr auto KW_STORAGE = "storage";
    static constexpr auto KW_TABLETS = "tablets";
+    static constexpr auto KW_CONSISTENCY = "consistency";

    static constexpr auto REPLICATION_STRATEGY_CLASS_KEY = "class";
    static constexpr auto DEFAULT_REPLICATION_STRATEGY_CLASS = "NetworkTopologyStrategy";
@@ -57,17 +60,27 @@ private:
    std::optional<sstring> _strategy_class;
 public:
    ks_prop_defs() = default;
-    explicit ks_prop_defs(std::map<sstring, sstring> options);
+
+    explicit ks_prop_defs(map_type options);
+
+    /// Converts options to a flattened map of properties.
+    ///
+    /// It holds that:
+    ///
+    ///   ks_prop_defs(flattened()) == *this
+    ///
+    map_type flattened() const;

    void validate();
-    std::map<sstring, sstring> get_replication_options() const;
+    locator::replication_strategy_config_options get_replication_options() const;
    std::optional<sstring> get_replication_strategy_class() const;
    void set_default_replication_strategy_class_option();
    std::optional<unsigned> get_initial_tablets(std::optional<unsigned> default_value, bool enforce_tablets = false) const;
+    std::optional<data_dictionary::consistency_config_option> get_consistency_option() const;
    data_dictionary::storage_options get_storage_options() const;
    bool get_durable_writes() const;
    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata(sstring ks_name, const locator::token_metadata&, const gms::feature_service&, const db::config&);
-    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata&, const gms::feature_service&);
+    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<data_dictionary::keyspace_metadata> old, const locator::token_metadata&, const gms::feature_service&, const db::config&);
 };

 }
--- a/cql3/statements/property_definitions.cc
+++ b/cql3/statements/property_definitions.cc
@@ -8,9 +8,12 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include <ranges>
+
 #include <seastar/core/format.hh>
 #include "cql3/statements/property_definitions.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/overloaded_functor.hh"

 namespace cql3 {

@@ -26,7 +29,7 @@ void property_definitions::add_property(const sstring& name, sstring value) {
    }
 }

-void property_definitions::add_property(const sstring& name, const std::map<sstring, sstring>& value) {
+void property_definitions::add_property(const sstring& name, const extended_map_type& value) {
    if (auto [ignored, added] = _properties.try_emplace(name, value); !added) {
        throw exceptions::syntax_exception(format("Multiple definition for property '{}'", name));
    }
@@ -60,18 +63,41 @@ std::optional<sstring> property_definitions::get_simple(const sstring& name) con
    }
 }

-std::optional<std::map<sstring, sstring>> property_definitions::get_map(const sstring& name) const {
+std::optional<property_definitions::extended_map_type> property_definitions::get_extended_map(const sstring& name) const {
    auto it = _properties.find(name);
    if (it == _properties.end()) {
        return std::nullopt;
    }
    try {
-        return std::get<map_type>(it->second);
+        return std::get<extended_map_type>(it->second);
    } catch (const std::bad_variant_access& e) {
        throw exceptions::syntax_exception(format("Invalid value for property '{}'. It should be a map.", name));
    }
 }

+std::optional<property_definitions::map_type> property_definitions::get_map(const sstring& name) const {
+    auto xmap = get_extended_map(name);
+    if (!xmap) {
+        return std::nullopt;
+    }
+    return to_simple_map(std::move(*xmap));
+}
+
+property_definitions::map_type property_definitions::to_simple_map(const extended_map_type& xmap) {
+    return xmap | std::views::transform([](const auto& x) {
+        // Convert each pair to a string key and value
+        try {
+            return std::make_pair(x.first, std::get<sstring>(x.second));
+        } catch (const std::bad_variant_access& e) {
+            throw exceptions::syntax_exception(seastar::format("Invalid map value '{}' for key '{}'. It should be a simple string.", std::get<list_type>(x.second), x.first));
+        }
+    }) | std::ranges::to<map_type>();
+}
+
+property_definitions::extended_map_type property_definitions::to_extended_map(const map_type& map) {
+    return map | std::ranges::to<extended_map_type>();
+}
+
 bool property_definitions::has_property(const sstring& name) const {
    return _properties.contains(name);
 }
@@ -166,7 +192,7 @@ void property_definitions::remove_from_map_if_exists(const sstring& name, const
        return;
    }
    try {
-        auto map = std::get<map_type>(it->second);
+        auto map = std::get<extended_map_type>(it->second);
        map.erase(key);
        _properties[name] = map;
    } catch (const std::bad_variant_access& e) {
@@ -174,6 +200,68 @@ void property_definitions::remove_from_map_if_exists(const sstring& name, const
    }
 }

+/// Converts extended map into a flat map.
+///
+/// Values which are lists are represented as multiple entries in the map
+/// with the list index appended to the key, with ':' as a separator.
+/// Empty list is represented as a single entry with index -1 and empty string as value.
+///
+/// For example:
+///
+///    {'dc1': '3', 'dc2': ['rack1', 'rack2'], 'dc3': []}
+///
+/// has a flattened representation of:
+///
+///   {'dc1': '3', 'dc2:0': 'rack1', 'dc2:1': 'rack2', 'dc3:-1': ''}
+///
+property_definitions::map_type to_flattened_map(const property_definitions::extended_map_type& in) {
+    property_definitions::map_type out;
+    for (const auto& [in_key, in_value]: in) {
+        if (in_key.find(':') != sstring::npos) {
+            throw std::invalid_argument(fmt::format("key '{}' contains reserved character ':'", in_key));
+        }
+        std::visit(overloaded_functor{
+            [&] (const sstring& value) {
+                out[in_key] = value;
+            },
+            [&] (const std::vector<sstring>& list) {
+                if (list.empty()) {
+                    out[fmt::format("{}:{}", in_key, -1)] = "";
+                } else {
+                    // flatten the rack list in multiple entries
+                    for (size_t i = 0; i < list.size(); ++i) {
+                        const auto& v = list[i];
+                        out[fmt::format("{}:{}", in_key, i)] = v;
+                    }
+                }
+            }
+        }, in_value);
+    }
+    return out;
+}
+
+property_definitions::extended_map_type from_flattened_map(const property_definitions::map_type& in) {
+    property_definitions::extended_map_type out;
+    for (const auto& [key, value] : in) {
+        auto pos = key.find(':');
+        if (pos == sstring::npos) {
+            out.emplace(key, value);
+        } else {
+            auto dc = key.substr(0, pos);
+            auto index = std::stol(key.substr(pos + 1));
+            auto [it, empty] = out.try_emplace(dc, std::vector<sstring>());
+            auto& vec = std::get<std::vector<sstring>>(it->second);
+            if (index >= 0) {
+                if (vec.size() <= size_t(index)) {
+                    vec.resize(index + 1);
+                }
+                vec[index] = value;
+            }
+        }
+    }
+    return out;
+}
+
 }

 }
--- a/cql3/statements/property_definitions.hh
+++ b/cql3/statements/property_definitions.hh
@@ -27,19 +27,26 @@ namespace statements {
 class property_definitions {
 public:
    using map_type = std::map<sstring, sstring>;
-    using value_type = std::variant<sstring, map_type>;
+    using list_type = std::vector<sstring>;
+    using extended_map_type = std::map<sstring, std::variant<sstring, list_type>>;
+    using value_type = std::variant<sstring, extended_map_type>;
+    using properties_map_type = std::unordered_map<sstring, value_type>;
 protected:
 #if 0
    protected static final Logger logger = LoggerFactory.getLogger(PropertyDefinitions.class);
 #endif

-    mutable std::unordered_map<sstring, value_type> _properties;
+    mutable properties_map_type _properties;

    property_definitions();
 public:
    void add_property(const sstring& name, sstring value);

-    void add_property(const sstring& name, const std::map<sstring, sstring>& value);
+    void add_property(const sstring& name, const map_type& value) {
+        add_property(name, to_extended_map(value));
+    }
+
+    void add_property(const sstring& name, const extended_map_type& value);

    void validate(const std::set<sstring>& keywords, const std::set<sstring>& exts = {}, const std::set<sstring>& obsolete = {}) const;

@@ -54,7 +61,11 @@ public:

    std::optional<value_type> get(const sstring& name) const;

-    std::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;
+    std::optional<extended_map_type> get_extended_map(const sstring& name) const;
+    std::optional<map_type> get_map(const sstring& name) const;
+
+    static map_type to_simple_map(const extended_map_type&);
+    static extended_map_type to_extended_map(const map_type&);

    sstring get_string(sstring key, sstring default_value) const;

@@ -78,6 +89,9 @@ public:
    }
 };

+property_definitions::map_type to_flattened_map(const property_definitions::extended_map_type&);
+property_definitions::extended_map_type from_flattened_map(const property_definitions::map_type&);
+
 }

 }
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -131,8 +131,6 @@ private:

    void verify_ordering_is_valid(const prepared_orderings_type&, const schema&, const restrictions::statement_restrictions& restrictions) const;

-    void verify_ann_ordering_is_valid(const std::optional<expr::expression>& limit, const std::optional<expr::expression>& per_partition_limit, const selection::selection& selection) const;
-
    prepared_ann_ordering_type prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const;

    // Checks whether this ordering reverses all results.
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -24,7 +24,6 @@
 #include "index/vector_index.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "service/qos/qos_common.hh"
-#include "vector_search/vector_store_client.hh"
 #include "transport/messages/result_message.hh"
 #include "cql3/functions/as_json_function.hh"
 #include "cql3/selection/selection.hh"
@@ -51,7 +50,6 @@
 #include "db/timeout_clock.hh"
 #include "db/consistency_level_validations.hh"
 #include "data_dictionary/data_dictionary.hh"
-#include "test/lib/select_statement_utils.hh"
 #include "gms/feature_service.hh"
 #include "utils/assert.hh"
 #include "utils/result_combinators.hh"
@@ -69,6 +67,25 @@ bool is_internal_keyspace(std::string_view name);
 namespace cql3 {

 namespace statements {
+namespace {
+
+constexpr std::string_view ANN_CUSTOM_INDEX_OPTION = "vector_index";
+
+template <typename Func>
+auto measure_index_latency(const schema& schema, const secondary_index::index& index, Func&& func) -> std::invoke_result_t<Func> {
+    auto start_time = lowres_system_clock::now();
+    auto result = co_await func();
+    auto duration = lowres_system_clock::now() - start_time;
+
+    auto stats = schema.table().get_index_manager().get_index_stats(index.metadata().name());
+    if (stats) {
+        stats->add_latency(duration);
+    }
+
+    co_return result;
+}
+
+} // namespace

 static logging::logger logger("select_statement");

@@ -109,8 +126,6 @@ failed_result_to_result_message(coordinator_result<T>&& r) {
    return ::make_shared<cql_transport::messages::result_message::exception>(std::move(r).assume_error());
 }

-static constexpr int DEFAULT_INTERNAL_PAGING_SIZE = select_statement::DEFAULT_COUNT_PAGE_SIZE;
-thread_local int internal_paging_size = DEFAULT_INTERNAL_PAGING_SIZE;
 thread_local const lw_shared_ptr<const select_statement::parameters> select_statement::_default_parameters = make_lw_shared<select_statement::parameters>();

 select_statement::parameters::parameters()
@@ -246,8 +261,7 @@ future<> select_statement::check_access(query_processor& qp, const service::clie
        auto& cf_name = s->is_view()
            ? s->view_info()->base_name()
            : (cdc ? cdc->cf_name() : column_family());
-        const schema_ptr& base_schema = cdc ? cdc : _schema;
-        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);
+        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*_schema);
        co_await state.has_column_family_access(keyspace(), cf_name, auth::permission::SELECT, auth::command_desc::type::OTHER, is_vector_indexed);
    } catch (const data_dictionary::no_such_column_family& e) {
        // Will be validated afterwards.
@@ -445,7 +459,7 @@ select_statement::do_execute(query_processor& qp,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = _restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = page_size <= 0 ? internal_paging_size : page_size;
+        page_size = page_size <= 0 ? qp.db().get_config().select_internal_page_size() : page_size;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -626,7 +640,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
 }

 lw_shared_ptr<query::read_command>
-indexed_table_select_statement::prepare_command_for_base_query(query_processor& qp, const query_options& options,
+view_indexed_table_select_statement::prepare_command_for_base_query(query_processor& qp, const query_options& options,
        service::query_state& state, gc_clock::time_point now, bool use_paging) const {
    auto slice = make_partition_slice(options);
    if (use_paging) {
@@ -654,7 +668,7 @@ indexed_table_select_statement::prepare_command_for_base_query(query_processor&
 }

 future<coordinator_result<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>>
-indexed_table_select_statement::do_execute_base_query(
+view_indexed_table_select_statement::do_execute_base_query(
        query_processor& qp,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -749,7 +763,7 @@ indexed_table_select_statement::do_execute_base_query(
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+view_indexed_table_select_statement::execute_base_query(
        query_processor& qp,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -757,14 +771,14 @@ indexed_table_select_statement::execute_base_query(
        gc_clock::time_point now,
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    return do_execute_base_query(qp, std::move(partition_ranges), state, options, now, paging_state).then(wrap_result_to_error_message(
-            [this, &state, &options, now, paging_state = std::move(paging_state)] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd) {
+            [this, &state, &options, now, paging_state = std::move(paging_state), internal_page_size = qp.db().get_config().select_internal_page_size()] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd) {
        auto&& [result, cmd] = result_and_cmd;
-        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state));
+        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state), internal_page_size);
    }));
 }

 future<coordinator_result<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>>
-indexed_table_select_statement::do_execute_base_query(
+view_indexed_table_select_statement::do_execute_base_query(
        query_processor& qp,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -829,7 +843,7 @@ indexed_table_select_statement::do_execute_base_query(
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+view_indexed_table_select_statement::execute_base_query(
        query_processor& qp,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -837,9 +851,9 @@ indexed_table_select_statement::execute_base_query(
        gc_clock::time_point now,
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    return do_execute_base_query(qp, std::move(primary_keys), state, options, now, paging_state).then(wrap_result_to_error_message(
-            [this, &state, &options, now, paging_state = std::move(paging_state)] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd){
+            [this, &state, &options, now, paging_state = std::move(paging_state), internal_page_size = qp.db().get_config().select_internal_page_size()] (std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>> result_and_cmd){
        auto&& [result, cmd] = result_and_cmd;
-        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state));
+        return process_base_query_results(std::move(result), std::move(cmd), state, options, now, std::move(paging_state), internal_page_size);
    }));
 }

@@ -897,16 +911,17 @@ select_statement::execute_without_checking_exception_message_non_aggregate_unpag
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::process_base_query_results(
+view_indexed_table_select_statement::process_base_query_results(
        foreign_ptr<lw_shared_ptr<query::result>> results,
        lw_shared_ptr<query::read_command> cmd,
        service::query_state& state,
        const query_options& options,
        gc_clock::time_point now,
-        lw_shared_ptr<const service::pager::paging_state> paging_state) const
+        lw_shared_ptr<const service::pager::paging_state> paging_state,
+        uint32_t internal_page_size) const
 {
    if (paging_state) {
-        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options);
+        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options, internal_page_size);
        _selection->get_result_metadata()->maybe_set_paging_state(std::move(paging_state));
    }
    return process_results(std::move(results), std::move(cmd), options, now);
@@ -999,7 +1014,7 @@ bool check_needs_allow_filtering_anyway(const restrictions::statement_restrictio
 }

 ::shared_ptr<cql3::statements::select_statement>
-indexed_table_select_statement::prepare(data_dictionary::database db,
+view_indexed_table_select_statement::prepare(data_dictionary::database db,
                                        schema_ptr schema,
                                        uint32_t bound_terms,
                                        lw_shared_ptr<const parameters> parameters,
@@ -1008,7 +1023,6 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
                                        ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                        bool is_reversed,
                                        ordering_comparator_type ordering_comparator,
-                                        std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                        std::optional<expr::expression> limit,
                                         std::optional<expr::expression> per_partition_limit,
                                         cql_stats &stats,
@@ -1018,36 +1032,18 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
    auto& sim = cf.get_index_manager();
    auto [index_opt, used_index_restrictions] = restrictions->find_idx(sim);

-    if (prepared_ann_ordering.has_value()) {
-        auto indexes = sim.list_indexes();
-        auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
-            return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name)
-                && ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ann_custom_index_option)
-                && (ind.target_column() == prepared_ann_ordering->first->name_as_text());
-        });
-
-        if (it == indexes.end()) {
-            throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
-        } else {
-            if (index_opt || parameters->allow_filtering() || !(restrictions->is_empty()) || check_needs_allow_filtering_anyway(*restrictions)) {
-                throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
-            }
-            index_opt = *it;
-        }
-    } else if (index_opt) {
-        auto it = index_opt->metadata().options().find(db::index::secondary_index::custom_class_option_name);
-        if (it != index_opt->metadata().options().end() && it->second == ann_custom_index_option) {
-            throw exceptions::invalid_request_exception("Vector indexes only support ANN queries");
-        }
-    }
-
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }

+    auto it = index_opt->metadata().options().find(db::index::secondary_index::custom_class_option_name);
+    if (it != index_opt->metadata().options().end() && it->second == ANN_CUSTOM_INDEX_OPTION) {
+        throw exceptions::invalid_request_exception("Vector indexes only support ANN queries");
+    }
+
    schema_ptr view_schema = restrictions->get_view_schema();

-    return ::make_shared<cql3::statements::indexed_table_select_statement>(
+    return ::make_shared<cql3::statements::view_indexed_table_select_statement>(
            schema,
            bound_terms,
            parameters,
@@ -1056,7 +1052,6 @@ indexed_table_select_statement::prepare(data_dictionary::database db,
            std::move(group_by_cell_indices),
            is_reversed,
            std::move(ordering_comparator),
-            std::move(prepared_ann_ordering),
            std::move(limit),
            std::move(per_partition_limit),
            stats,
@@ -1067,14 +1062,13 @@ indexed_table_select_statement::prepare(data_dictionary::database db,

 }

-indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
+view_indexed_table_select_statement::view_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           lw_shared_ptr<const parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
                                                           ::shared_ptr<const restrictions::statement_restrictions> restrictions,
                                                           ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                                           bool is_reversed,
                                                           ordering_comparator_type ordering_comparator,
-                                                           std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                                           std::optional<expr::expression> limit,
                                                           std::optional<expr::expression> per_partition_limit,
                                                           cql_stats &stats,
@@ -1086,8 +1080,8 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
    , _index{index}
    , _used_index_restrictions(std::move(used_index_restrictions))
    , _view_schema(view_schema)
-    , _prepared_ann_ordering(std::move(prepared_ann_ordering))
 {
+    SCYLLA_ASSERT(_view_schema);
    if (_index.metadata().local()) {
        _get_partition_ranges_for_posting_list = [this] (const query_options& options) { return get_partition_ranges_for_local_index_posting_list(options); };
        _get_partition_slice_for_posting_list = [this] (const query_options& options) { return get_partition_slice_for_local_index_posting_list(options); };
@@ -1111,7 +1105,7 @@ static void append_base_key_to_index_ck(std::vector<managed_bytes_view>& explode
    std::move(begin, key_view.end(), std::back_inserter(exploded_index_ck));
 }

-bytes indexed_table_select_statement::compute_idx_token(const partition_key& key) const {
+bytes view_indexed_table_select_statement::compute_idx_token(const partition_key& key) const {
    const column_definition& cdef = *_view_schema->clustering_key_columns().begin();
    if (!cdef.is_computed()) {
        throw std::logic_error{format(
@@ -1121,8 +1115,8 @@ bytes indexed_table_select_statement::compute_idx_token(const partition_key& key
    return cdef.get_computation().compute_value(*_schema, key);
 }

-lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement::generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
-        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options) const {
+lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_statement::generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
+        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options, uint32_t internal_page_size) const {
    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
    if (!cdef) {
        throw exceptions::invalid_request_exception("Indexed column not found in schema");
@@ -1169,32 +1163,22 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
-    paging_state_copy->set_remaining(internal_paging_size);
+    paging_state_copy->set_remaining(internal_page_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return paging_state_copy;
 }

-future<shared_ptr<cql_transport::messages::result_message>> indexed_table_select_statement::do_execute(
+future<shared_ptr<cql_transport::messages::result_message>> view_indexed_table_select_statement::do_execute(
        query_processor& qp, service::query_state& state, const query_options& options) const {
-        
-        auto start_time = lowres_system_clock::now();
-        auto result = co_await actually_do_execute(qp, state, options);
-        auto duration = lowres_system_clock::now() - start_time;
-        auto stats = _schema->table().get_index_manager().get_index_stats(_index.metadata().name());
-        if (stats) {
-            stats->add_latency(duration);
-        }
-        auto limit = get_limit(options, _limit);
-        auto page_size = options.get_page_size();
-        if (_prepared_ann_ordering.has_value() && page_size > 0 && (uint64_t) page_size < limit) {
-            result->add_warning("Paging is not supported for Vector Search queries. The entire result set has been returned.");
-        }
-        co_return result;
+
+    return measure_index_latency(*_schema, _index, [this, &qp, &state, &options]() -> future<shared_ptr<cql_transport::messages::result_message>> {
+        return actually_do_execute(qp, state, options);
+    });
 }

 future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::actually_do_execute(query_processor& qp,
+view_indexed_table_select_statement::actually_do_execute(query_processor& qp,
                             service::query_state& state,
                             const query_options& options) const
 {
@@ -1215,45 +1199,7 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
            ? source_selector::INTERNAL : source_selector::USER;
    ++_stats.query_cnt(src_sel, _ks_sel, cond_selector::NO_CONDITIONS, statement_type::SELECT);

-    SCYLLA_ASSERT(_restrictions->uses_secondary_indexing() || _prepared_ann_ordering.has_value());
-
-    if (_prepared_ann_ordering.has_value()) {
-        auto limit = get_limit(options, _limit);
-        if (limit > max_ann_query_limit) {
-            co_await coroutine::return_exception(exceptions::invalid_request_exception(fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
-        }
-
-        auto [ann_column, ann_vector_expr] = _prepared_ann_ordering.value();
-
-        auto expr_value = expr::evaluate(ann_vector_expr, options);
-
-        if (expr_value.is_null()) {
-            throw exceptions::invalid_request_exception(fmt::format("Unsupported null value for column {}", _prepared_ann_ordering->first->name_as_text()));
-        }
-
-        auto values = value_cast<vector_type_impl::native_type>(ann_column->type->deserialize(std::move(expr_value).to_bytes()));
-        auto ann_vector = util::to_vector<float>(values);
-
-        auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
-        auto aoe = abort_on_expiry(timeout);
-        auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema , std::move(ann_vector), limit, aoe.abort_source());
-        if (!pkeys.has_value()) {
-            co_await coroutine::return_exception(
-                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
-        }
-
-        // If there are no clustering columns, we have to convert the partition keys to partition ranges.
-        if (_schema->clustering_key_size() == 0) {
-            std::vector<dht::partition_range> partition_ranges;
-            std::ranges::transform(pkeys.value(), std::back_inserter(partition_ranges), [](const auto& pkey) {
-                    return dht::partition_range::make_singular(pkey.partition);
-                });
-
-            co_return co_await this->execute_base_query(qp, std::move(partition_ranges), state, options, now, nullptr);
-        }
-
-        co_return co_await this->execute_base_query(qp, std::move(*pkeys), state, options, now, nullptr);
-    }
+    SCYLLA_ASSERT(_restrictions->uses_secondary_indexing());

    _stats.unpaged_select_queries(_ks_sel) += options.get_page_size() <= 0;

@@ -1307,11 +1253,12 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
        std::unique_ptr<cql3::query_options> internal_options = std::make_unique<cql3::query_options>(cql3::query_options(options));
        stop_iteration stop;
        // page size is set to the internal count page size, regardless of the user-provided value
-        internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_paging_size));
+        auto internal_page_size = qp.db().get_config().select_internal_page_size();
+        internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_page_size));
        do {
-            auto consume_results = [this, &builder, &options, &internal_options, &state] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) -> stop_iteration {
+            auto consume_results = [this, &builder, &options, &internal_options, &state, internal_page_size] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) -> stop_iteration {
                if (paging_state) {
-                    paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options);
+                    paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, state, options, internal_page_size);
                }
                internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
                if (_restrictions_need_filtering) {
@@ -1388,11 +1335,11 @@ indexed_table_select_statement::actually_do_execute(query_processor& qp,
    }
 }

-dht::partition_range_vector indexed_table_select_statement::get_partition_ranges_for_local_index_posting_list(const query_options& options) const {
+dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_local_index_posting_list(const query_options& options) const {
    return _restrictions->get_partition_key_ranges(options);
 }

-dht::partition_range_vector indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
+dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
    dht::partition_range_vector partition_ranges;

    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
@@ -1411,7 +1358,7 @@ dht::partition_range_vector indexed_table_select_statement::get_partition_ranges
    return partition_ranges;
 }

-query::partition_slice indexed_table_select_statement::get_partition_slice_for_global_index_posting_list(const query_options& options) const {
+query::partition_slice view_indexed_table_select_statement::get_partition_slice_for_global_index_posting_list(const query_options& options) const {
    partition_slice_builder partition_slice_builder{*_view_schema};

    if (!_restrictions->has_partition_key_unrestricted_components()) {
@@ -1430,7 +1377,7 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
    return partition_slice_builder.build();
 }

-query::partition_slice indexed_table_select_statement::get_partition_slice_for_local_index_posting_list(const query_options& options) const {
+query::partition_slice view_indexed_table_select_statement::get_partition_slice_for_local_index_posting_list(const query_options& options) const {
    partition_slice_builder partition_slice_builder{*_view_schema};

    partition_slice_builder.with_ranges(
@@ -1443,7 +1390,7 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_l
 // the posting-list for a particular value of the indexed column.
 // Remember a secondary index can only be created on a single column.
 future<coordinator_result<::shared_ptr<cql_transport::messages::result_message::rows>>>
-indexed_table_select_statement::read_posting_list(query_processor& qp,
+view_indexed_table_select_statement::read_posting_list(query_processor& qp,
                  const query_options& options,
                  uint64_t limit,
                  service::query_state& state,
@@ -1504,7 +1451,7 @@ indexed_table_select_statement::read_posting_list(query_processor& qp,
 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
 future<coordinator_result<std::tuple<dht::partition_range_vector, lw_shared_ptr<const service::pager::paging_state>>>>
-indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
+view_indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
                                             service::query_state& state,
                                             const query_options& options) const
 {
@@ -1580,7 +1527,7 @@ indexed_table_select_statement::find_index_partition_ranges(query_processor& qp,
 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
 future<coordinator_result<std::tuple<std::vector<primary_key>, lw_shared_ptr<const service::pager::paging_state>>>>
-indexed_table_select_statement::find_index_clustering_rows(query_processor& qp, service::query_state& state, const query_options& options) const
+view_indexed_table_select_statement::find_index_clustering_rows(query_processor& qp, service::query_state& state, const query_options& options) const
 {
    using value_type = std::tuple<std::vector<primary_key>, lw_shared_ptr<const service::pager::paging_state>>;
    auto now = gc_clock::now();
@@ -1916,7 +1863,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = _restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = internal_paging_size;
+        page_size = qp.db().get_config().select_internal_page_size();
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -2010,6 +1957,175 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
            }));
 }

+::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
+        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<attributes> attrs) {
+    auto cf = db.find_column_family(schema);
+    auto& sim = cf.get_index_manager();
+    auto [index_opt, _] = restrictions->find_idx(sim);
+
+    auto indexes = sim.list_indexes();
+    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
+        return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
+                       ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
+               (ind.target_column() == prepared_ann_ordering.first->name_as_text());
+    });
+
+    if (it == indexes.end()) {
+        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
+    }
+    if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
+        throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
+    }
+    index_opt = *it;
+
+    if (!index_opt) {
+        throw std::runtime_error("No index found.");
+    }
+
+    return ::make_shared<cql3::statements::vector_indexed_table_select_statement>(schema, bound_terms, parameters, std::move(selection), std::move(restrictions),
+            std::move(group_by_cell_indices), is_reversed, std::move(ordering_comparator), std::move(prepared_ann_ordering), std::move(limit),
+            std::move(per_partition_limit), stats, *index_opt, std::move(attrs));
+}
+
+vector_indexed_table_select_statement::vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
+        ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+        ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
+        prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs)
+    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit,
+              per_partition_limit, stats, std::move(attrs)}
+    , _index{index}
+    , _prepared_ann_ordering(std::move(prepared_ann_ordering)) {
+
+    if (!limit.has_value()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
+    }
+
+    if (per_partition_limit.has_value()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries do not support per-partition limits");
+    }
+
+    if (selection->is_aggregate()) {
+        throw exceptions::invalid_request_exception("Vector ANN queries cannot be run with aggregation");
+    }
+}
+
+future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::do_execute(
+        query_processor& qp, service::query_state& state, const query_options& options) const {
+
+    return measure_index_latency(*_schema, _index, [this, &qp, &state, &options](this auto) -> future<shared_ptr<cql_transport::messages::result_message>> {
+        tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());
+        validate_for_read(options.get_consistency());
+
+        _query_start_time_point = gc_clock::now();
+
+        update_stats();
+
+        auto limit = get_limit(options, _limit);
+
+        if (limit > max_ann_query_limit) {
+            co_await coroutine::return_exception(exceptions::invalid_request_exception(
+                    fmt::format("Use of ANN OF in an ORDER BY clause requires a LIMIT that is not greater than {}. LIMIT was {}", max_ann_query_limit, limit)));
+        }
+
+        auto as = abort_source();
+        auto pkeys = co_await qp.vector_store_client().ann(_schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, as);
+        if (!pkeys.has_value()) {
+            co_await coroutine::return_exception(
+                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
+        }
+
+        co_return co_await query_base_table(qp, state, options, pkeys.value());
+    });
+}
+
+void vector_indexed_table_select_statement::update_stats() const {
+    ++_stats.secondary_index_reads;
+    ++_stats.query_cnt(source_selector::USER, _ks_sel, cond_selector::NO_CONDITIONS, statement_type::SELECT);
+}
+
+lw_shared_ptr<query::read_command> vector_indexed_table_select_statement::prepare_command_for_base_query(
+        query_processor& qp, service::query_state& state, const query_options& options) const {
+    auto slice = make_partition_slice(options);
+    return ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(slice), qp.proxy().get_max_result_size(slice),
+            query::tombstone_limit(qp.proxy().get_tombstone_limit()),
+            query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())), query::partition_limit(query::max_partitions),
+            _query_start_time_point, tracing::make_trace_info(state.get_trace_state()), query_id::create_null_id(), query::is_first_page::no,
+            options.get_timestamp(state));
+}
+
+std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vector(const query_options& options) const {
+    auto [ann_column, ann_vector_expr] = _prepared_ann_ordering;
+    auto expr_value = expr::evaluate(ann_vector_expr, options);
+    if (expr_value.is_null()) {
+        throw exceptions::invalid_request_exception(fmt::format("Unsupported null value for column {}", _prepared_ann_ordering.first->name_as_text()));
+    }
+    auto values = value_cast<vector_type_impl::native_type>(ann_column->type->deserialize(expr::evaluate(ann_vector_expr, options).to_bytes()));
+    return util::to_vector<float>(values);
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(
+        query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const {
+    auto command = prepare_command_for_base_query(qp, state, options);
+    auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
+
+    // For tables without clustering columns, we can optimize by querying
+    // partition ranges instead of individual primary keys, since the
+    // partition key alone uniquely identifies each row.
+    if (_schema->clustering_key_size() == 0) {
+        auto to_partition_ranges = [](const std::vector<vector_search::primary_key>& pkeys) -> std::vector<dht::partition_range> {
+            std::vector<dht::partition_range> partition_ranges;
+            std::ranges::transform(pkeys, std::back_inserter(partition_ranges), [](const auto& pkey) {
+                return dht::partition_range::make_singular(pkey.partition);
+            });
+
+            return partition_ranges;
+        };
+        co_return co_await query_base_table(qp, state, options, std::move(command), timeout, to_partition_ranges(pkeys));
+    }
+    co_return co_await query_base_table(qp, state, options, std::move(command), timeout, pkeys);
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
+        service::query_state& state, const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+        const std::vector<vector_search::primary_key>& pkeys) const {
+
+    coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>> result = co_await utils::result_map_reduce(
+            pkeys.begin(), pkeys.end(),
+            [&](this auto, auto& key) -> future<coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>>> {
+                auto cmd = ::make_lw_shared<query::read_command>(*command);
+                cmd->slice._row_ranges = query::clustering_row_ranges{query::clustering_range::make_singular(key.clustering)};
+                coordinator_result<service::storage_proxy::coordinator_query_result> rqr =
+                        co_await qp.proxy().query_result(_schema, cmd, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(),
+                                {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state()});
+                if (!rqr) {
+                    co_return std::move(rqr).as_failure();
+                }
+                co_return std::move(rqr.value().query_result);
+            },
+            query::result_merger{command->get_row_limit(), query::max_partitions});
+
+    co_return co_await wrap_result_to_error_message([this, &command, &options](auto result) {
+        return process_results(std::move(result), command, options, _query_start_time_point);
+    })(std::move(result));
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
+        service::query_state& state, const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+        std::vector<dht::partition_range> partition_ranges) const {
+
+    co_return co_await qp.proxy()
+            .query_result(_query_schema, command, std::move(partition_ranges), options.get_consistency(),
+                    {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}, options.get_specific_options().node_local_only},
+                    std::nullopt)
+            .then(wrap_result_to_error_message([this, &options, command](service::storage_proxy::coordinator_query_result qr) {
+                return this->process_results(std::move(qr.query_result), command, options, _query_start_time_point);
+            }));
+}
+
 namespace raw {

 static void validate_attrs(const cql3::attributes::raw& attrs) {
@@ -2168,7 +2284,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        std::visit([&](auto&& ordering) {
            using T = std::decay_t<decltype(ordering)>;
            if constexpr (std::is_same_v<T, select_statement::ann_vector>) {
-                verify_ann_ordering_is_valid(_limit, _per_partition_limit, *selection);
                prepared_ann_ordering = prepare_ann_ordering(*schema, ctx, db);
            } else {
                SCYLLA_ASSERT(!for_view);
@@ -2266,8 +2381,12 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
                std::move(prepared_attrs));
-    } else if (restrictions->uses_secondary_indexing() || prepared_ann_ordering) {
-        stmt = indexed_table_select_statement::prepare(
+    } else if (prepared_ann_ordering) {
+        stmt = vector_indexed_table_select_statement::prepare(db, schema, ctx.bound_variables_size(), _parameters, std::move(selection), std::move(restrictions),
+                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(*prepared_ann_ordering),
+                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, std::move(prepared_attrs));
+    } else if (restrictions->uses_secondary_indexing()) {
+        stmt = view_indexed_table_select_statement::prepare(
                db,
                schema,
                ctx.bound_variables_size(),
@@ -2277,7 +2396,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                std::move(group_by_cell_indices),
                is_reversed_,
                std::move(ordering_comparator),
-                std::move(prepared_ann_ordering),
                prepare_limit(db, ctx, _limit),
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
@@ -2488,22 +2606,6 @@ void select_statement::verify_ordering_is_valid(const prepared_orderings_type& o
    }
 }

-void select_statement::verify_ann_ordering_is_valid(const std::optional<expr::expression>& limit,
-                                                    const std::optional<expr::expression>& per_partition_limit,
-                                                    const selection::selection& selection) const {
-    if (!limit.has_value()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
-    }
-
-    if (per_partition_limit.has_value()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries do not support per-partition limits");
-    }
-
-    if (selection.is_aggregate()) {
-        throw exceptions::invalid_request_exception("Vector ANN queries cannot be run with aggregation");
-    }
-}
-
 select_statement::prepared_ann_ordering_type select_statement::prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const {
    auto [column_id, ordering] = _parameters->orderings().front();
    const auto& ann_vector = std::get_if<select_statement::ann_vector>(&ordering);
@@ -2757,16 +2859,6 @@ std::vector<size_t> select_statement::prepare_group_by(const schema& schema, sel

 }

-future<> set_internal_paging_size(int paging_size) {
-    return seastar::smp::invoke_on_all([paging_size] {
-        internal_paging_size = paging_size;
-    });
-}
-
-future<> reset_internal_paging_size() {
-    return set_internal_paging_size(DEFAULT_INTERNAL_PAGING_SIZE);
-}
-
 }

 namespace util {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -21,6 +21,7 @@
 #include "exceptions/coordinator_result.hh"
 #include "locator/host_id.hh"
 #include "service/cas_shard.hh"
+#include "vector_search/vector_store_client.hh"

 namespace service {
    class client_state;
@@ -63,7 +64,6 @@ public:
    using parameters = raw::select_statement::parameters;
    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
    using prepared_ann_ordering_type = raw::select_statement::prepared_ann_ordering_type;
-    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
    bool _may_use_token_aware_routing;
 protected:
    static thread_local const lw_shared_ptr<const parameters> _default_parameters;
@@ -184,17 +184,14 @@ public:
                     std::unique_ptr<cql3::attributes> attrs);
 };

-class indexed_table_select_statement : public select_statement {
+class view_indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
    expr::expression _used_index_restrictions;
    schema_ptr _view_schema;
-    std::optional<prepared_ann_ordering_type>  _prepared_ann_ordering;
    noncopyable_function<dht::partition_range_vector(const query_options&)> _get_partition_ranges_for_posting_list;
    noncopyable_function<query::partition_slice(const query_options&)> _get_partition_slice_for_posting_list;
 public:
    static constexpr size_t max_base_table_query_concurrency = 4096;
-    static constexpr size_t max_ann_query_limit = 1000;
-    static constexpr std::string_view ann_custom_index_option = "vector_index";

    static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db,
                                                                    schema_ptr schema,
@@ -205,13 +202,12 @@ public:
                                                                    ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                                                    bool is_reversed,
                                                                    ordering_comparator_type ordering_comparator,
-                                                                    std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                                                    std::optional<expr::expression> limit,
                                                                    std::optional<expr::expression> per_partition_limit,
                                                                    cql_stats &stats,
                                                                    std::unique_ptr<cql3::attributes> attrs);

-    indexed_table_select_statement(schema_ptr schema,
+    view_indexed_table_select_statement(schema_ptr schema,
                                   uint32_t bound_terms,
                                   lw_shared_ptr<const parameters> parameters,
                                   ::shared_ptr<selection::selection> selection,
@@ -219,7 +215,6 @@ public:
                                   ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                   bool is_reversed,
                                   ordering_comparator_type ordering_comparator,
-                                   std::optional<prepared_ann_ordering_type> prepared_ann_ordering,
                                   std::optional<expr::expression> limit,
                                   std::optional<expr::expression> per_partition_limit,
                                   cql_stats &stats,
@@ -236,7 +231,7 @@ private:
            service::query_state& state, const query_options& options) const;

    lw_shared_ptr<const service::pager::paging_state> generate_view_paging_state_from_base_query_results(lw_shared_ptr<const service::pager::paging_state> paging_state,
-            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options) const;
+            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::query_state& state, const query_options& options, uint32_t internal_page_size) const;

    future<coordinator_result<std::tuple<dht::partition_range_vector, lw_shared_ptr<const service::pager::paging_state>>>> find_index_partition_ranges(query_processor& qp,
                                                                    service::query_state& state,
@@ -253,7 +248,8 @@ private:
            service::query_state& state,
            const query_options& options,
            gc_clock::time_point now,
-            lw_shared_ptr<const service::pager::paging_state> paging_state) const;
+            lw_shared_ptr<const service::pager::paging_state> paging_state,
+            uint32_t internal_page_size) const;

    lw_shared_ptr<query::read_command>
    prepare_command_for_base_query(query_processor& qp, const query_options& options, service::query_state& state, gc_clock::time_point now,
@@ -362,6 +358,48 @@ private:
            service::query_state& state, const query_options& options) const override;
 };

-}
+
+class vector_indexed_table_select_statement : public select_statement {
+    secondary_index::index _index;
+    prepared_ann_ordering_type _prepared_ann_ordering;
+    mutable gc_clock::time_point _query_start_time_point;
+
+public:
+    static constexpr size_t max_ann_query_limit = 1000;
+
+    static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
+            lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+            ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+            ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+            std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<cql3::attributes> attrs);
+
+    vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
+            ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+            ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
+            prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit, std::optional<expr::expression> per_partition_limit,
+            cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
+
+private:
+    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(
+            query_processor& qp, service::query_state& state, const query_options& options) const override;
+
+    void update_stats() const;
+
+    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options) const;
+
+    std::vector<float> get_ann_ordering_vector(const query_options& options) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(
+            query_processor& qp, service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
+            const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+            const std::vector<vector_search::primary_key>& pkeys) const;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> query_base_table(query_processor& qp, service::query_state& state,
+            const query_options& options, lw_shared_ptr<query::read_command> command, lowres_clock::time_point timeout,
+            std::vector<dht::partition_range> partition_ranges) const;
+};

 }
+}
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -21,11 +21,11 @@
 #include "unimplemented.hh"

 #include "cql3/operation_impl.hh"
-#include "cql3/type_json.hh"
 #include "cql3/lists.hh"
 #include "cql3/maps.hh"
 #include "cql3/sets.hh"
 #include "cql3/user_types.hh"
+#include "types/json_utils.hh"
 #include "types/list.hh"
 #include "types/map.hh"
 #include "types/set.hh"
--- a/data_dictionary/consistency_config_options.hh
+++ b/data_dictionary/consistency_config_options.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <seastar/core/sstring.hh>
+
+namespace data_dictionary {
+enum class consistency_config_option : uint8_t {
+    eventual,
+    local,
+    global
+};
+
+consistency_config_option consistency_config_option_from_string(const seastar::sstring& str);
+seastar::sstring consistency_config_option_to_string(consistency_config_option option);
+}
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -9,6 +9,7 @@
 #include <ranges>
 #include "data_dictionary.hh"
 #include "cql3/description.hh"
+#include "data_dictionary/consistency_config_options.hh"
 #include "impl.hh"
 #include "user_types_metadata.hh"
 #include "keyspace_metadata.hh"
@@ -22,6 +23,7 @@
 #include <ostream>
 #include <array>
 #include "replica/database.hh"
+#include "utils/overloaded_functor.hh"

 namespace data_dictionary {

@@ -213,6 +215,7 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
             std::string_view strategy_name,
             locator::replication_strategy_config_options strategy_options,
             std::optional<unsigned> initial_tablets,
+             std::optional<consistency_config_option> consistency_option,
             bool durable_writes,
             std::vector<schema_ptr> cf_defs,
             user_types_metadata user_types,
@@ -224,6 +227,7 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
    , _durable_writes{durable_writes}
    , _user_types{std::move(user_types)}
    , _storage_options(make_lw_shared<storage_options>(std::move(storage_opts)))
+    , _consistency_option(consistency_option)
 {
    for (auto&& s : cf_defs) {
        _cf_meta_data.emplace(s->cf_name(), s);
@@ -232,9 +236,28 @@ keyspace_metadata::keyspace_metadata(std::string_view name,

 void keyspace_metadata::validate(const gms::feature_service& fs, const locator::topology& topology) const {
    using namespace locator;
-    locator::replication_strategy_params params(strategy_options(), initial_tablets());
-    auto strategy = locator::abstract_replication_strategy::create_replication_strategy(strategy_name(), params);
+    locator::replication_strategy_params params(strategy_options(), initial_tablets(), consistency_option());
+    auto strategy = locator::abstract_replication_strategy::create_replication_strategy(strategy_name(), params, topology);
    strategy->validate_options(fs, topology);
+    if (!params.initial_tablets && params.consistency.value_or(data_dictionary::consistency_config_option::eventual) != data_dictionary::consistency_config_option::eventual) {
+        throw exceptions::configuration_exception("Only eventual consistency is supported for non-tablet keyspaces");
+    }
+    if (params.consistency && !fs.strongly_consistent_tables) {
+        throw exceptions::configuration_exception("The strongly_consistent_tables feature must be enabled to use a consistency option");
+    }
+    if (params.consistency && *params.consistency == data_dictionary::consistency_config_option::global) {
+        throw exceptions::configuration_exception("Global consistency is not supported yet");
+    }
+}
+
+locator::replication_strategy_config_options keyspace_metadata::strategy_options_v1() const {
+    auto opts = _strategy_options;
+    for (auto& [key, value] : opts) {
+        if (std::holds_alternative<locator::rack_list>(value)) {
+            opts[key] = to_sstring(std::get<locator::rack_list>(value).size());
+        }
+    }
+    return opts;
 }

 lw_shared_ptr<keyspace_metadata>
@@ -242,16 +265,17 @@ keyspace_metadata::new_keyspace(std::string_view name,
                                std::string_view strategy_name,
                                locator::replication_strategy_config_options options,
                                std::optional<unsigned> initial_tablets,
+                                std::optional<consistency_config_option> consistency_option,
                                bool durables_writes,
                                storage_options storage_opts,
                                std::vector<schema_ptr> cf_defs)
 {
-    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
+    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
 }

 lw_shared_ptr<keyspace_metadata>
 keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
-    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.durable_writes(), ksm.get_storage_options());
+    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
 }

 void keyspace_metadata::add_user_type(const user_type ut) {
@@ -277,7 +301,7 @@ std::vector<view_ptr> keyspace_metadata::views() const {
            | std::ranges::to<std::vector<view_ptr>>();
 }

-storage_options::local storage_options::local::from_map(const std::map<sstring, sstring>& values) {
+static storage_options::local local_from_map(const std::map<sstring, sstring>& values) {
    if (!values.empty()) {
        throw std::runtime_error("Local storage does not accept any custom options");
    }
@@ -288,9 +312,13 @@ std::map<sstring, sstring> storage_options::local::to_map() const {
    return {};
 }

-storage_options::s3 storage_options::s3::from_map(const std::map<sstring, sstring>& values) {
-    s3 options;
-    const std::array<std::pair<sstring, sstring*>, 2> allowed_options {
+std::string_view storage_options::local::name() const {
+    return LOCAL_NAME;
+}
+
+static storage_options::object_storage object_storage_from_map(std::string_view type, const std::map<sstring, sstring>& values) {
+    storage_options::object_storage options;
+    const std::array<std::pair<sstring, std::string*>, 2> allowed_options {
        std::make_pair("bucket", &options.bucket),
        std::make_pair("endpoint", &options.endpoint),
    };
@@ -298,38 +326,61 @@ storage_options::s3 storage_options::s3::from_map(const std::map<sstring, sstrin
        if (auto it = values.find(option.first); it != values.end()) {
            *option.second = it->second;
        } else {
-            throw std::runtime_error(fmt::format("Missing S3 option: {}", option.first));
+            throw std::runtime_error(fmt::format("Missing {} option: {}", type, option.first));
        }
    }
    if (values.size() > allowed_options.size()) {
-        throw std::runtime_error(fmt::format("Extraneous options for S3: {}; allowed: {}",
-            fmt::join(values | std::views::keys, ","),
+        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
+            fmt::join(values | std::views::keys, ","), type,
            fmt::join(allowed_options | std::views::keys, ",")));
    }
+    options.type = std::string(type);
    return options;
 }

-std::map<sstring, sstring> storage_options::s3::to_map() const {
+std::map<sstring, sstring> storage_options::object_storage::to_map() const {
    return {{"bucket", bucket},
            {"endpoint", endpoint}};
 }

+std::string_view storage_options::object_storage::name() const {
+    return type;
+}
+
+bool storage_options::object_storage::operator==(const object_storage&) const = default;
+
 bool storage_options::is_local_type() const noexcept {
    return std::holds_alternative<local>(value);
 }

-storage_options::value_type storage_options::from_map(std::string_view type, std::map<sstring, sstring> values) {
-    if (type == local::name) {
-        return local::from_map(values);
+bool storage_options::is_object_storage_type() const noexcept {
+    return std::holds_alternative<object_storage>(value);
+}
+
+bool storage_options::is_s3_type() const noexcept {
+    return is_object_storage_type() && type_string() == S3_NAME;
+}
+
+bool storage_options::is_gs_type() const noexcept {
+    return is_object_storage_type() && type_string() == GS_NAME;
+}
+
+const std::string storage_options::LOCAL_NAME = "LOCAL";
+const std::string storage_options::S3_NAME = "S3";
+const std::string storage_options::GS_NAME = "GS";
+
+storage_options::value_type storage_options::from_map(std::string_view type, const std::map<sstring, sstring>& values) {
+    if (type == LOCAL_NAME) {
+        return local_from_map(values);
    }
-    if (type == s3::name) {
-        return s3::from_map(values);
+    if (type == S3_NAME || type == GS_NAME) {
+        return object_storage_from_map(type, values);
    }
    throw std::runtime_error(fmt::format("Unknown storage type: {}", type));
 }

 std::string_view storage_options::type_string() const {
-    return std::visit([] (auto& opt) { return opt.name; }, value);
+    return std::visit([] (auto& opt) { return opt.name(); }, value);
 }

 std::map<sstring, sstring> storage_options::to_map() const {
@@ -340,7 +391,7 @@ bool storage_options::can_update_to(const storage_options& new_options) {
    return value == new_options.value;
 }

-storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
+storage_options storage_options::append_to_object_storage_prefix(const sstring& s) const {
    // when restoring from object storage, the API of /storage_service/restore
    // provides:
    // 1. a shared prefix
@@ -355,7 +406,7 @@ storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
    //
    // note, this example shows three sstables from two different snapshot backups.
    //
-    // we assume all sstables' locations share the same base prefix (storage_options::s3::prefix).
+    // we assume all sstables' locations share the same base prefix (storage_options::object_storage::prefix).
    // however, sstable in different backups have different prefixes. to handle this, we compose
    // a per-sstable prefix by concatenating the shared prefix and the "parent directory" of the
    // sstable's location. the resulting structure looks like:
@@ -375,14 +426,109 @@ storage_options storage_options::append_to_s3_prefix(const sstring& s) const {
        return ret;
    }

-    s3 s3_options = std::get<s3>(value);
-    SCYLLA_ASSERT(std::holds_alternative<sstring>(s3_options.location));
-    sstring prefix = std::get<sstring>(s3_options.location);
-    s3_options.location = seastar::format("{}/{}", prefix, s);
-    ret.value = std::move(s3_options);
+    object_storage options = std::get<object_storage>(value);
+    SCYLLA_ASSERT(std::holds_alternative<sstring>(options.location));
+    sstring prefix = std::get<sstring>(options.location);
+    options.location = seastar::format("{}/{}", prefix, s);
+    ret.value = std::move(options);
    return ret;
 }

+storage_options make_local_options(std::filesystem::path dir) {
+    storage_options so;
+    so.value = data_dictionary::storage_options::local { .dir = std::move(dir) };
+    return so;
+}
+
+static std::string fqn_type(const std::string& fqn) {
+    auto i = fqn.find_first_of(':');
+    return fqn.substr(0, i) | std::views::transform(&toupper) | std::ranges::to<std::string>();
+}
+
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& fqn, abort_source* as) {
+    std::string bucket;
+    std::string object;
+    auto type = fqn_type(fqn);
+    object_storage_fqn_to_parts(fqn, type, bucket, object);
+    object = std::filesystem::path(object).parent_path().string(); // remove the filename and trailing separator from the path
+    return make_object_storage_options(endpoint, type, bucket, object, as);
+}
+
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& type, const std::string& bucket, const std::string& prefix, abort_source* as) {
+    storage_options so;
+    storage_options::object_storage os{
+        .bucket = std::move(bucket), .endpoint = endpoint, .location = std::move(prefix),
+        .abort_source = as,
+        .type = type | std::views::transform(&toupper) | std::ranges::to<std::string>()
+    };
+    so.value = std::move(os);
+    return so;
+}
+
+namespace fs = std::filesystem;
+using namespace std::string_literals;
+
+static fs::path object_store_canonicalize(const fs::path& path, std::string_view type) {
+    if (!is_object_storage_fqn(path, type) || path.string().length() < (type.length() + 2)) {
+        return path;
+    }
+    // Canonicalizing the original "<type>://" changes it to "<type>:/". Trim and re-add the "type://" prefix.
+    auto canonical = path.lexically_normal().string().substr(type.length() + 2);
+    return (type | std::views::transform(&tolower) | std::ranges::to<std::string>()) + "://"s + canonical;
+}
+
+bool is_object_storage_fqn(const fs::path& fqn, std::string_view type) {
+    if (fqn.empty()) {
+        return false;
+    }
+    std::string tmp = *(fqn.begin());
+    return tmp.size() == (type.size() + 1) // additional ':'
+        && tmp.back() == ':'
+        // allow case insensitive checks, like type=S3 as well as type=s3. Only because ::name 
+        // members (history) are upper case.
+        && std::equal(tmp.begin(), tmp.begin() + type.size(), type.begin(), [](char c1, char c2) {
+            return ::tolower(c1) == ::tolower(c2);
+        })
+        ;
+}
+
+bool object_storage_fqn_to_parts(const fs::path& fqn, std::string_view type, std::string& bucket_name, std::string& object_name) {
+    if (!is_object_storage_fqn(fqn, type)) {
+        return false;
+    }
+
+    const auto canonical = object_store_canonicalize(fqn, type);
+    auto it = canonical.begin();
+
+    // Expect at least two components: the scheme (e.g., "s3:") and the bucket name.
+    if (std::distance(it, canonical.end()) < 2) {
+        return false;
+    }
+
+    // Skip the scheme component.
+    ++it;
+
+    // The next component is the bucket name.
+    bucket_name = it->string();
+
+    // Advance to check for object parts.
+    ++it;
+    if (it == canonical.end()) {
+        // No object parts – default to root.
+        object_name = "/";
+        return true;
+    }
+
+    // Combine remaining parts into the object path.
+    fs::path obj;
+    for (; it != canonical.end(); ++it) {
+        obj /= *it;
+    }
+
+    object_name = obj.string().empty() ? "/" : obj.string();
+    return true;
+}
+
 no_such_keyspace::no_such_keyspace(std::string_view ks_name)
    : runtime_error{fmt::format("Can't find a keyspace {}", ks_name)}
 {
@@ -414,7 +560,22 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
        os << "CREATE KEYSPACE " << cql3::util::maybe_quote(_name)
           << " WITH replication = {'class': " << cql3::util::single_quote(_strategy_name);
        for (const auto& opt: _strategy_options) {
-            os << ", " << cql3::util::single_quote(opt.first) << ": " << cql3::util::single_quote(opt.second);
+            os << ", " << cql3::util::single_quote(opt.first) << ": ";
+            std::visit(overloaded_functor{
+                [&os] (const sstring& str) {
+                    os << cql3::util::single_quote(str);
+                },
+                [&os] (const std::vector<sstring>& vec) {
+                    os << "[";
+                    for (auto it = vec.begin(); it != vec.end(); ++it) {
+                        if (it != vec.begin()) {
+                            os << ", ";
+                        }
+                        os << cql3::util::single_quote(*it);
+                    }
+                    os << "]";
+                }
+            }, opt.second);
        }
        if (!_storage_options->is_local_type()) {
            os << "} AND storage = {'type': " << cql3::util::single_quote(sstring(_storage_options->type_string()));
@@ -424,6 +585,9 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
        }
        os << "} AND durable_writes = " << fmt::to_string(_durable_writes);
        if (db.features().tablets) {
+            if (_consistency_option) {
+                os << " AND consistency = " << cql3::util::single_quote(consistency_config_option_to_string(*_consistency_option));
+            }
            if (!_initial_tablets.has_value()) {
                os << " AND tablets = {'enabled': false}";
            } else {
@@ -443,6 +607,29 @@ cql3::description keyspace_metadata::describe(const replica::database& db, cql3:
    };
 }

+consistency_config_option consistency_config_option_from_string(const seastar::sstring& str) {
+    if (str == "eventual") {
+        return consistency_config_option::eventual;
+    } else if (str == "local") {
+        return consistency_config_option::local;
+    } else if (str == "global") {
+        return consistency_config_option::global;
+    } else {
+        throw exceptions::configuration_exception(fmt::format("Consistency option must be one of 'eventual', 'local', or 'global'; found: {}", str));
+    }
+}
+
+seastar::sstring consistency_config_option_to_string(consistency_config_option option) {
+    switch (option) {
+    case consistency_config_option::eventual:
+        return "eventual";
+    case consistency_config_option::local:
+        return "local";
+    case consistency_config_option::global:
+        return "global";
+    }
+}
+
 } // namespace data_dictionary

 template <>
@@ -472,17 +659,18 @@ auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dicti
 }

 auto fmt::formatter<data_dictionary::storage_options>::format(const data_dictionary::storage_options& so, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+    auto type = so.type_string() | std::views::transform(&tolower) | std::ranges::to<std::string>();
    return std::visit(overloaded_functor {
        [&ctx] (const data_dictionary::storage_options::local& so) -> decltype(ctx.out()) {
            return fmt::format_to(ctx.out(), "{}", so.dir);
        },
-        [&ctx] (const data_dictionary::storage_options::s3& so) -> decltype(ctx.out()) {
+        [&ctx, &type] (const data_dictionary::storage_options::object_storage& so) -> decltype(ctx.out()) {
            return std::visit(overloaded_functor {
-                [&ctx, &so] (const sstring& prefix) -> decltype(ctx.out()) {
-                    return fmt::format_to(ctx.out(), "s3://{}/{}", so.bucket, prefix);
+                [&] (const sstring& prefix) -> decltype(ctx.out()) {
+                    return fmt::format_to(ctx.out(), "{}://{}/{}", type, so.bucket, prefix);
                },
-                [&ctx, &so] (const table_id& owner) -> decltype(ctx.out()) {
-                    return fmt::format_to(ctx.out(), "s3://{} (owner {})", so.bucket, owner);
+                [&] (const table_id& owner) -> decltype(ctx.out()) {
+                    return fmt::format_to(ctx.out(), "{}://{} (owner {})", type, so.bucket, owner);
                }
            }, so.location);
        }
--- a/data_dictionary/keyspace_metadata.hh
+++ b/data_dictionary/keyspace_metadata.hh
@@ -17,6 +17,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "data_dictionary/user_types_metadata.hh"
 #include "data_dictionary/storage_options.hh"
+#include "data_dictionary/consistency_config_options.hh"

 namespace gms {
 class feature_service;
@@ -33,11 +34,13 @@ class keyspace_metadata final {
    bool _durable_writes;
    user_types_metadata _user_types;
    lw_shared_ptr<const storage_options> _storage_options;
+    std::optional<consistency_config_option> _consistency_option;
 public:
    keyspace_metadata(std::string_view name,
                 std::string_view strategy_name,
                 locator::replication_strategy_config_options strategy_options,
                 std::optional<unsigned> initial_tablets,
+                 std::optional<consistency_config_option> consistency_option,
                 bool durable_writes,
                 std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
                 user_types_metadata user_types = user_types_metadata{},
@@ -47,6 +50,7 @@ public:
                 std::string_view strategy_name,
                 locator::replication_strategy_config_options options,
                 std::optional<unsigned> initial_tablets,
+                 std::optional<consistency_config_option> consistency_option,
                 bool durables_writes = true,
                 storage_options storage_opts = {},
                 std::vector<schema_ptr> cf_defs = {});
@@ -62,9 +66,13 @@ public:
    const locator::replication_strategy_config_options& strategy_options() const {
        return _strategy_options;
    }
+    locator::replication_strategy_config_options strategy_options_v1() const;
    std::optional<unsigned> initial_tablets() const {
        return _initial_tablets;
    }
+    std::optional<data_dictionary::consistency_config_option> consistency_option() const {
+        return _consistency_option;
+    }
    bool uses_tablets() const noexcept {
        return _initial_tablets.has_value();
    }
--- a/data_dictionary/storage_options.hh
+++ b/data_dictionary/storage_options.hh
@@ -25,56 +25,56 @@ namespace data_dictionary {
 struct storage_options {
    struct local {
        std::filesystem::path dir;
-        static constexpr std::string_view name = "LOCAL";
-
-        static local from_map(const std::map<sstring, sstring>&);
        std::map<sstring, sstring> to_map() const;
+        std::string_view name() const;
        bool operator==(const local&) const = default;
    };
-    struct s3 {
-        sstring bucket;
-        sstring endpoint;
+    struct object_storage {
+        std::string bucket;
+        std::string endpoint;
        std::variant<sstring, table_id> location;
        seastar::abort_source* abort_source = nullptr;
-        static constexpr std::string_view name = "S3";

-        static s3 from_map(const std::map<sstring, sstring>&);
+        std::string type;
+
        std::map<sstring, sstring> to_map() const;
-        bool operator==(const s3&) const = default;
+        std::string_view name() const;
+        bool operator==(const object_storage&) const;
    };
-    using value_type = std::variant<local, s3>;
+    using s3 = object_storage;
+    using gs = object_storage;
+
+    using value_type = std::variant<local, object_storage>;
    value_type value = local{};

    storage_options() = default;

    bool is_local_type() const noexcept;
+    bool is_object_storage_type() const noexcept;
+
+    bool is_s3_type() const noexcept;
+    bool is_gs_type() const noexcept;
+
    std::string_view type_string() const;
    std::map<sstring, sstring> to_map() const;

    bool can_update_to(const storage_options& new_options);

-    static value_type from_map(std::string_view type, std::map<sstring, sstring> values);
+    static value_type from_map(std::string_view type, const std::map<sstring, sstring>& values);

-    storage_options append_to_s3_prefix(const sstring& s) const;
+    static const std::string LOCAL_NAME;
+    static const std::string S3_NAME;
+    static const std::string GS_NAME;
+
+    storage_options append_to_object_storage_prefix(const sstring& s) const;
 };

-inline storage_options make_local_options(std::filesystem::path dir) {
-    storage_options so;
-    so.value = data_dictionary::storage_options::local { .dir = std::move(dir) };
-    return so;
-}
-
-inline storage_options make_s3_options(const std::string& endpoint, const std::string& fqn) {
-    std::string bucket;
-    std::string object;
-    s3::s3fqn_to_parts(fqn, bucket, object);
-    object = std::filesystem::path(object).parent_path().string(); // remove the filename and trailing separator from the path
-    storage_options so;
-    so.value = storage_options::s3{.bucket = std::move(bucket), .endpoint = endpoint, .location = std::move(object)};
-
-    return so;
-}
+storage_options make_local_options(std::filesystem::path dir);
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& fqn, abort_source* = nullptr);
+storage_options make_object_storage_options(const std::string& endpoint, const std::string& type, const std::string& bucket, const std::string& prefix, abort_source* = nullptr);

+bool is_object_storage_fqn(const std::filesystem::path& fqn, std::string_view type);
+bool object_storage_fqn_to_parts(const std::filesystem::path& fqn, std::string_view type, std::string& bucket_name, std::string& object_name);

 } // namespace data_dictionary

--- a/db/CMakeLists.txt
+++ b/db/CMakeLists.txt
@@ -44,7 +44,9 @@ target_sources(db
    rate_limiter.cc
    per_partition_rate_limit_options.cc
    row_cache.cc
-    tablet_options.cc)
+    tablet_options.cc
+    object_storage_endpoint_param.cc
+    )
 target_include_directories(db
  PUBLIC
    ${CMAKE_SOURCE_DIR})
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -77,11 +77,9 @@ future<db::all_batches_replayed> db::batchlog_manager::do_batch_log_replay(post_
                });
            });
        }
-        if (all_replayed == all_batches_replayed::yes) {
-            co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
-                bm._last_replay = last_replay;
-            });
-        }
+        co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
+            bm._last_replay = last_replay;
+        });
        blogger.debug("Batchlog replay on shard {}: done", dest);
        co_return all_replayed;
    });
@@ -190,7 +188,6 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
-            all_replayed = all_batches_replayed::no;
            co_return stop_iteration::no;
        }

@@ -263,9 +260,9 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
                    // in both cases.
                    // FIXME: verify that the above is reasonably true.
                    co_await limiter->reserve(size);
-                        _stats.write_attempts += mutations.size();
-                        auto timeout = db::timeout_clock::now() + write_timeout;
-                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    _stats.write_attempts += mutations.size();
+                    auto timeout = db::timeout_clock::now() + write_timeout;
+                    co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
                }
            }
        } catch (data_dictionary::no_such_keyspace& ex) {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -502,9 +502,6 @@ public:
    void flush_segments(uint64_t size_to_remove);
    void check_no_data_older_than_allowed();

-    // whitebox testing
-    std::function<future<>()> _oversized_pre_wait_memory_func;
-
 private:
    class shutdown_marker{};

@@ -1600,15 +1597,8 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ

    scope_increment_counter allocating(totals.active_allocations);

-    // #27992 - whitebox testing. signal we are trying to lock out 
-    // all allocators
-    if (_oversized_pre_wait_memory_func) {
-        co_await _oversized_pre_wait_memory_func();
-    }
-
    auto permit = co_await std::move(fut);
-    // #27992 - task reordering _can_ force the available units to negative. this is ok.
-    SCYLLA_ASSERT(_request_controller.available_units() <= 0);
+    SCYLLA_ASSERT(_request_controller.available_units() == 0);

    decltype(permit) fake_permit; // can't have allocate+sync release semaphore.
    bool failed = false;
@@ -1869,15 +1859,13 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
            }
        }
    }
-
-    auto avail = _request_controller.available_units();
-    SCYLLA_ASSERT(avail <= 0);
+    SCYLLA_ASSERT(_request_controller.available_units() == 0);
    SCYLLA_ASSERT(permit.count() == max_request_controller_units());
    auto nw = _request_controller.waiters();
    permit.return_all();
    // #20633 cannot guarantee controller avail is now full, since we could have had waiters when doing
    // return all -> now will be less avail
-    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == (avail + ssize_t(max_request_controller_units())));
+    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == ssize_t(max_request_controller_units()));

    if (!failed) {
        clogger.trace("Oversized allocation succeeded.");
@@ -1986,13 +1974,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
+            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
-    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
@@ -3487,7 +3475,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

                if (tmp.size_bytes() == 0) {
                    eof = true;
-                    auto reason = fmt::format("read 0 bytes, while tried to read {}", block_size);
+                    auto reason = fmt::format("read 0 bytes, while tried to read {} bytes. rem={}, size={}",
+                            block_size, rem, size);
                    throw segment_truncation(std::move(reason), block_boundry);
                }

@@ -3523,11 +3512,13 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    auto checksum = crc.checksum();

                    if (check != checksum) {
-                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}", check, checksum);
+                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}. rem={}, size={}",
+                                check, checksum, rem, size);
                        throw segment_data_corruption_error(std::move(reason), alignment);
                    }
                    if (id != this->id) {
-                        auto reason = fmt::format("IDs do not match: {} vs. {}", id, this->id);
+                        auto reason = fmt::format("IDs do not match: {} vs. {}. rem={}, size={}",
+                                id, this->id, rem, size);
                        throw segment_truncation(std::move(reason), pos + rem);
                    }
                }
@@ -3636,10 +3627,6 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
            auto old = pos;
            pos = next_pos(off);
            clogger.trace("Pos {} -> {} ({})", old, pos, off);
-            // #24346 check eof status whenever we move file pos.
-            if (pos >= file_size) {
-                eof = true;
-            }
        }

        future<> read_entry() {
@@ -3955,9 +3942,6 @@ void db::commitlog::update_max_data_lifetime(std::optional<uint64_t> commitlog_d
    _segment_manager->cfg.commitlog_data_max_lifetime_in_seconds = commitlog_data_max_lifetime_in_seconds;
 }

-void db::commitlog::set_oversized_pre_wait_memory_func(std::function<future<>()> f) {
-    _segment_manager->_oversized_pre_wait_memory_func = std::move(f);
-}

 future<std::vector<sstring>> db::commitlog::get_segments_to_replay() const {
    return _segment_manager->get_segments_to_replay();
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -385,9 +385,6 @@ public:
    // (Re-)set data mix lifetime.
    void update_max_data_lifetime(std::optional<uint64_t> commitlog_data_max_lifetime_in_seconds);

-    // Whitebox testing. Do not use for production
-    void set_oversized_pre_wait_memory_func(std::function<future<>()>);
-
    using commit_load_reader_func = std::function<future<>(buffer_and_replay_position)>;

    class segment_error : public std::exception {};
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -54,14 +54,12 @@ public:
        uint64_t applied_mutations = 0;
        uint64_t corrupt_bytes = 0;
        uint64_t truncated_at = 0;
-        uint64_t broken_files = 0;

        stats& operator+=(const stats& s) {
            invalid_mutations += s.invalid_mutations;
            skipped_mutations += s.skipped_mutations;
            applied_mutations += s.applied_mutations;
            corrupt_bytes += s.corrupt_bytes;
-            broken_files += s.broken_files;
            return *this;
        }
        stats operator+(const stats& s) const {
@@ -194,8 +192,6 @@ db::commitlog_replayer::impl::recover(const commitlog::descriptor& d, const comm
            s->corrupt_bytes += e.bytes();
        } catch (commitlog::segment_truncation& e) {
            s->truncated_at = e.position();
-        } catch (commitlog::header_checksum_error&) {
-            ++s->broken_files;
        } catch (...) {
            throw;
        }
@@ -374,9 +370,6 @@ future<> db::commitlog_replayer::recover(std::vector<sstring> files, sstring fna
                    if (stats.truncated_at != 0) {
                        rlogger.warn("Truncated file: {} at position {}.", f, stats.truncated_at);
                    }
-                    if (stats.broken_files != 0) {
-                        rlogger.warn("Corrupted file header: {}. Skipped.", f);
-                    }
                    rlogger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
                                    , f
                                    , stats.applied_mutations
--- a/db/config.cc
+++ b/db/config.cc
@@ -30,6 +30,7 @@
 #include "db/per_partition_rate_limit_extension.hh"
 #include "db/paxos_grace_seconds_extension.hh"
 #include "db/tags/extension.hh"
+#include "db/object_storage_endpoint_param.hh"
 #include "config.hh"
 #include "extensions.hh"
 #include "sstables/compressor.hh"
@@ -85,7 +86,7 @@ json::json_return_type
 object_storage_endpoints_to_json(const std::vector<db::object_storage_endpoint_param> &endpoints) {
    std::unordered_map<sstring, sstring> m;
    for (auto& e : endpoints) {
-        m[e.endpoint] = e.to_json_string();
+        m[e.key()] = e.to_json_string();
    }
    return value_to_json(m);
 }
@@ -291,16 +292,16 @@ const config_type& config_type_for<std::vector<db::config::error_injection_at_st
 }

 template <>
-const config_type& config_type_for<enum_option<utils::dict_training_loop::when>>() {
+const config_type& config_type_for<enum_option<netw::dict_training_loop::when>>() {
    static config_type ct(
-        "dictionary training conditions", printable_to_json<enum_option<utils::dict_training_loop::when>>);
+        "dictionary training conditions", printable_to_json<enum_option<netw::dict_training_loop::when>>);
    return ct;
 }

 template <>
-const config_type& config_type_for<utils::advanced_rpc_compressor::tracker::algo_config>() {
+const config_type& config_type_for<netw::advanced_rpc_compressor::tracker::algo_config>() {
    static config_type ct(
-        "advanced rpc compressor config", printable_vector_to_json<enum_option<compression_algorithm>>);
+        "advanced rpc compressor config", printable_vector_to_json<enum_option<netw::compression_algorithm>>);
    return ct;
 }

@@ -474,9 +475,9 @@ struct convert<db::config::error_injection_at_startup> {


 template <>
-class convert<enum_option<utils::dict_training_loop::when>> {
+class convert<enum_option<netw::dict_training_loop::when>> {
 public:
-    static bool decode(const Node& node, enum_option<utils::dict_training_loop::when>& rhs) {
+    static bool decode(const Node& node, enum_option<netw::dict_training_loop::when>& rhs) {
        std::string name;
        if (!convert<std::string>::decode(node, name)) {
            return false;
@@ -491,9 +492,9 @@ public:
 };

 template <>
-class convert<enum_option<utils::compression_algorithm>> {
+class convert<enum_option<netw::compression_algorithm>> {
 public:
-    static bool decode(const Node& node, enum_option<utils::compression_algorithm>& rhs) {
+    static bool decode(const Node& node, enum_option<netw::compression_algorithm>& rhs) {
        std::string name;
        if (!convert<std::string>::decode(node, name)) {
            return false;
@@ -510,11 +511,7 @@ public:
 template<>
 struct convert<db::object_storage_endpoint_param> {
    static bool decode(const Node& node, db::object_storage_endpoint_param& ep) {
-        ep.endpoint = node["name"].as<std::string>();
-        ep.config.port = node["port"].as<unsigned>();
-        ep.config.use_https = node["https"].as<bool>(false);
-        ep.config.region = node["aws_region"] ? node["aws_region"].as<std::string>() : std::getenv("AWS_DEFAULT_REGION");
-        ep.config.role_arn = node["iam_role_arn"] ? node["iam_role_arn"].as<std::string>() : "";
+        ep = db::object_storage_endpoint_param::decode(node);
        return true;
    }
 };
@@ -776,7 +773,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Related information: Enabling incremental backups")
    , snapshot_before_compaction(this, "snapshot_before_compaction", value_status::Unused, false,
        "Enable or disable taking a snapshot before each compaction. This option is useful to back up data when there is a data format change. Be careful using this option because Cassandra does not clean up older snapshots automatically.\n"
-        "\n"
+        "\n"  
        "Related information: Configuring compaction")
    /**
    * @Group Common fault detection setting
@@ -1058,11 +1055,11 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , internode_compression_checksumming(this, "internode_compression_checksumming", liveness::LiveUpdate, value_status::Used, true,
        "Computes and checks checksums for compressed RPC frames. This is a paranoid precaution against corruption bugs in the compression protocol.")
    , internode_compression_algorithms(this, "internode_compression_algorithms", liveness::LiveUpdate, value_status::Used,
-            { utils::compression_algorithm::type::ZSTD, utils::compression_algorithm::type::LZ4, },
+            { netw::compression_algorithm::type::ZSTD, netw::compression_algorithm::type::LZ4, },
        "Specifies RPC compression algorithms supported by this node. ")
    , internode_compression_enable_advanced(this, "internode_compression_enable_advanced", liveness::MustRestart, value_status::Used, false,
        "Enables the new implementation of RPC compression. If disabled, Scylla will fall back to the old implementation.")
-    , rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, utils::dict_training_loop::when::type::NEVER,
+    , rpc_dict_training_when(this, "rpc_dict_training_when", liveness::LiveUpdate, value_status::Used, netw::dict_training_loop::when::type::NEVER,
        "Specifies when RPC compression dictionary training is performed by this node.\n"
        "* `never` disables it unconditionally.\n"
        "* `when_leader` enables it only whenever the node is the Raft leader.\n"
@@ -1171,17 +1168,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* default_weight: (Default: 1 **)  How many requests are handled during each turn of the RoundRobin.\n"
        "* weights: (Default: Keyspace: 1)  Takes a list of keyspaces. It sets how many requests are handled during each turn of the RoundRobin, based on the request_scheduler_id.")
    /**
-    * @Group Vector search settings
-    * @GroupDescription Settings for configuring and tuning vector search functionality.
-    */
-    , vector_store_primary_uri(this, "vector_store_primary_uri", liveness::LiveUpdate, value_status::Used, "",
-        "A comma-separated list of primary vector store node URIs. These nodes are preferred for vector search operations.")
-    , vector_store_secondary_uri(this, "vector_store_secondary_uri", liveness::LiveUpdate, value_status::Used, "",
-        "A comma-separated list of secondary vector store node URIs. These nodes are used as a fallback when all primary nodes are unavailable, and are typically located in a different availability zone for high availability.")
-    , vector_store_encryption_options(this, "vector_store_encryption_options", value_status::Used, {},
-        "Options for encrypted connections to the vector store. These options are used for HTTPS URIs in `vector_store_primary_uri` and `vector_store_secondary_uri`. The available options are:\n"
-        "* truststore: (Default: <not set, use system truststore>) Location of the truststore containing the trusted certificate for authenticating remote servers.")
-    /**
    * @Group Security properties
    * @GroupDescription Server and client security settings.
    */
@@ -1329,15 +1315,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", liveness::LiveUpdate, value_status::Used, "me", "Default sstable file format", {"md", "me", "ms"})
-    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{compression_parameters::algorithm::lz4_with_dicts},
+    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{},
        "Server-global user table compression options. If enabled, all user tables"
        "will be compressed using the provided options, unless overridden"
-        "by compression options in the table schema. User tables are all tables in non-system keyspaces. The available options are:\n"
-        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor, LZ4WithDictsCompressor (default), SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
+        "by compression options in the table schema. The available options are:\n"
+        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor (default), LZ4WithDictsCompressor, SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
        "* compression_level: (Default: 3) Compression level for ZstdCompressor and ZstdWithDictsCompressor. Higher levels provide better compression ratios at the cost of speed. Allowed values are integers between 1 and 22.")
-    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Deprecated, true,
+    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Used, true,
        "Allows for configuring tables to use SSTable compression with shared dictionaries. "
        "If the option is disabled, Scylla will reject CREATE and ALTER statements which try to set dictionary-based sstable compressors.\n"
        "This is only enforced when this node validates a new DDL statement; disabling the option won't disable dictionary-based compression "
@@ -1434,11 +1420,12 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Use on a new, parallel algorithm for performing aggregate queries.")
    , cql_duplicate_bind_variable_names_refer_to_same_variable(this, "cql_duplicate_bind_variable_names_refer_to_same_variable", liveness::LiveUpdate, value_status::Used, true,
            "A bind variable that appears twice in a CQL query refers to a single variable (if false, no name matching is performed).")
+    , select_internal_page_size(this, "select_internal_page_size", liveness::LiveUpdate, value_status::Used, 10000,
+            "SELECT statements with aggregation or GROUP BYs or a secondary index may use this page size for their internal reading data, not the page size specified in the query options.")
    , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port.")
    , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port.")
    , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address.")
-    , alternator_enforce_authorization(this, "alternator_enforce_authorization", liveness::LiveUpdate, value_status::Used, false, "Enforce checking the authorization header for every request in Alternator.")
-    , alternator_warn_authorization(this, "alternator_warn_authorization", liveness::LiveUpdate, value_status::Used, false, "Count and log warnings about failed authentication or authorization")
+    , alternator_enforce_authorization(this, "alternator_enforce_authorization", value_status::Used, false, "Enforce checking the authorization header for every request in Alternator.")
    , alternator_write_isolation(this, "alternator_write_isolation", value_status::Used, "", "Default write isolation policy for Alternator.")
    , alternator_streams_time_window_s(this, "alternator_streams_time_window_s", value_status::Used, 10, "CDC query confidence window for alternator streams.")
    , alternator_timeout_in_ms(this, "alternator_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
@@ -1462,6 +1449,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , alternator_max_expression_cache_entries_per_shard(this, "alternator_max_expression_cache_entries_per_shard", liveness::LiveUpdate, value_status::Used, 2000, "Maximum number of cached parsed request expressions, per shard.")
    , alternator_max_users_query_size_in_trace_output(this, "alternator_max_users_query_size_in_trace_output", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
            "Maximum size of user's command in trace output (`alternator_op` entry). Larger traces will be truncated and have `<truncated>` message appended - which doesn't count to the maximum limit.")
+    , vector_store_primary_uri(this, "vector_store_primary_uri", liveness::LiveUpdate, value_status::Used, "", "A comma-separated list of vector store node URIs. If not set, vector search is disabled.")
    , abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
    , sanitizer_report_backtrace(this, "sanitizer_report_backtrace", value_status::Used, false,
            "In debug mode, report log-structured allocator sanitizer violations with a backtrace. Slow.")
@@ -1537,9 +1525,9 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , error_injections_at_startup(this, "error_injections_at_startup", error_injection_value_status, {}, "List of error injections that should be enabled on startup.")
    , topology_barrier_stall_detector_threshold_seconds(this, "topology_barrier_stall_detector_threshold_seconds", value_status::Used, 2, "Report sites blocking topology barrier if it takes longer than this.")
    , enable_tablets(this, "enable_tablets", value_status::Used, false, "Enable tablets for newly created keyspaces. (deprecated)")
-    , tablets_mode_for_new_keyspaces(this, "tablets_mode_for_new_keyspaces", liveness::LiveUpdate, value_status::Used, tablets_mode_t::mode::unset, "Control tablets for new keyspaces.  Can be set to the following values:\n"
+    , tablets_mode_for_new_keyspaces(this, "tablets_mode_for_new_keyspaces", value_status::Used, tablets_mode_t::mode::unset, "Control tablets for new keyspaces.  Can be set to the following values:\n"
            "\tdisabled: New keyspaces use vnodes by default, unless enabled by the tablets={'enabled':true} option\n"
-            "\tenabled:  New keyspaces use tablets by default, unless disabled by the tablets={'enabled':false} option\n"
+            "\tenabled:  New keyspaces use tablets by default, unless disabled by the tablets={'disabled':true} option\n"
            "\tenforced: New keyspaces must use tablets. Tablets cannot be disabled using the CREATE KEYSPACE option")
    , view_flow_control_delay_limit_in_ms(this, "view_flow_control_delay_limit_in_ms", liveness::LiveUpdate, value_status::Used, 1000,
        "The maximal amount of time that materialized-view update flow control may delay responses "
@@ -1670,11 +1658,6 @@ auto fmt::formatter<db::error_injection_at_startup>::format(const db::error_inje
                          eias.name, eias.one_shot, eias.parameters);
 }

-auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
-    -> decltype(ctx.out()) {
-    return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
-}
-
 namespace utils {

 template<>
@@ -1753,21 +1736,6 @@ const db::extensions& db::config::extensions() const {
    return *_extensions;
 }

-compression_parameters db::config::get_sstable_compression_user_table_options(bool dicts_feature_enabled) const {
-    if (sstable_compression_user_table_options.is_set()
-            || dicts_feature_enabled
-            || !sstable_compression_user_table_options().uses_dictionary_compressor()) {
-        return sstable_compression_user_table_options();
-    } else {
-        // Fall back to non-dict if dictionary compression is not enabled cluster-wide.
-        auto options = sstable_compression_user_table_options();
-        auto params = options.get_options();
-        auto algo = compression_parameters::non_dict_equivalent(options.get_algorithm());
-        params[compression_parameters::SSTABLE_COMPRESSION] = sstring(compression_parameters::algorithm_to_name(algo));
-        return compression_parameters{params};
-    }
-}
-
 std::map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
    // We decided against using the construct-on-first-use idiom here:
    // https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
@@ -1784,7 +1752,8 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"broadcast-tables", feature::BROADCAST_TABLES},
        {"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},
        {"tablets", feature::UNUSED},
-        {"views-with-tablets", feature::UNUSED}
+        {"views-with-tablets", feature::UNUSED},
+        {"strongly-consistent-tables", feature::STRONGLY_CONSISTENT_TABLES}
    };
 }

--- a/db/config.hh
+++ b/db/config.hh
@@ -21,10 +21,9 @@
 #include "utils/enum_option.hh"
 #include "gms/inet_address.hh"
 #include "db/hints/host_filter.hh"
-#include "utils/s3/creds.hh"
 #include "utils/error_injection.hh"
-#include "utils/dict_trainer.hh"
-#include "utils/advanced_rpc_compressor.hh"
+#include "message/dict_trainer.hh"
+#include "message/advanced_rpc_compressor.hh"
 #include "db/tri_mode_restriction.hh"
 #include "sstables/compressor.hh"

@@ -87,23 +86,8 @@ struct error_injection_at_startup {

 std::istream& operator>>(std::istream& is, error_injection_at_startup&);

-struct object_storage_endpoint_param {
-    sstring endpoint;
-    s3::endpoint_config config;
+struct object_storage_endpoint_param;

-    bool operator==(const object_storage_endpoint_param& other) const {
-        return endpoint == other.endpoint && config == other.config;
-    }
-
-    sstring to_json_string() const {
-            return fmt::format("{{ \"port\": {}, \"use_https\": {}, \"aws_region\": \"{}\", \"iam_role_arn\": \"{}\" }}",
-            config.port, config.use_https, config.region, config.role_arn);
-    }
-
-    friend fmt::formatter<object_storage_endpoint_param>;
-};
-
-std::istream& operator>>(std::istream& is, object_storage_endpoint_param& f);
 }

 template<>
@@ -112,11 +96,6 @@ struct fmt::formatter<db::error_injection_at_startup> {
    auto format(const db::error_injection_at_startup&, fmt::format_context& ctx) const -> decltype(ctx.out());
 };

-template <>
-struct fmt::formatter<db::object_storage_endpoint_param> : fmt::formatter<sstring_view> {
-    auto format(const db::object_storage_endpoint_param&, fmt::format_context& ctx) const -> decltype(ctx.out());
-};
-
 namespace utils {

 sstring config_value_as_json(const db::seed_provider_type& v);
@@ -136,7 +115,8 @@ struct experimental_features_t {
        UDF,
        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
-        KEYSPACE_STORAGE_OPTIONS
+        KEYSPACE_STORAGE_OPTIONS,
+        STRONGLY_CONSISTENT_TABLES
    };
    static std::map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
@@ -328,9 +308,9 @@ public:
    named_value<uint32_t> internode_compression_zstd_min_message_size;
    named_value<uint32_t> internode_compression_zstd_max_message_size;
    named_value<bool> internode_compression_checksumming;
-    named_value<utils::advanced_rpc_compressor::tracker::algo_config> internode_compression_algorithms;
+    named_value<netw::advanced_rpc_compressor::tracker::algo_config> internode_compression_algorithms;
    named_value<bool> internode_compression_enable_advanced;
-    named_value<enum_option<utils::dict_training_loop::when>> rpc_dict_training_when;
+    named_value<enum_option<netw::dict_training_loop::when>> rpc_dict_training_when;
    named_value<uint32_t> rpc_dict_training_min_time_seconds;
    named_value<uint64_t> rpc_dict_training_min_bytes;
    named_value<bool> inter_dc_tcp_nodelay;
@@ -363,9 +343,6 @@ public:
    named_value<sstring> request_scheduler;
    named_value<sstring> request_scheduler_id;
    named_value<string_map> request_scheduler_options;
-    named_value<sstring> vector_store_primary_uri;
-    named_value<sstring> vector_store_secondary_uri;
-    named_value<string_map> vector_store_encryption_options;
    named_value<sstring> authenticator;
    named_value<sstring> internode_authenticator;
    named_value<sstring> authorizer;
@@ -434,13 +411,7 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
-
-    // NOTE: Do not use this option directly.
-    // Use get_sstable_compression_user_table_options() instead.
    named_value<compression_parameters> sstable_compression_user_table_options;
-
-    compression_parameters get_sstable_compression_user_table_options(bool dicts_feature_enabled) const;
-
    named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
    named_value<bool> sstable_compression_dictionaries_enable_writing;
    named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
@@ -481,12 +452,12 @@ public:
    named_value<bool> enable_cql_config_updates;
    named_value<bool> enable_parallelized_aggregation;
    named_value<bool> cql_duplicate_bind_variable_names_refer_to_same_variable;
+    named_value<uint32_t> select_internal_page_size;

    named_value<uint16_t> alternator_port;
    named_value<uint16_t> alternator_https_port;
    named_value<sstring> alternator_address;
    named_value<bool> alternator_enforce_authorization;
-    named_value<bool> alternator_warn_authorization;
    named_value<sstring> alternator_write_isolation;
    named_value<uint32_t> alternator_streams_time_window_s;
    named_value<uint32_t> alternator_timeout_in_ms;
@@ -497,6 +468,8 @@ public:
    named_value<uint32_t> alternator_max_expression_cache_entries_per_shard;
    named_value<uint64_t> alternator_max_users_query_size_in_trace_output;

+    named_value<sstring> vector_store_primary_uri;
+
    named_value<bool> abort_on_ebadf;

    named_value<bool> sanitizer_report_backtrace;
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -248,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
            // which is larger than the segment ID of the RP of the last written hint.
            cfg.base_segment_id = _last_written_rp.base_id();

-            return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
+            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
                // When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
                if (_sender.have_segments()) {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -643,12 +643,6 @@ future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept
        co_return;
    }

-    if (!replay_allowed()) {
-        auto reason = seastar::format("Precondition violdated while trying to drain {} / {}: "
-                "hint replay is not allowed", host_id, ip);
-        on_internal_error(manager_logger, std::move(reason));
-    }
-
    manager_logger.info("Draining starts for {}", host_id);

    const auto holder = seastar::gate::holder{_draining_eps_gate};
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -318,10 +318,6 @@ public:
    /// In both cases - removes the corresponding hints' directories after all hints have been drained and erases the
    /// corresponding hint_endpoint_manager objects.
    ///
-    /// Preconditions:
-    /// * Hint replay must be allowed (i.e. `replay_allowed()` must be true) throughout
-    ///   the execution of this function.
-    ///
    /// \param host_id host ID of the node that left the cluster
    /// \param ip the IP of the node that left the cluster
    future<> drain_for(endpoint_id host_id, gms::inet_address ip) noexcept;
@@ -346,15 +342,15 @@ public:
        return _state.contains(state::started);
    }

-    bool replay_allowed() const noexcept {
-        return _state.contains(state::replay_allowed);
-    }
-
 private:
    void set_started() noexcept {
        _state.set(state::started);
    }

+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
    void set_draining_all() noexcept {
        _state.set(state::draining_all);
    }
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -35,7 +35,7 @@
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
 #include "cql3/util.hh"
-#include "types/user.hh"
+#include "cql3/statements/property_definitions.hh"

 static seastar::logger mlogger("legacy_schema_migrator");

@@ -541,7 +541,8 @@ public:
        for (auto& ks : _keyspaces) {
            auto ksm = ::make_lw_shared<keyspace_metadata>(ks.name
                            , ks.replication_params["class"] // TODO, make ksm like c3?
-                            , ks.replication_params
+                            , cql3::statements::property_definitions::to_extended_map(ks.replication_params)
+                            , std::nullopt
                            , std::nullopt
                            , ks.durable_writes);

--- a/db/object_storage_endpoint_param.cc
+++ b/db/object_storage_endpoint_param.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ *
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+ 
+#include <string>
+#include <variant>
+#include <yaml-cpp/yaml.h>
+
+#include <boost/lexical_cast.hpp>
+
+#include "utils/s3/creds.hh"
+#include "object_storage_endpoint_param.hh"
+
+using namespace std::string_literals;
+
+db::object_storage_endpoint_param::object_storage_endpoint_param(s3_storage s)
+    : _data(std::move(s))
+{}
+db::object_storage_endpoint_param::object_storage_endpoint_param(std::string endpoint, s3::endpoint_config config)
+    : object_storage_endpoint_param(s3_storage{std::move(endpoint), std::move(config)})
+{}
+db::object_storage_endpoint_param::object_storage_endpoint_param(gs_storage s)
+    : _data(std::move(s))
+{}
+
+db::object_storage_endpoint_param::object_storage_endpoint_param() = default;
+db::object_storage_endpoint_param::object_storage_endpoint_param(const object_storage_endpoint_param&) = default;
+
+std::string db::object_storage_endpoint_param::s3_storage::to_json_string() const {
+    return fmt::format("{{ \"port\": {}, \"use_https\": {}, \"aws_region\": \"{}\", \"iam_role_arn\": \"{}\" }}",
+        config.port, config.use_https, config.region, config.role_arn
+    );
+}
+
+std::string db::object_storage_endpoint_param::s3_storage::key() const {
+    return endpoint;
+}
+
+std::string db::object_storage_endpoint_param::gs_storage::to_json_string() const {
+    return fmt::format("{{ \"type\": \"gs\", \"credentials_file\": \"{}\" }}",
+        credentials_file
+    );
+}
+
+std::string db::object_storage_endpoint_param::gs_storage::key() const {
+    return endpoint;
+}
+
+bool db::object_storage_endpoint_param::is_s3_storage() const {
+    return std::holds_alternative<s3_storage>(_data);
+}
+
+bool db::object_storage_endpoint_param::is_gs_storage() const {
+    return std::holds_alternative<gs_storage>(_data);
+}
+
+bool db::object_storage_endpoint_param::is_storage_of_type(std::string_view type) const {
+    if (type == s3_type) {
+        return is_s3_storage();
+    }
+    if (type == gs_type) {
+        return is_gs_storage();
+    }
+    return false;
+}
+
+const db::object_storage_endpoint_param::s3_storage& db::object_storage_endpoint_param::get_s3_storage() const {
+    return std::get<db::object_storage_endpoint_param::s3_storage>(_data);
+}
+
+const db::object_storage_endpoint_param::gs_storage& db::object_storage_endpoint_param::get_gs_storage() const {
+    return std::get<db::object_storage_endpoint_param::gs_storage>(_data);
+}
+
+std::strong_ordering db::object_storage_endpoint_param::operator<=>(const object_storage_endpoint_param&) const = default;
+bool db::object_storage_endpoint_param::operator==(const object_storage_endpoint_param&) const = default;
+
+std::string db::object_storage_endpoint_param::to_json_string() const {
+    return std::visit([](auto& o) { return o.to_json_string(); }, _data);
+}
+
+std::string db::object_storage_endpoint_param::key() const {
+    return std::visit([](auto& o) { return o.key(); }, _data);
+}
+
+const std::string& db::object_storage_endpoint_param::type() const {
+    if (is_s3_storage()) {
+        return s3_type;
+    } else if (is_gs_storage()) {
+        return gs_type;
+    }
+    throw std::runtime_error("Should not reach");
+}
+
+db::object_storage_endpoint_param db::object_storage_endpoint_param::decode(const YAML::Node& node) {
+    auto name = node["name"];
+    auto aws_region = node["aws_region"];
+    auto iam_role_arn = node["iam_role_arn"];
+    auto type = node["type"];
+
+    auto get_opt = [](auto& node, const std::string& key, auto def) {
+        auto tmp = node[key];
+        return tmp ? tmp.template as<std::decay_t<decltype(def)>>() : def;
+    };
+    // aws s3 endpoint. 
+    if (!type || type.as<std::string>() == s3_type || aws_region || iam_role_arn) {
+        s3_storage ep;
+        ep.endpoint = name.as<std::string>();
+        ep.config.port = node["port"].as<unsigned>();
+        ep.config.use_https = node["https"].as<bool>(false);
+        ep.config.region = aws_region ? aws_region.as<std::string>() : std::getenv("AWS_DEFAULT_REGION");
+        ep.config.role_arn = iam_role_arn ? iam_role_arn.as<std::string>() : "";
+
+        return object_storage_endpoint_param{std::move(ep)};
+    }
+    // GCS endpoint
+    if (type.as<std::string>() == gs_type) {
+        gs_storage ep;
+        ep.endpoint = name.as<std::string>();
+        ep.credentials_file = get_opt(node, "credentials_file", ""s);
+
+        return object_storage_endpoint_param(std::move(ep));
+    }
+    // TODO: other types
+    throw std::invalid_argument(fmt::format("Could not decode object_storage_endpoint_param: {}", boost::lexical_cast<std::string>(node)));
+}
+
+const std::string db::object_storage_endpoint_param::s3_type = "s3";
+const std::string db::object_storage_endpoint_param::gs_type = "gs";
+
+auto fmt::formatter<db::object_storage_endpoint_param>::format(const db::object_storage_endpoint_param& e, fmt::format_context& ctx) const
+    -> decltype(ctx.out()) {
+    return fmt::format_to(ctx.out(), "object_storage_endpoint_param{{}}", e.to_json_string());
+}
--- a/Show More
+++ b/Show More