Enable prometheus_allow_protobuf by default

Change the prometheus_allow_protobuf configuration to true by default. This allows ScyllaDB server to serve Prometheus protobuf format (enables native histogram support) if asked so by the monitoring server. Update config help text/docs to reflect protobuf support (drop “experimental” wording). Add cluster tests to validate the default is enabled, can be overridden, and /metrics returns protobuf when requested via Accept header (and falls back to text when disabled). Fixes #27817 co-Author: mykaul <mykaul@scylladb.com> Signed-off-by: Amnon Heiman <amnon@scylladb.com>
2026-01-19 09:40:49 +02:00
4058 changed files with 15052 additions and 23162 deletions
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -1,53 +0,0 @@
-name: Backport with Jira Integration
-
-on:
-  push:
-    branches:
-      - master
-      - next-*.*
-      - branch-*.*
-  pull_request_target:
-    types: [labeled, closed]
-    branches: 
-      - master
-      - next
-      - next-*.*
-      - branch-*.*
-
-jobs:
-  backport-on-push:
-    if: github.event_name == 'push'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'push'
-      base_branch: ${{ github.ref }}
-      commits: ${{ github.event.before }}..${{ github.sha }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-on-label:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'labeled'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      head_commit: ${{ github.event.pull_request.base.sha }}
-      label_name: ${{ github.event.label.name }}
-      pr_state: ${{ github.event.pull_request.state }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-chain:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'chain'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      pr_body: ${{ github.event.pull_request.body }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_sync_pr_milestone.yml
+++ b/.github/workflows/call_jira_sync_pr_milestone.yml
@@ -1,22 +0,0 @@
-name: Sync Jira Based on PR Milestone Events
-
-on:
-  pull_request_target:
-    types: [milestoned, demilestoned]
-
-permissions:
-  contents: read
-  pull-requests: read
-
-jobs:
-  jira-sync-milestone-set:
-    if: github.event.action == 'milestoned'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  jira-sync-milestone-removed:
-    if: github.event.action == 'demilestoned'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -1,4 +1,4 @@
-name: Call Jira release creation for new milestone
+name: Call Jira release creation for new milestone

 on:
  milestone:
@@ -9,6 +9,6 @@ jobs:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
    with:
      # Comma-separated list of Jira project keys
-      jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
+      jira_project_keys: "SCYLLADB,CUSTOMER"
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/close_issue_for_scylla_associate.yml
+++ b/.github/workflows/close_issue_for_scylla_associate.yml
@@ -1,62 +0,0 @@
-name: Close issues created by Scylla associates
-
-on:
-  issues:
-    types: [opened, reopened]
-
-permissions:
-  issues: write
-
-jobs:
-  comment-and-close:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Comment and close if author email is scylladb.com
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const issue = context.payload.issue;
-            const actor = context.actor;
-
-            // Get user data (only public email is available)
-            const { data: user } = await github.rest.users.getByUsername({
-              username: actor,
-            });
-
-            const email = user.email || "";
-            console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
-
-            // Only continue if email exists and ends with @scylladb.com
-            if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
-              console.log("User is not a scylladb.com email (or email not public); skipping.");
-              return;
-            }
-
-            const owner = context.repo.owner;
-            const repo = context.repo.repo;
-            const issue_number = issue.number;
-
-            const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
-
-            // Add the comment
-            await github.rest.issues.createComment({
-              owner,
-              repo,
-              issue_number,
-              body,
-            });
-
-            console.log(`Comment added to #${issue_number}`);
-
-            // Close the issue
-            await github.rest.issues.update({
-              owner,
-              repo,
-              issue_number,
-              state: "closed",
-              state_reason: "not_planned"
-            });
-
-            console.log(`Issue #${issue_number} closed.`);
--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -14,8 +14,7 @@ env:
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log

-permissions:
-  contents: read
+permissions: {}

 # cancel the in-progress run upon a repush
 concurrency:
@@ -35,6 +34,8 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
+      - run: |
+          sudo dnf -y install clang-tools-extra
      - name: Generate compilation database
        run: |
          cmake                                         \
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,52 +9,16 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
+    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
-      - name: Verify Org Membership
-        id: verify_author
-        shell: bash
-        run: |
-          if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
-            AUTHOR="${{ github.event.pull_request.user.login }}"
-            ASSOCIATION="${{ github.event.pull_request.author_association }}"
-          else
-            AUTHOR="${{ github.event.comment.user.login }}"
-            ASSOCIATION="${{ github.event.comment.author_association }}"
-          fi
-          if [[ "$ASSOCIATION" == "MEMBER" || "$ASSOCIATION" == "OWNER" ]]; then
-            echo "member=true" >> $GITHUB_OUTPUT
-          else
-            echo "::warning::${AUTHOR} is not a member of scylladb (association: ${ASSOCIATION}); skipping CI trigger."
-            echo "member=false" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Validate Comment Trigger
-        if: github.event_name == 'issue_comment'
-        id: verify_comment
-        shell: bash
-        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
-
-          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
-            echo "trigger=true" >> $GITHUB_OUTPUT
-          else
-            echo "trigger=false" >> $GITHUB_OUTPUT
-          fi
-
      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
-          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
-          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,6 +300,7 @@ add_subdirectory(locator)
 add_subdirectory(message)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
+add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(replica)
 add_subdirectory(raft)
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ For further information, please see:

 [developer documentation]: HACKING.md
 [build documentation]: docs/dev/building.md
-[docker image build documentation]: dist/docker/redhat/README.md
+[docker image build documentation]: dist/docker/debian/README.md

 ## Running Scylla

--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.2.0-dev
+VERSION=2026.1.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
 // Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function can throw a ValidationException API error if there
+// This function can throw an ValidationException API error if there
 // are errors in the format of the condition itself.
 bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_response) {
+    if (_should_add_to_reponse) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
+    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
    bool operator()() const noexcept {
-        return _should_add_to_response;
+        return _should_add_to_reponse;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_response = false;
+    bool _should_add_to_reponse = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -17,7 +17,6 @@
 #include "auth/service.hh"
 #include "db/config.hh"
 #include "db/view/view_build_status.hh"
-#include "locator/tablets.hh"
 #include "mutation/tombstone.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/log.hh"
@@ -63,7 +62,6 @@
 #include "types/types.hh"
 #include "db/system_keyspace.hh"
 #include "cql3/statements/ks_prop_defs.hh"
-#include "alternator/ttl_tag.hh"

 using namespace std::chrono_literals;

@@ -165,7 +163,7 @@ static map_type attrs_type() {

 static const column_definition& attrs_column(const schema& schema) {
    const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
-    throwing_assert(cdef);
+    SCYLLA_ASSERT(cdef);
    return *cdef;
 }

@@ -238,7 +236,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
 }

 // This function assumes the given value is an object and returns requested member value.
-// If it is not possible, an api_error::validation is thrown.
+// If it is not possible an api_error::validation is thrown.
 static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
    validate_is_object(obj, caller);
    const rjson::value* ret = rjson::find(obj, member_name);
@@ -250,7 +248,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe


 // This function assumes the given value is an object with a single member, and returns this member.
-// In case the requirements are not met, an api_error::validation is thrown.
+// In case the requirements are not met an api_error::validation is thrown.
 static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -683,7 +681,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
 }

 // Sets a KeySchema object inside the given JSON parent describing the key
-// attributes of the given schema as being either HASH or RANGE keys.
+// attributes of the the given schema as being either HASH or RANGE keys.
 // Additionally, adds to a given map mappings between the key attribute
 // names and their type (as a DynamoDB type string).
 void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -917,7 +915,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
                sstring index_name = cf_name.substr(delim_it + 1);
                rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
                rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
-                // Add index's KeySchema and collect types for AttributeDefinitions:
+                // Add indexes's KeySchema and collect types for AttributeDefinitions:
                executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
                // Add projection type
                rjson::value projection = rjson::empty_object();
@@ -1650,7 +1648,7 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
 }

 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
-    throwing_assert(this_shard_id() == 0);
+    SCYLLA_ASSERT(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
    // command. We can't inspect the current database schema at this point
@@ -1877,34 +1875,23 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        auto ts = group0_guard.write_timestamp();
        utils::chunked_vector<mutation> schema_mutations;
        auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
-        locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
-        const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
-        auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
        // Alternator Streams doesn't yet work when the table uses tablets (#23838)
        if (stream_specification && stream_specification->IsObject()) {
            auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
            if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
+                locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
+                const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
+                auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
                if (rs->uses_tablets()) {
                    co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
                    "If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
                }
            }
        }
-        // Creating an index in tablets mode requires the keyspace to be RF-rack-valid.
-        // GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues.
-        if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
-            try {
-                locator::assert_rf_rack_valid_keyspace(keyspace_name, _proxy.local_db().get_token_metadata_ptr(), *rs);
-            } catch (const std::invalid_argument& ex) {
-                if (!view_builders.empty()) {
-                    co_return api_error::validation(fmt::format("GlobalSecondaryIndexes and LocalSecondaryIndexes on a table "
-                        "using tablets require the number of racks in the cluster to be either 1 or 3"));
-                } else {
-                    co_return api_error::validation(fmt::format("Cannot create table '{}' with tablets: the configuration "
-                        "option 'rf_rack_valid_keyspaces' is enabled, which enforces that tables using tablets can only be created in clusters "
-                        "that have either 1 or 3 racks", table_name));
-                }
-            }
+        // Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
+        // GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
+        if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+            co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
        }
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
@@ -2127,12 +2114,9 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                            co_return api_error::validation(fmt::format(
                                "LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
                        }
-                        try {
-                            locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(),
-                                    p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy());
-                        } catch (const std::invalid_argument& ex) {
-                            co_return api_error::validation(fmt::format("GlobalSecondaryIndexes on a table "
-                                "using tablets require the number of racks in the cluster to be either 1 or 3"));
+                        if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
+                                !p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+                            co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
                        }

                        elogger.trace("Adding GSI {}", index_name);
@@ -2436,7 +2420,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
 //   case, this function simply won't be called for this attribute.)
 //
 // This function checks if the given attribute update is an update to some
-// GSI's key, and if the value is unsuitable, an api_error::validation is
+// GSI's key, and if the value is unsuitable, a api_error::validation is
 // thrown. The checking here is similar to the checking done in
 // get_key_from_typed_value() for the base table's key columns.
 //
@@ -2838,12 +2822,14 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        }
    } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
        std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
-        throwing_assert(m); // !needs_read_before_write, so apply() did not check a condition
+        SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
        return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
            return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
        });
    }
-    throwing_assert(cas_shard);
+    if (!cas_shard) {
+        on_internal_error(elogger, "cas_shard is not set");
+    }
    // If we're still here, we need to do this write using LWT:
    global_stats.write_using_lwt++;
    per_table_stats.write_using_lwt++;
@@ -3547,7 +3533,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
    return true;
 }

-// Add a path to an attribute_path_map. Throws a validation error if the path
+// Add a path to a attribute_path_map. Throws a validation error if the path
 // "overlaps" with one already in the filter (one is a sub-path of the other)
 // or "conflicts" with it (both a member and index is requested).
 template<typename T>
@@ -5412,7 +5398,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
 }

 static dht::token token_for_segment(int segment, int total_segments) {
-    throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments);
+    SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
    uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
    return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
 }
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -50,7 +50,7 @@ public:
        _operators.emplace_back(i);
        check_depth_limit();
    }
-    void add_dot(std::string name) {
+    void add_dot(std::string(name)) {
        _operators.emplace_back(std::move(name));
        check_depth_limit();
    }
@@ -85,7 +85,7 @@ struct constant {
    }
 };

-// "value" is a value used in the right hand side of an assignment
+// "value" is is a value used in the right hand side of an assignment
 // expression, "SET a = ...". It can be a constant (a reference to a value
 // included in the request, e.g., ":val"), a path to an attribute from the
 // existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
 // The supported primitive conditions are:
 // 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
 //    v1 and v2 are values - from the item (an attribute path), the query
-//    (a ":val" reference), or a function of the above (only the size()
+//    (a ":val" reference), or a function of the the above (only the size()
 //    function is supported).
 // 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
 // 3. N-ary operator - v1 IN ( v2, v3, ... )
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
 clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);

-// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it.  Otherwise,
+// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -710,7 +710,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
-    throwing_assert(req->content_stream);
+    SCYLLA_ASSERT(req->content_stream);
    chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
    // If the request had no Content-Length, we reserved too many units
    // so need to return some
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        co_return rjson::print(std::move(ret));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    }

    // TODO: label
@@ -502,121 +502,123 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
-    auto e = topologies.end();
-    auto prev = e;
-    auto shards = rjson::empty_array();
+    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

-    std::optional<shard_id> last;
+        auto e = topologies.end();
+        auto prev = e;
+        auto shards = rjson::empty_array();

-    auto i = topologies.begin();
-    // if we're a paged query, skip to the generation where we left of.
-    if (shard_start) {
-        i = topologies.find(shard_start->time);
-    }
+        std::optional<shard_id> last;

-    // for parent-child stuff we need id:s to be sorted by token
-    // (see explanation above) since we want to find closest
-    // token boundary when determining parent.
-    // #7346 - we processed and searched children/parents in
-    // stored order, which is not necessarily token order,
-    // so the finding of "closest" token boundary (using upper bound)
-    // could give somewhat weird results.
-    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-        return id1.token() < id2.token();
-    };
+        auto i = topologies.begin();
+        // if we're a paged query, skip to the generation where we left of.
+        if (shard_start) {
+            i = topologies.find(shard_start->time);
+        }

-    // #7409 - shards must be returned in lexicographical order,
-    // normal bytes compare is string_traits<int8_t>::compare.
-    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-    // compare instead we inadvertently will sort in string lexical.
-    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-    };
-
-    // need a prev even if we are skipping stuff
-    if (i != topologies.begin()) {
-        prev = std::prev(i);
-    }
-
-    for (; limit > 0 && i != e; prev = i, ++i) {
-        auto& [ts, sv] = *i;
-
-        last = std::nullopt;
-
-        auto lo = sv.streams.begin();
-        auto end = sv.streams.end();
+        // for parent-child stuff we need id:s to be sorted by token
+        // (see explanation above) since we want to find closest
+        // token boundary when determining parent.
+        // #7346 - we processed and searched children/parents in
+        // stored order, which is not necessarily token order,
+        // so the finding of "closest" token boundary (using upper bound)
+        // could give somewhat weird results.
+        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+            return id1.token() < id2.token();
+        };

        // #7409 - shards must be returned in lexicographical order,
-        std::sort(lo, end, id_cmp);
+        // normal bytes compare is string_traits<int8_t>::compare.
+        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+        // compare instead we inadvertently will sort in string lexical.
+        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+        };

-        if (shard_start) {
-            // find next shard position
-            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-            shard_start = std::nullopt;
+        // need a prev even if we are skipping stuff
+        if (i != topologies.begin()) {
+            prev = std::prev(i);
        }

-        if (lo != end && prev != e) {
-            // We want older stuff sorted in token order so we can find matching
-            // token range when determining parent shard.
-            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-        }
-
-        auto expired = [&]() -> std::optional<db_clock::time_point> {
-            auto j = std::next(i);
-            if (j == e) {
-                return std::nullopt;
-            }
-            // add this so we sort of match potential 
-            // sequence numbers in get_records result.
-            return j->first + confidence_interval(db);
-        }();
-
-        while (lo != end) {
-            auto& id = *lo++;
-
-            auto shard = rjson::empty_object();
-
-            if (prev != e) {
-                auto& pids = prev->second.streams;
-                auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                    return t < id.token();
-                });
-                if (pid != pids.begin()) {
-                    pid = std::prev(pid);
-                }
-                if (pid != pids.end()) {
-                    rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                }
-            }
-
-            last.emplace(ts, id);
-            rjson::add(shard, "ShardId", *last);
-            auto range = rjson::empty_object();
-            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-            if (expired) {
-                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-            }
-
-            rjson::add(shard, "SequenceNumberRange", std::move(range));
-            rjson::push_back(shards, std::move(shard));
-            
-            if (--limit == 0) {
-                break;
-            }
+        for (; limit > 0 && i != e; prev = i, ++i) {
+            auto& [ts, sv] = *i;

            last = std::nullopt;
+
+            auto lo = sv.streams.begin();
+            auto end = sv.streams.end();
+
+            // #7409 - shards must be returned in lexicographical order,
+            std::sort(lo, end, id_cmp);
+
+            if (shard_start) {
+                // find next shard position
+                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+                shard_start = std::nullopt;
+            }
+
+            if (lo != end && prev != e) {
+                // We want older stuff sorted in token order so we can find matching
+                // token range when determining parent shard.
+                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+            }
+
+            auto expired = [&]() -> std::optional<db_clock::time_point> {
+                auto j = std::next(i);
+                if (j == e) {
+                    return std::nullopt;
+                }
+                // add this so we sort of match potential 
+                // sequence numbers in get_records result.
+                return j->first + confidence_interval(db);
+            }();
+
+            while (lo != end) {
+                auto& id = *lo++;
+
+                auto shard = rjson::empty_object();
+
+                if (prev != e) {
+                    auto& pids = prev->second.streams;
+                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
+                        return t < id.token();
+                    });
+                    if (pid != pids.begin()) {
+                        pid = std::prev(pid);
+                    }
+                    if (pid != pids.end()) {
+                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
+                    }
+                }
+
+                last.emplace(ts, id);
+                rjson::add(shard, "ShardId", *last);
+                auto range = rjson::empty_object();
+                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+                if (expired) {
+                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+                }
+
+                rjson::add(shard, "SequenceNumberRange", std::move(range));
+                rjson::push_back(shards, std::move(shard));
+                
+                if (--limit == 0) {
+                    break;
+                }
+
+                last = std::nullopt;
+            }
        }
-    }

-    if (last) {
-        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-    }
+        if (last) {
+            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+        }

-    rjson::add(stream_desc, "Shards", std::move(shards));
-    rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        
-    co_return rjson::print(std::move(ret));
+        rjson::add(stream_desc, "Shards", std::move(shards));
+        rjson::add(ret, "StreamDescription", std::move(stream_desc));
+            
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+    });
 }

 enum class shard_iterator_type {
@@ -896,169 +898,172 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
-    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
+            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
+        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-    auto result_set = builder.build();
-    auto records = rjson::empty_array();
+        auto result_set = builder.build();
+        auto records = rjson::empty_array();

-    auto& metadata = result_set->get_metadata();
+        auto& metadata = result_set->get_metadata();

-    auto op_index = std::distance(metadata.get_names().begin(), 
-        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-            return cdef->name->name() == op_column_name;
-        })
-    );
-    auto ts_index = std::distance(metadata.get_names().begin(), 
-        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-            return cdef->name->name() == timestamp_column_name;
-        })
-    );
-    auto eor_index = std::distance(metadata.get_names().begin(), 
-        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-            return cdef->name->name() == eor_column_name;
-        })
-    );
+        auto op_index = std::distance(metadata.get_names().begin(), 
+            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+                return cdef->name->name() == op_column_name;
+            })
+        );
+        auto ts_index = std::distance(metadata.get_names().begin(), 
+            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+                return cdef->name->name() == timestamp_column_name;
+            })
+        );
+        auto eor_index = std::distance(metadata.get_names().begin(), 
+            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+                return cdef->name->name() == eor_column_name;
+            })
+        );

-    std::optional<utils::UUID> timestamp;
-    auto dynamodb = rjson::empty_object();
-    auto record = rjson::empty_object();
-    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+        std::optional<utils::UUID> timestamp;
+        auto dynamodb = rjson::empty_object();
+        auto record = rjson::empty_object();
+        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-    using op_utype = std::underlying_type_t<cdc::operation>;
+        using op_utype = std::underlying_type_t<cdc::operation>;

-    auto maybe_add_record = [&] {
-        if (!dynamodb.ObjectEmpty()) {
-            rjson::add(record, "dynamodb", std::move(dynamodb));
-            dynamodb = rjson::empty_object();
-        }
-        if (!record.ObjectEmpty()) {
-            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-            rjson::add(record, "eventSource", "scylladb:alternator");
-            rjson::add(record, "eventVersion", "1.1");
-            rjson::push_back(records, std::move(record));
-            record = rjson::empty_object();
-            --limit;
-        }
-    };
+        auto maybe_add_record = [&] {
+            if (!dynamodb.ObjectEmpty()) {
+                rjson::add(record, "dynamodb", std::move(dynamodb));
+                dynamodb = rjson::empty_object();
+            }
+            if (!record.ObjectEmpty()) {
+                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+                rjson::add(record, "eventSource", "scylladb:alternator");
+                rjson::add(record, "eventVersion", "1.1");
+                rjson::push_back(records, std::move(record));
+                record = rjson::empty_object();
+                --limit;
+            }
+        };

-    for (auto& row : result_set->rows()) {
-        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+        for (auto& row : result_set->rows()) {
+            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-        if (!dynamodb.HasMember("Keys")) {
-            auto keys = rjson::empty_object();
-            describe_single_item(*selection, row, key_names, keys);
-            rjson::add(dynamodb, "Keys", std::move(keys));
-            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-            rjson::add(dynamodb, "StreamViewType", type);
-            // TODO: SizeBytes
-        }
+            if (!dynamodb.HasMember("Keys")) {
+                auto keys = rjson::empty_object();
+                describe_single_item(*selection, row, key_names, keys);
+                rjson::add(dynamodb, "Keys", std::move(keys));
+                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+                rjson::add(dynamodb, "StreamViewType", type);
+                // TODO: SizeBytes
+            }

-        /**
-         * We merge rows with same timestamp into a single event.
-         * This is pretty much needed, because a CDC row typically
-         * encodes ~half the info of an alternator write. 
-         * 
-         * A big, big downside to how alternator records are written
-         * (i.e. CQL), is that the distinction between INSERT and UPDATE
-         * is somewhat lost/unmappable to actual eventName. 
-         * A write (currently) always looks like an insert+modify
-         * regardless whether we wrote existing record or not. 
-         * 
-         * Maybe RMW ops could be done slightly differently so 
-         * we can distinguish them here...
-         * 
-         * For now, all writes will become MODIFY.
-         * 
-         * Note: we do not check the current pre/post
-         * flags on CDC log, instead we use data to 
-         * drive what is returned. This is (afaict)
-         * consistent with dynamo streams
-         */
-        switch (op) {
-        case cdc::operation::pre_image:
-        case cdc::operation::post_image:
-        {
-            auto item = rjson::empty_object();
-            describe_single_item(*selection, row, attr_names, item, nullptr, true);
-            describe_single_item(*selection, row, key_names, item);
-            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-            break;
-        }
-        case cdc::operation::update:
-            rjson::add(record, "eventName", "MODIFY");
-            break;
-        case cdc::operation::insert:
-            rjson::add(record, "eventName", "INSERT");
-            break;
-        case cdc::operation::service_row_delete:
-        case cdc::operation::service_partition_delete:
-        {
-            auto user_identity = rjson::empty_object();
-            rjson::add(user_identity, "Type", "Service");
-            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-            rjson::add(record, "userIdentity", std::move(user_identity));
-            rjson::add(record, "eventName", "REMOVE");
-            break;
-        }
-        default:
-            rjson::add(record, "eventName", "REMOVE");
-            break;
-        }
-        if (eor) {
-            maybe_add_record();
-            timestamp = ts;
-            if (limit == 0) {
+            /**
+             * We merge rows with same timestamp into a single event.
+             * This is pretty much needed, because a CDC row typically
+             * encodes ~half the info of an alternator write. 
+             * 
+             * A big, big downside to how alternator records are written
+             * (i.e. CQL), is that the distinction between INSERT and UPDATE
+             * is somewhat lost/unmappable to actual eventName. 
+             * A write (currently) always looks like an insert+modify
+             * regardless whether we wrote existing record or not. 
+             * 
+             * Maybe RMW ops could be done slightly differently so 
+             * we can distinguish them here...
+             * 
+             * For now, all writes will become MODIFY.
+             * 
+             * Note: we do not check the current pre/post
+             * flags on CDC log, instead we use data to 
+             * drive what is returned. This is (afaict)
+             * consistent with dynamo streams
+             */
+            switch (op) {
+            case cdc::operation::pre_image:
+            case cdc::operation::post_image:
+            {
+                auto item = rjson::empty_object();
+                describe_single_item(*selection, row, attr_names, item, nullptr, true);
+                describe_single_item(*selection, row, key_names, item);
+                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
                break;
            }
+            case cdc::operation::update:
+                rjson::add(record, "eventName", "MODIFY");
+                break;
+            case cdc::operation::insert:
+                rjson::add(record, "eventName", "INSERT");
+                break;
+            case cdc::operation::service_row_delete:
+            case cdc::operation::service_partition_delete:
+            {
+                auto user_identity = rjson::empty_object();
+                rjson::add(user_identity, "Type", "Service");
+                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+                rjson::add(record, "userIdentity", std::move(user_identity));
+                rjson::add(record, "eventName", "REMOVE");
+                break;
+            }
+            default:
+                rjson::add(record, "eventName", "REMOVE");
+                break;
+            }
+            if (eor) {
+                maybe_add_record();
+                timestamp = ts;
+                if (limit == 0) {
+                    break;
+                }
+            }
        }
-    }

-    auto ret = rjson::empty_object();
-    auto nrecords = records.Size();
-    rjson::add(ret, "Records", std::move(records));
+        auto ret = rjson::empty_object();
+        auto nrecords = records.Size();
+        rjson::add(ret, "Records", std::move(records));

-    if (nrecords != 0) {
-        // #9642. Set next iterators threshold to > last
-        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-        // Note that here we unconditionally return NextShardIterator,
-        // without checking if maybe we reached the end-of-shard. If the
-        // shard did end, then the next read will have nrecords == 0 and
-        // will notice end end of shard and not return NextShardIterator.
-        rjson::add(ret, "NextShardIterator", next_iter);
-        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-        co_return rjson::print(std::move(ret));
-    }
+        if (nrecords != 0) {
+            // #9642. Set next iterators threshold to > last
+            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+            // Note that here we unconditionally return NextShardIterator,
+            // without checking if maybe we reached the end-of-shard. If the
+            // shard did end, then the next read will have nrecords == 0 and
+            // will notice end end of shard and not return NextShardIterator.
+            rjson::add(ret, "NextShardIterator", next_iter);
+            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        }

-    // ugh. figure out if we are and end-of-shard
-    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+        // ugh. figure out if we are and end-of-shard
+        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();

-    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
-    auto& shard = iter.shard;
+        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
+            auto& shard = iter.shard;            

-    if (shard.time < ts && ts < high_ts) {
-        // The DynamoDB documentation states that when a shard is
-        // closed, reading it until the end has NextShardIterator
-        // "set to null". Our test test_streams_closed_read
-        // confirms that by "null" they meant not set at all.
-    } else {
-        // We could have return the same iterator again, but we did
-        // a search from it until high_ts and found nothing, so we
-        // can also start the next search from high_ts.
-        // TODO: but why? It's simpler just to leave the iterator be.
-        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-        rjson::add(ret, "NextShardIterator", iter);
-    }
-    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-    if (is_big(ret)) {
-        co_return make_streamed(std::move(ret));
-    }
-    co_return rjson::print(std::move(ret));
+            if (shard.time < ts && ts < high_ts) {
+                // The DynamoDB documentation states that when a shard is
+                // closed, reading it until the end has NextShardIterator
+                // "set to null". Our test test_streams_closed_read
+                // confirms that by "null" they meant not set at all.
+            } else {
+                // We could have return the same iterator again, but we did
+                // a search from it until high_ts and found nothing, so we
+                // can also start the next search from high_ts.
+                // TODO: but why? It's simpler just to leave the iterator be.
+                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+                rjson::add(ret, "NextShardIterator", iter);
+            }
+            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+            if (is_big(ret)) {
+                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
+            }
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        });
+    });
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -46,7 +46,6 @@
 #include "alternator/executor.hh"
 #include "alternator/controller.hh"
 #include "alternator/serialization.hh"
-#include "alternator/ttl_tag.hh"
 #include "dht/sharder.hh"
 #include "db/config.hh"
 #include "db/tags/utils.hh"
@@ -58,10 +57,19 @@ static logging::logger tlogger("alternator_ttl");

 namespace alternator {

+// We write the expiration-time attribute enabled on a table in a
+// tag TTL_TAG_KEY.
+// Currently, the *value* of this tag is simply the name of the attribute,
+// and the expiration scanner interprets it as an Alternator attribute name -
+// It can refer to a real column or if that doesn't exist, to a member of
+// the ":attrs" map column. Although this is designed for Alternator, it may
+// be good enough for CQL as well (there, the ":attrs" column won't exist).
+extern const sstring TTL_TAG_KEY;
+
 future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.update_time_to_live++;
    if (!_proxy.features().alternator_ttl) {
-        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Upgrade all nodes to a version that supports it.");
+        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
    }

    schema_ptr schema = get_table(_proxy, request);
@@ -133,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via an UpdateTimeToLive request.
+// Alternator tables with TTL configured via a UpdateTimeToLive request.
 //
 // Here is a brief overview of how the expiration service works:
 //
@@ -316,7 +324,9 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
    const auto& tm = *erm->get_token_metadata_ptr();
    const auto& sorted_tokens = tm.sorted_tokens();
    std::vector<std::pair<dht::token_range, locator::host_id>> ret;
-    throwing_assert(!sorted_tokens.empty());
+    if (sorted_tokens.empty()) {
+        on_internal_error(tlogger, "Token metadata is empty");
+    }
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
        co_await coroutine::maybe_yield();
@@ -553,7 +563,7 @@ static future<> scan_table_ranges(
        expiration_service::stats& expiration_stats)
 {
    const schema_ptr& s = scan_ctx.s;
-    throwing_assert(partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
+    SCYLLA_ASSERT (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
    auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
            *scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
    while (!p->is_exhausted()) {
@@ -583,7 +593,7 @@ static future<> scan_table_ranges(
            if (retries >= 10) {
                // Don't get stuck forever asking the same page, maybe there's
                // a bug or a real problem in several replicas. Give up on
-                // this scan and retry the scan from a random position later,
+                // this scan an retry the scan from a random position later,
                // in the next scan period.
                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
            }
@@ -630,38 +640,13 @@ static future<> scan_table_ranges(
                }
            } else {
                // For a real column to contain an expiration time, it
-                // must be a numeric type. We currently support decimal
-                // (used by Alternator TTL) as well as bigint, int and
-                // timestamp (used by CQL per-row TTL).
-                switch (meta[*expiration_column]->type->get_kind()) {
-                    case abstract_type::kind::decimal:
-                        // Used by Alternator TTL for key columns not stored
-                        // in the map. The value is in seconds, fractional
-                        // part is ignored.
-                        expired = is_expired(value_cast<big_decimal>(v), now);
-                        break;
-                    case abstract_type::kind::long_kind:
-                        // Used by CQL per-row TTL. The value is in seconds.
-                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int64_t>(v))), now);
-                        break;
-                    case abstract_type::kind::int32:
-                        // Used by CQL per-row TTL. The value is in seconds.
-                        // Using int type is not recommended because it will
-                        // overflow in 2038, but we support it to allow users
-                        // to use existing int columns for expiration.
-                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int32_t>(v))), now);
-                        break;
-                    case abstract_type::kind::timestamp:
-                        // Used by CQL per-row TTL. The value is in milliseconds
-                        // but we truncate it to gc_clock's precision (whole seconds).
-                        expired = is_expired(gc_clock::time_point(std::chrono::duration_cast<gc_clock::duration>(value_cast<db_clock::time_point>(v).time_since_epoch())), now);
-                        break;
-                    default:
-                        // Should never happen - we verified the column's type
-                        // before starting the scan.
-                        [[unlikely]]
-                        on_internal_error(tlogger, format("expiration scanner value of unsupported type {} in column {}", meta[*expiration_column]->type->cql3_type_name(), scan_ctx.column_name) );
-                }
+                // must be a numeric type.
+                // FIXME: Currently we only support decimal_type (which is
+                // what Alternator uses), but other numeric types can be
+                // supported as well to make this feature more useful in CQL.
+                // Note that kind::decimal is also checked above.
+                big_decimal n = value_cast<big_decimal>(v);
+                expired = is_expired(n, now);
            }
            if (expired) {
                expiration_stats.items_deleted++;
@@ -723,12 +708,16 @@ static future<bool> scan_table(
        co_return false;
    }
    // attribute_name may be one of the schema's columns (in Alternator, this
-    // means a key column, in CQL it's a regular column), or an element in
-    // Alternator's attrs map encoded in Alternator's JSON encoding (which we
-    // decode). If attribute_name is a real column, in Alternator it will have
-    // the type decimal, counting seconds since the UNIX epoch, while in CQL
-    // it will one of the types bigint or int (counting seconds) or timestamp
-    // (counting milliseconds).
+    // means it's a key column), or an element in Alternator's attrs map
+    // encoded in Alternator's JSON encoding.
+    // FIXME: To make this less Alternators-specific, we should encode in the
+    // single key's value three things:
+    // 1. The name of a column
+    // 2. Optionally if column is a map, a member in the map
+    // 3. The deserializer for the value: CQL or Alternator (JSON).
+    // The deserializer can be guessed: If the given column or map item is
+    // numeric, it can be used directly. If it is a "bytes" type, it needs to
+    // be deserialized using Alternator's deserializer.
    bytes column_name = to_bytes(*attribute_name);
    const column_definition *cd = s->get_column_definition(column_name);
    std::optional<std::string> member;
@@ -747,14 +736,11 @@ static future<bool> scan_table(
    data_type column_type = cd->type;
    // Verify that the column has the right type: If "member" exists
    // the column must be a map, and if it doesn't, the column must
-    // be decimal_type (Alternator), bigint, int or timestamp (CQL).
-    // If the column has the wrong type nothing can get expired in
-    // this table, and it's pointless to scan it.
+    // (currently) be a decimal_type. If the column has the wrong type
+    // nothing can get expired in this table, and it's pointless to
+    // scan it.
    if ((member && column_type->get_kind() != abstract_type::kind::map) ||
-        (!member && column_type->get_kind() != abstract_type::kind::decimal &&
-         column_type->get_kind() != abstract_type::kind::long_kind &&
-         column_type->get_kind() != abstract_type::kind::int32 &&
-         column_type->get_kind() != abstract_type::kind::timestamp)) {
+        (!member && column_type->get_kind() != abstract_type::kind::decimal)) {
        tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
        co_return false;
    }
@@ -781,7 +767,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
@@ -892,10 +878,12 @@ future<> expiration_service::run() {
 future<> expiration_service::start() {
    // Called by main() on each shard to start the expiration-service
    // thread. Just runs run() in the background and allows stop().
-    if (!shutting_down()) {
-        _end = run().handle_exception([] (std::exception_ptr ep) {
-            tlogger.error("expiration_service failed: {}", ep);
-        });
+    if (_db.features().alternator_ttl) {
+        if (!shutting_down()) {
+            _end = run().handle_exception([] (std::exception_ptr ep) {
+                tlogger.error("expiration_service failed: {}", ep);
+            });
+        }
    }
    return make_ready_future<>();
 }
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -30,7 +30,7 @@ namespace alternator {

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via an UpdateTimeToLive request.
+// Alternator tables with TTL configured via a UpdateTimeToLeave request.
 class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
 public:
    // Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
    data_dictionary::database _db;
    service::storage_proxy& _proxy;
    gms::gossiper& _gossiper;
-    // _end is set by start(), and resolves when the background service
+    // _end is set by start(), and resolves when the the background service
    // started by it ends. To ask the background service to end, _abort_source
    // should be triggered. stop() below uses both _abort_source and _end.
    std::optional<future<>> _end;
--- a/alternator/ttl_tag.hh
+++ b/alternator/ttl_tag.hh
@@ -1,26 +0,0 @@
-/*
- * Copyright 2026-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include "seastarx.hh"
-#include <seastar/core/sstring.hh>
-
-namespace alternator {
-// We use the table tag TTL_TAG_KEY ("system:ttl_attribute") to remember
-// which attribute was chosen as the expiration-time attribute for
-// Alternator's TTL and CQL's per-row TTL features.
-// Currently, the *value* of this tag is simply the name of the attribute:
-// It can refer to a real column or if that doesn't exist, to a member of
-// the ":attrs" map column (which Alternator uses).
-extern const sstring TTL_TAG_KEY;
-} // namespace alternator
-
-// let users use TTL_TAG_KEY without the "alternator::" prefix,
-// to make it easier to move it to a different namespace later.
-using alternator::TTL_TAG_KEY;
--- a/api/api-doc/authorization_cache.json
+++ b/api/api-doc/authorization_cache.json
@@ -12,7 +12,7 @@
      "operations":[
        {
          "method":"POST",
-          "summary":"Resets authorized prepared statements cache",
+          "summary":"Reset cache",
          "type":"void",
          "nickname":"authorization_cache_reset",
          "produces":[
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -3051,7 +3051,7 @@
                  },
                  {
                     "name":"incremental_mode",
-                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
+                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -3085,48 +3085,6 @@
            }
         ]
      },
-
-      {
-         "path":"/storage_service/tablets/snapshots",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Takes the snapshot for the given keyspaces/tables. A snapshot name must be specified.",
-               "type":"void",
-               "nickname":"take_cluster_snapshot",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"tag",
-                     "description":"the tag given to the snapshot",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"keyspace",
-                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"table",
-                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  }
-               ]
-            }
-         ]
-      },
-
      {
         "path":"/storage_service/quiesce_topology",
         "operations":[
--- a/api/api.hh
+++ b/api/api.hh
@@ -23,6 +23,31 @@

 namespace api {

+template<class T>
+std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
+    std::vector<T> res;
+    res.reserve(map.size());
+
+    for (const auto& [key, value] : map) {
+        res.push_back(T());
+        res.back().key = key;
+        res.back().value = value;
+    }
+    return res;
+}
+
+template<class T, class MAP>
+std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
+    res.reserve(res.size() + std::size(map));
+
+    for (const auto& [key, value] : map) {
+        T val;
+        val.key = fmt::to_string(key);
+        val.value = fmt::to_string(value);
+        res.push_back(val);
+    }
+    return res;
+}
 template <typename T, typename S = T>
 T map_sum(T&& dest, const S& src) {
    for (const auto& i : src) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
-                    keyspace,
-                    table,
-                    endpoint,
-                    bucket,
-                    prefix,
-                    sstables.size(),
-                    scope,
-                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
@@ -536,15 +527,13 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
-    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
        auto keyspace = validate_keyspace(ctx, req);
        auto view = req->get_path_param("view");
-        co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
-            storage_service_json::mapper res;
-            res.key = i.first;
-            res.value = i.second;
-            return res;
-        }));
+        return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
+            std::vector<storage_service_json::mapper> res;
+            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
+        });
    });

    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -582,16 +571,6 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
 }

-namespace {
-template <typename Key, typename Value>
-storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
-    storage_service_json::mapper val;
-    val.key = fmt::to_string(i.first);
-    val.value = fmt::to_string(i.second);
-    return val;
-}
-}
-
 static
 future<json::json_return_type>
 rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -609,7 +588,12 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
            throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
        }

-        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
+        co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
+            storage_service_json::mapper val;
+            val.key = fmt::to_string(i.first);
+            val.value = fmt::to_string(i.second);
+            return val;
+        }));
 }

 static
@@ -693,6 +677,7 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            table_id = validate_table(ctx.db.local(), keyspace, table);
        }

+        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
            ss::maplist_mapper m;
@@ -783,13 +768,17 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        apilog.info("cleanup_all global={}", global);

-        if (global) {
-            co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-                co_return co_await ss.do_clusterwide_vnodes_cleanup();
-            });
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+            if (!ss.is_topology_coordinator_enabled()) {
+                co_return false;
+            }
+            co_await ss.do_clusterwide_vnodes_cleanup();
+            co_return true;
+        });
+        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local cleanup if local cleanup is requested
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
@@ -797,7 +786,9 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        // Mark this node as clean
        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-            co_await ss.reset_cleanup_needed();
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
        });

        co_return json::json_return_type(0);
@@ -808,6 +799,9 @@ future<json::json_return_type>
 rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
        apilog.info("reset_cleanup_needed");
        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
            return ss.reset_cleanup_needed();
        });
        co_return json_void();
@@ -1314,7 +1308,10 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
            throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
        }

-        co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
+        return ss.local().get_ownership().then([] (auto&& ownership) {
+            std::vector<storage_service_json::mapper> res;
+            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
+        });
 }

 static
@@ -1331,7 +1328,10 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
            }
        }

-        co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
+        return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
+            std::vector<storage_service_json::mapper> res;
+            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
+        });
 }

 static
@@ -1341,7 +1341,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
        apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
        throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1407,7 +1407,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
        apilog.warn("retrain_dict: called before the cluster feature was enabled");
        throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
@@ -1565,7 +1565,16 @@ rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::
 static
 future<json::json_return_type>
 rest_upgrade_to_raft_topology(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("Requested to schedule upgrade to raft topology, but this version does not need it since it uses raft topology by default.");
+        apilog.info("Requested to schedule upgrade to raft topology");
+        try {
+            co_await ss.invoke_on(0, [] (auto& ss) {
+                return ss.start_upgrade_to_raft_topology();
+            });
+        } catch (...) {
+            auto ex = std::current_exception();
+            apilog.error("Failed to schedule upgrade to raft topology: {}", ex);
+            std::rethrow_exception(std::move(ex));
+        }
        co_return json_void();
 }

@@ -2007,16 +2016,12 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
-        auto tcopt = req->get_query_param("tc");
-
-        db::snapshot_options opts = {
-            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
-        };
+        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
        try {
            if (column_families.empty()) {
-                co_await snap_ctl.local().take_snapshot(tag, keynames, opts);
+                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
            } else {
                if (keynames.empty()) {
                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -2024,7 +2029,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                if (keynames.size() > 1) {
                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
                }
-                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
            }
            co_return json_void();
        } catch (...) {
@@ -2033,27 +2038,6 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        }
    });

-    ss::take_cluster_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        apilog.info("take_cluster_snapshot: {}", req->get_query_params());
-        auto tag = req->get_query_param("tag");
-        auto column_families = split(req->get_query_param("table"), ",");
-        // Note: not published/active. Retain as internal option, but...
-        auto sfopt = req->get_query_param("skip_flush");
-
-        db::snapshot_options opts = {
-            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
-        };
-
-        std::vector<sstring> keynames = split(req->get_query_param("keyspace"), ",");
-        try {
-            co_await snap_ctl.local().take_cluster_column_family_snapshot(keynames, column_families, tag, opts);
-            co_return json_void();
-        } catch (...) {
-            apilog.error("take_cluster_snapshot failed: {}", std::current_exception());
-            throw;
-        }
-    });
-
    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        apilog.info("del_snapshot: {}", req->get_query_params());
        auto tag = req->get_query_param("tag");
@@ -2080,8 +2064,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto info = parse_scrub_options(ctx, std::move(req));

        if (!info.snapshot_tag.empty()) {
-            db::snapshot_options opts = {.skip_flush = false};
-            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
+            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
        }

        compaction::compaction_stats stats;
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -146,8 +146,7 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
        auto info = parse_scrub_options(ctx, std::move(req));

        if (!info.snapshot_tag.empty()) {
-            db::snapshot_options opts = {.skip_flush = false};
-            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
+            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
        }

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,11 +209,15 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table, batch);
+    return std::make_unique<audit_info>(cat, keyspace, table);
+}
+
+audit_info_ptr audit::create_no_audit_info() {
+    return audit_info_ptr();
 }

 future<> audit::start(const db::config& cfg) {
@@ -263,21 +267,18 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    auto audit_info = statement->get_audit_info();
-    if (!audit_info) {
-        return make_ready_future<>();
-    }
-    if (audit_info->batch()) {
-        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
+    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
+    if (batch != nullptr) {
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        if (audit::local_audit_instance().should_log(audit_info)) {
+        auto audit_info = statement->get_audit_info();
+        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
-        return make_ready_future<>();
    }
+    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,13 +75,11 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
-    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
+    audit_info(statement_category cat, sstring keyspace, sstring table)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
-        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -91,7 +89,6 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
-    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -129,7 +126,8 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
+    static audit_info_ptr create_no_audit_info();
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -53,10 +53,10 @@ static std::string json_escape(std::string_view str) {

 }

-future<> audit_syslog_storage_helper::syslog_send_helper(temporary_buffer<char> msg) {
+future<> audit_syslog_storage_helper::syslog_send_helper(const sstring& msg) {
    try {
        auto lock = co_await get_units(_semaphore, 1, std::chrono::hours(1));
-        co_await _sender.send(_syslog_address, std::span(&msg, 1));
+        co_await _sender.send(_syslog_address, net::packet{msg.data(), msg.size()});
    }
    catch (const std::exception& e) {
        auto error_msg = seastar::format(
@@ -90,7 +90,7 @@ future<> audit_syslog_storage_helper::start(const db::config& cfg) {
        co_return;
    }

-    co_await syslog_send_helper(temporary_buffer<char>::copy_of("Initializing syslog audit backend."));
+    co_await syslog_send_helper("Initializing syslog audit backend.");
 }

 future<> audit_syslog_storage_helper::stop() {
@@ -120,7 +120,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
                                    audit_info->table(),
                                    username);

-    co_await syslog_send_helper(std::move(msg).release());
+    co_await syslog_send_helper(msg);
 }

 future<> audit_syslog_storage_helper::write_login(const sstring& username,
@@ -139,7 +139,7 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
                                    client_ip,
                                    username);

-    co_await syslog_send_helper(std::move(msg).release());
+    co_await syslog_send_helper(msg.c_str());
 }

 }
--- a/audit/audit_syslog_storage_helper.hh
+++ b/audit/audit_syslog_storage_helper.hh
@@ -26,7 +26,7 @@ class audit_syslog_storage_helper : public storage_helper {
    net::datagram_channel _sender;
    seastar::semaphore _semaphore;

-    future<> syslog_send_helper(seastar::temporary_buffer<char> msg);
+    future<> syslog_send_helper(const sstring& msg);
 public:
    explicit audit_syslog_storage_helper(cql3::query_processor&, service::migration_manager&);
    virtual ~audit_syslog_storage_helper();
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -17,6 +17,7 @@ target_sources(scylla_auth
    password_authenticator.cc
    passwords.cc
    permission.cc
+    permissions_cache.cc
    resource.cc
    role_or_anonymous.cc
    roles-metadata.cc
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -8,7 +8,6 @@

 #include "auth/cache.hh"
 #include "auth/common.hh"
-#include "auth/role_or_anonymous.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -19,8 +18,6 @@
 #include <seastar/core/abort_source.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/core/format.hh>
-#include <seastar/core/metrics.hh>
-#include <seastar/core/do_with.hh>

 namespace auth {

@@ -30,21 +27,7 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
    : _current_version(0)
    , _qp(qp)
    , _loading_sem(1)
-    , _as(as)
-    , _permission_loader(nullptr)
-    , _permission_loader_sem(8) {
-    namespace sm = seastar::metrics;
-    _metrics.add_group("auth_cache", {
-        sm::make_gauge("roles", [this] { return _roles.size(); },
-                sm::description("Number of roles currently cached")),
-        sm::make_gauge("permissions", [this] {
-            return _cached_permissions_count;
-        }, sm::description("Total number of permission sets currently cached across all roles"))
-    });
-}
-
-void cache::set_permission_loader(permission_loader_func loader) {
-    _permission_loader = std::move(loader);
+    , _as(as) {
 }

 lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -55,83 +38,6 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

-future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
-    std::unordered_map<resource, permission_set>* perms_cache;
-    lw_shared_ptr<role_record> role_ptr;
-
-    if (is_anonymous(role)) {
-        perms_cache = &_anonymous_permissions;
-    } else {
-        const auto& role_name = *role.name;
-        auto role_it = _roles.find(role_name);
-        if (role_it == _roles.end()) {
-            // Role might have been deleted but there are some connections
-            // left which reference it. They should no longer have access to anything.
-            return make_ready_future<permission_set>(permissions::NONE);
-        }
-        role_ptr = role_it->second;
-        perms_cache = &role_ptr->cached_permissions;
-    }
-
-    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
-        return make_ready_future<permission_set>(it->second);
-    }
-    // keep alive role_ptr as it holds perms_cache (except anonymous)
-    return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
-        return load_permissions(role, r, perms_cache);
-    });
-}
-
-future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
-    SCYLLA_ASSERT(_permission_loader);
-    auto units = co_await get_units(_permission_loader_sem, 1, _as);
-
-    // Check again, perhaps we were blocked and other call loaded
-    // the permissions already. This is a protection against misses storm.
-    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
-        co_return it->second;
-    }
-    auto perms = co_await _permission_loader(role, r);
-    add_permissions(*perms_cache, r, perms);
-    co_return perms;
-}
-
-future<> cache::prune(const resource& r) {
-    auto units = co_await get_units(_loading_sem, 1, _as);
-    _anonymous_permissions.erase(r);
-    for (auto& it : _roles) {
-        // Prunning can run concurrently with other functions but it
-        // can only cause cached_permissions extra reload via get_permissions.
-        remove_permissions(it.second->cached_permissions, r);
-        co_await coroutine::maybe_yield();
-    }
-}
-
-future<> cache::reload_all_permissions() noexcept {
-    SCYLLA_ASSERT(_permission_loader);
-    auto units = co_await get_units(_loading_sem, 1, _as);
-    auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
-        std::vector<resource> keys;
-        keys.reserve(m.size());
-        for (const auto& [res, _] : m) {
-            keys.push_back(res);
-        }
-        return keys;
-    };
-    const role_or_anonymous anon;
-    for (const auto& res : copy_keys(_anonymous_permissions)) {
-        _anonymous_permissions[res] = co_await _permission_loader(anon, res);
-    }
-    for (auto& [role, entry] : _roles) {
-        auto& perms_cache = entry->cached_permissions;
-        auto r = role_or_anonymous(role);
-        for (const auto& res : copy_keys(perms_cache)) {
-            perms_cache[res] = co_await _permission_loader(r, res);
-        }
-    }
-    logger.debug("Reloaded auth cache with {} entries", _roles.size());
-}
-
 future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
    auto rec = make_lw_shared<role_record>();
    rec->version = _current_version;
@@ -199,7 +105,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
 future<> cache::prune_all() noexcept {
    for (auto it = _roles.begin(); it != _roles.end(); ) {
        if (it->second->version != _current_version) {
-            remove_role(it++);
+            _roles.erase(it++);
            co_await coroutine::maybe_yield();
        } else {
            ++it;
@@ -223,7 +129,7 @@ future<> cache::load_all() {
        const auto name = r.get_as<sstring>("role");
        auto role = co_await fetch_role(name);
        if (role) {
-            add_role(name, role);
+            _roles[name] = role;
        }
        co_return stop_iteration::no;
    };
@@ -236,32 +142,11 @@ future<> cache::load_all() {
        co_await distribute_role(name, role);
    }
    co_await container().invoke_on_others([this](cache& c) -> future<> {
-        auto units = co_await get_units(c._loading_sem, 1, c._as);
        c._current_version = _current_version;
        co_await c.prune_all();
    });
 }

-future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
-    if (!role) {
-        // Role might have been removed or not yet added, either way
-        // their members will be handled by another top call to this function.
-        co_return;
-    }
-    for (const auto& member_name : role->members) {
-        bool is_new = roles.insert(member_name).second;
-        if (!is_new) {
-            continue;
-        }
-        lw_shared_ptr<cache::role_record> member_role;
-        auto r = _roles.find(member_name);
-        if (r != _roles.end()) {
-            member_role = r->second;
-        }
-        co_await gather_inheriting_roles(roles, member_role, member_name);
-    }
-}
-
 future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    if (legacy_mode(_qp)) {
        co_return;
@@ -269,41 +154,27 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    SCYLLA_ASSERT(this_shard_id() == 0);
    auto units = co_await get_units(_loading_sem, 1, _as);

-    std::unordered_set<role_name_t> roles_to_clear_perms;
    for (const auto& name : roles) {
        logger.info("Loading role {}", name);
        auto role = co_await fetch_role(name);
         if (role) {
-            add_role(name, role);
-            co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
+            _roles[name] = role;
        } else {
-            if (auto it = _roles.find(name); it != _roles.end()) {
-                auto old_role = it->second;
-                remove_role(it);
-                co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
-            }
+            _roles.erase(name);
        }
        co_await distribute_role(name, role);
    }
-
-    co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
-        for (const auto& name : roles_to_clear_perms) {
-            c.clear_role_permissions(name);
-            co_await coroutine::maybe_yield();
-        }
-    });
 }

 future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
    auto role_ptr = role.get();
-    co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
-        auto units = co_await get_units(c._loading_sem, 1, c._as);
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
        if (!role_ptr) {
-            c.remove_role(name);
-            co_return;
+            c._roles.erase(name);
+            return;
        }
        auto role_copy = make_lw_shared<role_record>(*role_ptr);
-        c.add_role(name, std::move(role_copy));
+        c._roles[name] = std::move(role_copy);
    });
 }

@@ -314,40 +185,4 @@ bool cache::includes_table(const table_id& id) noexcept {
            || id == db::system_keyspace::role_permissions()->id();
 }

-void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
-    if (auto it = _roles.find(name); it != _roles.end()) {
-        _cached_permissions_count -= it->second->cached_permissions.size();
-    }
-    _cached_permissions_count += role->cached_permissions.size();
-    _roles[name] = std::move(role);
-}
-
-void cache::remove_role(const role_name_t& name) {
-    if (auto it = _roles.find(name); it != _roles.end()) {
-        remove_role(it);
-    }
-}
-
-void cache::remove_role(roles_map::iterator it) {
-    _cached_permissions_count -= it->second->cached_permissions.size();
-    _roles.erase(it);
-}
-
-void cache::clear_role_permissions(const role_name_t& name) {
-    if (auto it = _roles.find(name); it != _roles.end()) {
-        _cached_permissions_count -= it->second->cached_permissions.size();
-        it->second->cached_permissions.clear();
-    }
-}
-
-void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
-    if (cache.emplace(r, perms).second) {
-        ++_cached_permissions_count;
-    }
-}
-
-void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
-    _cached_permissions_count -= cache.erase(r);
-}
-
 } // namespace auth
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -17,14 +17,11 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/semaphore.hh>
-#include <seastar/core/metrics_registration.hh>

 #include <absl/container/flat_hash_map.h>

 #include "auth/permission.hh"
 #include "auth/common.hh"
-#include "auth/resource.hh"
-#include "auth/role_or_anonymous.hh"

 namespace cql3 { class query_processor; }

@@ -34,7 +31,6 @@ class cache : public peering_sharded_service<cache> {
 public:
    using role_name_t = sstring;
    using version_tag_t = char;
-    using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;

 	struct role_record {
        bool can_login = false;
@@ -44,19 +40,11 @@ public:
        sstring salted_hash;
        std::unordered_map<sstring, sstring> attributes;
        std::unordered_map<sstring, permission_set> permissions;
-    private:
-        friend cache;
-        // cached permissions include effects of role's inheritance
-        std::unordered_map<resource, permission_set> cached_permissions;
        version_tag_t version; // used for seamless cache reloads
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
-    void set_permission_loader(permission_loader_func loader);
-    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
-    future<> prune(const resource& r);
-    future<> reload_all_permissions() noexcept;
    future<> load_all();
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;
@@ -64,31 +52,14 @@ public:
 private:
    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
    roles_map _roles;
-    // anonymous permissions map exists mainly due to compatibility with
-    // higher layers which use role_or_anonymous to get permissions.
-    std::unordered_map<resource, permission_set> _anonymous_permissions;
    version_tag_t _current_version;
    cql3::query_processor& _qp;
-    semaphore _loading_sem; // protects iteration of _roles map
+    semaphore _loading_sem;
    abort_source& _as;
-    permission_loader_func _permission_loader;
-    semaphore _permission_loader_sem; // protects against reload storms on a single role change
-    metrics::metric_groups _metrics;
-    size_t _cached_permissions_count = 0;

    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
    future<> prune_all() noexcept;
    future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
-    future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
-
-    void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
-    void remove_role(const role_name_t& name);
-    void remove_role(roles_map::iterator it);
-    void clear_role_permissions(const role_name_t& name);
-    void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
-    void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
-
-    future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
 };

 } // namespace auth
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -88,16 +88,10 @@ static const class_registrator<

 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
-        uint32_t permissions_update_interval_in_ms,
-        utils::observer<uint32_t>  permissions_update_interval_in_ms_observer,
        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
        : _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
        , _bind_password(bind_password)
-        , _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
-        , _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
-        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
-        , _cache(cache)
-        , _cache_pruner(make_ready_future<>()) {
+        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
 }

 ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -106,8 +100,6 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
            qp.db().get_config().ldap_attr_role(),
            qp.db().get_config().ldap_bind_dn(),
            qp.db().get_config().ldap_bind_passwd(),
-            qp.db().get_config().permissions_update_interval_in_ms(),
-            qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
            qp,
            rg0c,
            mm,
@@ -127,22 +119,6 @@ future<> ldap_role_manager::start() {
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
    }
-    _cache_pruner = futurize_invoke([this] () -> future<> {
-        while (true) {
-            try {
-                co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
-            } catch (const seastar::sleep_aborted&) {
-                co_return; // ignore
-            }
-            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
-                try {
-                    co_await c.reload_all_permissions();
-                } catch (...) {
-                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
-                }
-            });
-        }
-    });
    return _std_mgr.start();
 }

@@ -199,11 +175,7 @@ future<conn_ptr> ldap_role_manager::reconnect() {

 future<> ldap_role_manager::stop() {
    _as.request_abort();
-    return std::move(_cache_pruner).then([this] {
-        return _std_mgr.stop();
-    }).then([this] {
-        return _connection_factory.stop();
-    });
+    return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
 }

 future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -10,7 +10,6 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
-#include <seastar/core/future.hh>
 #include <stdexcept>

 #include "ent/ldap/ldap_connection.hh"
@@ -35,22 +34,14 @@ class ldap_role_manager : public role_manager {
    seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
    seastar::sstring _bind_name; ///< Username for LDAP simple bind.
    seastar::sstring _bind_password; ///< Password for LDAP simple bind.
-
-    uint32_t _permissions_update_interval_in_ms;
-    utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
-
    mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
    seastar::abort_source _as;
-    cache& _cache;
-    seastar::future<> _cache_pruner;
  public:
    ldap_role_manager(
            std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
            std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
            std::string_view bind_name, ///< LDAP bind credentials.
            std::string_view bind_password, ///< LDAP bind credentials.
-            uint32_t permissions_update_interval_in_ms,
-            utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
            cql3::query_processor& qp, ///< Passed to standard_role_manager.
            ::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
            ::service::migration_manager& mm, ///< Passed to standard_role_manager.
--- a/auth/permissions_cache.cc
+++ b/auth/permissions_cache.cc
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2017-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "auth/permissions_cache.hh"
+
+#include <fmt/ranges.h>
+#include "auth/authorizer.hh"
+#include "auth/service.hh"
+
+namespace auth {
+
+permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
+        : _cache(c, log, [&ser, &log](const key_type& k) {
+              log.debug("Refreshing permissions for {}", k.first);
+              return ser.get_uncached_permissions(k.first, k.second);
+          }) {
+}
+
+bool permissions_cache::update_config(utils::loading_cache_config c) {
+    return _cache.update_config(std::move(c));
+}
+
+void permissions_cache::reset() {
+    _cache.reset();
+}
+
+future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
+    return do_with(key_type(maybe_role, r), [this](const auto& k) {
+        return _cache.get(k);
+    });
+}
+
+}
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2017-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <iostream>
+#include <utility>
+
+#include <fmt/core.h>
+#include <seastar/core/future.hh>
+
+#include "auth/permission.hh"
+#include "auth/resource.hh"
+#include "auth/role_or_anonymous.hh"
+#include "utils/log.hh"
+#include "utils/hash.hh"
+#include "utils/loading_cache.hh"
+
+namespace std {
+
+inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
+    fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
+    return os;
+}
+
+}
+
+namespace db {
+class config;
+}
+
+namespace auth {
+
+class service;
+
+class permissions_cache final {
+    using cache_type = utils::loading_cache<
+            std::pair<role_or_anonymous, resource>,
+            permission_set,
+            1,
+            utils::loading_cache_reload_enabled::yes,
+            utils::simple_entry_size<permission_set>,
+            utils::tuple_hash>;
+
+    using key_type = typename cache_type::key_type;
+
+    cache_type _cache;
+
+public:
+    explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
+
+    future <> stop() {
+        return _cache.stop();
+    }
+
+    bool update_config(utils::loading_cache_config);
+    void reset();
+    future<permission_set> get(const role_or_anonymous&, const resource&);
+};
+
+}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -64,11 +64,11 @@ static const sstring superuser_col_name("super");
 static logging::logger log("auth_service");

 class auth_migration_listener final : public ::service::migration_listener {
-    service& _service;
+    authorizer& _authorizer;
    cql3::query_processor& _qp;

 public:
-    explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s),  _qp(qp) {
+    explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a),  _qp(qp) {
    }

 private:
@@ -92,14 +92,14 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
-            return _service.revoke_all(r, mc);
+        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
+            return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });

-        (void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
-            return _service.revoke_all(r, mc);
+        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
+            return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
        });
@@ -111,8 +111,9 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
-            return _service.revoke_all(r, mc);
+        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
+            return _authorizer.revoke_all(
+                    auth::make_data_resource(ks_name, cf_name), mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
        });
@@ -125,8 +126,9 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
-            return _service.revoke_all(r, mc);
+        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
+            return _authorizer.revoke_all(
+                    auth::make_functions_resource(ks_name, function_name), mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
        });
@@ -136,8 +138,9 @@ private:
            // in non legacy path revoke is part of schema change statement execution
            return;
        }
-        (void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
-            return _service.revoke_all(r, mc);
+        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
+            return _authorizer.revoke_all(
+                    auth::make_functions_resource(ks_name, aggregate_name), mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
        });
@@ -154,6 +157,7 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 }

 service::service(
+        utils::loading_cache_config c,
        cache& cache,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
@@ -162,17 +166,25 @@ service::service(
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r,
        maintenance_socket_enabled used_by_maintenance_socket)
-            : _cache(cache)
+            : _loading_cache_config(std::move(c))
+            , _permissions_cache(nullptr)
+            , _cache(cache)
            , _qp(qp)
            , _group0_client(g0)
            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
+            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
+            , _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
+            , _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
+            , _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
+            , _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
+            , _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
            , _used_by_maintenance_socket(used_by_maintenance_socket) {}

 service::service(
+        utils::loading_cache_config c,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
        ::service::migration_notifier& mn,
@@ -181,6 +193,7 @@ service::service(
        maintenance_socket_enabled used_by_maintenance_socket,
        cache& cache)
            : service(
+                      std::move(c),
                      cache,
                      qp,
                      g0,
@@ -244,14 +257,7 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        co_await _role_manager->ensure_superuser_is_created();
    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
-    if (!_used_by_maintenance_socket) {
-        // Maintenance socket mode can't cache permissions because it has
-        // different authorizer. We can't mix cached permissions, they could be
-        // different in normal mode.
-        _cache.set_permission_loader(std::bind(
-                &service::get_uncached_permissions,
-                this, std::placeholders::_1, std::placeholders::_2));
-    }
+    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
    co_await once_among_shards([this] {
        _mnotifier.register_listener(_migration_listener.get());
        return make_ready_future<>();
@@ -263,7 +269,9 @@ future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
-        _cache.set_permission_loader(nullptr);
+        if (_permissions_cache) {
+            return _permissions_cache->stop();
+        }
        return make_ready_future<>();
    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
@@ -275,8 +283,21 @@ future<> service::ensure_superuser_is_created() {
    co_await _authenticator->ensure_superuser_is_created();
 }

+void service::update_cache_config() {
+    auto db = _qp.db();
+
+    utils::loading_cache_config perm_cache_config;
+    perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
+    perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
+    perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
+
+    if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
+        log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
+    }
+}

 void service::reset_authorization_cache() {
+    _permissions_cache->reset();
    _qp.reset_cache();
 }

@@ -301,10 +322,7 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
 }

 future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
-    if (legacy_mode(_qp) || _used_by_maintenance_socket) {
-        return get_uncached_permissions(maybe_role, r);
-    }
-    return _cache.get_permissions(maybe_role, r);
+    return _permissions_cache->get(maybe_role, r);
 }

 future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -429,11 +447,6 @@ future<bool> service::exists(const resource& r) const {
    return make_ready_future<bool>(false);
 }

-future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
-    co_await _authorizer->revoke_all(r, mc);
-    co_await _cache.prune(r);
-}
-
 future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
    std::vector<cql3::description> result{};

@@ -788,7 +801,7 @@ future<> revoke_permissions(
 }

 future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
-    return ser.revoke_all(r, mc);
+    return ser.underlying_authorizer().revoke_all(r, mc);
 }

 future<std::vector<permission_details>> list_filtered_permissions(
@@ -863,6 +876,22 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
                continue; // some tables might not have been created if they were not used
            }

+            // use longer than usual timeout as we scan the whole table
+            // but not infinite or very long as we want to fail reasonably fast
+            const auto t = 5min;
+            const timeout_config tc{t, t, t, t, t, t, t};
+            ::service::client_state cs(::service::client_state::internal_tag{}, tc);
+            ::service::query_state qs(cs, empty_service_permit());
+
+            auto rows = co_await qp.execute_internal(
+                    seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
+                    db::consistency_level::ALL,
+                    qs,
+                    {},
+                    cql3::query_processor::cache_internal::no);
+            if (rows->empty()) {
+                continue;
+            }
            std::vector<sstring> col_names;
            for (const auto& col : schema->all_columns()) {
                col_names.push_back(col.name_as_cql_string());
@@ -871,51 +900,30 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
            for (size_t i = 1; i < col_names.size(); ++i) {
                val_binders_str += ", ?";
            }
-
-            std::vector<mutation> collected;
-            // use longer than usual timeout as we scan the whole table
-            // but not infinite or very long as we want to fail reasonably fast
-            const auto t = 5min;
-            const timeout_config tc{t, t, t, t, t, t, t};
-            ::service::client_state cs(::service::client_state::internal_tag{}, tc);
-            ::service::query_state qs(cs, empty_service_permit());
-
-            co_await qp.query_internal(
-                seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
-                db::consistency_level::ALL,
-                {},
-                1000,
-                [&qp, &cf_name, &col_names, &val_binders_str, &schema, ts, &collected] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
-                    std::vector<data_value_or_unset> values;
-                    for (const auto& col : schema->all_columns()) {
-                        if (row.has(col.name_as_text())) {
-                            values.push_back(
-                                    col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
-                        } else {
-                            values.push_back(unset_value{});
-                        }
+            for (const auto& row : *rows) {
+                std::vector<data_value_or_unset> values;
+                for (const auto& col : schema->all_columns()) {
+                    if (row.has(col.name_as_text())) {
+                        values.push_back(
+                                col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
+                    } else {
+                        values.push_back(unset_value{});
                    }
-                    auto muts = co_await qp.get_mutations_internal(
-                            seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
-                                    db::system_keyspace::NAME,
-                                    cf_name,
-                                    fmt::join(col_names, ", "),
-                                    val_binders_str),
-                            internal_distributed_query_state(),
-                            ts,
-                            std::move(values));
-                    if (muts.size() != 1) {
-                        on_internal_error(log,
-                                format("expecting single insert mutation, got {}", muts.size()));
-                    }
-
-                    collected.push_back(std::move(muts[0]));
-                    co_return stop_iteration::no;
-                },
-                std::move(qs));
-
-            for (auto& m : collected) {
-                co_yield std::move(m);
+                }
+                auto muts = co_await qp.get_mutations_internal(
+                        seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
+                                db::system_keyspace::NAME,
+                                cf_name,
+                                fmt::join(col_names, ", "),
+                                val_binders_str),
+                        internal_distributed_query_state(),
+                        ts,
+                        std::move(values));
+                if (muts.size() != 1) {
+                    on_internal_error(log,
+                            format("expecting single insert mutation, got {}", muts.size()));
+                }
+                co_yield std::move(muts[0]);
            }
        }
        co_yield co_await sys_ks.make_auth_version_mutation(ts,
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -20,6 +20,7 @@
 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
 #include "auth/permission.hh"
+#include "auth/permissions_cache.hh"
 #include "auth/cache.hh"
 #include "auth/role_manager.hh"
 #include "auth/common.hh"
@@ -74,6 +75,8 @@ public:
 /// peering_sharded_service inheritance is needed to be able to access shard local authentication service
 /// given an object from another shard. Used for bouncing lwt requests to correct shard.
 class service final : public seastar::peering_sharded_service<service> {
+    utils::loading_cache_config _loading_cache_config;
+    std::unique_ptr<permissions_cache> _permissions_cache;
    cache& _cache;

    cql3::query_processor& _qp;
@@ -91,12 +94,20 @@ class service final : public seastar::peering_sharded_service<service> {
    // Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
    std::unique_ptr<::service::migration_listener> _migration_listener;

+    std::function<void(uint32_t)> _permissions_cache_cfg_cb;
+    serialized_action _permissions_cache_config_action;
+
+    utils::observer<uint32_t> _permissions_cache_max_entries_observer;
+    utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
+    utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
+
    maintenance_socket_enabled _used_by_maintenance_socket;

    abort_source _as;

 public:
    service(
+            utils::loading_cache_config,
            cache& cache,
            cql3::query_processor&,
            ::service::raft_group0_client&,
@@ -112,6 +123,7 @@ public:
    /// of the instances themselves.
    ///
    service(
+            utils::loading_cache_config,
            cql3::query_processor&,
            ::service::raft_group0_client&,
            ::service::migration_notifier&,
@@ -126,6 +138,8 @@ public:

    future<> ensure_superuser_is_created();

+    void update_cache_config();
+
    void reset_authorization_cache();

    ///
@@ -167,13 +181,6 @@ public:

    future<bool> exists(const resource&) const;

-    ///
-    /// Revoke all permissions granted to any role for a particular resource.
-    ///
-    /// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
-    ///
-    future<> revoke_all(const resource&, ::service::group0_batch&) const;
-
    ///
    /// Produces descriptions that can be used to restore the state of auth. That encompasses
    /// roles, role grants, and permission grants.
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -52,6 +52,13 @@ static const class_registrator<
        ::service::migration_manager&,
        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");

+struct record final {
+    sstring name;
+    bool is_superuser;
+    bool can_login;
+    role_set member_of;
+};
+
 static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -60,13 +67,13 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
    return db::consistency_level::LOCAL_ONE;
 }

-future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
+static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            get_auth_ks_name(_qp),
+            get_auth_ks_name(qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);

-    const auto results = co_await _qp.execute_internal(
+    const auto results = co_await qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_query_state(),
@@ -86,25 +93,8 @@ future<std::optional<standard_role_manager::record>> standard_role_manager::lega
                        : role_set())});
 }

-future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
-    if (legacy_mode(_qp)) {
-        return legacy_find_record(role_name);
-    }
-    auto name = sstring(role_name);
-    auto role = _cache.get(name);
-    if (!role) {
-        return make_ready_future<std::optional<record>>(std::nullopt);
-    }
-    return make_ready_future<std::optional<record>>(std::make_optional(record{
-        .name = std::move(name),
-        .is_superuser = role->is_superuser,
-        .can_login = role->can_login,
-        .member_of = role->member_of
-    }));
-}
-
-future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
-    return find_record(role_name).then([role_name](std::optional<record> mr) {
+static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
+    return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
        if (!mr) {
            throw nonexistant_role(role_name);
        }
@@ -396,7 +386,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        return fmt::to_string(fmt::join(assignments, ", "));
    };

-    return require_record(role_name).then([this, role_name, &u, &mc](record) {
+    return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
@@ -630,17 +620,18 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
    });
 }

-future<> standard_role_manager::collect_roles(
+static future<> collect_roles(
+        cql3::query_processor& qp,
        std::string_view grantee_name,
        bool recurse,
        role_set& roles) {
-    return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
-        return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
-            return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
+    return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
+        return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
+            return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
                roles.insert(role_name);

                if (recurse) {
-                    return collect_roles(role_name, true, roles);
+                    return collect_roles(qp, role_name, true, roles);
                }

                return make_ready_future<>();
@@ -655,7 +646,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    return do_with(
            role_set{sstring(grantee_name)},
            [this, grantee_name, recurse](role_set& roles) {
-        return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
+        return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
    });
 }

@@ -715,21 +706,27 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
 }

 future<bool> standard_role_manager::exists(std::string_view role_name) {
-    return find_record(role_name).then([](std::optional<record> mr) {
+    return find_record(_qp, role_name).then([](std::optional<record> mr) {
        return static_cast<bool>(mr);
    });
 }

 future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
-    return require_record(role_name).then([](record r) {
+    return require_record(_qp, role_name).then([](record r) {
        return r.is_superuser;
    });
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    return require_record(role_name).then([](record r) {
-        return r.can_login;
-    });
+    if (legacy_mode(_qp)) {
+       const auto r = co_await require_record(_qp, role_name);
+       co_return r.can_login;
+    }
+    auto role = _cache.get(sstring(role_name));
+    if (!role) {
+        throw nonexistant_role(role_name);
+    }
+    co_return role->can_login;
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -90,12 +90,6 @@ public:

 private:
    enum class membership_change { add, remove };
-    struct record final {
-        sstring name;
-        bool is_superuser;
-        bool can_login;
-        role_set member_of;
-    };

    future<> create_legacy_metadata_tables_if_missing() const;

@@ -113,14 +107,6 @@ private:
    future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);

    future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
-
-    future<std::optional<record>> legacy_find_record(std::string_view role_name);
-    future<std::optional<record>> find_record(std::string_view role_name);
-    future<record> require_record(std::string_view role_name);
-    future<> collect_roles(
-            std::string_view grantee_name,
-            bool recurse,
-            role_set& roles);
 };

 } // namespace auth
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -814,7 +814,8 @@ generation_service::generation_service(
            config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
            sharded<db::system_keyspace>& sys_ks,
            abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
-            replica::database& db)
+            replica::database& db,
+            std::function<bool()> raft_topology_change_enabled)
        : _cfg(std::move(cfg))
        , _gossiper(g)
        , _sys_dist_ks(sys_dist_ks)
@@ -823,6 +824,7 @@ generation_service::generation_service(
        , _token_metadata(stm)
        , _feature_service(f)
        , _db(db)
+        , _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
 {
 }

@@ -876,7 +878,16 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
 future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    return make_ready_future<>();
+    if (_raft_topology_change_enabled()) {
+        return make_ready_future<>();
+    }
+
+    return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
+        auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
+        cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
+
+        return legacy_handle_cdc_generation(gen_id);
+    });
 }

 future<> generation_service::check_and_repair_cdc_streams() {
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -79,12 +79,17 @@ private:
    std::optional<cdc::generation_id> _gen_id;
    future<> _cdc_streams_rewrite_complete = make_ready_future<>();

+    /* Returns true if raft topology changes are enabled.
+     * Can only be called from shard 0.
+     */
+    std::function<bool()> _raft_topology_change_enabled;
 public:
    generation_service(config cfg, gms::gossiper&,
            sharded<db::system_distributed_keyspace>&,
            sharded<db::system_keyspace>& sys_ks,
            abort_source&, const locator::shared_token_metadata&,
-            gms::feature_service&, replica::database& db);
+            gms::feature_service&, replica::database& db,
+            std::function<bool()> raft_topology_change_enabled);

    future<> stop();
    ~generation_service();
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1519,9 +1519,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto injected_threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold");
-    const auto threshold = injected_threshold.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
-
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1536,7 +1534,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.when();
+            co_await cstate.compaction_done.wait([this, &t] {
+                return !can_perform_regular_compaction(t);
+            });
        }
    } catch (const broken_condition_variable&) {
        co_return;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -299,11 +299,13 @@ batch_size_fail_threshold_in_kb: 1024
 # max_hint_window_in_ms: 10800000 # 3 hours


-# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
+# Validity period for permissions cache (fetching permissions can be an
+# expensive operation depending on the authorizer, CassandraAuthorizer is
+# one example). Defaults to 10000, set to 0 to disable.
 # Will be disabled automatically for AllowAllAuthorizer.
 # permissions_validity_in_ms: 10000

-# Refresh interval for authorized statements cache.
+# Refresh interval for permissions cache (if enabled).
 # After this interval, cache entries become eligible for refresh. Upon next
 # access, an async reload is scheduled and the old value returned until it
 # completes. If permissions_validity_in_ms is non-zero, then this also must have
@@ -564,16 +566,15 @@ commitlog_total_space_in_mb: -1
 # prometheus_address: 1.2.3.4

 # audit settings
-# Table audit is enabled by default.
+# By default, Scylla does not audit anything.
 # 'audit' config option controls if and where to output audited events:
-#   - "none": auditing is disabled
-#   - "table": save audited events in audit.audit_log column family (default)
+#   - "none": auditing is disabled (default)
+#   - "table": save audited events in audit.audit_log column family
 #   - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
 audit: "table"
 #
 # List of statement categories that should be audited.
-# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
-audit_categories: "DCL,AUTH,ADMIN"
+audit_categories: "DCL,DDL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
@@ -874,16 +875,7 @@ maintenance_socket: ignore
 # The `tablets` option cannot be changed using `ALTER KEYSPACE`.
 tablets_mode_for_new_keyspaces: enabled

-# Require every tablet-enabled keyspace to be RF-rack-valid.
-#
-# A tablet-enabled keyspace is RF-rack-valid when, for each data center,
-# its replication factor (RF) is 0, 1, or exactly equal to the number of
-# racks in that data center. Setting the RF to the number of racks ensures
-# that a single rack failure never results in data unavailability.
-#
-# When set to true, CREATE KEYSPACE and ALTER KEYSPACE statements that
-# would produce an RF-rack-invalid keyspace are rejected.
-# When set to false, such statements are allowed but emit a warning.
+# Enforce RF-rack-valid keyspaces.
 rf_rack_valid_keyspaces: false

 #
--- a/configure.py
+++ b/configure.py
@@ -725,9 +725,29 @@ raft_tests = set([
 vector_search_tests = set([
    'test/vector_search/vector_store_client_test',
    'test/vector_search/load_balancer_test',
-    'test/vector_search/client_test',
-    'test/vector_search/filter_test',
-    'test/vector_search/rescoring_test'
+    'test/vector_search/client_test'
+])
+
+vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
+vector_search_validator_deps = set([
+    'test/vector_search_validator/build-validator',
+    'test/vector_search_validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/src/main.rs',
+    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
+    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
+    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
+])
+
+vector_store_bin = 'vector-search-validator/bin/vector-store'
+vector_store_deps = set([
+    'test/vector_search_validator/build-env',
+    'test/vector_search_validator/build-vector-store',
+])
+
+vector_search_validator_bins = set([
+    vector_search_validator_bin,
+    vector_store_bin,
 ])

 wasms = set([
@@ -763,7 +783,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms
+all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -795,9 +815,6 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
-# Workaround for https://github.com/mozilla/sccache/issues/2575
-arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
-                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -928,7 +945,8 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'test/lib/limiting_data_source.cc',
+                'utils/buffer_input_stream.cc',
+                'utils/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1016,9 +1034,6 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/functions/aggregate_fcts.cc',
                'cql3/functions/castas_fcts.cc',
                'cql3/functions/error_injection_fcts.cc',
-                'cql3/statements/strong_consistency/modification_statement.cc',
-                'cql3/statements/strong_consistency/select_statement.cc',
-                'cql3/statements/strong_consistency/statement_helpers.cc',
                'cql3/functions/vector_similarity_fcts.cc',
                'cql3/statements/cf_prop_defs.cc',
                'cql3/statements/cf_statement.cc',
@@ -1044,8 +1059,8 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/statements/raw/parsed_statement.cc',
                'cql3/statements/property_definitions.cc',
                'cql3/statements/update_statement.cc',
-                'cql3/statements/broadcast_modification_statement.cc',
-                'cql3/statements/broadcast_select_statement.cc',
+                'cql3/statements/strongly_consistent_modification_statement.cc',
+                'cql3/statements/strongly_consistent_select_statement.cc',
                'cql3/statements/delete_statement.cc',
                'cql3/statements/prune_materialized_view_statement.cc',
                'cql3/statements/batch_statement.cc',
@@ -1174,7 +1189,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
                'utils/http.cc',
-                'utils/http_client_error_processing.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
@@ -1192,7 +1206,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/azure/identity/default_credentials.cc',
                'utils/gcp/gcp_credentials.cc',
                'utils/gcp/object_storage.cc',
-                'utils/gcp/object_storage_retry_strategy.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1277,6 +1290,7 @@ scylla_core = (['message/messaging_service.cc',
                'auth/passwords.cc',
                'auth/password_authenticator.cc',
                'auth/permission.cc',
+                'auth/permissions_cache.cc',
                'auth/service.cc',
                'auth/standard_role_manager.cc',
                'auth/ldap_role_manager.cc',
@@ -1337,9 +1351,6 @@ scylla_core = (['message/messaging_service.cc',
                'lang/wasm.cc',
                'lang/wasm_alien_thread_runner.cc',
                'lang/wasm_instance_cache.cc',
-                'service/strong_consistency/groups_manager.cc',
-                'service/strong_consistency/coordinator.cc',
-                'service/strong_consistency/state_machine.cc',
                'service/raft/group0_state_id_handler.cc',
                'service/raft/group0_state_machine.cc',
                'service/raft/group0_state_machine_merger.cc',
@@ -1361,6 +1372,7 @@ scylla_core = (['message/messaging_service.cc',
                'service/topology_state_machine.cc',
                'service/topology_mutation.cc',
                'service/topology_coordinator.cc',
+                'node_ops/node_ops_ctl.cc',
                'node_ops/task_manager_module.cc',
                'reader_concurrency_semaphore_group.cc',
                'utils/disk_space_monitor.cc',
@@ -1368,7 +1380,6 @@ scylla_core = (['message/messaging_service.cc',
                'vector_search/dns.cc',
                'vector_search/client.cc',
                'vector_search/clients.cc',
-                'vector_search/filter.cc',
                'vector_search/truststore.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
@@ -1478,7 +1489,6 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/hinted_handoff.idl.hh',
        'idl/storage_proxy.idl.hh',
        'idl/sstables.idl.hh',
-        'idl/strong_consistency/state_machine.idl.hh',
        'idl/group0_state_machine.idl.hh',
        'idl/mapreduce_request.idl.hh',
        'idl/replica_exception.idl.hh',
@@ -1537,7 +1547,6 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
-                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -1645,7 +1654,6 @@ for t in sorted(perf_tests):

 deps['test/boost/combined_tests'] += [
    'test/boost/aggregate_fcts_test.cc',
-    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
    'test/boost/cache_algorithm_test.cc',
@@ -1776,8 +1784,6 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
 deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
-deps['test/vector_search/filter_test'] = ['test/vector_search/filter_test.cc'] + scylla_tests_dependencies
-deps['test/vector_search/rescoring_test'] = ['test/vector_search/rescoring_test.cc'] + scylla_tests_dependencies

 boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]

@@ -2387,7 +2393,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2564,10 +2570,11 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
+                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2597,7 +2604,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms:
+            if binary in other or binary in wasms or binary in vector_search_validator_bins:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2708,10 +2715,11 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
+                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2879,6 +2887,19 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

+    f.write(textwrap.dedent(f'''\
+        rule build-vector-search-validator
+            command = test/vector_search_validator/build-validator $builddir
+        rule build-vector-store
+            command = test/vector_search_validator/build-vector-store $builddir
+        '''))
+    f.write(
+            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
+    )
+    f.write(
+            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
+    )
+
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -3116,7 +3137,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache and args.sccache_rust:
+        if 'sccache' in compiler_cache:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -47,9 +47,6 @@ target_sources(cql3
    functions/aggregate_fcts.cc
    functions/castas_fcts.cc
    functions/error_injection_fcts.cc
-    statements/strong_consistency/select_statement.cc
-    statements/strong_consistency/modification_statement.cc
-    statements/strong_consistency/statement_helpers.cc
    functions/vector_similarity_fcts.cc
    statements/cf_prop_defs.cc
    statements/cf_statement.cc
@@ -75,8 +72,8 @@ target_sources(cql3
    statements/raw/parsed_statement.cc
    statements/property_definitions.cc
    statements/update_statement.cc
-    statements/broadcast_modification_statement.cc
-    statements/broadcast_select_statement.cc
+    statements/strongly_consistent_modification_statement.cc
+    statements/strongly_consistent_select_statement.cc
    statements/delete_statement.cc
    statements/prune_materialized_view_statement.cc
    statements/batch_statement.cc
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -389,10 +389,8 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool is_ann_ordering = false;
    }
    : K_SELECT (
-                ( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
-                | (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
-                )?
-                ( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
+                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
+                ( K_DISTINCT { is_distinct = true; } )?
                sclause=selectClause
               )
      K_FROM (
@@ -427,13 +425,13 @@ selector returns [shared_ptr<raw_selector> s]

 unaliasedSelector returns [uexpression tmp]
    :  ( c=cident                                  { tmp = unresolved_identifier{std::move(c)}; }
-       | v=value                                   { tmp = std::move(v); }
       | K_COUNT '(' countArgument ')'             { tmp = make_count_rows_function_expression(); }
       | K_WRITETIME '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | K_TTL       '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
+       | f=similarityFunctionName args=vectorSimilarityArgs            { tmp = function_call{std::move(f), std::move(args)}; }
       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
       )
       ( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -448,9 +446,23 @@ selectionFunctionArgs returns [std::vector<expression> a]
      ')'
    ;

+vectorSimilarityArgs returns [std::vector<expression> a]
+    : '(' ')'
+    | '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
+          ( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
+      ')'
+    ;
+
+vectorSimilarityArg returns [uexpression a]
+    : s=unaliasedSelector { a = std::move(s); }
+    | v=value             { a = std::move(v); }
+    ;
+
 countArgument
    : '*'
-    /* COUNT(1) is also allowed, it is recognized via the general function(args) path */
+    | i=INTEGER { if (i->getText() != "1") {
+                    add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
+                } }
    ;

 whereClause returns [uexpression clause]
@@ -874,8 +886,8 @@ cfamDefinition[cql3::statements::create_table_statement::raw_statement& expr]
    ;

 cfamColumns[cql3::statements::create_table_statement::raw_statement& expr]
-    @init { bool is_static=false, is_ttl=false; }
-    : k=ident v=comparatorType (K_TTL {is_ttl = true;})? (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static, is_ttl); }
+    @init { bool is_static=false; }
+    : k=ident v=comparatorType (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static); }
        (K_PRIMARY K_KEY { $expr.add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); })?
    | K_PRIMARY K_KEY '(' pkDef[expr] (',' c=ident { $expr.add_column_alias(c); } )* ')'
    ;
@@ -1042,7 +1054,6 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
        std::vector<alter_table_statement::column_change> column_changes;
        std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>, shared_ptr<cql3::column_identifier::raw>>> renames;
        auto attrs = std::make_unique<cql3::attributes::raw>();
-        shared_ptr<cql3::column_identifier::raw> ttl_change;
    }
    : K_ALTER K_COLUMNFAMILY cf=columnFamilyName
          ( K_ALTER id=cident K_TYPE v=comparatorType { type = alter_table_statement::type::alter; column_changes.emplace_back(alter_table_statement::column_change{id, v}); }
@@ -1061,11 +1072,9 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
          | K_RENAME                                  { type = alter_table_statement::type::rename; }
               id1=cident K_TO toId1=cident { renames.emplace_back(id1, toId1); }
               ( K_AND idn=cident K_TO toIdn=cident { renames.emplace_back(idn, toIdn); } )*
-          | K_TTL                                     { type = alter_table_statement::type::ttl; }
-               ( id=cident { ttl_change = id; } | K_NULL )
          )
    {
-        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs), std::move(ttl_change));
+        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs));
    }
    ;

@@ -1697,6 +1706,10 @@ functionName returns [cql3::functions::function_name s]
    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s.keyspace = std::move(ks); $s.name = std::move(f); }
    ;

+similarityFunctionName returns [cql3::functions::function_name s]
+    : f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
+    ;
+
 allowedFunctionName returns [sstring s]
    : f=IDENT                       { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
    | f=QUOTED_NAME                 { $s = $f.text; }
@@ -1705,6 +1718,11 @@ allowedFunctionName returns [sstring s]
    | K_COUNT                       { $s = "count"; }
    ;

+allowedSimilarityFunctionName returns [sstring s]
+    : f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
+      { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
+    ;
+
 functionArgs returns [std::vector<expression> a]
    : '(' ')'
    | '(' t1=term { a.push_back(std::move(t1)); }
@@ -2074,21 +2092,7 @@ vector_type returns [shared_ptr<cql3::cql3_type::raw> pt]
        {
            if ($d.text[0] == '-')
                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
-            unsigned long parsed_dimension;
-            try {
-                parsed_dimension = std::stoul($d.text);
-            } catch (const std::exception& e) {
-                throw exceptions::invalid_request_exception(format("Invalid vector dimension: {}", $d.text));
-            }
-            static_assert(sizeof(unsigned long) >= sizeof(vector_dimension_t));
-            if (parsed_dimension == 0) {
-                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
-            }
-            if (parsed_dimension > cql3::cql3_type::MAX_VECTOR_DIMENSION) {
-                throw exceptions::invalid_request_exception(
-                        format("Vectors must have a dimension less than or equal to {}", cql3::cql3_type::MAX_VECTOR_DIMENSION));
-            }
-            $pt = cql3::cql3_type::raw::vector(t, static_cast<vector_dimension_t>(parsed_dimension));
+            $pt = cql3::cql3_type::raw::vector(t, std::stoul($d.text));
        }
    ;

@@ -2415,6 +2419,10 @@ K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

 K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;

+K_SIMILARITY_EUCLIDEAN:     S I M I L A R I T Y '_' E U C L I D E A N;
+K_SIMILARITY_COSINE:        S I M I L A R I T Y '_' C O S I N E;
+K_SIMILARITY_DOT_PRODUCT:   S I M I L A R I T Y '_' D O T '_' P R O D U C T;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/assignment_testable.hh
+++ b/cql3/assignment_testable.hh
@@ -27,7 +27,7 @@ public:

    struct vector_test_result {
        test_result result;
-        std::optional<vector_dimension_t> dimension_opt;
+        std::optional<size_t> dimension_opt;
    };

    static bool is_assignable(test_result tr) {
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -307,14 +307,17 @@ public:

 class cql3_type::raw_vector : public raw {
    shared_ptr<raw> _type;
-    vector_dimension_t _dimension;
+    size_t _dimension;
+
+    // This limitation is acquired from the maximum number of dimensions in OpenSearch. 
+    static constexpr size_t MAX_VECTOR_DIMENSION = 16000;

    virtual sstring to_string() const override {
        return seastar::format("vector<{}, {}>", _type, _dimension);
    }

 public:
-    raw_vector(shared_ptr<raw> type, vector_dimension_t dimension)
+    raw_vector(shared_ptr<raw> type, size_t dimension)
            : _type(std::move(type)), _dimension(dimension) {
    }

@@ -414,7 +417,7 @@ cql3_type::raw::tuple(std::vector<shared_ptr<raw>> ts) {
 }

 shared_ptr<cql3_type::raw>
-cql3_type::raw::vector(shared_ptr<raw> t, vector_dimension_t dimension) {
+cql3_type::raw::vector(shared_ptr<raw> t, size_t dimension) {
    return ::make_shared<raw_vector>(std::move(t), dimension);
 }

--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -39,9 +39,6 @@ public:
    data_type get_type() const { return _type; }
    const sstring& to_string() const { return _type->cql3_type_name(); }

-    // This limitation is acquired from the maximum number of dimensions in OpenSearch.
-    static constexpr vector_dimension_t MAX_VECTOR_DIMENSION = 16000;
-
    // For UserTypes, we need to know the current keyspace to resolve the
    // actual type used, so Raw is a "not yet prepared" CQL3Type.
    class raw {
@@ -67,7 +64,7 @@ public:
        static shared_ptr<raw> list(shared_ptr<raw> t);
        static shared_ptr<raw> set(shared_ptr<raw> t);
        static shared_ptr<raw> tuple(std::vector<shared_ptr<raw>> ts);
-        static shared_ptr<raw> vector(shared_ptr<raw> t, vector_dimension_t dimension);
+        static shared_ptr<raw> vector(shared_ptr<raw> t, size_t dimension);
        static shared_ptr<raw> frozen(shared_ptr<raw> t);
        friend sstring format_as(const raw& r) {
            return r.to_string();
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -10,7 +10,6 @@
 #include "expr-utils.hh"
 #include "evaluate.hh"
 #include "cql3/functions/functions.hh"
-#include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/castas_fcts.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/column_identifier.hh"
@@ -502,8 +501,8 @@ vector_validate_assignable_to(const collection_constructor& c, data_dictionary::
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

-    vector_dimension_t expected_size = vt->get_dimension();
-    if (expected_size == 0) {
+    size_t expected_size = vt->get_dimension();
+    if (!expected_size) {
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {}: type {} expects at least one element",
                                                            *receiver.name, receiver.type->as_cql3_type()));
    }
@@ -1048,47 +1047,8 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
    return partially_prepared_args;
 }

-// Special case for count(1) - recognize it as the countRows() function. Note it is quite
-// artificial and we might relax it to the more general count(expression) later.
-static
-std::optional<expression>
-try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
-    return std::visit(overloaded_functor{
-        [&] (const functions::function_name& name) -> std::optional<expression> {
-            auto native_name = name;
-            if (!native_name.has_keyspace()) {
-                native_name = name.as_native_function();
-            }
-            // Collapse count(1) into countRows()
-            if (native_name == functions::function_name::native_function("count")) {
-                if (fc.args.size() == 1) {
-                    if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
-                        if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
-                                && uc_arg->raw_text == "1") {
-                            return expr::function_call{
-                                .func = functions::aggregate_fcts::make_count_rows_function(),
-                                .args = {},
-                            };
-                        } else {
-                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
-                        }
-                    }
-                }
-            }
-            return std::nullopt;
-        },
-        [] (const shared_ptr<functions::function>&) -> std::optional<expression> {
-            // Already prepared, nothing to do
-            return std::nullopt;
-        },
-    }, fc.func);
-}
-
 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
-    if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
-        return prepared;
-    }
    // Try to extract a column family name from the available information.
    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,41 +10,9 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
-#include <span>
-#include <bit>

 namespace cql3 {
 namespace functions {
-
-namespace detail {
-
-std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension) {
-    if (!param) {
-        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
-    }
-
-    const size_t expected_size = dimension * sizeof(float);
-    if (param->size() != expected_size) {
-        throw exceptions::invalid_request_exception(
-            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
-                       expected_size, dimension, param->size()));
-    }
-
-    std::vector<float> result;
-    result.reserve(dimension);
-
-    bytes_view view(*param);
-    for (size_t i = 0; i < dimension; ++i) {
-        // read_simple handles network byte order (big-endian) conversion
-        uint32_t raw = read_simple<uint32_t>(view);
-        result.push_back(std::bit_cast<float>(raw));
-    }
-
-    return result;
-}
-
-} // namespace detail
-
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;
    double squared_norm_a = 0.0;
    double squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -69,7 +37,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        return std::numeric_limits<float>::quiet_NaN();
+        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
    }

    // The cosine similarity is in the range [-1, 1].
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        double diff = a - b;
        sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
        dot_product += a * b;
    }

@@ -156,7 +124,7 @@ std::vector<data_type> retrieve_vector_arg_types(const function_name& name, cons
        }
    }

-    vector_dimension_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
+    size_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
    auto type = vector_type_impl::get_instance(float_type, dimension);
    return {type, type};
 }
@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    // Extract dimension from the vector type
-    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
-    vector_dimension_t dimension = type.get_dimension();
+    const auto& type = arg_types()[0];
+    data_value v1 = type->deserialize(*parameters[0]);
+    data_value v2 = type->deserialize(*parameters[1]);
+    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
+    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);

-    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
-    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
-    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
-
-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,7 +11,6 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
-#include <span>

 namespace cql3 {
 namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
+using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

-namespace detail {
-
-// Extract float vector directly from serialized bytes, bypassing data_value overhead.
-// This is an internal API exposed for testing purposes.
-// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
-std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension);
-
-} // namespace detail
-
 } // namespace functions
 } // namespace cql3
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -48,10 +48,8 @@ const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono

 struct query_processor::remote {
    remote(service::migration_manager& mm, service::mapreduce_service& fwd,
-           service::storage_service& ss, service::raft_group0_client& group0_client,
-           service::strong_consistency::coordinator& _sc_coordinator)
+           service::storage_service& ss, service::raft_group0_client& group0_client)
            : mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
-            , sc_coordinator(_sc_coordinator)
            , gate("query_processor::remote")
    {}

@@ -59,7 +57,6 @@ struct query_processor::remote {
    service::mapreduce_service& mapreducer;
    service::storage_service& ss;
    service::raft_group0_client& group0_client;
-    service::strong_consistency::coordinator& sc_coordinator;

    seastar::named_gate gate;
 };
@@ -517,16 +514,9 @@ query_processor::~query_processor() {
    }
 }

-std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
-query_processor::acquire_strongly_consistent_coordinator() {
-    auto [remote_, holder] = remote();
-    return {remote_.get().sc_coordinator, std::move(holder)};
-}
-
 void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
-                                   service::storage_service& ss, service::raft_group0_client& group0_client,
-                                   service::strong_consistency::coordinator& sc_coordinator) {
-    _remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
+                                   service::storage_service& ss, service::raft_group0_client& group0_client) {
+    _remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client);
 }

 future<> query_processor::stop_remote() {
@@ -870,7 +860,6 @@ struct internal_query_state {
    sstring query_string;
    std::unique_ptr<query_options> opts;
    statements::prepared_statement::checked_weak_ptr p;
-    std::optional<service::query_state> qs;
    bool more_results = true;
 };

@@ -878,14 +867,10 @@ internal_query_state query_processor::create_paged_state(
        const sstring& query_string,
        db::consistency_level cl,
        const data_value_list& values,
-        int32_t page_size,
-        std::optional<service::query_state> qs) {
+        int32_t page_size) {
    auto p = prepare_internal(query_string);
    auto opts = make_internal_options(p, values, cl, page_size);
-    if (!qs) {
-        qs.emplace(query_state_for_internal_call());
-    }
-    return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), std::move(qs), true};
+    return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), true};
 }

 bool query_processor::has_more_results(cql3::internal_query_state& state) const {
@@ -908,8 +893,9 @@ future<> query_processor::for_each_cql_result(
 future<::shared_ptr<untyped_result_set>>
 query_processor::execute_paged_internal(internal_query_state& state) {
    state.p->statement->validate(*this, service::client_state::for_internal_calls());
+    auto qs = query_state_for_internal_call();
    ::shared_ptr<cql_transport::messages::result_message> msg =
-      co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);
+      co_await state.p->statement->execute(*this, qs, *state.opts, std::nullopt);

    class visitor : public result_message::visitor_base {
        internal_query_state& _state;
@@ -1216,9 +1202,8 @@ future<> query_processor::query_internal(
        db::consistency_level cl,
        const data_value_list& values,
        int32_t page_size,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
-        std::optional<service::query_state> qs) {
-    auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
+    auto query_state = create_paged_state(query_string, cl, values, page_size);
    co_return co_await for_each_cql_result(query_state, std::move(f));
 }

--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -44,10 +44,6 @@ class query_state;
 class mapreduce_service;
 class raft_group0_client;

-namespace strong_consistency {
-class coordinator;
-}
-
 namespace broadcast_tables {
 struct query;
 }
@@ -159,8 +155,7 @@ public:
    ~query_processor();

    void start_remote(service::migration_manager&, service::mapreduce_service&,
-                      service::storage_service& ss, service::raft_group0_client&,
-                      service::strong_consistency::coordinator&);
+                      service::storage_service& ss, service::raft_group0_client&);
    future<> stop_remote();

    data_dictionary::database db() {
@@ -179,9 +174,6 @@ public:
        return _proxy;
    }

-    std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
-    acquire_strongly_consistent_coordinator();
-
    cql_stats& get_cql_stats() {
        return _cql_stats;
    }
@@ -330,7 +322,6 @@ public:
     * page_size - maximum page size
     * f - a function to be run on each row of the query result,
     *     if the function returns stop_iteration::yes the iteration will stop
-     * qs - optional query state (default: std::nullopt)
     *
     * \note This function is optimized for convenience, not performance.
     */
@@ -339,8 +330,7 @@ public:
            db::consistency_level cl,
            const data_value_list& values,
            int32_t page_size,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
-            std::optional<service::query_state> qs = std::nullopt);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);

    /*
     * \brief iterate over all cql results using paging
@@ -509,8 +499,7 @@ private:
            const sstring& query_string,
            db::consistency_level,
            const data_value_list& values,
-            int32_t page_size,
-            std::optional<service::query_state> qs = std::nullopt);
+            int32_t page_size);

    /*!
     * \brief run a query using paging
--- a/cql3/query_result_printer.hh
+++ b/cql3/query_result_printer.hh
@@ -1,20 +0,0 @@
-/*
- * Copyright 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <ostream>
-
-namespace cql3 {
-
-class result;
-
-void print_query_results_text(std::ostream& os, const result& result);
-void print_query_results_json(std::ostream& os, const result& result);
-
-} // namespace cql3
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -9,10 +9,8 @@
 */

 #include <cstdint>
-#include "types/json_utils.hh"
 #include "utils/assert.hh"
 #include "utils/hashers.hh"
-#include "utils/rjson.hh"
 #include "cql3/result_set.hh"

 namespace cql3 {
@@ -48,13 +46,6 @@ void metadata::add_non_serialized_column(lw_shared_ptr<column_specification> nam
    _column_info->_names.emplace_back(std::move(name));
 }

-void metadata::hide_last_column() {
-    if (_column_info->_column_count == 0) {
-        utils::on_internal_error("Trying to hide a column when there are no columns visible.");
-    }
-    _column_info->_column_count--;
-}
-
 void metadata::set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state) {
    _flags.set<flag::HAS_MORE_PAGES>();
    _paging_state = std::move(paging_state);
@@ -197,85 +188,4 @@ make_empty_metadata() {
    return empty_metadata_cache;
 }

-void print_query_results_text(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    struct column_values {
-        size_t max_size{0};
-        sstring header_format;
-        sstring row_format;
-        std::vector<sstring> values;
-
-        void add(sstring value) {
-            max_size = std::max(max_size, value.size());
-            values.push_back(std::move(value));
-        }
-    };
-
-    std::vector<column_values> columns;
-    columns.resize(column_metadata.size());
-
-    for (size_t i = 0; i < column_metadata.size(); ++i) {
-        columns[i].add(column_metadata[i]->name->text());
-    }
-
-    for (const auto& row : result.result_set().rows()) {
-        for (size_t i = 0; i < row.size(); ++i) {
-            if (row[i]) {
-                columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
-            } else {
-                columns[i].add("");
-            }
-        }
-    }
-
-    std::vector<sstring> separators(columns.size(), sstring());
-    for (size_t i = 0; i < columns.size(); ++i) {
-        auto& col_values = columns[i];
-        col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
-        col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
-        for (size_t c = 0; c < col_values.max_size; ++c) {
-            separators[i] += "-";
-        }
-    }
-
-    for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
-        std::vector<sstring> row;
-        row.reserve(columns.size());
-        for (size_t i = 0; i < columns.size(); ++i) {
-            const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
-            row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
-        }
-        fmt::print(os, "{}\n", fmt::join(row, "|"));
-        if (!r) {
-            fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
-        }
-    }
-}
-
-void print_query_results_json(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    rjson::streaming_writer writer(os);
-
-    writer.StartArray();
-    for (const auto& row : result.result_set().rows()) {
-        writer.StartObject();
-        for (size_t i = 0; i < row.size(); ++i) {
-            writer.Key(column_metadata[i]->name->text());
-            if (!row[i] || row[i]->empty()) {
-                writer.Null();
-                continue;
-            }
-            const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
-            const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
-            writer.RawValue(value, type);
-        }
-        writer.EndObject();
-    }
-    writer.EndArray();
-}
-
 }
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -73,7 +73,6 @@ public:
    uint32_t value_count() const;

    void add_non_serialized_column(lw_shared_ptr<column_specification> name);
-    void hide_last_column();

 public:
    void set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state);
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -225,9 +225,10 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            //    The second hyphen is not really true because currently topological changes can
            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-        } catch (const std::invalid_argument& e) {
+        } catch (const std::exception& e) {
            if (replica::database::enforce_rf_rack_validity_for_keyspace(qp.db().get_config(), *ks_md)) {
-                // wrap the exception manually here in a type that can be passed to the user.
+                // There's no guarantee what the type of the exception will be, so we need to
+                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
            } else {
                // Even when RF-rack-validity is not enforced for the keyspace, we'd
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -10,7 +10,6 @@

 #include "cdc/log.hh"
 #include "index/vector_index.hh"
-#include "types/types.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -31,9 +30,6 @@
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
 #include "cdc/cdc_partitioner.hh"
-#include "db/tags/extension.hh"
-#include "db/tags/utils.hh"
-#include "alternator/ttl_tag.hh"

 namespace cql3 {

@@ -47,8 +43,7 @@ alter_table_statement::alter_table_statement(uint32_t bound_terms,
                                             std::vector<column_change> column_changes,
                                             std::optional<cf_prop_defs> properties,
                                             renames_type renames,
-                                             std::unique_ptr<attributes> attrs,
-                                             shared_ptr<column_identifier::raw> ttl_change)
+                                             std::unique_ptr<attributes> attrs)
    : schema_altering_statement(std::move(name))
    , _bound_terms(bound_terms)
    , _type(t)
@@ -56,7 +51,6 @@ alter_table_statement::alter_table_statement(uint32_t bound_terms,
    , _properties(std::move(properties))
    , _renames(std::move(renames))
    , _attrs(std::move(attrs))
-    , _ttl_change(std::move(ttl_change))
 {
 }

@@ -386,21 +380,6 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
            throw exceptions::invalid_request_exception("Cannot drop columns from a non-CQL3 table");
        }
        invoke_column_change_fn(std::mem_fn(&alter_table_statement::drop_column));
-
-        // If we dropped the column used for per-row TTL, we need to remove the tag.
-        if (std::optional<std::string> ttl_column = db::find_tag(*s, TTL_TAG_KEY)) {
-            for (auto& [raw_name, raw_validator, is_static] : _column_changes) {
-                if (*ttl_column == raw_name->text()) {
-                    const std::map<sstring, sstring>* tags_ptr = db::get_tags_of_table(s);
-                    if (tags_ptr) {
-                        std::map<sstring, sstring> tags_map = *tags_ptr;
-                        tags_map.erase(TTL_TAG_KEY);
-                        cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
-                    }
-                    break;
-                }
-            }
-        }
        break;

    case alter_table_statement::type::opts:
@@ -455,7 +434,6 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        break;

    case alter_table_statement::type::rename:
-    {
        for (auto&& entry : _renames) {
            auto from = entry.first->prepare_column_identifier(*s);
            auto to = entry.second->prepare_column_identifier(*s);
@@ -492,53 +470,6 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        }
        return make_pair(std::move(new_base_schema), std::move(view_updates));
    }
-    case alter_table_statement::type::ttl:
-        if (!db.features().cql_row_ttl) {
-            throw exceptions::invalid_request_exception("The CQL per-row TTL feature is not yet supported by this cluster. Upgrade all nodes to use it.");
-        }
-        if (_ttl_change) {
-            // Enable per-row TTL with chosen column for expiration time
-            const column_definition *cdef = 
-                s->get_column_definition(to_bytes(_ttl_change->text()));
-            if (!cdef) {
-                throw exceptions::invalid_request_exception(fmt::format("Column '{}' does not exist in table {}.{}", _ttl_change->text(), keyspace(), column_family()));
-            }
-            if (cdef->type != timestamp_type && cdef->type != long_type && cdef->type != int32_type) {
-                throw exceptions::invalid_request_exception(fmt::format("TTL column {} must be of type timestamp, bigint or int, can't be {}", _ttl_change->text(), cdef->type->as_cql3_type().to_string()));
-            }
-            if (cdef->is_primary_key()) {
-                throw exceptions::invalid_request_exception(fmt::format("Cannot use a primary key column {} as a TTL column", _ttl_change->text()));
-            }
-            if (cdef->is_static()) {
-                throw exceptions::invalid_request_exception(fmt::format("Cannot use a static column {} as a TTL column", _ttl_change->text()));
-            }
-            std::optional<std::string> old_ttl_column = db::find_tag(*s, TTL_TAG_KEY);
-            if (old_ttl_column) {
-                throw exceptions::invalid_request_exception(fmt::format("Cannot set TTL column, table {}.{} already has a TTL column defined: {}", keyspace(), column_family(), *old_ttl_column));
-            }
-            const std::map<sstring, sstring>* old_tags_ptr = db::get_tags_of_table(s);
-            std::map<sstring, sstring> tags_map;
-            if (old_tags_ptr) {
-                // tags_ptr is a constant pointer to schema data. To modify
-                // it, we must make a copy.
-                tags_map = *old_tags_ptr;
-            }
-            tags_map[TTL_TAG_KEY] = _ttl_change->text();
-            cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
-        } else {
-            // Disable per-row TTL
-            const std::map<sstring, sstring>* tags_ptr = db::get_tags_of_table(s);
-            if (!tags_ptr || tags_ptr->find(TTL_TAG_KEY) == tags_ptr->end()) {
-                throw exceptions::invalid_request_exception(fmt::format("Cannot unset TTL column, table {}.{} does not have a TTL column set", keyspace(), column_family()));
-            }
-            // tags_ptr is a constant pointer to schema data. To modify it, we
-            // must make a copy.
-            std::map<sstring, sstring> tags_map = *tags_ptr;
-            tags_map.erase(TTL_TAG_KEY);
-            cfm.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
-        }
-        break;
-    }

    return make_pair(cfm.build(), std::move(view_updates));
 }
@@ -577,15 +508,13 @@ alter_table_statement::raw_statement::raw_statement(cf_name name,
                                                    std::vector<column_change> column_changes,
                                                    std::optional<cf_prop_defs> properties,
                                                    renames_type renames,
-                                                    std::unique_ptr<attributes::raw> attrs,
-                                                    shared_ptr<column_identifier::raw> ttl_change)
+                                                    std::unique_ptr<attributes::raw> attrs)
    : cf_statement(std::move(name))
    , _type(t)
    , _column_changes(std::move(column_changes))
    , _properties(std::move(properties))
    , _renames(std::move(renames))
    , _attrs(std::move(attrs))
-    , _ttl_change(std::move(ttl_change))
    {}

 std::unique_ptr<cql3::statements::prepared_statement>
@@ -610,8 +539,7 @@ alter_table_statement::raw_statement::prepare(data_dictionary::database db, cql_
                _column_changes,
                _properties,
                _renames,
-                std::move(prepared_attrs),
-                _ttl_change
+                std::move(prepared_attrs)
            ),
            ctx,
            // since alter table is `cql_statement_no_metadata` (it doesn't return any metadata when preparing)
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -32,7 +32,6 @@ public:
        drop,
        opts,
        rename,
-        ttl,
    };
    using renames_type = std::vector<std::pair<shared_ptr<column_identifier::raw>,
                                               shared_ptr<column_identifier::raw>>>;
@@ -51,7 +50,6 @@ private:
    const std::optional<cf_prop_defs> _properties;
    const renames_type _renames;
    const std::unique_ptr<attributes> _attrs;
-    shared_ptr<column_identifier::raw> _ttl_change;
 public:
    alter_table_statement(uint32_t bound_terms,
                          cf_name name,
@@ -59,8 +57,7 @@ public:
                          std::vector<column_change> column_changes,
                          std::optional<cf_prop_defs> properties,
                          renames_type renames,
-                          std::unique_ptr<attributes> attrs,
-                          shared_ptr<column_identifier::raw> ttl_change);
+                          std::unique_ptr<attributes> attrs);

    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
@@ -81,7 +78,6 @@ class alter_table_statement::raw_statement : public raw::cf_statement {
    const std::optional<cf_prop_defs> _properties;
    const alter_table_statement::renames_type _renames;
    const std::unique_ptr<attributes::raw> _attrs;
-    shared_ptr<column_identifier::raw> _ttl_change;

 public:
    raw_statement(cf_name name,
@@ -89,8 +85,7 @@ public:
                  std::vector<column_change> column_changes,
                  std::optional<cf_prop_defs> properties,
                  renames_type renames,
-                  std::unique_ptr<attributes::raw> attrs,
-                  shared_ptr<column_identifier::raw> ttl_change);
+                  std::unique_ptr<attributes::raw> attrs);
    
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

--- a/cql3/statements/broadcast_modification_statement.cc
+++ b/cql3/statements/broadcast_modification_statement.cc
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
- */
-
-
-#include "cql3/statements/broadcast_modification_statement.hh"
-
-#include <optional>
-
-#include <seastar/core/future.hh>
-#include <seastar/util/variant_utils.hh>
-
-#include "bytes.hh"
-#include "cql3/attributes.hh"
-#include "cql3/expr/expression.hh"
-#include "cql3/expr/evaluate.hh"
-#include "cql3/query_processor.hh"
-#include "cql3/values.hh"
-#include "timeout_config.hh"
-#include "service/broadcast_tables/experimental/lang.hh"
-#include "db/system_keyspace.hh"
-
-namespace cql3 {
-
-static logging::logger logger("broadcast_modification_statement");
-
-namespace statements {
-
-broadcast_modification_statement::broadcast_modification_statement(
-    uint32_t bound_terms,
-    schema_ptr schema,
-    broadcast_tables::prepared_update query)
-    : cql_statement_opt_metadata{&timeout_config::write_timeout}
-    , _bound_terms{bound_terms}
-    , _schema{schema}
-    , _query{std::move(query)}
-{ }
-
-future<::shared_ptr<cql_transport::messages::result_message>>
-broadcast_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
-    return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
-            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<cql_transport::messages::result_message>>);
-}
-
-static
-service::broadcast_tables::update_query
-evaluate_prepared(
-    const broadcast_tables::prepared_update& query,
-    const query_options& options) {
-    return service::broadcast_tables::update_query{
-        .key = expr::evaluate(query.key, options).to_bytes(),
-        .new_value = expr::evaluate(query.new_value, options).to_bytes(),
-        .value_condition = query.value_condition
-            ? std::optional<bytes_opt>{expr::evaluate(*query.value_condition, options).to_bytes_opt()}
-            : std::nullopt
-    };
-}
-
-future<::shared_ptr<cql_transport::messages::result_message>>
-broadcast_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
-    if (this_shard_id() != 0) {
-        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
-    }
-
-    auto result = co_await qp.execute_broadcast_table_query(
-        { evaluate_prepared(_query, options) }
-    );
-
-    co_return co_await std::visit(make_visitor(
-        [] (service::broadcast_tables::query_result_conditional_update& qr) -> future<::shared_ptr<cql_transport::messages::result_message>> {
-            auto result_set = std::make_unique<cql3::result_set>(std::vector{
-                make_lw_shared<cql3::column_specification>(
-                    db::system_keyspace::NAME,
-                    db::system_keyspace::BROADCAST_KV_STORE,
-                    ::make_shared<cql3::column_identifier>("[applied]", false),
-                    boolean_type
-                ),
-                make_lw_shared<cql3::column_specification>(
-                    db::system_keyspace::NAME,
-                    db::system_keyspace::BROADCAST_KV_STORE,
-                    ::make_shared<cql3::column_identifier>("value", true),
-                    utf8_type
-                )
-            });
-
-            result_set->add_row({ boolean_type->decompose(qr.is_applied), qr.previous_value });
-
-            return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(
-                ::make_shared<cql_transport::messages::result_message::rows>(cql3::result{std::move(result_set)}));
-        },
-        [] (service::broadcast_tables::query_result_none&) -> future<::shared_ptr<cql_transport::messages::result_message>> {
-            return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
-        },
-        [] (service::broadcast_tables::query_result_select&) -> future<::shared_ptr<cql_transport::messages::result_message>> {
-            on_internal_error(logger, "incorrect query result ");
-        }
-    ), result);
-}
-
-uint32_t broadcast_modification_statement::get_bound_terms() const {
-    return _bound_terms;
-}
-
-future<> broadcast_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
-    auto f = state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::MODIFY);
-    if (_query.value_condition.has_value()) {
-        f = f.then([this, &state] {
-           return state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::SELECT);
-        });
-    }
-    return f;
-}
-
-bool broadcast_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
-    return _schema->ks_name() == ks_name && (!cf_name || _schema->cf_name() == *cf_name);
-}
-
-}
-
-}
--- a/cql3/statements/broadcast_modification_statement.hh
+++ b/cql3/statements/broadcast_modification_statement.hh
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
- */
-
-#pragma once
-
-#include "cql3/cql_statement.hh"
-#include "cql3/statements/modification_statement.hh"
-
-namespace cql3 {
-
-namespace statements {
-
-namespace broadcast_tables {
-
-struct prepared_update {
-    expr::expression key;
-    expr::expression new_value;
-    std::optional<expr::expression> value_condition;
-};
-
-}
-
-class broadcast_modification_statement : public cql_statement_opt_metadata {
-    const uint32_t _bound_terms;
-    const schema_ptr _schema;
-    const broadcast_tables::prepared_update _query;
-
-public:
-    broadcast_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);
-
-    virtual future<::shared_ptr<cql_transport::messages::result_message>>
-    execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
-
-    virtual future<::shared_ptr<cql_transport::messages::result_message>>
-    execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
-
-    virtual uint32_t get_bound_terms() const override;
-
-    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
-
-    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
-};
-
-
-}
-
-}
--- a/cql3/statements/broadcast_select_statement.cc
+++ b/cql3/statements/broadcast_select_statement.cc
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
- */
-
-
-#include "cql3/statements/broadcast_select_statement.hh"
-
-#include <seastar/core/future.hh>
-#include <seastar/core/on_internal_error.hh>
-
-#include "cql3/restrictions/statement_restrictions.hh"
-#include "cql3/expr/evaluate.hh"
-#include "cql3/query_processor.hh"
-#include "service/broadcast_tables/experimental/lang.hh"
-#include "db/system_keyspace.hh"
-
-namespace cql3 {
-
-namespace statements {
-
-static logging::logger logger("broadcast_select_statement");
-
-static
-expr::expression get_key(const cql3::expr::expression& partition_key_restrictions) {
-    const auto* conjunction = cql3::expr::as_if<cql3::expr::conjunction>(&partition_key_restrictions);
-
-    if (!conjunction || conjunction->children.size() != 1) {
-        throw service::broadcast_tables::unsupported_operation_error(fmt::format(
-            "partition key restriction: {}", partition_key_restrictions));
-    }
-
-    const auto* key_restriction = cql3::expr::as_if<cql3::expr::binary_operator>(&conjunction->children[0]);
-
-    if (!key_restriction) {
-        throw service::broadcast_tables::unsupported_operation_error(fmt::format("partition key restriction: {}", *conjunction));
-    }
-
-    const auto* column = cql3::expr::as_if<cql3::expr::column_value>(&key_restriction->lhs);
-
-    if (!column || column->col->kind != column_kind::partition_key ||
-        key_restriction->op != cql3::expr::oper_t::EQ) {
-        throw service::broadcast_tables::unsupported_operation_error(fmt::format("key restriction: {}", *key_restriction));
-    }
-
-    return key_restriction->rhs;
-}
-
-static
-bool is_selecting_only_value(const cql3::selection::selection& selection) {
-    return selection.is_trivial() &&
-           selection.get_column_count() == 1 &&
-           selection.get_columns()[0]->name() == "value";
-}
-
-broadcast_select_statement::broadcast_select_statement(schema_ptr schema, uint32_t bound_terms,
-                                                                           lw_shared_ptr<const parameters> parameters,
-                                                                           ::shared_ptr<selection::selection> selection,
-                                                                           ::shared_ptr<const restrictions::statement_restrictions> restrictions,
-                                                                           ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
-                                                                           bool is_reversed,
-                                                                           ordering_comparator_type ordering_comparator,
-                                                                           std::optional<expr::expression> limit,
-                                                                           std::optional<expr::expression> per_partition_limit,
-                                                                           cql_stats &stats,
-                                                                           std::unique_ptr<attributes> attrs)
-    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, std::move(limit), std::move(per_partition_limit), stats, std::move(attrs)},
-      _query{prepare_query()}
-{ }
-
-broadcast_tables::prepared_select broadcast_select_statement::prepare_query() const {
-    if (!is_selecting_only_value(*_selection)) {
-        throw service::broadcast_tables::unsupported_operation_error("only 'value' selector is allowed");
-    }
-
-    return {
-        .key = get_key(_restrictions->get_partition_key_restrictions())
-    };
-}
-
-static
-service::broadcast_tables::select_query
-evaluate_prepared(
-    const broadcast_tables::prepared_select& query,
-    const query_options& options) {
-    return service::broadcast_tables::select_query{
-        .key = expr::evaluate(query.key, options).to_bytes()
-    };
-}
-
-future<::shared_ptr<cql_transport::messages::result_message>>
-broadcast_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
-    if (this_shard_id() != 0) {
-        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
-    }
-
-    auto result = co_await qp.execute_broadcast_table_query(
-        { evaluate_prepared(_query, options) }
-    );
-
-    auto query_result = std::get_if<service::broadcast_tables::query_result_select>(&result);
-
-    if (!query_result) {
-        on_internal_error(logger, "incorrect query result ");
-    }
-
-    auto result_set = std::make_unique<cql3::result_set>(std::vector{
-        make_lw_shared<cql3::column_specification>(
-            db::system_keyspace::NAME,
-            db::system_keyspace::BROADCAST_KV_STORE,
-            ::make_shared<cql3::column_identifier>("value", true),
-            utf8_type
-        )
-    });
-
-    if (query_result->value) {
-        result_set->add_row({ managed_bytes_opt(query_result->value) });
-    }
-
-    co_return ::make_shared<cql_transport::messages::result_message::rows>(cql3::result{std::move(result_set)});
-}
-
-}
-
-}
--- a/cql3/statements/broadcast_select_statement.hh
+++ b/cql3/statements/broadcast_select_statement.hh
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2022-present ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
- */
-
-#pragma once
-
-#include "cql3/expr/expression.hh"
-#include "cql3/statements/select_statement.hh"
-
-namespace cql3 {
-
-namespace statements {
-
-namespace broadcast_tables {
-
-struct prepared_select {
-    expr::expression key;
-};
-
-}
-
-class broadcast_select_statement : public select_statement {
-    const broadcast_tables::prepared_select _query;
-
-    broadcast_tables::prepared_select prepare_query() const;
-public:
-    broadcast_select_statement(schema_ptr schema,
-                     uint32_t bound_terms,
-                     lw_shared_ptr<const parameters> parameters,
-                     ::shared_ptr<selection::selection> selection,
-                     ::shared_ptr<const restrictions::statement_restrictions> restrictions,
-                     ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
-                     bool is_reversed,
-                     ordering_comparator_type ordering_comparator,
-                     std::optional<expr::expression> limit,
-                     std::optional<expr::expression> per_partition_limit,
-                     cql_stats &stats,
-                     std::unique_ptr<cql3::attributes> attrs);
-
-    virtual future<::shared_ptr<cql_transport::messages::result_message>>
-        execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
-};
-
-}
-
-}
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -123,9 +123,10 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
            // We hold a group0_guard, so it's correct to check this here.
            // The topology or schema cannot change while we're performing this query.
            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-        } catch (const std::invalid_argument& e) {
+        } catch (const std::exception& e) {
            if (replica::database::enforce_rf_rack_validity_for_keyspace(cfg, *ksm)) {
-                // wrap the exception in a type that can be passed to the user.
+                // There's no guarantee what the type of the exception will be, so we need to
+                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
            } else {
                // Even when RF-rack-validity is not enforced for the keyspace, we'd
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -30,9 +30,8 @@
 #include "service/storage_proxy.hh"
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"
-#include "db/tags/extension.hh"
-#include "db/tags/utils.hh"
-#include "alternator/ttl_tag.hh"
+
+bool is_internal_keyspace(std::string_view name);

 namespace cql3 {

@@ -44,12 +43,10 @@ create_table_statement::create_table_statement(cf_name name,
                                               ::shared_ptr<cf_prop_defs> properties,
                                               bool if_not_exists,
                                               column_set_type static_columns,
-                                               ::shared_ptr<column_identifier> ttl_column,
                                               const std::optional<table_id>& id)
    : schema_altering_statement{name}
    , _use_compact_storage(false)
    , _static_columns{static_columns}
-    , _ttl_column{ttl_column}
    , _properties{properties}
    , _if_not_exists{if_not_exists}
    , _id(id)
@@ -127,14 +124,11 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 #endif

-    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
-    // Remembering which column was designated as the TTL column for row-based
-    // TTL column is done using a "tag" extension. If there is no TTL column,
-    // we don't need this extension at all.
-    if (_ttl_column) {
-        std::map<sstring, sstring> tags_map = {{TTL_TAG_KEY, _ttl_column->text()}};
-        builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags_map)));
+    if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
+        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
    }
+
+    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
 }

 void create_table_statement::add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind) const
@@ -210,7 +204,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    }
    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

-    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _ttl_column, _properties.properties()->get_id());
+    auto stmt = ::make_shared<create_table_statement>(*_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());

    bool ks_uses_tablets;
    try {
@@ -415,27 +409,6 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
        }
    }

-    // If a TTL column is defined, it must be a regular column - not a static
-    // column or part of the primary key.
-    if (_ttl_column) {
-        if (!db.features().cql_row_ttl) {
-            throw exceptions::invalid_request_exception("The CQL per-row TTL feature is not yet supported by this cluster. Upgrade all nodes to use it.");
-        }
-        for (const auto& alias : key_aliases) {
-            if (alias->text() == _ttl_column->text()) {
-                throw exceptions::invalid_request_exception(format("TTL column {} cannot be part of the PRIMARY KEY", alias->text()));
-            }
-        }
-        for (const auto& alias : _column_aliases) {
-            if (alias->text() == _ttl_column->text()) {
-                throw exceptions::invalid_request_exception(format("TTL column {} cannot be part of the PRIMARY KEY", alias->text()));
-            }
-        }
-        if (_static_columns.contains(_ttl_column)) {
-            throw exceptions::invalid_request_exception(format("TTL column {} cannot be a static column", _ttl_column->text()));
-        }
-    }
-
    return std::make_unique<prepared_statement>(audit_info(), stmt, std::move(stmt_warnings));
 }

@@ -458,23 +431,12 @@ data_type create_table_statement::raw_statement::get_type_and_remove(column_map_
    return _properties.get_reversable_type(*t, type);
 }

-void create_table_statement::raw_statement::add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static, bool is_ttl) {
+void create_table_statement::raw_statement::add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static) {
    _defined_names.emplace(def);
    _definitions.emplace(def, type);
    if (is_static) {
        _static_columns.emplace(def);
    }
-    if (is_ttl) {
-        if (_ttl_column) {
-            throw exceptions::invalid_request_exception(fmt::format("Cannot have more than one TTL column in a table. Saw {} and {}", _ttl_column->text(), def->text()));
-        }
-        // FIXME: find a way to check cql3_type::raw without fmt::format
-        auto type_name = fmt::format("{}", type);
-        if (type_name != "timestamp" && type_name != "bigint" && type_name != "int") {
-            throw exceptions::invalid_request_exception(fmt::format("TTL column '{}' must be of type timestamp, bigint or int, can't be {}", def->text(), type_name));
-        }
-        _ttl_column = def;
-    }
 }

 void create_table_statement::raw_statement::add_key_aliases(const std::vector<::shared_ptr<column_identifier>> aliases) {
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -57,7 +57,6 @@ class create_table_statement : public schema_altering_statement {
                           shared_ptr_equal_by_value<column_identifier>>;
    column_map_type _columns;
    column_set_type _static_columns;
-    ::shared_ptr<column_identifier> _ttl_column; // for row-based TTL
    const ::shared_ptr<cf_prop_defs> _properties;
    const bool _if_not_exists;
    std::optional<table_id> _id;
@@ -66,7 +65,6 @@ public:
                           ::shared_ptr<cf_prop_defs> properties,
                           bool if_not_exists,
                           column_set_type static_columns,
-                           ::shared_ptr<column_identifier> ttl_column,
                           const std::optional<table_id>& id);

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
@@ -102,7 +100,6 @@ private:
    std::vector<std::vector<::shared_ptr<column_identifier>>> _key_aliases;
    std::vector<::shared_ptr<column_identifier>> _column_aliases;
    create_table_statement::column_set_type _static_columns;
-    ::shared_ptr<column_identifier> _ttl_column; // for row-based TTL

    std::multiset<::shared_ptr<column_identifier>,
            indirect_less<::shared_ptr<column_identifier>, column_identifier::text_comparator>> _defined_names;
@@ -119,7 +116,7 @@ public:

    data_type get_type_and_remove(column_map_type& columns, ::shared_ptr<column_identifier> t);

-    void add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static, bool is_ttl);
+    void add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static);

    void add_key_aliases(const std::vector<::shared_ptr<column_identifier>> aliases);

--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,7 +23,6 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
-#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

-        table_desc.create_statement = std::move(os).to_managed_string();
-    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
-        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
-        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
-        fragmented_ostringstream os{};
-
-        fmt::format_to(os.to_iter(),
-                "/* Do NOT execute this statement! It's only for informational purposes.\n"
-                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
-                "\n{}\n"
-                "*/",
-                *table_desc.create_statement);
-
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -98,7 +98,6 @@ static locator::replication_strategy_config_options prepare_options(
        const sstring& strategy_class,
        const locator::token_metadata& tm,
        bool rf_rack_valid_keyspaces,
-        bool enforce_rack_list,
        locator::replication_strategy_config_options options,
        const locator::replication_strategy_config_options& old_options,
        bool rack_list_enabled,
@@ -108,7 +107,7 @@ static locator::replication_strategy_config_options prepare_options(
    auto is_nts = locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) == "org.apache.cassandra.locator.NetworkTopologyStrategy";
    auto is_alter = !old_options.empty();
    const auto& all_dcs = tm.get_datacenter_racks_token_owners();
-    auto auto_expand_racks = uses_tablets && rack_list_enabled && (rf_rack_valid_keyspaces || enforce_rack_list);
+    auto auto_expand_racks = uses_tablets && rf_rack_valid_keyspaces && rack_list_enabled;

    logger.debug("prepare_options: {}: is_nts={} auto_expand_racks={} rack_list_enabled={} old_options={} new_options={} all_dcs={}",
                 strategy_class, is_nts, auto_expand_racks, rack_list_enabled, old_options, options, all_dcs);
@@ -418,7 +417,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
    bool uses_tablets = initial_tablets.has_value();
    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
-    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
+    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
            std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }
@@ -435,7 +434,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
    auto sc = get_replication_strategy_class();
    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
    if (sc) {
-        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
+        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
    } else {
        sc = old->strategy_name();
        options = old_options;
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -11,7 +11,7 @@
 #include "utils/assert.hh"
 #include "cql3/cql_statement.hh"
 #include "cql3/statements/modification_statement.hh"
-#include "cql3/statements/broadcast_modification_statement.hh"
+#include "cql3/statements/strongly_consistent_modification_statement.hh"
 #include "cql3/statements/raw/modification_statement.hh"
 #include "cql3/statements/prepared_statement.hh"
 #include "cql3/expr/expr-utils.hh"
@@ -29,8 +29,6 @@
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
-#include "cql3/statements/strong_consistency/modification_statement.hh"
-#include "cql3/statements/strong_consistency/statement_helpers.hh"

 #include <boost/lexical_cast.hpp>

@@ -548,7 +546,7 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
    }
 }

-::shared_ptr<broadcast_modification_statement>
+::shared_ptr<strongly_consistent_modification_statement>
 modification_statement::prepare_for_broadcast_tables() const {
    // FIXME: implement for every type of `modification_statement`.
    throw service::broadcast_tables::unsupported_operation_error{};
@@ -556,27 +554,24 @@ modification_statement::prepare_for_broadcast_tables() const {

 namespace raw {

+::shared_ptr<cql_statement_opt_metadata>
+modification_statement::prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) {
+    ::shared_ptr<cql3::statements::modification_statement> statement = prepare(db, ctx, stats);
+
+    if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
+        return statement->prepare_for_broadcast_tables();
+    } else {
+        return statement;
+    }
+}
+
 std::unique_ptr<prepared_statement>
 modification_statement::prepare(data_dictionary::database db, cql_stats& stats) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto meta = get_prepare_context();
-
-    auto statement = std::invoke([&] -> shared_ptr<cql_statement> {
-        auto result = prepare(db, meta, stats);
-
-        if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
-            return ::make_shared<strong_consistency::modification_statement>(std::move(result));
-        }
-
-        if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
-            return result->prepare_for_broadcast_tables();
-        }
-        return result;
-    });
-
+    auto statement = prepare_statement(db, meta, stats);
    auto partition_key_bind_indices = meta.get_partition_key_bind_indexes(*schema);
-    return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta, 
-        std::move(partition_key_bind_indices));
+    return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta, std::move(partition_key_bind_indices));
 }

 ::shared_ptr<cql3::statements::modification_statement>
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -30,7 +30,7 @@ class operation;

 namespace statements {

-class broadcast_modification_statement;
+class strongly_consistent_modification_statement;

 namespace raw { class modification_statement; }

@@ -113,15 +113,15 @@ public:

    virtual void add_update_for_key(mutation& m, const query::clustering_range& range, const update_parameters& params, const json_cache_opt& json_cache) const = 0;

-    uint32_t get_bound_terms() const override;
+    virtual uint32_t get_bound_terms() const override;

-    const sstring& keyspace() const;
+    virtual const sstring& keyspace() const;

-    const sstring& column_family() const;
+    virtual const sstring& column_family() const;

-    bool is_counter() const;
+    virtual bool is_counter() const;

-    bool is_view() const;
+    virtual bool is_view() const;

    int64_t get_timestamp(int64_t now, const query_options& options) const;

@@ -129,12 +129,12 @@ public:

    std::optional<gc_clock::duration> get_time_to_live(const query_options& options) const;

-    future<> check_access(query_processor& qp, const service::client_state& state) const override;
+    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

@@ -256,9 +256,7 @@ public:

    virtual json_cache_opt maybe_prepare_json_cache(const query_options& options) const;

-    virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const;
-
-    db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;
+    virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const;

 protected:
    /**
@@ -266,7 +264,9 @@ protected:
     * processed to check that they are compatible.
     * @throws InvalidRequestException
     */
-    void validate_where_clause_for_conditions() const;
+    virtual void validate_where_clause_for_conditions() const;
+
+    db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;

    friend class raw::modification_statement;
 };
--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -50,8 +50,8 @@ public:
 protected:
    virtual audit::statement_category category() const override;
    virtual audit::audit_info_ptr audit_info() const override {
-        constexpr bool batch = true;
-        return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
+        // We don't audit batch statements. Instead we audit statements that are inside the batch.
+        return audit::audit::create_no_audit_info();
    }
 };

--- a/cql3/statements/raw/modification_statement.hh
+++ b/cql3/statements/raw/modification_statement.hh
@@ -40,6 +40,7 @@ protected:

 public:
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
+    ::shared_ptr<cql_statement_opt_metadata> prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats);
    ::shared_ptr<cql3::statements::modification_statement> prepare(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) const;
    void add_raw(sstring&& raw) { _raw_cql = std::move(raw); }
    const sstring& get_raw_cql() const { return _raw_cql; }
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -131,6 +131,8 @@ private:

    void verify_ordering_is_valid(const prepared_orderings_type&, const schema&, const restrictions::statement_restrictions& restrictions) const;

+    prepared_ann_ordering_type prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const;
+
    // Checks whether this ordering reverses all results.
    // We only allow leaving select results unchanged or reversing them.
    bool is_ordering_reversed(const prepared_orderings_type&) const;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -8,8 +8,6 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

-#include "cql3/statements/strong_consistency/select_statement.hh"
-#include "cql3/statements/strong_consistency/statement_helpers.hh"
 #include "cql3/statements/select_statement.hh"
 #include "cql3/expr/expression.hh"
 #include "cql3/expr/evaluate.hh"
@@ -18,7 +16,7 @@
 #include "cql3/statements/raw/select_statement.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/statements/prune_materialized_view_statement.hh"
-#include "cql3/statements/broadcast_select_statement.hh"
+#include "cql3/statements/strongly_consistent_select_statement.hh"

 #include "exceptions/exceptions.hh"
 #include <seastar/core/future.hh>
@@ -27,14 +25,12 @@
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "service/qos/qos_common.hh"
 #include "transport/messages/result_message.hh"
-#include "cql3/functions/functions.hh"
 #include "cql3/functions/as_json_function.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/util.hh"
 #include "cql3/restrictions/statement_restrictions.hh"
 #include "index/secondary_index.hh"
 #include "types/vector.hh"
-#include "vector_search/filter.hh"
 #include "validation.hh"
 #include "exceptions/unrecognized_entity_exception.hh"
 #include <optional>
@@ -259,9 +255,11 @@ uint32_t select_statement::get_bound_terms() const {

 future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
    try {
-        auto cdc = qp.db().get_cdc_base_table(*_schema);
-        auto& cf_name = _schema->is_view()
-            ? _schema->view_info()->base_name()
+        const data_dictionary::database db = qp.db();
+        auto&& s = db.find_schema(keyspace(), column_family());
+        auto cdc = db.get_cdc_base_table(*s);
+        auto& cf_name = s->is_view()
+            ? s->view_info()->base_name()
            : (cdc ? cdc->cf_name() : column_family());
        const schema_ptr& base_schema = cdc ? cdc : _schema;
        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);
@@ -370,9 +368,8 @@ uint64_t select_statement::get_inner_loop_limit(uint64_t limit, bool is_aggregat
 }

 bool select_statement::needs_post_query_ordering() const {
-    // We need post-query ordering for queries with IN on the partition key and an ORDER BY
-    // and ANN index queries with rescoring.
-    return static_cast<bool>(_ordering_comparator);
+    // We need post-query ordering only for queries with IN on the partition key and an ORDER BY.
+    return _restrictions->key_is_in_relation() && !_parameters->orderings().empty();
 }

 struct select_statement_executor {
@@ -1961,139 +1958,46 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
            }));
 }

-struct ann_ordering_info {
-    secondary_index::index _index;
-    raw::select_statement::prepared_ann_ordering_type _prepared_ann_ordering;
-    bool is_rescoring_enabled;
-};
-
-static std::optional<ann_ordering_info> get_ann_ordering_info(
-        data_dictionary::database db,
-        schema_ptr schema,
-        lw_shared_ptr<const raw::select_statement::parameters> parameters,
-        prepare_context& ctx) {
-
-    if (parameters->orderings().empty()) {
-        return std::nullopt;
-    }
-
-    auto [column_id, ordering] = parameters->orderings().front();
-    const auto& ann_vector = std::get_if<raw::select_statement::ann_vector>(&ordering);
-    if (!ann_vector) {
-        return std::nullopt;
-    }
-
-    ::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(*schema);
-    const column_definition* def = schema->get_column_definition(column->name());
-    if (!def) {
-        throw exceptions::invalid_request_exception(
-                fmt::format("Undefined column name {}", column->text()));
-    }
-
-    if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
-        throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
-    }
-
-    auto e =  expr::prepare_expression(*ann_vector, db, schema->ks_name(), nullptr, def->column_specification);
-    expr::fill_prepare_context(e, ctx);
-
-    raw::select_statement::prepared_ann_ordering_type prepared_ann_ordering = std::make_pair(std::move(def), std::move(e));
-
+::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
+        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<attributes> attrs) {
    auto cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
+    auto [index_opt, _] = restrictions->find_idx(sim);

    auto indexes = sim.list_indexes();
    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
-        return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
+        return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
+                       ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
+               (ind.target_column() == prepared_ann_ordering.first->name_as_text());
    });

    if (it == indexes.end()) {
        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
    }
+    
+    index_opt = *it;

-    return ann_ordering_info{
-        *it,
-        std::move(prepared_ann_ordering),
-        secondary_index::vector_index::is_rescoring_enabled(it->metadata().options())
-    };
-}
-
-static uint32_t add_similarity_function_to_selectors(
-        std::vector<selection::prepared_selector>& prepared_selectors,
-        const ann_ordering_info& ann_ordering_info,
-        data_dictionary::database db,
-        schema_ptr schema) {
-    auto similarity_function_name = secondary_index::vector_index::get_cql_similarity_function_name(ann_ordering_info._index.metadata().options());
-    // Create the function name
-    auto func_name = functions::function_name::native_function(sstring(similarity_function_name));
-
-    // Create the function arguments
-    std::vector<expr::expression> args;
-    args.push_back(expr::column_value(ann_ordering_info._prepared_ann_ordering.first));
-    args.push_back(ann_ordering_info._prepared_ann_ordering.second);
-
-    // Get the function object
-    std::vector<shared_ptr<assignment_testable>> provided_args;
-    provided_args.push_back(expr::as_assignment_testable(args[0], expr::type_of(args[0])));
-    provided_args.push_back(expr::as_assignment_testable(args[1], expr::type_of(args[1])));
-
-    auto func = cql3::functions::instance().get(db, schema->ks_name(), func_name, provided_args, schema->ks_name(), schema->cf_name(), nullptr);
-
-    // Create the function call expression
-    expr::function_call similarity_func_call{
-        .func = func,
-        .args = std::move(args),
-    };
-
-    // Add the similarity function as a prepared selector (last)
-    prepared_selectors.push_back(selection::prepared_selector{
-        .expr = std::move(similarity_func_call),
-        .alias = nullptr,
-    });
-    return prepared_selectors.size() - 1;
-}
-
-static select_statement::ordering_comparator_type get_similarity_ordering_comparator(std::vector<selection::prepared_selector>& prepared_selectors, uint32_t similarity_column_index) {
-    auto type = expr::type_of(prepared_selectors[similarity_column_index].expr);
-    if (type->get_kind() != abstract_type::kind::float_kind) {
-        seastar::on_internal_error(logger, "Similarity function must return float type.");
+    if (!index_opt) {
+        throw std::runtime_error("No index found.");
    }
-    return [similarity_column_index, type] (const raw::select_statement::result_row_type& r1, const raw::select_statement::result_row_type& r2) {
-        auto& c1 = r1[similarity_column_index];
-        auto& c2 = r2[similarity_column_index];
-        auto f1 = c1 ? value_cast<float>(type->deserialize(*c1)) : std::numeric_limits<float>::quiet_NaN();
-        auto f2 = c2 ? value_cast<float>(type->deserialize(*c2)) : std::numeric_limits<float>::quiet_NaN();
-        if (std::isfinite(f1) && std::isfinite(f2)) {
-            return f1 > f2;
-        }
-        return std::isfinite(f1);
-    };
-}
-
-::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
-        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
-        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
-
-    auto prepared_filter = vector_search::prepare_filter(*restrictions, parameters->allow_filtering());

    return ::make_shared<cql3::statements::vector_indexed_table_select_statement>(schema, bound_terms, parameters, std::move(selection), std::move(restrictions),
            std::move(group_by_cell_indices), is_reversed, std::move(ordering_comparator), std::move(prepared_ann_ordering), std::move(limit),
-            std::move(per_partition_limit), stats, index, std::move(prepared_filter), std::move(attrs));
+            std::move(per_partition_limit), stats, *index_opt, std::move(attrs));
 }

 vector_indexed_table_select_statement::vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
        ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
        ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
        prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index,
-        vector_search::prepared_filter prepared_filter, std::unique_ptr<attributes> attrs)
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs)
    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit,
              per_partition_limit, stats, std::move(attrs)}
    , _index{index}
-    , _prepared_ann_ordering(std::move(prepared_ann_ordering))
-    , _prepared_filter(std::move(prepared_filter)) {
+    , _prepared_ann_ordering(std::move(prepared_ann_ordering)) {

    if (!limit.has_value()) {
        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
@@ -2128,19 +2032,13 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table

        auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
        auto aoe = abort_on_expiry(timeout);
-        auto filter_json = _prepared_filter.to_json(options);
-        uint64_t fetch = static_cast<uint64_t>(std::ceil(limit * secondary_index::vector_index::get_oversampling(_index.metadata().options())));
        auto pkeys = co_await qp.vector_store_client().ann(
-                _schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), fetch, filter_json, aoe.abort_source());
+                _schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, aoe.abort_source());
        if (!pkeys.has_value()) {
            co_await coroutine::return_exception(
                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
        }

-        if (pkeys->size() > limit && !secondary_index::vector_index::is_rescoring_enabled(_index.metadata().options())) {
-            pkeys->erase(pkeys->begin() + limit, pkeys->end());
-        }
-
        co_return co_await query_base_table(qp, state, options, pkeys.value(), timeout);
    });

@@ -2157,11 +2055,11 @@ void vector_indexed_table_select_statement::update_stats() const {
 }

 lw_shared_ptr<query::read_command> vector_indexed_table_select_statement::prepare_command_for_base_query(
-        query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const {
+        query_processor& qp, service::query_state& state, const query_options& options) const {
    auto slice = make_partition_slice(options);
    return ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(slice), qp.proxy().get_max_result_size(slice),
            query::tombstone_limit(qp.proxy().get_tombstone_limit()),
-            query::row_limit(get_inner_loop_limit(fetch_limit, _selection->is_aggregate())), query::partition_limit(query::max_partitions),
+            query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())), query::partition_limit(query::max_partitions),
            _query_start_time_point, tracing::make_trace_info(state.get_trace_state()), query_id::create_null_id(), query::is_first_page::no,
            options.get_timestamp(state));
 }
@@ -2179,7 +2077,7 @@ std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vecto
 future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
        service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys,
        lowres_clock::time_point timeout) const {
-    auto command = prepare_command_for_base_query(qp, state, options, pkeys.size());
+    auto command = prepare_command_for_base_query(qp, state, options);

    // For tables without clustering columns, we can optimize by querying
    // partition ranges instead of individual primary keys, since the
@@ -2218,7 +2116,6 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
            query::result_merger{command->get_row_limit(), query::max_partitions});

    co_return co_await wrap_result_to_error_message([this, &command, &options](auto result) {
-        command->set_row_limit(get_limit(options, _limit));
        return process_results(std::move(result), command, options, _query_start_time_point);
    })(std::move(result));
 }
@@ -2232,7 +2129,6 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
                    {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}, options.get_specific_options().node_local_only},
                    std::nullopt)
            .then(wrap_result_to_error_message([this, &options, command](service::storage_proxy::coordinator_query_result qr) {
-                command->set_row_limit(get_limit(options, _limit));
                return this->process_results(std::move(qr.query_result), command, options, _query_start_time_point);
            }));
 }
@@ -2327,41 +2223,32 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d

    prepared_selectors = maybe_jsonize_select_clause(std::move(prepared_selectors), db, schema);

-    std::optional<ann_ordering_info> ann_ordering_info_opt = get_ann_ordering_info(db, schema, _parameters, ctx);
-    bool is_ann_query = ann_ordering_info_opt.has_value();
+    auto aggregation_depth = 0u;

-    if (prepared_selectors.empty() && (!_group_by_columns.empty() || (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled))) {
-        // We have a "SELECT * GROUP BY" or "SELECT * ORDER BY ANN" with rescoring enabled. If we leave prepared_selectors
-        // empty, below we choose selection::wildcard() for SELECT *, and either:
-        //  - forget to do the "levellize" trick needed for the GROUP BY. See #16531.
-        //  - forget to add the similarity function needed for ORDER BY ANN with rescoring. See below.
-        // So we need to set prepared_selectors. 
-        auto all_columns = selection::selection::wildcard_columns(schema);
-        std::vector<::shared_ptr<selection::raw_selector>> select_all;
-        select_all.reserve(all_columns.size());
-        for (const column_definition *cdef : all_columns) {
-            auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
-            select_all.push_back(::make_shared<selection::raw_selector>(
-                expr::unresolved_identifier(std::move(name)), nullptr));
+    // Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
+    if (!_group_by_columns.empty()) {
+        aggregation_depth = std::max(aggregation_depth, 1u);
+        if (prepared_selectors.empty()) {
+            // We have a "SELECT * GROUP BY". If we leave prepared_selectors
+            // empty, below we choose selection::wildcard() for SELECT *, and
+            // forget to do the "levellize" trick needed for the GROUP BY.
+            // So we need to set prepared_selectors. See #16531.
+            auto all_columns = selection::selection::wildcard_columns(schema);
+            std::vector<::shared_ptr<selection::raw_selector>> select_all;
+            select_all.reserve(all_columns.size());
+            for (const column_definition *cdef : all_columns) {
+                auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
+                select_all.push_back(::make_shared<selection::raw_selector>(
+                    expr::unresolved_identifier(std::move(name)), nullptr));
+            }
+            prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
        }
-        prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
    }

    for (auto& ps : prepared_selectors) {
        expr::fill_prepare_context(ps.expr, ctx);
    }

-    // Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
-    auto aggregation_depth = _group_by_columns.empty() ? 0u : 1u;
-
-    select_statement::ordering_comparator_type ordering_comparator;
-    bool hide_last_column = false;
-    if (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled) {
-        uint32_t similarity_column_index = add_similarity_function_to_selectors(prepared_selectors, *ann_ordering_info_opt, db, schema);
-        hide_last_column = true;
-        ordering_comparator = get_similarity_ordering_comparator(prepared_selectors, similarity_column_index);
-    }
-
    for (auto& ps : prepared_selectors) {
        aggregation_depth = std::max(aggregation_depth, expr::aggregation_depth(ps.expr));
    }
@@ -2379,11 +2266,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                     ? selection::selection::wildcard(schema)
                     : selection::selection::from_selectors(db, schema, keyspace(), levellized_prepared_selectors);

-    if (is_ann_query && hide_last_column) {
-        // Hide the similarity selector from the client by reducing column_count
-        selection->get_result_metadata()->hide_last_column();
-    }
-
    // Cassandra 5.0.2 disallows PER PARTITION LIMIT with aggregate queries
    // but only if GROUP BY is not used.
    // See #9879 for more details.
@@ -2391,6 +2273,8 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        throw exceptions::invalid_request_exception("PER PARTITION LIMIT is not allowed with aggregate queries.");
    }

+    bool is_ann_query = !_parameters->orderings().empty() && std::holds_alternative<select_statement::ann_vector>(_parameters->orderings().front().second);
+
    auto restrictions = prepare_restrictions(db, schema, ctx, selection, for_view, _parameters->allow_filtering() || is_ann_query,
            restrictions::check_indexes(!_parameters->is_mutation_fragments()));

@@ -2398,14 +2282,19 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        validate_distinct_selection(*schema, *selection, *restrictions);
    }

+    select_statement::ordering_comparator_type ordering_comparator;
    bool is_reversed_ = false;

+    std::optional<prepared_ann_ordering_type> prepared_ann_ordering;
+
    auto orderings = _parameters->orderings();

-    if (!orderings.empty() && !is_ann_query) {
+    if (!orderings.empty()) {
        std::visit([&](auto&& ordering) {
            using T = std::decay_t<decltype(ordering)>;
-            if constexpr (!std::is_same_v<T, select_statement::ann_vector>) {
+            if constexpr (std::is_same_v<T, select_statement::ann_vector>) {
+                prepared_ann_ordering = prepare_ann_ordering(*schema, ctx, db);
+            } else {
                SCYLLA_ASSERT(!for_view);
                verify_ordering_is_allowed(*_parameters, *restrictions);
                prepared_orderings_type prepared_orderings = prepare_orderings(*schema);
@@ -2418,7 +2307,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
    }

    std::vector<sstring> warnings;
-    if (!is_ann_query) {
+    if (!prepared_ann_ordering.has_value()) {
        check_needs_filtering(*restrictions, db.get_config().strict_allow_filtering(), warnings);
        ensure_filtering_columns_retrieval(db, *selection, *restrictions);
    }
@@ -2472,21 +2361,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                && restrictions->partition_key_restrictions_size() == schema->partition_key_size());
    };

-    if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
-        stmt = ::make_shared<strong_consistency::select_statement>(
-                schema,
-                ctx.bound_variables_size(),
-                _parameters,
-                std::move(selection),
-                std::move(restrictions),
-                std::move(group_by_cell_indices),
-                is_reversed_,
-                std::move(ordering_comparator),
-                prepare_limit(db, ctx, _limit),
-                prepare_limit(db, ctx, _per_partition_limit),
-                stats,
-                std::move(prepared_attrs));
-    } else if (_parameters->is_prune_materialized_view()) {
+    if (_parameters->is_prune_materialized_view()) {
        stmt = ::make_shared<cql3::statements::prune_materialized_view_statement>(
                schema,
                ctx.bound_variables_size(),
@@ -2515,10 +2390,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
                std::move(prepared_attrs));
-    } else if (is_ann_query) {
+    } else if (prepared_ann_ordering) {
        stmt = vector_indexed_table_select_statement::prepare(db, schema, ctx.bound_variables_size(), _parameters, std::move(selection), std::move(restrictions),
-                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(ann_ordering_info_opt->_prepared_ann_ordering),
-                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, ann_ordering_info_opt->_index, std::move(prepared_attrs));
+                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(*prepared_ann_ordering),
+                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, std::move(prepared_attrs));
    } else if (restrictions->uses_secondary_indexing()) {
        stmt = view_indexed_table_select_statement::prepare(
                db,
@@ -2550,7 +2425,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
            std::move(prepared_attrs)
        );
    } else if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
-        stmt = ::make_shared<cql3::statements::broadcast_select_statement>(
+        stmt = ::make_shared<cql3::statements::strongly_consistent_select_statement>(
                schema,
                ctx.bound_variables_size(),
                _parameters,
@@ -2740,6 +2615,28 @@ void select_statement::verify_ordering_is_valid(const prepared_orderings_type& o
    }
 }

+select_statement::prepared_ann_ordering_type select_statement::prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const {
+    auto [column_id, ordering] = _parameters->orderings().front();
+    const auto& ann_vector = std::get_if<select_statement::ann_vector>(&ordering);
+    SCYLLA_ASSERT(ann_vector);
+
+    ::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(schema);
+    const column_definition* def = schema.get_column_definition(column->name());
+    if (!def) {
+        throw exceptions::invalid_request_exception(
+                fmt::format("Undefined column name {}", column->text()));
+    }
+
+    if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
+        throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
+    }
+
+    auto e =  expr::prepare_expression(*ann_vector, db, keyspace(), nullptr, def->column_specification);
+    expr::fill_prepare_context(e, ctx);
+
+    return std::make_pair(std::move(def), std::move(e));
+}
+
 select_statement::ordering_comparator_type select_statement::get_ordering_comparator(const prepared_orderings_type& orderings,
    selection::selection& selection,
    const restrictions::statement_restrictions& restrictions) {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -22,7 +22,6 @@
 #include "locator/host_id.hh"
 #include "service/cas_shard.hh"
 #include "vector_search/vector_store_client.hh"
-#include "vector_search/filter.hh"

 namespace service {
    class client_state;
@@ -363,7 +362,6 @@ private:
 class vector_indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
    prepared_ann_ordering_type _prepared_ann_ordering;
-    vector_search::prepared_filter _prepared_filter;
    mutable gc_clock::time_point _query_start_time_point;

 public:
@@ -373,13 +371,13 @@ public:
            lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
            ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
            ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-            std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
+            std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<cql3::attributes> attrs);

    vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
            ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
            ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
            prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit, std::optional<expr::expression> per_partition_limit,
-            cql_stats& stats, const secondary_index::index& index, vector_search::prepared_filter prepared_filter, std::unique_ptr<cql3::attributes> attrs);
+            cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);

 private:
    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(
@@ -387,7 +385,7 @@ private:

    void update_stats() const;

-    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const;
+    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options) const;

    std::vector<float> get_ann_ordering_vector(const query_options& options) const;

--- a/cql3/statements/strong_consistency/modification_statement.cc
+++ b/cql3/statements/strong_consistency/modification_statement.cc
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "modification_statement.hh"
-
-#include "transport/messages/result_message.hh"
-#include "cql3/query_processor.hh"
-#include "service/strong_consistency/coordinator.hh"
-#include "cql3/statements/strong_consistency/statement_helpers.hh"
-
-namespace cql3::statements::strong_consistency {
-static logging::logger logger("sc_modification_statement");
-
-modification_statement::modification_statement(shared_ptr<base_statement> statement)
-    : cql_statement_opt_metadata(&timeout_config::write_timeout)
-    , _statement(std::move(statement))
-{
-}
-
-using result_message = cql_transport::messages::result_message;
-
-future<shared_ptr<result_message>> modification_statement::execute(query_processor& qp, service::query_state& qs, 
-    const query_options& options, std::optional<service::group0_guard> guard) const
-{
-    return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
-            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<result_message>>);
-}
-
-future<shared_ptr<result_message>> modification_statement::execute_without_checking_exception_message(
-        query_processor& qp, service::query_state& qs, const query_options& options,
-        std::optional<service::group0_guard> guard) const
-{
-    auto json_cache = base_statement::json_cache_opt{};
-    const auto keys = _statement->build_partition_keys(options, json_cache);
-    if (keys.size() != 1 || !query::is_single_partition(keys[0])) {
-        throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
-    }
-    if (_statement->requires_read()) {
-        throw exceptions::invalid_request_exception("Strongly consistent updates don't support data prefetch");
-    }
-
-    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
-    const auto mutate_result = co_await coordinator.get().mutate(_statement->s,
-        keys[0].start()->value().token(),
-        [&](api::timestamp_type ts) {
-            const auto prefetch_data = update_parameters::prefetch_data(_statement->s);
-            const auto ttl = _statement->get_time_to_live(options);
-            const auto params = update_parameters(_statement->s, options, ts, ttl, prefetch_data);
-            const auto ranges = _statement->create_clustering_ranges(options, json_cache);
-            auto muts = _statement->apply_updates(keys, ranges, params, json_cache);
-            if (muts.size() != 1) {
-                on_internal_error(logger, ::format("statement '{}' has unexpected number of mutations {}",
-                    raw_cql_statement, muts.size()));
-            }
-            return std::move(*muts.begin());
-        });
-
-    using namespace service::strong_consistency;
-    if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
-        co_return co_await redirect_statement(qp, options, redirect->target);
-    }
-
-    co_return seastar::make_shared<result_message::void_message>();
-}
-
-future<> modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
-    return _statement->check_access(qp, state);
-}
-
-uint32_t modification_statement::get_bound_terms() const {
-    return _statement->get_bound_terms();
-}
-
-bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
-    return _statement->depends_on(ks_name, cf_name);
-}
-}
--- a/cql3/statements/strong_consistency/modification_statement.hh
+++ b/cql3/statements/strong_consistency/modification_statement.hh
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include "cql3/cql_statement.hh"
-#include "cql3/expr/expression.hh"
-#include "cql3/statements/modification_statement.hh"
-
-namespace cql3::statements::strong_consistency {
-
-class modification_statement : public cql_statement_opt_metadata {
-    using result_message = cql_transport::messages::result_message;
-    using base_statement = cql3::statements::modification_statement;
-
-    shared_ptr<base_statement> _statement;
-public:
-    modification_statement(shared_ptr<base_statement> statement);
-
-    future<shared_ptr<result_message>> execute(query_processor& qp, service::query_state& state,
-        const query_options& options, std::optional<service::group0_guard> guard) const override;
-
-    future<shared_ptr<result_message>> execute_without_checking_exception_message(query_processor& qp,
-        service::query_state& qs, const query_options& options,
-        std::optional<service::group0_guard> guard) const override;
-
-    future<> check_access(query_processor& qp, const service::client_state& state) const override;
-
-    uint32_t get_bound_terms() const override;
-
-    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
-};
-
-}
--- a/cql3/statements/strong_consistency/select_statement.cc
+++ b/cql3/statements/strong_consistency/select_statement.cc
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "select_statement.hh"
-
-#include "query/query-request.hh"
-#include "cql3/query_processor.hh"
-#include "service/strong_consistency/coordinator.hh"
-#include "cql3/statements/strong_consistency/statement_helpers.hh"
-
-namespace cql3::statements::strong_consistency {
-
-using result_message = cql_transport::messages::result_message;
-
-future<::shared_ptr<result_message>> select_statement::do_execute(query_processor& qp,
-        service::query_state& state, 
-        const query_options& options) const
-{
-    const auto key_ranges = _restrictions->get_partition_key_ranges(options);
-    if (key_ranges.size() != 1 || !query::is_single_partition(key_ranges[0])) {
-        throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
-    }
-    const auto now = gc_clock::now();
-    auto read_command = make_lw_shared<query::read_command>(
-        _query_schema->id(),
-        _query_schema->version(),
-        make_partition_slice(options),
-        query::max_result_size(query::result_memory_limiter::maximum_result_size),
-        query::tombstone_limit(query::tombstone_limit::max),
-        query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())),
-        query::partition_limit(query::max_partitions),
-        now,
-        tracing::make_trace_info(state.get_trace_state()),
-        query_id::create_null_id(),
-        query::is_first_page::no,
-        options.get_timestamp(state));
-    const auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
-    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
-    auto query_result = co_await coordinator.get().query(_query_schema, *read_command,
-        key_ranges, state.get_trace_state(), timeout);
-
-    using namespace service::strong_consistency;
-    if (const auto* redirect = get_if<need_redirect>(&query_result)) {
-        co_return co_await redirect_statement(qp, options, redirect->target);
-    }
-
-    co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
-        read_command, options, now);
-}
-
-}
--- a/cql3/statements/strong_consistency/select_statement.hh
+++ b/cql3/statements/strong_consistency/select_statement.hh
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include "cql3/cql_statement.hh"
-#include "cql3/statements/select_statement.hh"
-
-namespace cql3::statements::strong_consistency {
-
-class select_statement : public cql3::statements::select_statement {
-    using result_message = cql_transport::messages::result_message;
-
-public:
-    using cql3::statements::select_statement::select_statement;
-
-    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(query_processor& qp,
-        service::query_state& state, const query_options& options) const override;
-};
-
-}
--- a/cql3/statements/strong_consistency/statement_helpers.cc
+++ b/cql3/statements/strong_consistency/statement_helpers.cc
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "statement_helpers.hh"
-
-#include "transport/messages/result_message_base.hh"
-#include "cql3/query_processor.hh"
-#include "replica/database.hh"
-#include "locator/tablet_replication_strategy.hh"
-
-namespace cql3::statements::strong_consistency {
-future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
-        const query_options& options,
-        const locator::tablet_replica& target)
-{
-    const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
-    if (target.host != my_host_id) {
-        throw exceptions::invalid_request_exception(format(
-            "Strongly consistent writes can be executed only on the leader node, "
-            "leader id {}, current host id {}",
-            target.host, my_host_id));
-    }
-    auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
-    co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
-}
-
-bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name) {
-    const auto* tablet_aware_rs = db.find_keyspace(ks_name).get_replication_strategy().maybe_as_tablet_aware();
-    return tablet_aware_rs && tablet_aware_rs->get_consistency() != data_dictionary::consistency_config_option::eventual;
-}
-
-}
--- a/cql3/statements/strong_consistency/statement_helpers.hh
+++ b/cql3/statements/strong_consistency/statement_helpers.hh
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include "cql3/cql_statement.hh"
-#include "locator/tablets.hh"
-
-namespace cql3::statements::strong_consistency {
-
-future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
-    query_processor& qp,
-    const query_options& options,
-    const locator::tablet_replica& target);
-
-bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);
-
-}
--- a/cql3/statements/strongly_consistent_modification_statement.cc
+++ b/cql3/statements/strongly_consistent_modification_statement.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+
+#include "cql3/statements/strongly_consistent_modification_statement.hh"
+
+#include <optional>
+
+#include <seastar/core/future.hh>
+#include <seastar/util/variant_utils.hh>
+
+#include "bytes.hh"
+#include "cql3/attributes.hh"
+#include "cql3/expr/expression.hh"
+#include "cql3/expr/evaluate.hh"
+#include "cql3/query_processor.hh"
+#include "cql3/values.hh"
+#include "timeout_config.hh"
+#include "service/broadcast_tables/experimental/lang.hh"
+#include "db/system_keyspace.hh"
+
+namespace cql3 {
+
+static logging::logger logger("strongly_consistent_modification_statement");
+
+namespace statements {
+
+strongly_consistent_modification_statement::strongly_consistent_modification_statement(
+    uint32_t bound_terms,
+    schema_ptr schema,
+    broadcast_tables::prepared_update query)
+    : cql_statement_opt_metadata{&timeout_config::write_timeout}
+    , _bound_terms{bound_terms}
+    , _schema{schema}
+    , _query{std::move(query)}
+{ }
+
+future<::shared_ptr<cql_transport::messages::result_message>>
+strongly_consistent_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+    return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
+            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<cql_transport::messages::result_message>>);
+}
+
+static
+service::broadcast_tables::update_query
+evaluate_prepared(
+    const broadcast_tables::prepared_update& query,
+    const query_options& options) {
+    return service::broadcast_tables::update_query{
+        .key = expr::evaluate(query.key, options).to_bytes(),
+        .new_value = expr::evaluate(query.new_value, options).to_bytes(),
+        .value_condition = query.value_condition
+            ? std::optional<bytes_opt>{expr::evaluate(*query.value_condition, options).to_bytes_opt()}
+            : std::nullopt
+    };
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>>
+strongly_consistent_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+    if (this_shard_id() != 0) {
+        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
+    }
+
+    auto result = co_await qp.execute_broadcast_table_query(
+        { evaluate_prepared(_query, options) }
+    );
+
+    co_return co_await std::visit(make_visitor(
+        [] (service::broadcast_tables::query_result_conditional_update& qr) -> future<::shared_ptr<cql_transport::messages::result_message>> {
+            auto result_set = std::make_unique<cql3::result_set>(std::vector{
+                make_lw_shared<cql3::column_specification>(
+                    db::system_keyspace::NAME,
+                    db::system_keyspace::BROADCAST_KV_STORE,
+                    ::make_shared<cql3::column_identifier>("[applied]", false),
+                    boolean_type
+                ),
+                make_lw_shared<cql3::column_specification>(
+                    db::system_keyspace::NAME,
+                    db::system_keyspace::BROADCAST_KV_STORE,
+                    ::make_shared<cql3::column_identifier>("value", true),
+                    utf8_type
+                )
+            });
+
+            result_set->add_row({ boolean_type->decompose(qr.is_applied), qr.previous_value });
+
+            return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(
+                ::make_shared<cql_transport::messages::result_message::rows>(cql3::result{std::move(result_set)}));
+        },
+        [] (service::broadcast_tables::query_result_none&) -> future<::shared_ptr<cql_transport::messages::result_message>> {
+            return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
+        },
+        [] (service::broadcast_tables::query_result_select&) -> future<::shared_ptr<cql_transport::messages::result_message>> {
+            on_internal_error(logger, "incorrect query result ");
+        }
+    ), result);
+}
+
+uint32_t strongly_consistent_modification_statement::get_bound_terms() const {
+    return _bound_terms;
+}
+
+future<> strongly_consistent_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
+    auto f = state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::MODIFY);
+    if (_query.value_condition.has_value()) {
+        f = f.then([this, &state] {
+           return state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::SELECT);
+        });
+    }
+    return f;
+}
+
+bool strongly_consistent_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return _schema->ks_name() == ks_name && (!cf_name || _schema->cf_name() == *cf_name);
+}
+
+}
+
+}
--- a/cql3/statements/strongly_consistent_modification_statement.hh
+++ b/cql3/statements/strongly_consistent_modification_statement.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "cql3/cql_statement.hh"
+#include "cql3/statements/modification_statement.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+namespace broadcast_tables {
+
+struct prepared_update {
+    expr::expression key;
+    expr::expression new_value;
+    std::optional<expr::expression> value_condition;
+};
+
+}
+
+class strongly_consistent_modification_statement : public cql_statement_opt_metadata {
+    const uint32_t _bound_terms;
+    const schema_ptr _schema;
+    const broadcast_tables::prepared_update _query;
+
+public:
+    strongly_consistent_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);
+
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
+    execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
+
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
+    execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
+
+    virtual uint32_t get_bound_terms() const override;
+
+    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
+
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
+};
+
+
+}
+
+}
--- a/cql3/statements/strongly_consistent_select_statement.cc
+++ b/cql3/statements/strongly_consistent_select_statement.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+
+#include "cql3/statements/strongly_consistent_select_statement.hh"
+
+#include <seastar/core/future.hh>
+#include <seastar/core/on_internal_error.hh>
+
+#include "cql3/restrictions/statement_restrictions.hh"
+#include "cql3/expr/evaluate.hh"
+#include "cql3/query_processor.hh"
+#include "service/broadcast_tables/experimental/lang.hh"
+#include "db/system_keyspace.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+static logging::logger logger("strongly_consistent_select_statement");
+
+static
+expr::expression get_key(const cql3::expr::expression& partition_key_restrictions) {
+    const auto* conjunction = cql3::expr::as_if<cql3::expr::conjunction>(&partition_key_restrictions);
+
+    if (!conjunction || conjunction->children.size() != 1) {
+        throw service::broadcast_tables::unsupported_operation_error(fmt::format(
+            "partition key restriction: {}", partition_key_restrictions));
+    }
+
+    const auto* key_restriction = cql3::expr::as_if<cql3::expr::binary_operator>(&conjunction->children[0]);
+
+    if (!key_restriction) {
+        throw service::broadcast_tables::unsupported_operation_error(fmt::format("partition key restriction: {}", *conjunction));
+    }
+
+    const auto* column = cql3::expr::as_if<cql3::expr::column_value>(&key_restriction->lhs);
+
+    if (!column || column->col->kind != column_kind::partition_key ||
+        key_restriction->op != cql3::expr::oper_t::EQ) {
+        throw service::broadcast_tables::unsupported_operation_error(fmt::format("key restriction: {}", *key_restriction));
+    }
+
+    return key_restriction->rhs;
+}
+
+static
+bool is_selecting_only_value(const cql3::selection::selection& selection) {
+    return selection.is_trivial() &&
+           selection.get_column_count() == 1 &&
+           selection.get_columns()[0]->name() == "value";
+}
+
+strongly_consistent_select_statement::strongly_consistent_select_statement(schema_ptr schema, uint32_t bound_terms,
+                                                                           lw_shared_ptr<const parameters> parameters,
+                                                                           ::shared_ptr<selection::selection> selection,
+                                                                           ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+                                                                           ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
+                                                                           bool is_reversed,
+                                                                           ordering_comparator_type ordering_comparator,
+                                                                           std::optional<expr::expression> limit,
+                                                                           std::optional<expr::expression> per_partition_limit,
+                                                                           cql_stats &stats,
+                                                                           std::unique_ptr<attributes> attrs)
+    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, std::move(limit), std::move(per_partition_limit), stats, std::move(attrs)},
+      _query{prepare_query()}
+{ }
+
+broadcast_tables::prepared_select strongly_consistent_select_statement::prepare_query() const {
+    if (!is_selecting_only_value(*_selection)) {
+        throw service::broadcast_tables::unsupported_operation_error("only 'value' selector is allowed");
+    }
+
+    return {
+        .key = get_key(_restrictions->get_partition_key_restrictions())
+    };
+}
+
+static
+service::broadcast_tables::select_query
+evaluate_prepared(
+    const broadcast_tables::prepared_select& query,
+    const query_options& options) {
+    return service::broadcast_tables::select_query{
+        .key = expr::evaluate(query.key, options).to_bytes()
+    };
+}
+
+future<::shared_ptr<cql_transport::messages::result_message>>
+strongly_consistent_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+    if (this_shard_id() != 0) {
+        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
+    }
+
+    auto result = co_await qp.execute_broadcast_table_query(
+        { evaluate_prepared(_query, options) }
+    );
+
+    auto query_result = std::get_if<service::broadcast_tables::query_result_select>(&result);
+
+    if (!query_result) {
+        on_internal_error(logger, "incorrect query result ");
+    }
+
+    auto result_set = std::make_unique<cql3::result_set>(std::vector{
+        make_lw_shared<cql3::column_specification>(
+            db::system_keyspace::NAME,
+            db::system_keyspace::BROADCAST_KV_STORE,
+            ::make_shared<cql3::column_identifier>("value", true),
+            utf8_type
+        )
+    });
+
+    if (query_result->value) {
+        result_set->add_row({ managed_bytes_opt(query_result->value) });
+    }
+
+    co_return ::make_shared<cql_transport::messages::result_message::rows>(cql3::result{std::move(result_set)});
+}
+
+}
+
+}
--- a/cql3/statements/strongly_consistent_select_statement.hh
+++ b/cql3/statements/strongly_consistent_select_statement.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "cql3/expr/expression.hh"
+#include "cql3/statements/select_statement.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+namespace broadcast_tables {
+
+struct prepared_select {
+    expr::expression key;
+};
+
+}
+
+class strongly_consistent_select_statement : public select_statement {
+    const broadcast_tables::prepared_select _query;
+
+    broadcast_tables::prepared_select prepare_query() const;
+public:
+    strongly_consistent_select_statement(schema_ptr schema,
+                     uint32_t bound_terms,
+                     lw_shared_ptr<const parameters> parameters,
+                     ::shared_ptr<selection::selection> selection,
+                     ::shared_ptr<const restrictions::statement_restrictions> restrictions,
+                     ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
+                     bool is_reversed,
+                     ordering_comparator_type ordering_comparator,
+                     std::optional<expr::expression> limit,
+                     std::optional<expr::expression> per_partition_limit,
+                     cql_stats &stats,
+                     std::unique_ptr<cql3::attributes> attrs);
+
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
+        execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
+};
+
+}
+
+}
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -13,7 +13,7 @@
 #include "cql3/expr/expression.hh"
 #include "cql3/expr/evaluate.hh"
 #include "cql3/expr/expr-utils.hh"
-#include "cql3/statements/broadcast_modification_statement.hh"
+#include "cql3/statements/strongly_consistent_modification_statement.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "raw/update_statement.hh"

@@ -333,7 +333,7 @@ std::optional<expr::expression> get_value_condition(const expr::expression& the_
    return binop->rhs;
 }

-::shared_ptr<broadcast_modification_statement>
+::shared_ptr<strongly_consistent_modification_statement>
 update_statement::prepare_for_broadcast_tables() const {
    if (attrs) {
        if (attrs->is_time_to_live_set()) {
@@ -359,7 +359,7 @@ update_statement::prepare_for_broadcast_tables() const {
        .value_condition = get_value_condition(_condition),
    };

-    return ::make_shared<broadcast_modification_statement>(
+    return ::make_shared<strongly_consistent_modification_statement>(
        get_bound_terms(),
        s,
        query
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -45,7 +45,7 @@ private:
    virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const;

 public:
-    virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const override;
+    virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const override;
 };

 /*
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -55,21 +55,8 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
    return hash & ((1ULL << batchlog_shard_bits) - 1);
 }

-bool is_batchlog_v1(const schema& schema) {
-    return schema.cf_name() == system_keyspace::BATCHLOG;
-}
-
 std::pair<partition_key, clustering_key>
 get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
-    if (is_batchlog_v1(schema)) {
-        if (!id) {
-            on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
-        }
-        auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
-        auto ckey = clustering_key::make_empty();
-        return std::pair(std::move(pkey), std::move(ckey));
-    }
-
    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});

    std::vector<bytes> ckey_components;
@@ -98,14 +85,6 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
    auto cdef_data = schema->get_column_definition(to_bytes("data"));
    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));

-    if (is_batchlog_v1(*schema)) {
-        auto cdef_version = schema->get_column_definition(to_bytes("version"));
-        m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
-
-        auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
-        m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
-    }
-
    return m;
 }

@@ -143,10 +122,9 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

-db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
+db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
-        , _fs(fs)
        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
@@ -322,206 +300,149 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
    });
 }

-namespace {
-
-using clock_type = db_clock::rep;
-
-struct replay_stats {
-    std::optional<db_clock::time_point> min_too_fresh;
-    bool need_cleanup = false;
-};
-
-} // anonymous namespace
-
-static future<db::all_batches_replayed> process_batch(
-        cql3::query_processor& qp,
-        db::batchlog_manager::stats& stats,
-        db::batchlog_manager::post_replay_cleanup cleanup,
-        utils::rate_limiter& limiter,
-        schema_ptr schema,
-        std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
-        const db_clock::time_point now,
-        db_clock::duration replay_timeout,
-        std::chrono::seconds write_timeout,
-        const cql3::untyped_result_set::row& row) {
-    const bool is_v1 = db::is_batchlog_v1(*schema);
-    const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
-    const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
-    auto written_at = row.get_as<db_clock::time_point>("written_at");
-    auto id = row.get_as<utils::UUID>("id");
-    // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-    auto timeout = replay_timeout;
-
-    if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
-        blogger.debug("Skipping batch replay due to skip_batch_replay injection");
-        co_return db::all_batches_replayed::no;
-    }
-
-    auto data = row.get_blob_unfragmented("data");
-
-    blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
-
-    utils::chunked_vector<mutation> mutations;
-    bool send_failed = false;
-
-    auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
-
-    try {
-        utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
-        auto in = ser::as_input_stream(data);
-        while (in.size()) {
-            auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
-            const auto tbl = qp.db().try_find_table(fm.column_family_id());
-            if (!tbl) {
-                continue;
-            }
-            if (written_at <= tbl->get_truncation_time()) {
-                continue;
-            }
-            schema_ptr s = tbl->schema();
-            if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
-                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
-            }
-            fms.emplace_back(std::move(fm), std::move(s));
-        }
-
-        if (now < written_at + timeout) {
-            blogger.debug("Skipping replay of {}, too fresh", id);
-
-            shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
-
-            co_return db::all_batches_replayed::no;
-        }
-
-        auto size = data.size();
-
-        for (const auto& [fm, s] : fms) {
-            mutations.emplace_back(fm.to_mutation(s));
-            co_await coroutine::maybe_yield();
-        }
-
-        if (!mutations.empty()) {
-            const auto ttl = [written_at]() -> clock_type {
-                /*
-                * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                * This ensures that deletes aren't "undone" by an old batch replay.
-                */
-                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                warn(unimplemented::cause::HINT);
-#if 0
-                for (auto& m : *mutations) {
-                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-                }
-#endif
-                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-            }();
-
-            if (ttl > 0) {
-                // Origin does the send manually, however I can't see a super great reason to do so.
-                // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-                // in both cases.
-                // FIXME: verify that the above is reasonably true.
-                co_await limiter.reserve(size);
-                stats.write_attempts += mutations.size();
-                auto timeout = db::timeout_clock::now() + write_timeout;
-                if (cleanup) {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
-                } else {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
-                }
-            }
-        }
-    } catch (data_dictionary::no_such_keyspace& ex) {
-        // should probably ignore and drop the batch
-    } catch (const data_dictionary::no_such_column_family&) {
-        // As above -- we should drop the batch if the table doesn't exist anymore.
-    } catch (...) {
-        blogger.warn("Replay failed (will retry): {}", std::current_exception());
-        // timeout, overload etc.
-        // Do _not_ remove the batch, assuning we got a node write error.
-        // Since we don't have hints (which origin is satisfied with),
-        // we have to resort to keeping this batch to next lap.
-        if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
-            co_return db::all_batches_replayed::no;
-        }
-        send_failed = true;
-    }
-
-    auto& sp = qp.proxy();
-
-    if (send_failed) {
-        blogger.debug("Moving batch {} to stage failed_replay", id);
-        auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
-        co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-    }
-
-    // delete batch
-    auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
-    co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-
-    shard_written_at.need_cleanup = true;
-
-    co_return db::all_batches_replayed(!send_failed);
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
-    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
-    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
-    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
-    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
-
-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
-
-    // Use a stable `now` across all batches, so skip/replay decisions are the
-    // same across a while prefix of written_at (across all ids).
-    const auto now = db_clock::now();
-
-    auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
-        co_return stop_iteration::no;
-    };
-
-    co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
-        blogger.debug("Started replayAllFailedBatches");
-        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
-
-        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-        co_await _qp.query_internal(
-                format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
-                db::consistency_level::ONE,
-                {},
-                page_size,
-                batch);
-
-        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
-    });
-
-    co_return all_replayed;
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
    co_await maybe_migrate_v1_to_v2();

+    typedef db_clock::rep clock_type;
+
    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
+    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);

+    struct replay_stats {
+        std::optional<db_clock::time_point> min_too_fresh;
+        bool need_cleanup = false;
+    };
+
    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

    // Use a stable `now` across all batches, so skip/replay decisions are the
    // same across a while prefix of written_at (across all ids).
    const auto now = db_clock::now();

-    auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
+    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto batch_shard = row.get_as<int32_t>("shard");
+        auto written_at = row.get_as<db_clock::time_point>("written_at");
+        auto id = row.get_as<utils::UUID>("id");
+        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+        auto timeout = _replay_timeout;
+
+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
+            co_return stop_iteration::no;
+        }
+
+        auto data = row.get_blob_unfragmented("data");
+
+        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+        utils::chunked_vector<mutation> mutations;
+        bool send_failed = false;
+
+        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
+
+        try {
+            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
+            auto in = ser::as_input_stream(data);
+            while (in.size()) {
+                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
+                if (!tbl) {
+                    continue;
+                }
+                if (written_at <= tbl->get_truncation_time()) {
+                    continue;
+                }
+                schema_ptr s = tbl->schema();
+                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                }
+                fms.emplace_back(std::move(fm), std::move(s));
+            }
+
+            if (now < written_at + timeout) {
+                blogger.debug("Skipping replay of {}, too fresh", id);
+
+                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
+                co_return stop_iteration::no;
+            }
+
+            auto size = data.size();
+
+            for (const auto& [fm, s] : fms) {
+                mutations.emplace_back(fm.to_mutation(s));
+                co_await coroutine::maybe_yield();
+            }
+
+            if (!mutations.empty()) {
+                const auto ttl = [written_at]() -> clock_type {
+                    /*
+                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                    * This ensures that deletes aren't "undone" by an old batch replay.
+                    */
+                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                    warn(unimplemented::cause::HINT);
+#if 0
+                    for (auto& m : *mutations) {
+                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                    }
+#endif
+                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+                }();
+
+                if (ttl > 0) {
+                    // Origin does the send manually, however I can't see a super great reason to do so.
+                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                    // in both cases.
+                    // FIXME: verify that the above is reasonably true.
+                    co_await limiter->reserve(size);
+                    _stats.write_attempts += mutations.size();
+                    auto timeout = db::timeout_clock::now() + write_timeout;
+                    if (cleanup) {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                    } else {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    }
+                }
+            }
+        } catch (data_dictionary::no_such_keyspace& ex) {
+            // should probably ignore and drop the batch
+        } catch (const data_dictionary::no_such_column_family&) {
+            // As above -- we should drop the batch if the table doesn't exist anymore.
+        } catch (...) {
+            blogger.warn("Replay failed (will retry): {}", std::current_exception());
+            all_replayed = all_batches_replayed::no;
+            // timeout, overload etc.
+            // Do _not_ remove the batch, assuning we got a node write error.
+            // Since we don't have hints (which origin is satisfied with),
+            // we have to resort to keeping this batch to next lap.
+            if (!cleanup || stage == batchlog_stage::failed_replay) {
+                co_return stop_iteration::no;
+            }
+            send_failed = true;
+        }
+
+        auto& sp = _qp.proxy();
+
+        if (send_failed) {
+            blogger.debug("Moving batch {} to stage failed_replay", id);
+            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+        }
+
+        // delete batch
+        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+        shard_written_at.need_cleanup = true;
+
        co_return stop_iteration::no;
    };

@@ -580,10 +501,3 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

    co_return all_replayed;
 }
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
-    if (_fs.batchlog_v2) {
-        return replay_all_failed_batches_v2(cleanup);
-    }
-    return replay_all_failed_batches_v1(cleanup);
-}
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -27,12 +27,6 @@ class query_processor;

 } // namespace cql3

-namespace gms {
-
-class feature_service;
-
-} // namespace gms
-
 namespace db {

 class system_keyspace;
@@ -55,11 +49,6 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

-    struct stats {
-        uint64_t write_attempts = 0;
-    };
-
-
 private:
    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
@@ -67,13 +56,14 @@ private:

    using clock_type = lowres_clock;

-    stats _stats;
+    struct stats {
+        uint64_t write_attempts = 0;
+    } _stats;

    seastar::metrics::metric_groups _metrics;

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
-    gms::feature_service& _fs;
    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
@@ -94,14 +84,12 @@ private:

    future<> maybe_migrate_v1_to_v2();

-    future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
-    future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
    // shard qp (which is what you feed here).
-    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
+    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);

    // abort the replay loop and return its future.
    future<> drain();
@@ -114,7 +102,7 @@ public:
        return _last_replay;
    }

-    const stats& get_stats() const {
+    const stats& stats() const {
        return _stats;
    }
 private:
--- a/db/cache_mutation_reader.hh
+++ b/db/cache_mutation_reader.hh
@@ -323,9 +323,6 @@ void cache_mutation_reader::touch_partition() {

 inline
 future<> cache_mutation_reader::fill_buffer() {
-    if (const auto& ex = get_abort_exception(); ex) {
-        return make_exception_future<>(ex);
-    }
    if (_state == state::before_static_row) {
        touch_partition();
        auto after_static_row = [this] {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
+            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
-    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -621,6 +621,25 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    * @GroupDescription: Provides an overview of the group.
    */
    /**
+    * @Group Ungrouped properties
+    */
+    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
+        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
+    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
+        "true: auto-adjust memtable shares for flush processes")
+    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
+        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
+        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
+    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
+        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
+        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
+        "Set to 0 to disable automatic flushing all tables before major compaction.")
+    /**
    * @Group Initialization properties
    * @GroupDescription The minimal properties needed for configuring a cluster.
    */
@@ -1201,13 +1220,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* org.apache.cassandra.auth.CassandraRoleManager: Stores role data in the system_auth keyspace;\n"
        "* com.scylladb.auth.LDAPRoleManager: Fetches role data from an LDAP server.")
    , permissions_validity_in_ms(this, "permissions_validity_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
-        "How long authorized statements cache entries remain valid. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
+        "How long permissions in cache remain valid. Depending on the authorizer, such as CassandraAuthorizer, fetching permissions can be resource intensive. Permissions caching is disabled when this property is set to 0 or when AllowAllAuthorizer is used. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
        "and the cached value has been read at least once during the permissions_validity_in_ms time frame. If any of these two conditions doesn't hold the cached value is going to be evicted from the cache.\n"
        "\n"
        "Related information: Object permissions")
    , permissions_update_interval_in_ms(this, "permissions_update_interval_in_ms", liveness::LiveUpdate, value_status::Used, 2000,
-        "Refresh interval for authorized statements cache. After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms. This option additionally controls the permissions refresh interval for LDAP.")
-    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Unused, 1000,
+        "Refresh interval for permissions cache (if enabled). After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms.")
+    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Used, 1000,
        "Maximum cached permission entries. Must have a non-zero value if permissions caching is enabled (see a permissions_validity_in_ms description).")
    , server_encryption_options(this, "server_encryption_options", value_status::Used, {/*none*/},
        "Enable or disable inter-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. The available options are:\n"
@@ -1272,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
    , override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
    , enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
    , enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
            "If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1292,14 +1311,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , fd_initial_value_ms(this, "fd_initial_value_ms", value_status::Used, 2 * 1000, "The initial failure_detector interval time in milliseconds.")
    , shutdown_announce_in_ms(this, "shutdown_announce_in_ms", value_status::Used, 2 * 1000, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.")
    , developer_mode(this, "developer_mode", value_status::Used, DEVELOPER_MODE_DEFAULT, "Relax environment checks. Setting to true can reduce performance and reliability significantly.")
-    , skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Deprecated, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
+    , skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Used, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
    , force_gossip_generation(this, "force_gossip_generation", liveness::LiveUpdate, value_status::Used, -1 , "Force gossip to use the generation number provided by user.")
    , experimental_features(this, "experimental_features", value_status::Used, {}, experimental_features_help_string())
    , lsa_reclamation_step(this, "lsa_reclamation_step", value_status::Used, 1, "Minimum number of segments to reclaim in a single step.")
    , prometheus_port(this, "prometheus_port", value_status::Used, 9180, "Prometheus port, set to zero to disable.")
    , prometheus_address(this, "prometheus_address", value_status::Used, {/* listen_address */}, "Prometheus listening address, defaulting to listen_address if not explicitly set.")
    , prometheus_prefix(this, "prometheus_prefix", value_status::Used, "scylla", "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.")
-    , prometheus_allow_protobuf(this, "prometheus_allow_protobuf", value_status::Used, false, "If set allows the experimental Prometheus protobuf with native histogram")
+    , prometheus_allow_protobuf(this, "prometheus_allow_protobuf", value_status::Used, true, "Enable Prometheus protobuf with native histogram. Set to false to force text exposition format.")
    , abort_on_lsa_bad_alloc(this, "abort_on_lsa_bad_alloc", value_status::Used, false, "Abort when allocation in LSA region fails.")
    , murmur3_partitioner_ignore_msb_bits(this, "murmur3_partitioner_ignore_msb_bits", value_status::Used, default_murmur3_partitioner_ignore_msb_bits, "Number of most significant token bits to ignore in murmur3 partitioner; increase for very large clusters.")
    , unspooled_dirty_soft_limit(this, "unspooled_dirty_soft_limit", value_status::Used, 0.6, "Soft limit of unspooled dirty memory expressed as a portion of the hard limit.")
@@ -1322,7 +1341,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{compression_parameters::algorithm::lz4_with_dicts},
        "Server-global user table compression options. If enabled, all user tables"
        "will be compressed using the provided options, unless overridden"
-        "by compression options in the table schema. User tables are all tables in non-system keyspaces. The available options are:\n"
+        "by compression options in the table schema. The available options are:\n"
        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor, LZ4WithDictsCompressor (default), SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
@@ -1375,10 +1394,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
            "Admit new reads while there are less than this number of requests that need CPU.")
-    , reader_concurrency_semaphore_preemptive_abort_factor(this, "reader_concurrency_semaphore_preemptive_abort_factor", liveness::LiveUpdate, value_status::Used, 0.3,
-            "Admit new reads while their remaining time is more than this factor times their timeout times when arrived to a semaphore. Its vale means\n"
-            "* <= 0.0 means new reads will never get rejected during admission\n"
-            "* >= 1.0 means new reads will always get rejected during admission\n")
    , view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
            "Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , view_update_reader_concurrency_semaphore_kill_limit_multiplier(this, "view_update_reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
@@ -1498,7 +1513,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
-    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
+    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
@@ -1527,21 +1542,17 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "Allows target tablet size to be configured. Defaults to 5G (in bytes). Maintaining tablets at reasonable sizes is important to be able to " \
         "redistribute load. A higher value means tablet migration throughput can be reduced. A lower value may cause number of tablets to increase significantly, " \
         "potentially resulting in performance drawbacks.")
-    , tablet_streaming_read_concurrency_per_shard(this, "tablet_streaming_read_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
-         "Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
-    , tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
-         "Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
    , replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")

-    , audit(this, "audit", value_status::Used, "table",
+    , audit(this, "audit", value_status::Used, "none",
        "Controls the audit feature:\n"
        "\n"
        "\tnone   : No auditing enabled.\n"
        "\tsyslog : Audit messages sent to Syslog.\n"
        "\ttable  : Audit messages written to column family named audit.audit_log.\n")
-    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
+    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH", "Comma separated list of operation categories that should be audited.")
    , audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
    , audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
    , audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")
@@ -1573,14 +1584,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.  This feature will eventually be removed in a future version.")
    , rf_rack_valid_keyspaces(this, "rf_rack_valid_keyspaces", liveness::MustRestart, value_status::Used, false,
        "Enforce RF-rack-valid keyspaces. Additionally, if there are existing RF-rack-invalid "
-        "keyspaces, attempting to start a node with this option ON will fail. "
-        "DEPRECATED. Use enforce_rack_list instead.")
-    , enforce_rack_list(this, "enforce_rack_list", liveness::MustRestart, value_status::Used, false,
-            "Enforce rack list for tablet keyspaces. "
-            "When the option is on, CREATE STATEMENT expands numeric rfs to rack lists "
-            "and ALTER STATEMENT is allowed only when rack lists are used in all DCs."
-            "Additionally, if there are existing tablet keyspaces with numeric rf in any DC "
-            "attempting to start a node with this option ON will fail.")
+        "keyspaces, attempting to start a node with this option ON will fail.")
    // FIXME: make frequency per table in order to reduce work in each iteration.
    // Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
    , tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
@@ -1591,25 +1595,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
    , minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
        "Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
-    /**
-    * @Group Ungrouped properties
-    */
-    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
-        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
-    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
-        "true: auto-adjust memtable shares for flush processes")
-    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
-        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
-        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
-    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
-        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
-        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
-        "Set to 0 to disable automatic flushing all tables before major compaction.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
@@ -1800,21 +1785,6 @@ const db::extensions& db::config::extensions() const {
    return *_extensions;
 }

-compression_parameters db::config::get_sstable_compression_user_table_options(bool dicts_feature_enabled) const {
-    if (sstable_compression_user_table_options.is_set()
-            || dicts_feature_enabled
-            || !sstable_compression_user_table_options().uses_dictionary_compressor()) {
-        return sstable_compression_user_table_options();
-    } else {
-        // Fall back to non-dict if dictionary compression is not enabled cluster-wide.
-        auto options = sstable_compression_user_table_options();
-        auto params = options.get_options();
-        auto algo = compression_parameters::non_dict_equivalent(options.get_algorithm());
-        params[compression_parameters::SSTABLE_COMPRESSION] = sstring(compression_parameters::algorithm_to_name(algo));
-        return compression_parameters{params};
-    }
-}
-
 std::map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
    // We decided against using the construct-on-first-use idiom here:
    // https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
--- a/db/config.hh
+++ b/db/config.hh
@@ -185,6 +185,13 @@ public:
     * All values and documentation taken from
     * http://docs.datastax.com/en/cassandra/2.1/cassandra/configuration/configCassandra_yaml_r.html
     */
+    named_value<double> background_writer_scheduling_quota;
+    named_value<bool> auto_adjust_flush_quota;
+    named_value<float> memtable_flush_static_shares;
+    named_value<float> compaction_static_shares;
+    named_value<float> compaction_max_shares;
+    named_value<bool> compaction_enforce_min_threshold;
+    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
    named_value<sstring> cluster_name;
    named_value<sstring> listen_address;
    named_value<sstring> listen_interface;
@@ -412,13 +419,7 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
-
-    // NOTE: Do not use this option directly.
-    // Use get_sstable_compression_user_table_options() instead.
    named_value<compression_parameters> sstable_compression_user_table_options;
-
-    compression_parameters get_sstable_compression_user_table_options(bool dicts_feature_enabled) const;
-
    named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
    named_value<bool> sstable_compression_dictionaries_enable_writing;
    named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
@@ -439,7 +440,6 @@ public:
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
-    named_value<float> reader_concurrency_semaphore_preemptive_abort_factor;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_cpu_concurrency;
@@ -542,8 +542,6 @@ public:
    named_value<double> tablets_initial_scale_factor;
    named_value<unsigned> tablets_per_shard_goal;
    named_value<uint64_t> target_tablet_size_in_bytes;
-    named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
-    named_value<unsigned> tablet_streaming_write_concurrency_per_shard;

    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
@@ -601,21 +599,12 @@ public:
    named_value<bool> enable_create_table_with_compact_storage;

    named_value<bool> rf_rack_valid_keyspaces;
-    named_value<bool> enforce_rack_list;

    named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
    named_value<bool> force_capacity_based_balancing;
    named_value<float> size_based_balance_threshold_percentage;
    named_value<uint64_t> minimal_tablet_size_for_balancing;

-    named_value<double> background_writer_scheduling_quota;
-    named_value<bool> auto_adjust_flush_quota;
-    named_value<float> memtable_flush_static_shares;
-    named_value<float> compaction_static_shares;
-    named_value<float> compaction_max_shares;
-    named_value<bool> compaction_enforce_min_threshold;
-    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
-
    static const sstring default_tls_priority;
 private:
    template<typename T>
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -31,23 +31,19 @@ size_t quorum_for(const locator::effective_replication_map& erm) {
    return replication_factor ? (replication_factor / 2) + 1 : 0;
 }

-static size_t get_replication_factor_for_dc(const locator::effective_replication_map& erm, const sstring& dc) {
+size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
    using namespace locator;

    const auto& rs = erm.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
-        const network_topology_strategy* nts =
+        const network_topology_strategy* nrs =
            static_cast<const network_topology_strategy*>(&rs);
-        return nts->get_replication_factor(dc);
+        size_t replication_factor = nrs->get_replication_factor(dc);
+        return replication_factor ? (replication_factor / 2) + 1 : 0;
    }

-    return erm.get_replication_factor();
-}
-
-size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
-    auto rf = get_replication_factor_for_dc(erm, dc);
-    return rf ? (rf / 2) + 1 : 0;
+    return quorum_for(erm);
 }

 size_t block_for_local_serial(const locator::effective_replication_map& erm) {
@@ -192,30 +188,18 @@ void assure_sufficient_live_nodes(
        return pending <= live ? live - pending : 0;
    };

-    auto make_rf_zero_error_msg = [cl] (const sstring& local_dc) {
-        return format("Cannot achieve consistency level {} in datacenter '{}' with replication factor 0. "
-                      "Ensure the keyspace is replicated to this datacenter or use a non-local consistency level.", cl, local_dc);
-    };
-
    const auto& topo = erm.get_topology();
-    const sstring& local_dc = topo.get_datacenter();

    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
        break;
    case consistency_level::LOCAL_ONE:
-        if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
-            throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, 1, 0);
-        }
        if (topo.count_local_endpoints(live_endpoints) < topo.count_local_endpoints(pending_endpoints) + 1) {
            throw exceptions::unavailable_exception(cl, 1, 0);
        }
        break;
    case consistency_level::LOCAL_QUORUM: {
-        if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
-            throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, need, 0);
-        }
        size_t local_live = topo.count_local_endpoints(live_endpoints);
        size_t pending = topo.count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
    _sender.cancel_draining();
 }

-hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
+hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
    , _store_gate("hint_endpoint_manager")
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
    // Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
    // TODO: Should this logic be deduplicated with what is in the commitlog?
    , _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
-    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
+    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
 {}

 hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -63,7 +63,7 @@ private:
    hint_sender _sender;

 public:
-    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
+    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
    hint_endpoint_manager(hint_endpoint_manager&&);
    ~hint_endpoint_manager();

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
    return cm_it->second;
 }

-hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
+hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
    : _stopped(make_ready_future<>())
    , _ep_key(parent.end_point_key())
    , _ep_manager(parent)
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
-    , _hints_cpu_sched_group(sg)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
--- a/Show More
+++ b/Show More