test/alternator: fix delete_item_no_ts test, add LWT rejection tests for delete ops, simplify assertions, update docs

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
alternator: add custom timestamp support to DeleteItem and BatchWriteItem DeleteRequest
2026-03-05 17:31:41 +00:00 · 2026-03-05 16:16:27 +00:00 · 2026-02-25 22:17:17 +00:00 · 2026-02-25 22:12:07 +00:00 · 2026-02-25 21:58:34 +00:00 · 2026-02-25 21:38:52 +00:00
612 changed files with 17288 additions and 7781 deletions
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_jira_sync_pr_milestone.yml
+++ b/.github/workflows/call_jira_sync_pr_milestone.yml
@@ -0,0 +1,22 @@
+name: Sync Jira Based on PR Milestone Events
+
+on:
+  pull_request_target:
+    types: [milestoned, demilestoned]
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  jira-sync-milestone-set:
+    if: github.event.action == 'milestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-sync-milestone-removed:
+    if: github.event.action == 'demilestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -1,4 +1,4 @@
-name: Call Jira release creation for new milestone
+name: Call Jira release creation for new milestone

 on:
  milestone:
@@ -9,6 +9,6 @@ jobs:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
    with:
      # Comma-separated list of Jira project keys
-      jira_project_keys: "SCYLLADB,CUSTOMER"
+      jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/close_issue_for_scylla_associate.yml
+++ b/.github/workflows/close_issue_for_scylla_associate.yml
@@ -0,0 +1,62 @@
+name: Close issues created by Scylla associates
+
+on:
+  issues:
+    types: [opened, reopened]
+
+permissions:
+  issues: write
+
+jobs:
+  comment-and-close:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Comment and close if author email is scylladb.com
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const issue = context.payload.issue;
+            const actor = context.actor;
+
+            // Get user data (only public email is available)
+            const { data: user } = await github.rest.users.getByUsername({
+              username: actor,
+            });
+
+            const email = user.email || "";
+            console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
+
+            // Only continue if email exists and ends with @scylladb.com
+            if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
+              console.log("User is not a scylladb.com email (or email not public); skipping.");
+              return;
+            }
+
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const issue_number = issue.number;
+
+            const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
+
+            // Add the comment
+            await github.rest.issues.createComment({
+              owner,
+              repo,
+              issue_number,
+              body,
+            });
+
+            console.log(`Comment added to #${issue_number}`);
+
+            // Close the issue
+            await github.rest.issues.update({
+              owner,
+              repo,
+              issue_number,
+              state: "closed",
+              state_reason: "not_planned"
+            });
+
+            console.log(`Issue #${issue_number} closed.`);
--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -14,7 +14,8 @@ env:
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log

-permissions: {}
+permissions:
+  contents: read

 # cancel the in-progress run upon a repush
 concurrency:
@@ -34,8 +35,6 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
-      - run: |
-          sudo dnf -y install clang-tools-extra
      - name: Generate compilation database
        run: |
          cmake                                         \
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,16 +9,34 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
+    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
+      - name: Validate Comment Trigger
+        if: github.event_name == 'issue_comment'
+        id: verify_comment
+        shell: bash
+        run: |
+          BODY=$(cat << 'EOF'
+          ${{ github.event.comment.body }}
+          EOF
+          )
+          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+
+          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
+            echo "trigger=true" >> $GITHUB_OUTPUT
+          else
+            echo "trigger=false" >> $GITHUB_OUTPUT
+          fi
+
      - name: Trigger Scylla-CI-Route Jenkins Job
+        if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
        run: |
-          PR_NUMBER=${{ github.event.issue.number }}
+          PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ For further information, please see:

 [developer documentation]: HACKING.md
 [build documentation]: docs/dev/building.md
-[docker image build documentation]: dist/docker/debian/README.md
+[docker image build documentation]: dist/docker/redhat/README.md

 ## Running Scylla

--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0-dev
+VERSION=2026.2.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
 // Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function can throw an ValidationException API error if there
+// This function can throw a ValidationException API error if there
 // are errors in the format of the condition itself.
 bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_reponse) {
+    if (_should_add_to_response) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
+    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
    bool operator()() const noexcept {
-        return _should_add_to_reponse;
+        return _should_add_to_response;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_reponse = false;
+    bool _should_add_to_response = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -7,6 +7,7 @@
 */

 #include <fmt/ranges.h>
+#include <cstdlib>
 #include <seastar/core/on_internal_error.hh>
 #include "alternator/executor.hh"
 #include "alternator/consumed_capacity.hh"
@@ -17,6 +18,7 @@
 #include "auth/service.hh"
 #include "db/config.hh"
 #include "db/view/view_build_status.hh"
+#include "locator/tablets.hh"
 #include "mutation/tombstone.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "utils/log.hh"
@@ -107,6 +109,16 @@ const sstring TABLE_CREATION_TIME_TAG_KEY("system:table_creation_time");
 // configured by UpdateTimeToLive to be the expiration-time attribute for
 // this table.
 extern const sstring TTL_TAG_KEY("system:ttl_attribute");
+// If this tag is present, it stores the name of an attribute whose numeric
+// value (in microseconds since the Unix epoch) is used as the write timestamp
+// for PutItem and UpdateItem operations. When the named attribute is present
+// in a PutItem or UpdateItem request, its value is used as the timestamp of
+// the write, and the attribute itself is NOT stored in the item. This allows
+// users to control write ordering for last-write-wins semantics. Because LWT
+// does not allow setting a custom write timestamp, operations using this
+// feature are incompatible with conditions (which require LWT), and with
+// the LWT_ALWAYS write isolation mode; such operations are rejected.
+static const sstring TIMESTAMP_TAG_KEY("system:timestamp_attribute");
 // This will be set to 1 in a case, where user DID NOT specify a range key.
 // The way GSI / LSI is implemented by Alternator assumes user specified keys will come first
 // in materialized view's key list. Then, if needed missing keys are added (current implementation
@@ -236,7 +248,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
 }

 // This function assumes the given value is an object and returns requested member value.
-// If it is not possible an api_error::validation is thrown.
+// If it is not possible, an api_error::validation is thrown.
 static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
    validate_is_object(obj, caller);
    const rjson::value* ret = rjson::find(obj, member_name);
@@ -248,7 +260,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe


 // This function assumes the given value is an object with a single member, and returns this member.
-// In case the requirements are not met an api_error::validation is thrown.
+// In case the requirements are not met, an api_error::validation is thrown.
 static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -681,7 +693,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
 }

 // Sets a KeySchema object inside the given JSON parent describing the key
-// attributes of the the given schema as being either HASH or RANGE keys.
+// attributes of the given schema as being either HASH or RANGE keys.
 // Additionally, adds to a given map mappings between the key attribute
 // names and their type (as a DynamoDB type string).
 void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -915,7 +927,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
                sstring index_name = cf_name.substr(delim_it + 1);
                rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
                rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
-                // Add indexes's KeySchema and collect types for AttributeDefinitions:
+                // Add index's KeySchema and collect types for AttributeDefinitions:
                executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
                // Add projection type
                rjson::value projection = rjson::empty_object();
@@ -1336,13 +1348,14 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
 // Alternator uses tags whose keys start with the "system:" prefix for
 // internal purposes. Those should not be readable by ListTagsOfResource,
 // nor writable with TagResource or UntagResource (see #24098).
-// Only a few specific system tags, currently only "system:write_isolation"
-// and "system:initial_tablets", are deliberately intended to be set and read
-// by the user, so are not considered "internal".
+// Only a few specific system tags, currently only "system:write_isolation",
+// "system:initial_tablets", and "system:timestamp_attribute", are deliberately
+// intended to be set and read by the user, so are not considered "internal".
 static bool tag_key_is_internal(std::string_view tag_key) {
    return tag_key.starts_with("system:")
        && tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
-        && tag_key != INITIAL_TABLETS_TAG_KEY;
+        && tag_key != INITIAL_TABLETS_TAG_KEY
+        && tag_key != TIMESTAMP_TAG_KEY;
 }

 enum class update_tags_action { add_tags, delete_tags };
@@ -1875,23 +1888,34 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        auto ts = group0_guard.write_timestamp();
        utils::chunked_vector<mutation> schema_mutations;
        auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
+        locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
+        const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
+        auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
        // Alternator Streams doesn't yet work when the table uses tablets (#23838)
        if (stream_specification && stream_specification->IsObject()) {
            auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
            if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
-                locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
-                const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
-                auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
                if (rs->uses_tablets()) {
                    co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
                    "If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
                }
            }
        }
-        // Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
-        // GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
-        if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
-            co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+        // Creating an index in tablets mode requires the keyspace to be RF-rack-valid.
+        // GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues.
+        if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+            try {
+                locator::assert_rf_rack_valid_keyspace(keyspace_name, _proxy.local_db().get_token_metadata_ptr(), *rs);
+            } catch (const std::invalid_argument& ex) {
+                if (!view_builders.empty()) {
+                    co_return api_error::validation(fmt::format("GlobalSecondaryIndexes and LocalSecondaryIndexes on a table "
+                        "using tablets require the number of racks in the cluster to be either 1 or 3"));
+                } else {
+                    co_return api_error::validation(fmt::format("Cannot create table '{}' with tablets: the configuration "
+                        "option 'rf_rack_valid_keyspaces' is enabled, which enforces that tables using tablets can only be created in clusters "
+                        "that have either 1 or 3 racks", table_name));
+                }
+            }
        }
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
@@ -2114,9 +2138,12 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                            co_return api_error::validation(fmt::format(
                                "LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
                        }
-                        if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
-                                !p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
-                            co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+                        try {
+                            locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(),
+                                    p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy());
+                        } catch (const std::invalid_argument& ex) {
+                            co_return api_error::validation(fmt::format("GlobalSecondaryIndexes on a table "
+                                "using tablets require the number of racks in the cluster to be either 1 or 3"));
                        }

                        elogger.trace("Adding GSI {}", index_name);
@@ -2283,8 +2310,11 @@ public:
 // After calling pk_from_json() and ck_from_json() to extract the pk and ck
 // components of a key, and if that succeeded, call check_key() to further
 // check that the key doesn't have any spurious components.
-static void check_key(const rjson::value& key, const schema_ptr& schema) {
-    if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
+// allow_extra_attribute: set to true when the key may contain one extra
+// non-key attribute (e.g., the timestamp pseudo-attribute for DeleteItem).
+static void check_key(const rjson::value& key, const schema_ptr& schema, bool allow_extra_attribute = false) {
+    const unsigned expected = (schema->clustering_key_size() == 0 ? 1 : 2) + (allow_extra_attribute ? 1 : 0);
+    if (key.MemberCount() != expected) {
        throw api_error::validation("Given key attribute not in schema");
    }
 }
@@ -2331,6 +2361,57 @@ void validate_value(const rjson::value& v, const char* caller) {
 // any writing happens (if one of the commands has an error, none of the
 // writes should be done). LWT makes it impossible for the parse step to
 // generate "mutation" objects, because the timestamp still isn't known.
+
+// Convert a DynamoDB number (big_decimal) to an api::timestamp_type
+// (microseconds since the Unix epoch). Fractional microseconds are truncated.
+// Returns nullopt if the value is negative or zero.
+static std::optional<api::timestamp_type> bigdecimal_to_timestamp(const big_decimal& bd) {
+    if (bd.unscaled_value() <= 0) {
+        return std::nullopt;
+    }
+    if (bd.scale() == 0) {
+        // Fast path: integer value, no decimal adjustment needed
+        return static_cast<api::timestamp_type>(bd.unscaled_value());
+    }
+    // General case: adjust for decimal scale.
+    // big_decimal stores value as unscaled_value * 10^(-scale).
+    // scale > 0 means divide by 10^scale (truncate fractional part).
+    // scale < 0 means multiply by 10^|scale| (add trailing zeros).
+    auto str = bd.unscaled_value().str();
+    if (bd.scale() > 0) {
+        int len = str.length();
+        if (len <= bd.scale()) {
+            return std::nullopt;  // Number < 1
+        }
+        str = str.substr(0, len - bd.scale());
+    } else {
+        if (bd.scale() < -18) {
+            // Too large to represent as int64_t
+            return std::nullopt;
+        }
+        for (int i = 0; i < -bd.scale(); i++) {
+            str.push_back('0');
+        }
+    }
+    long long result = strtoll(str.c_str(), nullptr, 10);
+    if (result <= 0) {
+        return std::nullopt;
+    }
+    return static_cast<api::timestamp_type>(result);
+}
+
+// Try to extract a write timestamp from a DynamoDB-typed value.
+// The value should be a number ({"N": "..."}), representing microseconds
+// since the Unix epoch. Returns nullopt if the value is not a valid number
+// or doesn't represent a valid timestamp.
+static std::optional<api::timestamp_type> try_get_timestamp(const rjson::value& attr_value) {
+    std::optional<big_decimal> n = try_unwrap_number(attr_value);
+    if (!n) {
+        return std::nullopt;
+    }
+    return bigdecimal_to_timestamp(*n);
+}
+
 class put_or_delete_item {
 private:
    partition_key _pk;
@@ -2346,11 +2427,17 @@ private:
    // that length can have different meaning depends on the operation but the
    // the calculation of length in bytes to WCU is the same.
    uint64_t _length_in_bytes = 0;
+    // If the table has a system:timestamp_attribute tag, and the named
+    // attribute was found in the item with a valid numeric value, this holds
+    // the extracted timestamp. The attribute is not added to _cells.
+    std::optional<api::timestamp_type> _custom_timestamp;
 public:
    struct delete_item {};
    struct put_item {};
-    put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
-    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes);
+    put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item,
+            const std::optional<bytes>& timestamp_attribute = std::nullopt);
+    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
+            const std::optional<bytes>& timestamp_attribute = std::nullopt);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
    mutation build(schema_ptr schema, api::timestamp_type ts) const;
@@ -2365,11 +2452,32 @@ public:
    bool is_put_item() noexcept {
        return _cells.has_value();
    }
+    // Returns the custom write timestamp extracted from the timestamp attribute,
+    // if any. If not set, the caller should use api::new_timestamp() instead.
+    std::optional<api::timestamp_type> custom_timestamp() const noexcept {
+        return _custom_timestamp;
+    }
 };

-put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
+put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item, const std::optional<bytes>& timestamp_attribute)
        : _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
-    check_key(key, schema);
+    if (timestamp_attribute) {
+        // The timestamp attribute may be provided as a "pseudo-key": it is
+        // not a real key column, but can be included in the "Key" object to
+        // carry the custom write timestamp. If found, extract the timestamp
+        // and don't store it in the item.
+        const rjson::value* ts_val = rjson::find(key, to_string_view(*timestamp_attribute));
+        if (ts_val) {
+            if (auto t = try_get_timestamp(*ts_val)) {
+                _custom_timestamp = t;
+            } else {
+                throw api_error::validation(fmt::format(
+                    "The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)",
+                    to_string_view(*timestamp_attribute)));
+            }
+        }
+    }
+    check_key(key, schema, _custom_timestamp.has_value());
 }

 // find_attribute() checks whether the named attribute is stored in the
@@ -2420,7 +2528,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
 //   case, this function simply won't be called for this attribute.)
 //
 // This function checks if the given attribute update is an update to some
-// GSI's key, and if the value is unsuitable, a api_error::validation is
+// GSI's key, and if the value is unsuitable, an api_error::validation is
 // thrown. The checking here is similar to the checking done in
 // get_key_from_typed_value() for the base table's key columns.
 //
@@ -2456,7 +2564,8 @@ static inline void validate_value_if_index_key(
    }
 }

-put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes)
+put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
+        const std::optional<bytes>& timestamp_attribute)
        : _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
    _cells = std::vector<cell>();
    _cells->reserve(item.MemberCount());
@@ -2465,6 +2574,17 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
        validate_value(it->value, "PutItem");
        const column_definition* cdef = find_attribute(*schema, column_name);
        validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
+        // If this is the timestamp attribute, it must be a valid numeric value
+        // (microseconds since epoch). Use it as the write timestamp and do not
+        // store it in the item data. Reject the write if the value is non-numeric.
+        if (timestamp_attribute && column_name == *timestamp_attribute) {
+            if (auto t = try_get_timestamp(it->value)) {
+                _custom_timestamp = t;
+                // The attribute is consumed as timestamp, not stored in _cells.
+                continue;
+            }
+            throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*timestamp_attribute)));
+        }
        _length_in_bytes += column_name.size();
        if (!cdef) {
            // This attribute may be a key column of one of the GSI or LSI,
@@ -2656,6 +2776,13 @@ rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& reque
    // _pk and _ck will be assigned later, by the subclass's constructor
    // (each operation puts the key in a slightly different location in
    // the request).
+    const auto tags_ptr = db::get_tags_of_table(_schema);
+    if (tags_ptr) {
+        auto it = tags_ptr->find(TIMESTAMP_TAG_KEY);
+        if (it != tags_ptr->end() && !it->second.empty()) {
+            _timestamp_attribute = to_bytes(it->second);
+        }
+    }
 }

 std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) {
@@ -2800,6 +2927,21 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        .alternator = true,
        .alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
    };
+    // If the operation uses a custom write timestamp (from the
+    // system:timestamp_attribute tag), LWT is incompatible because LWT
+    // requires the timestamp to be set by the Paxos protocol. Reject the
+    // operation if it would need to use LWT.
+    if (has_custom_timestamp()) {
+        bool would_use_lwt = _write_isolation == write_isolation::LWT_ALWAYS ||
+            (needs_read_before_write &&
+             _write_isolation != write_isolation::FORBID_RMW &&
+             _write_isolation != write_isolation::UNSAFE_RMW);
+        if (would_use_lwt) {
+            throw api_error::validation(
+                "Using the system:timestamp_attribute is not compatible with "
+                "conditional writes or the 'always' write isolation policy.");
+        }
+    }
    if (needs_read_before_write) {
        if (_write_isolation == write_isolation::FORBID_RMW) {
            throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
@@ -2898,7 +3040,8 @@ public:
    put_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
        : rmw_operation(proxy, std::move(request))
        , _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{},
-            si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name()))) {
+            si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name())),
+            _timestamp_attribute) {
        _pk = _mutation_builder.pk();
        _ck = _mutation_builder.ck();
        if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
@@ -2930,6 +3073,9 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
+    bool has_custom_timestamp() const noexcept {
+        return _mutation_builder.custom_timestamp().has_value();
+    }
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
@@ -2947,7 +3093,10 @@ public:
        } else {
            _return_attributes = {};
        }
-        return _mutation_builder.build(_schema, ts);
+        // Use the custom timestamp from the timestamp attribute if available,
+        // otherwise use the provided timestamp.
+        api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
+        return _mutation_builder.build(_schema, effective_ts);
    }
    virtual ~put_item_operation() = default;
 };
@@ -2999,7 +3148,7 @@ public:
    parsed::condition_expression _condition_expression;
    delete_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
        : rmw_operation(proxy, std::move(request))
-        , _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
+        , _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}, _timestamp_attribute) {
        _pk = _mutation_builder.pk();
        _ck = _mutation_builder.ck();
        if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
@@ -3030,6 +3179,9 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
+    bool has_custom_timestamp() const noexcept override {
+        return _mutation_builder.custom_timestamp().has_value();
+    }
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
@@ -3050,7 +3202,10 @@ public:
        if (_consumed_capacity._total_bytes == 0) {
            _consumed_capacity._total_bytes = 1;
        }
-        return _mutation_builder.build(_schema, ts);
+        // Use the custom timestamp from the timestamp attribute if available,
+        // otherwise use the provided timestamp.
+        api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
+        return _mutation_builder.build(_schema, effective_ts);
    }
    virtual ~delete_item_operation() = default;
 };
@@ -3237,10 +3392,13 @@ future<> executor::do_batch_write(
        // Do a normal write, without LWT:
        utils::chunked_vector<mutation> mutations;
        mutations.reserve(mutation_builders.size());
-        api::timestamp_type now = api::new_timestamp();
+        api::timestamp_type default_ts = api::new_timestamp();
        bool any_cdc_enabled = false;
        for (auto& b : mutation_builders) {
-            mutations.push_back(b.second.build(b.first, now));
+            // Use custom timestamp from the timestamp attribute if available,
+            // otherwise use the default timestamp for all items in this batch.
+            api::timestamp_type ts = b.second.custom_timestamp().value_or(default_ts);
+            mutations.push_back(b.second.build(b.first, ts));
            any_cdc_enabled |= b.first->cdc_options().enabled();
        }
        return _proxy.mutate(std::move(mutations),
@@ -3340,6 +3498,16 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c

        std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
                1, primary_key_hash{schema}, primary_key_equal{schema});
+        // Look up the timestamp attribute tag once per table (shared by all
+        // PutRequests and DeleteRequests for this table).
+        std::optional<bytes> ts_attr;
+        const auto tags_ptr = db::get_tags_of_table(schema);
+        if (tags_ptr) {
+            auto tag_it = tags_ptr->find(TIMESTAMP_TAG_KEY);
+            if (tag_it != tags_ptr->end() && !tag_it->second.empty()) {
+                ts_attr = to_bytes(tag_it->second);
+            }
+        }
        for (auto& request : it->value.GetArray()) {
            auto& r = get_single_member(request, "RequestItems element");
            const auto r_name = rjson::to_string_view(r.name);
@@ -3348,7 +3516,8 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                validate_is_object(item, "Item in PutRequest");
                auto&& put_item = put_or_delete_item(
                        item, schema, put_or_delete_item::put_item{},
-                        si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())));
+                        si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())),
+                        ts_attr);
                mutation_builders.emplace_back(schema, std::move(put_item));
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
@@ -3359,7 +3528,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                const rjson::value& key = get_member(r.value, "Key", "DeleteRequest");
                validate_is_object(key, "Key in DeleteRequest");
                mutation_builders.emplace_back(schema, put_or_delete_item(
-                        key, schema, put_or_delete_item::delete_item{}));
+                        key, schema, put_or_delete_item::delete_item{}, ts_attr));
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
                        mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
@@ -3533,7 +3702,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
    return true;
 }

-// Add a path to a attribute_path_map. Throws a validation error if the path
+// Add a path to an attribute_path_map. Throws a validation error if the path
 // "overlaps" with one already in the filter (one is a sub-path of the other)
 // or "conflicts" with it (both a member and index is requested).
 template<typename T>
@@ -3968,6 +4137,10 @@ public:
    virtual ~update_item_operation() = default;
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override;
    bool needs_read_before_write() const;
+    // Returns true if the timestamp attribute is being set in this update
+    // (via AttributeUpdates PUT or UpdateExpression SET). Used to detect
+    // whether a custom write timestamp will be used.
+    bool has_custom_timestamp() const noexcept;

 private:
    void delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
@@ -4102,6 +4275,44 @@ update_item_operation::needs_read_before_write() const {
           (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
 }

+bool
+update_item_operation::has_custom_timestamp() const noexcept {
+    if (!_timestamp_attribute) {
+        return false;
+    }
+    // Check if the timestamp attribute is being set via AttributeUpdates PUT
+    // with a valid numeric value.
+    if (_attribute_updates) {
+        std::string_view ts_attr = to_string_view(*_timestamp_attribute);
+        for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
+            if (rjson::to_string_view(it->name) == ts_attr) {
+                const rjson::value* action = rjson::find(it->value, "Action");
+                if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
+                    // Only consider it a custom timestamp if the value is numeric
+                    if (try_get_timestamp((it->value)["Value"])) {
+                        return true;
+                    }
+                }
+                break;
+            }
+        }
+    }
+    // Check if the timestamp attribute is being set via UpdateExpression SET.
+    // We can't check the actual value type without resolving the expression
+    // (which requires previous_item), so we conservatively return true if the
+    // attribute appears in a SET action, and handle the non-numeric case in apply().
+    // A non-numeric value will cause apply() to throw a ValidationException.
+    if (!_update_expression.empty()) {
+        std::string ts_attr(to_string_view(*_timestamp_attribute));
+        auto it = _update_expression.find(ts_attr);
+        if (it != _update_expression.end() && it->second.has_value()) {
+            const auto& action = it->second.get_value();
+            return std::holds_alternative<parsed::update_expression::action::set>(action._action);
+        }
+    }
+    return false;
+}
+
 // action_result() returns the result of applying an UpdateItem action -
 // this result is either a JSON object or an unset optional which indicates
 // the action was a deletion. The caller (update_item_operation::apply()
@@ -4377,6 +4588,17 @@ inline void update_item_operation::apply_attribute_updates(const std::unique_ptr
            throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
        }
        std::string action = rjson::to_string((it->value)["Action"]);
+        // If this is the timestamp attribute being PUT, it must be a valid
+        // numeric value (microseconds since epoch). Use it as the write
+        // timestamp and skip storing it. Reject if the value is non-numeric.
+        if (_timestamp_attribute && column_name == *_timestamp_attribute && action == "PUT") {
+            if (it->value.HasMember("Value")) {
+                if (try_get_timestamp((it->value)["Value"])) {
+                    continue;
+                }
+                throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
+            }
+        }
        if (action == "DELETE") {
            // The DELETE operation can do two unrelated tasks. Without a
            // "Value" option, it is used to delete an attribute. With a
@@ -4480,6 +4702,20 @@ inline void update_item_operation::apply_update_expression(const std::unique_ptr
        if (cdef && cdef->is_primary_key()) {
            throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
        }
+        // If this is the timestamp attribute being set via UpdateExpression SET,
+        // it must be a valid numeric value (microseconds since epoch). Use it as
+        // the write timestamp and skip storing it. Reject if non-numeric.
+        if (_timestamp_attribute && to_bytes(column_name) == *_timestamp_attribute &&
+                actions.second.has_value() &&
+                std::holds_alternative<parsed::update_expression::action::set>(actions.second.get_value()._action)) {
+            std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
+            if (result) {
+                if (try_get_timestamp(*result)) {
+                    continue;  // Skip - already used as timestamp
+                }
+                throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
+            }
+        }
        if (actions.second.has_value()) {
            // An action on a top-level attribute column_name. The single
            // action is actions.second.get_value(). We can simply invoke
@@ -4528,6 +4764,44 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
        return {};
    }

+    // If the table has a timestamp attribute, look for it in the update
+    // (AttributeUpdates PUT or UpdateExpression SET). If found with a valid
+    // numeric value, use it as the write timestamp instead of the provided ts.
+    api::timestamp_type effective_ts = ts;
+    if (_timestamp_attribute) {
+        bool found_ts = false;
+        if (_attribute_updates) {
+            std::string_view ts_attr = to_string_view(*_timestamp_attribute);
+            for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
+                if (rjson::to_string_view(it->name) == ts_attr) {
+                    const rjson::value* action = rjson::find(it->value, "Action");
+                    if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
+                        if (auto t = try_get_timestamp((it->value)["Value"])) {
+                            effective_ts = *t;
+                            found_ts = true;
+                        }
+                    }
+                    break;
+                }
+            }
+        }
+        if (!found_ts && !_update_expression.empty()) {
+            std::string ts_attr(to_string_view(*_timestamp_attribute));
+            auto it = _update_expression.find(ts_attr);
+            if (it != _update_expression.end() && it->second.has_value()) {
+                const auto& action = it->second.get_value();
+                if (std::holds_alternative<parsed::update_expression::action::set>(action._action)) {
+                    std::optional<rjson::value> result = action_result(action, previous_item.get());
+                    if (result) {
+                        if (auto t = try_get_timestamp(*result)) {
+                            effective_ts = *t;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
    // In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
    // _return_attributes and parts of it will be overwritten by the new
    // updates (in do_update() and do_delete()). We need to make a copy and
@@ -4556,10 +4830,10 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
    auto& row = m.partition().clustered_row(*_schema, _ck);
    auto modified_attrs = attribute_collector();
    if (!_update_expression.empty()) {
-        apply_update_expression(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
+        apply_update_expression(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
    }
    if (_attribute_updates) {
-        apply_attribute_updates(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
+        apply_attribute_updates(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
    }
    if (!modified_attrs.empty()) {
        auto serialized_map = modified_attrs.to_mut().serialize(*attrs_type());
@@ -4570,7 +4844,7 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
    // marker. An update with only DELETE operations must not add a row marker
    // (this was issue #5862) but any other update, even an empty one, should.
    if (any_updates || !any_deletes) {
-        row.apply(row_marker(ts));
+        row.apply(row_marker(effective_ts));
    } else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
        // There was no pre-existing item, and we're not creating one, so
        // don't report the new item in the returned Attributes.
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -50,7 +50,7 @@ public:
        _operators.emplace_back(i);
        check_depth_limit();
    }
-    void add_dot(std::string(name)) {
+    void add_dot(std::string name) {
        _operators.emplace_back(std::move(name));
        check_depth_limit();
    }
@@ -85,7 +85,7 @@ struct constant {
    }
 };

-// "value" is is a value used in the right hand side of an assignment
+// "value" is a value used in the right hand side of an assignment
 // expression, "SET a = ...". It can be a constant (a reference to a value
 // included in the request, e.g., ":val"), a path to an attribute from the
 // existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
 // The supported primitive conditions are:
 // 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
 //    v1 and v2 are values - from the item (an attribute path), the query
-//    (a ":val" reference), or a function of the the above (only the size()
+//    (a ":val" reference), or a function of the above (only the size()
 //    function is supported).
 // 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
 // 3. N-ary operator - v1 IN ( v2, v3, ... )
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -18,6 +18,7 @@
 #include "executor.hh"
 #include "tracing/trace_state.hh"
 #include "keys/keys.hh"
+#include "bytes.hh"

 namespace alternator {

@@ -72,6 +73,11 @@ protected:
    clustering_key _ck = clustering_key::make_empty();
    write_isolation _write_isolation;
    mutable wcu_consumed_capacity_counter _consumed_capacity;
+    // If the table has a "system:timestamp_attribute" tag, this holds the
+    // name of the attribute (converted to bytes) whose numeric value should
+    // be used as the write timestamp instead of the current time. The
+    // attribute itself is NOT stored in the item data.
+    std::optional<bytes> _timestamp_attribute;
    // All RMW operations can have a ReturnValues parameter from the following
    // choices. But note that only UpdateItem actually supports all of them:
    enum class returnvalues {
@@ -113,6 +119,9 @@ public:
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
    virtual ~rmw_operation() = default;
+    // Returns true if the operation will use a custom write timestamp (from the
+    // system:timestamp_attribute tag). Subclasses override this as needed.
+    virtual bool has_custom_timestamp() const noexcept { return false; }
    const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
    schema_ptr schema() const { return _schema; }
    const rjson::value& request() const { return _request; }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
 clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);

-// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
+// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        co_return rjson::print(std::move(ret));
    }

    // TODO: label
@@ -502,123 +502,121 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
+    auto e = topologies.end();
+    auto prev = e;
+    auto shards = rjson::empty_array();

-        auto e = topologies.end();
-        auto prev = e;
-        auto shards = rjson::empty_array();
+    std::optional<shard_id> last;

-        std::optional<shard_id> last;
+    auto i = topologies.begin();
+    // if we're a paged query, skip to the generation where we left of.
+    if (shard_start) {
+        i = topologies.find(shard_start->time);
+    }

-        auto i = topologies.begin();
-        // if we're a paged query, skip to the generation where we left of.
-        if (shard_start) {
-            i = topologies.find(shard_start->time);
-        }
+    // for parent-child stuff we need id:s to be sorted by token
+    // (see explanation above) since we want to find closest
+    // token boundary when determining parent.
+    // #7346 - we processed and searched children/parents in
+    // stored order, which is not necessarily token order,
+    // so the finding of "closest" token boundary (using upper bound)
+    // could give somewhat weird results.
+    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return id1.token() < id2.token();
+    };

-        // for parent-child stuff we need id:s to be sorted by token
-        // (see explanation above) since we want to find closest
-        // token boundary when determining parent.
-        // #7346 - we processed and searched children/parents in
-        // stored order, which is not necessarily token order,
-        // so the finding of "closest" token boundary (using upper bound)
-        // could give somewhat weird results.
-        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return id1.token() < id2.token();
-        };
+    // #7409 - shards must be returned in lexicographical order,
+    // normal bytes compare is string_traits<int8_t>::compare.
+    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+    // compare instead we inadvertently will sort in string lexical.
+    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+    };
+
+    // need a prev even if we are skipping stuff
+    if (i != topologies.begin()) {
+        prev = std::prev(i);
+    }
+
+    for (; limit > 0 && i != e; prev = i, ++i) {
+        auto& [ts, sv] = *i;
+
+        last = std::nullopt;
+
+        auto lo = sv.streams.begin();
+        auto end = sv.streams.end();

        // #7409 - shards must be returned in lexicographical order,
-        // normal bytes compare is string_traits<int8_t>::compare.
-        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-        // compare instead we inadvertently will sort in string lexical.
-        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-        };
+        std::sort(lo, end, id_cmp);

-        // need a prev even if we are skipping stuff
-        if (i != topologies.begin()) {
-            prev = std::prev(i);
+        if (shard_start) {
+            // find next shard position
+            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+            shard_start = std::nullopt;
        }

-        for (; limit > 0 && i != e; prev = i, ++i) {
-            auto& [ts, sv] = *i;
+        if (lo != end && prev != e) {
+            // We want older stuff sorted in token order so we can find matching
+            // token range when determining parent shard.
+            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+        }
+
+        auto expired = [&]() -> std::optional<db_clock::time_point> {
+            auto j = std::next(i);
+            if (j == e) {
+                return std::nullopt;
+            }
+            // add this so we sort of match potential 
+            // sequence numbers in get_records result.
+            return j->first + confidence_interval(db);
+        }();
+
+        while (lo != end) {
+            auto& id = *lo++;
+
+            auto shard = rjson::empty_object();
+
+            if (prev != e) {
+                auto& pids = prev->second.streams;
+                auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
+                    return t < id.token();
+                });
+                if (pid != pids.begin()) {
+                    pid = std::prev(pid);
+                }
+                if (pid != pids.end()) {
+                    rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
+                }
+            }
+
+            last.emplace(ts, id);
+            rjson::add(shard, "ShardId", *last);
+            auto range = rjson::empty_object();
+            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+            if (expired) {
+                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+            }
+
+            rjson::add(shard, "SequenceNumberRange", std::move(range));
+            rjson::push_back(shards, std::move(shard));
+            
+            if (--limit == 0) {
+                break;
+            }

            last = std::nullopt;
-
-            auto lo = sv.streams.begin();
-            auto end = sv.streams.end();
-
-            // #7409 - shards must be returned in lexicographical order,
-            std::sort(lo, end, id_cmp);
-
-            if (shard_start) {
-                // find next shard position
-                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-                shard_start = std::nullopt;
-            }
-
-            if (lo != end && prev != e) {
-                // We want older stuff sorted in token order so we can find matching
-                // token range when determining parent shard.
-                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-            }
-
-            auto expired = [&]() -> std::optional<db_clock::time_point> {
-                auto j = std::next(i);
-                if (j == e) {
-                    return std::nullopt;
-                }
-                // add this so we sort of match potential 
-                // sequence numbers in get_records result.
-                return j->first + confidence_interval(db);
-            }();
-
-            while (lo != end) {
-                auto& id = *lo++;
-
-                auto shard = rjson::empty_object();
-
-                if (prev != e) {
-                    auto& pids = prev->second.streams;
-                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                        return t < id.token();
-                    });
-                    if (pid != pids.begin()) {
-                        pid = std::prev(pid);
-                    }
-                    if (pid != pids.end()) {
-                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                    }
-                }
-
-                last.emplace(ts, id);
-                rjson::add(shard, "ShardId", *last);
-                auto range = rjson::empty_object();
-                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-                if (expired) {
-                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-                }
-
-                rjson::add(shard, "SequenceNumberRange", std::move(range));
-                rjson::push_back(shards, std::move(shard));
-                
-                if (--limit == 0) {
-                    break;
-                }
-
-                last = std::nullopt;
-            }
        }
+    }

-        if (last) {
-            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-        }
+    if (last) {
+        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+    }

-        rjson::add(stream_desc, "Shards", std::move(shards));
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-            
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-    });
+    rjson::add(stream_desc, "Shards", std::move(shards));
+    rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        
+    co_return rjson::print(std::move(ret));
 }

 enum class shard_iterator_type {
@@ -898,172 +896,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
-            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-        auto result_set = builder.build();
-        auto records = rjson::empty_array();
+    auto result_set = builder.build();
+    auto records = rjson::empty_array();

-        auto& metadata = result_set->get_metadata();
+    auto& metadata = result_set->get_metadata();

-        auto op_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == op_column_name;
-            })
-        );
-        auto ts_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == timestamp_column_name;
-            })
-        );
-        auto eor_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == eor_column_name;
-            })
-        );
+    auto op_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == op_column_name;
+        })
+    );
+    auto ts_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == timestamp_column_name;
+        })
+    );
+    auto eor_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == eor_column_name;
+        })
+    );

-        std::optional<utils::UUID> timestamp;
-        auto dynamodb = rjson::empty_object();
-        auto record = rjson::empty_object();
-        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+    std::optional<utils::UUID> timestamp;
+    auto dynamodb = rjson::empty_object();
+    auto record = rjson::empty_object();
+    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-        using op_utype = std::underlying_type_t<cdc::operation>;
+    using op_utype = std::underlying_type_t<cdc::operation>;

-        auto maybe_add_record = [&] {
-            if (!dynamodb.ObjectEmpty()) {
-                rjson::add(record, "dynamodb", std::move(dynamodb));
-                dynamodb = rjson::empty_object();
-            }
-            if (!record.ObjectEmpty()) {
-                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.1");
-                rjson::push_back(records, std::move(record));
-                record = rjson::empty_object();
-                --limit;
-            }
-        };
+    auto maybe_add_record = [&] {
+        if (!dynamodb.ObjectEmpty()) {
+            rjson::add(record, "dynamodb", std::move(dynamodb));
+            dynamodb = rjson::empty_object();
+        }
+        if (!record.ObjectEmpty()) {
+            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+            rjson::add(record, "eventSource", "scylladb:alternator");
+            rjson::add(record, "eventVersion", "1.1");
+            rjson::push_back(records, std::move(record));
+            record = rjson::empty_object();
+            --limit;
+        }
+    };

-        for (auto& row : result_set->rows()) {
-            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+    for (auto& row : result_set->rows()) {
+        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-            if (!dynamodb.HasMember("Keys")) {
-                auto keys = rjson::empty_object();
-                describe_single_item(*selection, row, key_names, keys);
-                rjson::add(dynamodb, "Keys", std::move(keys));
-                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-                rjson::add(dynamodb, "StreamViewType", type);
-                // TODO: SizeBytes
-            }
-
-            /**
-             * We merge rows with same timestamp into a single event.
-             * This is pretty much needed, because a CDC row typically
-             * encodes ~half the info of an alternator write. 
-             * 
-             * A big, big downside to how alternator records are written
-             * (i.e. CQL), is that the distinction between INSERT and UPDATE
-             * is somewhat lost/unmappable to actual eventName. 
-             * A write (currently) always looks like an insert+modify
-             * regardless whether we wrote existing record or not. 
-             * 
-             * Maybe RMW ops could be done slightly differently so 
-             * we can distinguish them here...
-             * 
-             * For now, all writes will become MODIFY.
-             * 
-             * Note: we do not check the current pre/post
-             * flags on CDC log, instead we use data to 
-             * drive what is returned. This is (afaict)
-             * consistent with dynamo streams
-             */
-            switch (op) {
-            case cdc::operation::pre_image:
-            case cdc::operation::post_image:
-            {
-                auto item = rjson::empty_object();
-                describe_single_item(*selection, row, attr_names, item, nullptr, true);
-                describe_single_item(*selection, row, key_names, item);
-                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-                break;
-            }
-            case cdc::operation::update:
-                rjson::add(record, "eventName", "MODIFY");
-                break;
-            case cdc::operation::insert:
-                rjson::add(record, "eventName", "INSERT");
-                break;
-            case cdc::operation::service_row_delete:
-            case cdc::operation::service_partition_delete:
-            {
-                auto user_identity = rjson::empty_object();
-                rjson::add(user_identity, "Type", "Service");
-                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-                rjson::add(record, "userIdentity", std::move(user_identity));
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            default:
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            if (eor) {
-                maybe_add_record();
-                timestamp = ts;
-                if (limit == 0) {
-                    break;
-                }
-            }
+        if (!dynamodb.HasMember("Keys")) {
+            auto keys = rjson::empty_object();
+            describe_single_item(*selection, row, key_names, keys);
+            rjson::add(dynamodb, "Keys", std::move(keys));
+            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(dynamodb, "StreamViewType", type);
+            // TODO: SizeBytes
        }

-        auto ret = rjson::empty_object();
-        auto nrecords = records.Size();
-        rjson::add(ret, "Records", std::move(records));
-
-        if (nrecords != 0) {
-            // #9642. Set next iterators threshold to > last
-            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-            // Note that here we unconditionally return NextShardIterator,
-            // without checking if maybe we reached the end-of-shard. If the
-            // shard did end, then the next read will have nrecords == 0 and
-            // will notice end end of shard and not return NextShardIterator.
-            rjson::add(ret, "NextShardIterator", next_iter);
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        /**
+         * We merge rows with same timestamp into a single event.
+         * This is pretty much needed, because a CDC row typically
+         * encodes ~half the info of an alternator write. 
+         * 
+         * A big, big downside to how alternator records are written
+         * (i.e. CQL), is that the distinction between INSERT and UPDATE
+         * is somewhat lost/unmappable to actual eventName. 
+         * A write (currently) always looks like an insert+modify
+         * regardless whether we wrote existing record or not. 
+         * 
+         * Maybe RMW ops could be done slightly differently so 
+         * we can distinguish them here...
+         * 
+         * For now, all writes will become MODIFY.
+         * 
+         * Note: we do not check the current pre/post
+         * flags on CDC log, instead we use data to 
+         * drive what is returned. This is (afaict)
+         * consistent with dynamo streams
+         */
+        switch (op) {
+        case cdc::operation::pre_image:
+        case cdc::operation::post_image:
+        {
+            auto item = rjson::empty_object();
+            describe_single_item(*selection, row, attr_names, item, nullptr, true);
+            describe_single_item(*selection, row, key_names, item);
+            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            break;
        }
-
-        // ugh. figure out if we are and end-of-shard
-        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-
-        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
-            auto& shard = iter.shard;            
-
-            if (shard.time < ts && ts < high_ts) {
-                // The DynamoDB documentation states that when a shard is
-                // closed, reading it until the end has NextShardIterator
-                // "set to null". Our test test_streams_closed_read
-                // confirms that by "null" they meant not set at all.
-            } else {
-                // We could have return the same iterator again, but we did
-                // a search from it until high_ts and found nothing, so we
-                // can also start the next search from high_ts.
-                // TODO: but why? It's simpler just to leave the iterator be.
-                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-                rjson::add(ret, "NextShardIterator", iter);
+        case cdc::operation::update:
+            rjson::add(record, "eventName", "MODIFY");
+            break;
+        case cdc::operation::insert:
+            rjson::add(record, "eventName", "INSERT");
+            break;
+        case cdc::operation::service_row_delete:
+        case cdc::operation::service_partition_delete:
+        {
+            auto user_identity = rjson::empty_object();
+            rjson::add(user_identity, "Type", "Service");
+            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+            rjson::add(record, "userIdentity", std::move(user_identity));
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        default:
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        if (eor) {
+            maybe_add_record();
+            timestamp = ts;
+            if (limit == 0) {
+                break;
            }
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            if (is_big(ret)) {
-                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
-            }
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-        });
-    });
+        }
+    }
+
+    auto ret = rjson::empty_object();
+    auto nrecords = records.Size();
+    rjson::add(ret, "Records", std::move(records));
+
+    if (nrecords != 0) {
+        // #9642. Set next iterators threshold to > last
+        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+        // Note that here we unconditionally return NextShardIterator,
+        // without checking if maybe we reached the end-of-shard. If the
+        // shard did end, then the next read will have nrecords == 0 and
+        // will notice end end of shard and not return NextShardIterator.
+        rjson::add(ret, "NextShardIterator", next_iter);
+        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+        co_return rjson::print(std::move(ret));
+    }
+
+    // ugh. figure out if we are and end-of-shard
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+
+    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
+    auto& shard = iter.shard;
+
+    if (shard.time < ts && ts < high_ts) {
+        // The DynamoDB documentation states that when a shard is
+        // closed, reading it until the end has NextShardIterator
+        // "set to null". Our test test_streams_closed_read
+        // confirms that by "null" they meant not set at all.
+    } else {
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        rjson::add(ret, "NextShardIterator", iter);
+    }
+    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+    if (is_big(ret)) {
+        co_return make_streamed(std::move(ret));
+    }
+    co_return rjson::print(std::move(ret));
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -141,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLive request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 //
 // Here is a brief overview of how the expiration service works:
 //
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
            if (retries >= 10) {
                // Don't get stuck forever asking the same page, maybe there's
                // a bug or a real problem in several replicas. Give up on
-                // this scan an retry the scan from a random position later,
+                // this scan and retry the scan from a random position later,
                // in the next scan period.
                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
            }
@@ -767,7 +767,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -30,7 +30,7 @@ namespace alternator {

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLeave request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
 public:
    // Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
    data_dictionary::database _db;
    service::storage_proxy& _proxy;
    gms::gossiper& _gossiper;
-    // _end is set by start(), and resolves when the the background service
+    // _end is set by start(), and resolves when the background service
    // started by it ends. To ask the background service to end, _abort_source
    // should be triggered. stop() below uses both _abort_source and _end.
    std::optional<future<>> _end;
--- a/api/api-doc/authorization_cache.json
+++ b/api/api-doc/authorization_cache.json
@@ -12,7 +12,7 @@
      "operations":[
        {
          "method":"POST",
-          "summary":"Reset cache",
+          "summary":"Resets authorized prepared statements cache",
          "type":"void",
          "nickname":"authorization_cache_reset",
          "produces":[
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -3051,7 +3051,7 @@
                  },
                  {
                     "name":"incremental_mode",
-                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to 'disabled' mode.",
+                     "description":"Set the incremental repair mode. Can be 'disabled', 'incremental', or 'full'. 'incremental': The incremental repair logic is enabled. Unrepaired sstables will be included for repair. Repaired sstables will be skipped. The incremental repair states will be updated after repair. 'full': The incremental repair logic is enabled. Both repaired and unrepaired sstables will be included for repair. The incremental repair states will be updated after repair. 'disabled': The incremental repair logic is disabled completely. The incremental repair states, e.g., repaired_at in sstables and sstables_repaired_at in the system.tablets table, will not be updated after repair. When the option is not provided, it defaults to incremental mode.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api.hh
+++ b/api/api.hh
@@ -23,31 +23,6 @@

 namespace api {

-template<class T>
-std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
-    std::vector<T> res;
-    res.reserve(map.size());
-
-    for (const auto& [key, value] : map) {
-        res.push_back(T());
-        res.back().key = key;
-        res.back().value = value;
-    }
-    return res;
-}
-
-template<class T, class MAP>
-std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
-    res.reserve(res.size() + std::size(map));
-
-    for (const auto& [key, value] : map) {
-        T val;
-        val.key = fmt::to_string(key);
-        val.value = fmt::to_string(value);
-        res.push_back(val);
-    }
-    return res;
-}
 template <typename T, typename S = T>
 T map_sum(T&& dest, const S& src) {
    for (const auto& i : src) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,6 +515,15 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
+        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
+                    keyspace,
+                    table,
+                    endpoint,
+                    bucket,
+                    prefix,
+                    sstables.size(),
+                    scope,
+                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
@@ -527,13 +536,15 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
-    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
+    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req);
        auto view = req->get_path_param("view");
-        return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
+            storage_service_json::mapper res;
+            res.key = i.first;
+            res.value = i.second;
+            return res;
+        }));
    });

    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -571,6 +582,16 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
 }

+namespace {
+template <typename Key, typename Value>
+storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
+    storage_service_json::mapper val;
+    val.key = fmt::to_string(i.first);
+    val.value = fmt::to_string(i.second);
+    return val;
+}
+}
+
 static
 future<json::json_return_type>
 rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -588,12 +609,7 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
            throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
        }

-        co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
-            storage_service_json::mapper val;
-            val.key = fmt::to_string(i.first);
-            val.value = fmt::to_string(i.second);
-            return val;
-        }));
+        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

 static
@@ -677,7 +693,6 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            table_id = validate_table(ctx.db.local(), keyspace, table);
        }

-        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
            ss::maplist_mapper m;
@@ -1308,10 +1323,7 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
            throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
        }

-        return ss.local().get_ownership().then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1328,10 +1340,7 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
            }
        }

-        return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1341,7 +1350,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
        apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
        throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1407,7 +1416,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
        apilog.warn("retrain_dict: called before the cluster feature was enabled");
        throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
@@ -2016,12 +2025,14 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
-        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);
+        db::snapshot_options opts = {
+            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
+        };

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
        try {
            if (column_families.empty()) {
-                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
+                co_await snap_ctl.local().take_snapshot(tag, keynames, opts);
            } else {
                if (keynames.empty()) {
                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -2029,7 +2040,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
                if (keynames.size() > 1) {
                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
                }
-                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, opts);
            }
            co_return json_void();
        } catch (...) {
@@ -2064,7 +2075,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto info = parse_scrub_options(ctx, std::move(req));

        if (!info.snapshot_tag.empty()) {
-            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
+            db::snapshot_options opts = {.skip_flush = false};
+            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
        }

        compaction::compaction_stats stats;
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -146,7 +146,8 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
        auto info = parse_scrub_options(ctx, std::move(req));

        if (!info.snapshot_tag.empty()) {
-            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, db::snapshot_ctl::skip_flush::no);
+            db::snapshot_options opts = {.skip_flush = false};
+            co_await snap_ctl.local().take_column_family_snapshot(info.keyspace, info.column_families, info.snapshot_tag, opts);
        }

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table);
-}
-
-audit_info_ptr audit::create_no_audit_info() {
-    return audit_info_ptr();
+    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

 future<> audit::start(const db::config& cfg) {
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
-    if (batch != nullptr) {
+    auto audit_info = statement->get_audit_info();
+    if (!audit_info) {
+        return make_ready_future<>();
+    }
+    if (audit_info->batch()) {
+        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        auto audit_info = statement->get_audit_info();
-        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
+        if (audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
+        return make_ready_future<>();
    }
-    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,11 +75,13 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
+    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table)
+    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
+        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -89,6 +91,7 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
+    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -126,8 +129,7 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
-    static audit_info_ptr create_no_audit_info();
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -53,10 +53,10 @@ static std::string json_escape(std::string_view str) {

 }

-future<> audit_syslog_storage_helper::syslog_send_helper(const sstring& msg) {
+future<> audit_syslog_storage_helper::syslog_send_helper(temporary_buffer<char> msg) {
    try {
        auto lock = co_await get_units(_semaphore, 1, std::chrono::hours(1));
-        co_await _sender.send(_syslog_address, net::packet{msg.data(), msg.size()});
+        co_await _sender.send(_syslog_address, std::span(&msg, 1));
    }
    catch (const std::exception& e) {
        auto error_msg = seastar::format(
@@ -90,7 +90,7 @@ future<> audit_syslog_storage_helper::start(const db::config& cfg) {
        co_return;
    }

-    co_await syslog_send_helper("Initializing syslog audit backend.");
+    co_await syslog_send_helper(temporary_buffer<char>::copy_of("Initializing syslog audit backend."));
 }

 future<> audit_syslog_storage_helper::stop() {
@@ -120,7 +120,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
                                    audit_info->table(),
                                    username);

-    co_await syslog_send_helper(msg);
+    co_await syslog_send_helper(std::move(msg).release());
 }

 future<> audit_syslog_storage_helper::write_login(const sstring& username,
@@ -139,7 +139,7 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
                                    client_ip,
                                    username);

-    co_await syslog_send_helper(msg.c_str());
+    co_await syslog_send_helper(std::move(msg).release());
 }

 }
--- a/audit/audit_syslog_storage_helper.hh
+++ b/audit/audit_syslog_storage_helper.hh
@@ -26,7 +26,7 @@ class audit_syslog_storage_helper : public storage_helper {
    net::datagram_channel _sender;
    seastar::semaphore _semaphore;

-    future<> syslog_send_helper(const sstring& msg);
+    future<> syslog_send_helper(seastar::temporary_buffer<char> msg);
 public:
    explicit audit_syslog_storage_helper(cql3::query_processor&, service::migration_manager&);
    virtual ~audit_syslog_storage_helper();
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -17,7 +17,6 @@ target_sources(scylla_auth
    password_authenticator.cc
    passwords.cc
    permission.cc
-    permissions_cache.cc
    resource.cc
    role_or_anonymous.cc
    roles-metadata.cc
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -8,6 +8,7 @@

 #include "auth/cache.hh"
 #include "auth/common.hh"
+#include "auth/role_or_anonymous.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -18,6 +19,8 @@
 #include <seastar/core/abort_source.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/core/format.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/do_with.hh>

 namespace auth {

@@ -27,7 +30,21 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
    : _current_version(0)
    , _qp(qp)
    , _loading_sem(1)
-    , _as(as) {
+    , _as(as)
+    , _permission_loader(nullptr)
+    , _permission_loader_sem(8) {
+    namespace sm = seastar::metrics;
+    _metrics.add_group("auth_cache", {
+        sm::make_gauge("roles", [this] { return _roles.size(); },
+                sm::description("Number of roles currently cached")),
+        sm::make_gauge("permissions", [this] {
+            return _cached_permissions_count;
+        }, sm::description("Total number of permission sets currently cached across all roles"))
+    });
+}
+
+void cache::set_permission_loader(permission_loader_func loader) {
+    _permission_loader = std::move(loader);
 }

 lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -38,6 +55,83 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
+    std::unordered_map<resource, permission_set>* perms_cache;
+    lw_shared_ptr<role_record> role_ptr;
+
+    if (is_anonymous(role)) {
+        perms_cache = &_anonymous_permissions;
+    } else {
+        const auto& role_name = *role.name;
+        auto role_it = _roles.find(role_name);
+        if (role_it == _roles.end()) {
+            // Role might have been deleted but there are some connections
+            // left which reference it. They should no longer have access to anything.
+            return make_ready_future<permission_set>(permissions::NONE);
+        }
+        role_ptr = role_it->second;
+        perms_cache = &role_ptr->cached_permissions;
+    }
+
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        return make_ready_future<permission_set>(it->second);
+    }
+    // keep alive role_ptr as it holds perms_cache (except anonymous)
+    return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
+        return load_permissions(role, r, perms_cache);
+    });
+}
+
+future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_permission_loader_sem, 1, _as);
+
+    // Check again, perhaps we were blocked and other call loaded
+    // the permissions already. This is a protection against misses storm.
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        co_return it->second;
+    }
+    auto perms = co_await _permission_loader(role, r);
+    add_permissions(*perms_cache, r, perms);
+    co_return perms;
+}
+
+future<> cache::prune(const resource& r) {
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    _anonymous_permissions.erase(r);
+    for (auto& it : _roles) {
+        // Prunning can run concurrently with other functions but it
+        // can only cause cached_permissions extra reload via get_permissions.
+        remove_permissions(it.second->cached_permissions, r);
+        co_await coroutine::maybe_yield();
+    }
+}
+
+future<> cache::reload_all_permissions() noexcept {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
+        std::vector<resource> keys;
+        keys.reserve(m.size());
+        for (const auto& [res, _] : m) {
+            keys.push_back(res);
+        }
+        return keys;
+    };
+    const role_or_anonymous anon;
+    for (const auto& res : copy_keys(_anonymous_permissions)) {
+        _anonymous_permissions[res] = co_await _permission_loader(anon, res);
+    }
+    for (auto& [role, entry] : _roles) {
+        auto& perms_cache = entry->cached_permissions;
+        auto r = role_or_anonymous(role);
+        for (const auto& res : copy_keys(perms_cache)) {
+            perms_cache[res] = co_await _permission_loader(r, res);
+        }
+    }
+    logger.debug("Reloaded auth cache with {} entries", _roles.size());
+}
+
 future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
    auto rec = make_lw_shared<role_record>();
    rec->version = _current_version;
@@ -105,7 +199,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
 future<> cache::prune_all() noexcept {
    for (auto it = _roles.begin(); it != _roles.end(); ) {
        if (it->second->version != _current_version) {
-            _roles.erase(it++);
+            remove_role(it++);
            co_await coroutine::maybe_yield();
        } else {
            ++it;
@@ -129,7 +223,7 @@ future<> cache::load_all() {
        const auto name = r.get_as<sstring>("role");
        auto role = co_await fetch_role(name);
        if (role) {
-            _roles[name] = role;
+            add_role(name, role);
        }
        co_return stop_iteration::no;
    };
@@ -142,11 +236,32 @@ future<> cache::load_all() {
        co_await distribute_role(name, role);
    }
    co_await container().invoke_on_others([this](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        c._current_version = _current_version;
        co_await c.prune_all();
    });
 }

+future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
+    if (!role) {
+        // Role might have been removed or not yet added, either way
+        // their members will be handled by another top call to this function.
+        co_return;
+    }
+    for (const auto& member_name : role->members) {
+        bool is_new = roles.insert(member_name).second;
+        if (!is_new) {
+            continue;
+        }
+        lw_shared_ptr<cache::role_record> member_role;
+        auto r = _roles.find(member_name);
+        if (r != _roles.end()) {
+            member_role = r->second;
+        }
+        co_await gather_inheriting_roles(roles, member_role, member_name);
+    }
+}
+
 future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    if (legacy_mode(_qp)) {
        co_return;
@@ -154,27 +269,41 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    SCYLLA_ASSERT(this_shard_id() == 0);
    auto units = co_await get_units(_loading_sem, 1, _as);

+    std::unordered_set<role_name_t> roles_to_clear_perms;
    for (const auto& name : roles) {
        logger.info("Loading role {}", name);
        auto role = co_await fetch_role(name);
         if (role) {
-            _roles[name] = role;
+            add_role(name, role);
+            co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
        } else {
-            _roles.erase(name);
+            if (auto it = _roles.find(name); it != _roles.end()) {
+                auto old_role = it->second;
+                remove_role(it);
+                co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
+            }
        }
        co_await distribute_role(name, role);
    }
+
+    co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
+        for (const auto& name : roles_to_clear_perms) {
+            c.clear_role_permissions(name);
+            co_await coroutine::maybe_yield();
+        }
+    });
 }

 future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
    auto role_ptr = role.get();
-    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        if (!role_ptr) {
-            c._roles.erase(name);
-            return;
+            c.remove_role(name);
+            co_return;
        }
        auto role_copy = make_lw_shared<role_record>(*role_ptr);
-        c._roles[name] = std::move(role_copy);
+        c.add_role(name, std::move(role_copy));
    });
 }

@@ -185,4 +314,40 @@ bool cache::includes_table(const table_id& id) noexcept {
            || id == db::system_keyspace::role_permissions()->id();
 }

+void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+    }
+    _cached_permissions_count += role->cached_permissions.size();
+    _roles[name] = std::move(role);
+}
+
+void cache::remove_role(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        remove_role(it);
+    }
+}
+
+void cache::remove_role(roles_map::iterator it) {
+    _cached_permissions_count -= it->second->cached_permissions.size();
+    _roles.erase(it);
+}
+
+void cache::clear_role_permissions(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+        it->second->cached_permissions.clear();
+    }
+}
+
+void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
+    if (cache.emplace(r, perms).second) {
+        ++_cached_permissions_count;
+    }
+}
+
+void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
+    _cached_permissions_count -= cache.erase(r);
+}
+
 } // namespace auth
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -17,11 +17,14 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/metrics_registration.hh>

 #include <absl/container/flat_hash_map.h>

 #include "auth/permission.hh"
 #include "auth/common.hh"
+#include "auth/resource.hh"
+#include "auth/role_or_anonymous.hh"

 namespace cql3 { class query_processor; }

@@ -31,6 +34,7 @@ class cache : public peering_sharded_service<cache> {
 public:
    using role_name_t = sstring;
    using version_tag_t = char;
+    using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;

 	struct role_record {
        bool can_login = false;
@@ -40,11 +44,19 @@ public:
        sstring salted_hash;
        std::unordered_map<sstring, sstring> attributes;
        std::unordered_map<sstring, permission_set> permissions;
+    private:
+        friend cache;
+        // cached permissions include effects of role's inheritance
+        std::unordered_map<resource, permission_set> cached_permissions;
        version_tag_t version; // used for seamless cache reloads
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    void set_permission_loader(permission_loader_func loader);
+    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
+    future<> prune(const resource& r);
+    future<> reload_all_permissions() noexcept;
    future<> load_all();
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;
@@ -52,14 +64,31 @@ public:
 private:
    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
    roles_map _roles;
+    // anonymous permissions map exists mainly due to compatibility with
+    // higher layers which use role_or_anonymous to get permissions.
+    std::unordered_map<resource, permission_set> _anonymous_permissions;
    version_tag_t _current_version;
    cql3::query_processor& _qp;
-    semaphore _loading_sem;
+    semaphore _loading_sem; // protects iteration of _roles map
    abort_source& _as;
+    permission_loader_func _permission_loader;
+    semaphore _permission_loader_sem; // protects against reload storms on a single role change
+    metrics::metric_groups _metrics;
+    size_t _cached_permissions_count = 0;

    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
    future<> prune_all() noexcept;
    future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
+    future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
+
+    void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
+    void remove_role(const role_name_t& name);
+    void remove_role(roles_map::iterator it);
+    void clear_role_permissions(const role_name_t& name);
+    void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
+    void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
+
+    future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
 };

 } // namespace auth
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -88,10 +88,16 @@ static const class_registrator<

 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
+        uint32_t permissions_update_interval_in_ms,
+        utils::observer<uint32_t>  permissions_update_interval_in_ms_observer,
        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
        : _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
        , _bind_password(bind_password)
-        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
+        , _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
+        , _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
+        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
+        , _cache(cache)
+        , _cache_pruner(make_ready_future<>()) {
 }

 ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -100,6 +106,8 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
            qp.db().get_config().ldap_attr_role(),
            qp.db().get_config().ldap_bind_dn(),
            qp.db().get_config().ldap_bind_passwd(),
+            qp.db().get_config().permissions_update_interval_in_ms(),
+            qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
            qp,
            rg0c,
            mm,
@@ -119,6 +127,22 @@ future<> ldap_role_manager::start() {
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
    }
+    _cache_pruner = futurize_invoke([this] () -> future<> {
+        while (true) {
+            try {
+                co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
+            } catch (const seastar::sleep_aborted&) {
+                co_return; // ignore
+            }
+            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
+                try {
+                    co_await c.reload_all_permissions();
+                } catch (...) {
+                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+                }
+            });
+        }
+    });
    return _std_mgr.start();
 }

@@ -175,7 +199,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {

 future<> ldap_role_manager::stop() {
    _as.request_abort();
-    return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
+    return std::move(_cache_pruner).then([this] {
+        return _std_mgr.stop();
+    }).then([this] {
+        return _connection_factory.stop();
+    });
 }

 future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
 #include <stdexcept>

 #include "ent/ldap/ldap_connection.hh"
@@ -34,14 +35,22 @@ class ldap_role_manager : public role_manager {
    seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
    seastar::sstring _bind_name; ///< Username for LDAP simple bind.
    seastar::sstring _bind_password; ///< Password for LDAP simple bind.
+
+    uint32_t _permissions_update_interval_in_ms;
+    utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
+
    mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
    seastar::abort_source _as;
+    cache& _cache;
+    seastar::future<> _cache_pruner;
  public:
    ldap_role_manager(
            std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
            std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
            std::string_view bind_name, ///< LDAP bind credentials.
            std::string_view bind_password, ///< LDAP bind credentials.
+            uint32_t permissions_update_interval_in_ms,
+            utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
            cql3::query_processor& qp, ///< Passed to standard_role_manager.
            ::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
            ::service::migration_manager& mm, ///< Passed to standard_role_manager.
--- a/auth/permissions_cache.cc
+++ b/auth/permissions_cache.cc
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "auth/permissions_cache.hh"
-
-#include <fmt/ranges.h>
-#include "auth/authorizer.hh"
-#include "auth/service.hh"
-
-namespace auth {
-
-permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
-        : _cache(c, log, [&ser, &log](const key_type& k) {
-              log.debug("Refreshing permissions for {}", k.first);
-              return ser.get_uncached_permissions(k.first, k.second);
-          }) {
-}
-
-bool permissions_cache::update_config(utils::loading_cache_config c) {
-    return _cache.update_config(std::move(c));
-}
-
-void permissions_cache::reset() {
-    _cache.reset();
-}
-
-future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
-    return do_with(key_type(maybe_role, r), [this](const auto& k) {
-        return _cache.get(k);
-    });
-}
-
-}
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <iostream>
-#include <utility>
-
-#include <fmt/core.h>
-#include <seastar/core/future.hh>
-
-#include "auth/permission.hh"
-#include "auth/resource.hh"
-#include "auth/role_or_anonymous.hh"
-#include "utils/log.hh"
-#include "utils/hash.hh"
-#include "utils/loading_cache.hh"
-
-namespace std {
-
-inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
-    fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
-    return os;
-}
-
-}
-
-namespace db {
-class config;
-}
-
-namespace auth {
-
-class service;
-
-class permissions_cache final {
-    using cache_type = utils::loading_cache<
-            std::pair<role_or_anonymous, resource>,
-            permission_set,
-            1,
-            utils::loading_cache_reload_enabled::yes,
-            utils::simple_entry_size<permission_set>,
-            utils::tuple_hash>;
-
-    using key_type = typename cache_type::key_type;
-
-    cache_type _cache;
-
-public:
-    explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
-
-    future <> stop() {
-        return _cache.stop();
-    }
-
-    bool update_config(utils::loading_cache_config);
-    void reset();
-    future<permission_set> get(const role_or_anonymous&, const resource&);
-};
-
-}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -64,11 +64,11 @@ static const sstring superuser_col_name("super");
 static logging::logger log("auth_service");

 class auth_migration_listener final : public ::service::migration_listener {
-    authorizer& _authorizer;
+    service& _service;
    cql3::query_processor& _qp;

 public:
-    explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a),  _qp(qp) {
+    explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s),  _qp(qp) {
    }

 private:
@@ -92,14 +92,14 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
+        (void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });

-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
        });
@@ -111,9 +111,8 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_data_resource(ks_name, cf_name), mc);
+        (void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
        });
@@ -126,9 +125,8 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, function_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
        });
@@ -138,9 +136,8 @@ private:
            // in non legacy path revoke is part of schema change statement execution
            return;
        }
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, aggregate_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
        });
@@ -157,7 +154,6 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 }

 service::service(
-        utils::loading_cache_config c,
        cache& cache,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
@@ -166,25 +162,17 @@ service::service(
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r,
        maintenance_socket_enabled used_by_maintenance_socket)
-            : _loading_cache_config(std::move(c))
-            , _permissions_cache(nullptr)
-            , _cache(cache)
+            : _cache(cache)
            , _qp(qp)
            , _group0_client(g0)
            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
-            , _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
-            , _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
-            , _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
+            , _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
            , _used_by_maintenance_socket(used_by_maintenance_socket) {}

 service::service(
-        utils::loading_cache_config c,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
        ::service::migration_notifier& mn,
@@ -193,7 +181,6 @@ service::service(
        maintenance_socket_enabled used_by_maintenance_socket,
        cache& cache)
            : service(
-                      std::move(c),
                      cache,
                      qp,
                      g0,
@@ -257,7 +244,14 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        co_await _role_manager->ensure_superuser_is_created();
    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
-    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
+    if (!_used_by_maintenance_socket) {
+        // Maintenance socket mode can't cache permissions because it has
+        // different authorizer. We can't mix cached permissions, they could be
+        // different in normal mode.
+        _cache.set_permission_loader(std::bind(
+                &service::get_uncached_permissions,
+                this, std::placeholders::_1, std::placeholders::_2));
+    }
    co_await once_among_shards([this] {
        _mnotifier.register_listener(_migration_listener.get());
        return make_ready_future<>();
@@ -269,9 +263,7 @@ future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
-        if (_permissions_cache) {
-            return _permissions_cache->stop();
-        }
+        _cache.set_permission_loader(nullptr);
        return make_ready_future<>();
    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
@@ -283,21 +275,8 @@ future<> service::ensure_superuser_is_created() {
    co_await _authenticator->ensure_superuser_is_created();
 }

-void service::update_cache_config() {
-    auto db = _qp.db();
-
-    utils::loading_cache_config perm_cache_config;
-    perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
-    perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
-    perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
-
-    if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
-        log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
-    }
-}

 void service::reset_authorization_cache() {
-    _permissions_cache->reset();
    _qp.reset_cache();
 }

@@ -322,7 +301,10 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
 }

 future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
-    return _permissions_cache->get(maybe_role, r);
+    if (legacy_mode(_qp) || _used_by_maintenance_socket) {
+        return get_uncached_permissions(maybe_role, r);
+    }
+    return _cache.get_permissions(maybe_role, r);
 }

 future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -447,6 +429,11 @@ future<bool> service::exists(const resource& r) const {
    return make_ready_future<bool>(false);
 }

+future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
+    co_await _authorizer->revoke_all(r, mc);
+    co_await _cache.prune(r);
+}
+
 future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
    std::vector<cql3::description> result{};

@@ -801,7 +788,7 @@ future<> revoke_permissions(
 }

 future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
-    return ser.underlying_authorizer().revoke_all(r, mc);
+    return ser.revoke_all(r, mc);
 }

 future<std::vector<permission_details>> list_filtered_permissions(
@@ -876,22 +863,6 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
                continue; // some tables might not have been created if they were not used
            }

-            // use longer than usual timeout as we scan the whole table
-            // but not infinite or very long as we want to fail reasonably fast
-            const auto t = 5min;
-            const timeout_config tc{t, t, t, t, t, t, t};
-            ::service::client_state cs(::service::client_state::internal_tag{}, tc);
-            ::service::query_state qs(cs, empty_service_permit());
-
-            auto rows = co_await qp.execute_internal(
-                    seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
-                    db::consistency_level::ALL,
-                    qs,
-                    {},
-                    cql3::query_processor::cache_internal::no);
-            if (rows->empty()) {
-                continue;
-            }
            std::vector<sstring> col_names;
            for (const auto& col : schema->all_columns()) {
                col_names.push_back(col.name_as_cql_string());
@@ -900,30 +871,51 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
            for (size_t i = 1; i < col_names.size(); ++i) {
                val_binders_str += ", ?";
            }
-            for (const auto& row : *rows) {
-                std::vector<data_value_or_unset> values;
-                for (const auto& col : schema->all_columns()) {
-                    if (row.has(col.name_as_text())) {
-                        values.push_back(
-                                col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
-                    } else {
-                        values.push_back(unset_value{});
+
+            std::vector<mutation> collected;
+            // use longer than usual timeout as we scan the whole table
+            // but not infinite or very long as we want to fail reasonably fast
+            const auto t = 5min;
+            const timeout_config tc{t, t, t, t, t, t, t};
+            ::service::client_state cs(::service::client_state::internal_tag{}, tc);
+            ::service::query_state qs(cs, empty_service_permit());
+
+            co_await qp.query_internal(
+                seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
+                db::consistency_level::ALL,
+                {},
+                1000,
+                [&qp, &cf_name, &col_names, &val_binders_str, &schema, ts, &collected] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+                    std::vector<data_value_or_unset> values;
+                    for (const auto& col : schema->all_columns()) {
+                        if (row.has(col.name_as_text())) {
+                            values.push_back(
+                                    col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
+                        } else {
+                            values.push_back(unset_value{});
+                        }
                    }
-                }
-                auto muts = co_await qp.get_mutations_internal(
-                        seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
-                                db::system_keyspace::NAME,
-                                cf_name,
-                                fmt::join(col_names, ", "),
-                                val_binders_str),
-                        internal_distributed_query_state(),
-                        ts,
-                        std::move(values));
-                if (muts.size() != 1) {
-                    on_internal_error(log,
-                            format("expecting single insert mutation, got {}", muts.size()));
-                }
-                co_yield std::move(muts[0]);
+                    auto muts = co_await qp.get_mutations_internal(
+                            seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
+                                    db::system_keyspace::NAME,
+                                    cf_name,
+                                    fmt::join(col_names, ", "),
+                                    val_binders_str),
+                            internal_distributed_query_state(),
+                            ts,
+                            std::move(values));
+                    if (muts.size() != 1) {
+                        on_internal_error(log,
+                                format("expecting single insert mutation, got {}", muts.size()));
+                    }
+
+                    collected.push_back(std::move(muts[0]));
+                    co_return stop_iteration::no;
+                },
+                std::move(qs));
+
+            for (auto& m : collected) {
+                co_yield std::move(m);
            }
        }
        co_yield co_await sys_ks.make_auth_version_mutation(ts,
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -20,7 +20,6 @@
 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
 #include "auth/permission.hh"
-#include "auth/permissions_cache.hh"
 #include "auth/cache.hh"
 #include "auth/role_manager.hh"
 #include "auth/common.hh"
@@ -75,8 +74,6 @@ public:
 /// peering_sharded_service inheritance is needed to be able to access shard local authentication service
 /// given an object from another shard. Used for bouncing lwt requests to correct shard.
 class service final : public seastar::peering_sharded_service<service> {
-    utils::loading_cache_config _loading_cache_config;
-    std::unique_ptr<permissions_cache> _permissions_cache;
    cache& _cache;

    cql3::query_processor& _qp;
@@ -94,20 +91,12 @@ class service final : public seastar::peering_sharded_service<service> {
    // Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
    std::unique_ptr<::service::migration_listener> _migration_listener;

-    std::function<void(uint32_t)> _permissions_cache_cfg_cb;
-    serialized_action _permissions_cache_config_action;
-
-    utils::observer<uint32_t> _permissions_cache_max_entries_observer;
-    utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
-    utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
-
    maintenance_socket_enabled _used_by_maintenance_socket;

    abort_source _as;

 public:
    service(
-            utils::loading_cache_config,
            cache& cache,
            cql3::query_processor&,
            ::service::raft_group0_client&,
@@ -123,7 +112,6 @@ public:
    /// of the instances themselves.
    ///
    service(
-            utils::loading_cache_config,
            cql3::query_processor&,
            ::service::raft_group0_client&,
            ::service::migration_notifier&,
@@ -138,8 +126,6 @@ public:

    future<> ensure_superuser_is_created();

-    void update_cache_config();
-
    void reset_authorization_cache();

    ///
@@ -181,6 +167,13 @@ public:

    future<bool> exists(const resource&) const;

+    ///
+    /// Revoke all permissions granted to any role for a particular resource.
+    ///
+    /// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
+    ///
+    future<> revoke_all(const resource&, ::service::group0_batch&) const;
+
    ///
    /// Produces descriptions that can be used to restore the state of auth. That encompasses
    /// roles, role grants, and permission grants.
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -52,13 +52,6 @@ static const class_registrator<
        ::service::migration_manager&,
        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");

-struct record final {
-    sstring name;
-    bool is_superuser;
-    bool can_login;
-    role_set member_of;
-};
-
 static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -67,13 +60,13 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
    return db::consistency_level::LOCAL_ONE;
 }

-static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            get_auth_ks_name(qp),
+            get_auth_ks_name(_qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);

-    const auto results = co_await qp.execute_internal(
+    const auto results = co_await _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_query_state(),
@@ -93,8 +86,25 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
                        : role_set())});
 }

-static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
-    return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
+    if (legacy_mode(_qp)) {
+        return legacy_find_record(role_name);
+    }
+    auto name = sstring(role_name);
+    auto role = _cache.get(name);
+    if (!role) {
+        return make_ready_future<std::optional<record>>(std::nullopt);
+    }
+    return make_ready_future<std::optional<record>>(std::make_optional(record{
+        .name = std::move(name),
+        .is_superuser = role->is_superuser,
+        .can_login = role->can_login,
+        .member_of = role->member_of
+    }));
+}
+
+future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
+    return find_record(role_name).then([role_name](std::optional<record> mr) {
        if (!mr) {
            throw nonexistant_role(role_name);
        }
@@ -386,7 +396,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        return fmt::to_string(fmt::join(assignments, ", "));
    };

-    return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
+    return require_record(role_name).then([this, role_name, &u, &mc](record) {
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
@@ -620,18 +630,17 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
    });
 }

-static future<> collect_roles(
-        cql3::query_processor& qp,
+future<> standard_role_manager::collect_roles(
        std::string_view grantee_name,
        bool recurse,
        role_set& roles) {
-    return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
-        return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
-            return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
+    return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
+        return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
+            return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
                roles.insert(role_name);

                if (recurse) {
-                    return collect_roles(qp, role_name, true, roles);
+                    return collect_roles(role_name, true, roles);
                }

                return make_ready_future<>();
@@ -646,7 +655,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    return do_with(
            role_set{sstring(grantee_name)},
            [this, grantee_name, recurse](role_set& roles) {
-        return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
+        return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
    });
 }

@@ -706,27 +715,21 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
 }

 future<bool> standard_role_manager::exists(std::string_view role_name) {
-    return find_record(_qp, role_name).then([](std::optional<record> mr) {
+    return find_record(role_name).then([](std::optional<record> mr) {
        return static_cast<bool>(mr);
    });
 }

 future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
-    return require_record(_qp, role_name).then([](record r) {
+    return require_record(role_name).then([](record r) {
        return r.is_superuser;
    });
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    if (legacy_mode(_qp)) {
-       const auto r = co_await require_record(_qp, role_name);
-       co_return r.can_login;
-    }
-    auto role = _cache.get(sstring(role_name));
-    if (!role) {
-        throw nonexistant_role(role_name);
-    }
-    co_return role->can_login;
+    return require_record(role_name).then([](record r) {
+        return r.can_login;
+    });
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -90,6 +90,12 @@ public:

 private:
    enum class membership_change { add, remove };
+    struct record final {
+        sstring name;
+        bool is_superuser;
+        bool can_login;
+        role_set member_of;
+    };

    future<> create_legacy_metadata_tables_if_missing() const;

@@ -107,6 +113,14 @@ private:
    future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);

    future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
+
+    future<std::optional<record>> legacy_find_record(std::string_view role_name);
+    future<std::optional<record>> find_record(std::string_view role_name);
+    future<record> require_record(std::string_view role_name);
+    future<> collect_roles(
+            std::string_view grantee_name,
+            bool recurse,
+            role_set& roles);
 };

 } // namespace auth
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -814,8 +814,7 @@ generation_service::generation_service(
            config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
            sharded<db::system_keyspace>& sys_ks,
            abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
-            replica::database& db,
-            std::function<bool()> raft_topology_change_enabled)
+            replica::database& db)
        : _cfg(std::move(cfg))
        , _gossiper(g)
        , _sys_dist_ks(sys_dist_ks)
@@ -824,7 +823,6 @@ generation_service::generation_service(
        , _token_metadata(stm)
        , _feature_service(f)
        , _db(db)
-        , _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
 {
 }

@@ -878,16 +876,7 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
 future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    if (_raft_topology_change_enabled()) {
-        return make_ready_future<>();
-    }
-
-    return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
-        auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
-        cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
-
-        return legacy_handle_cdc_generation(gen_id);
-    });
+    return make_ready_future<>();
 }

 future<> generation_service::check_and_repair_cdc_streams() {
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -79,17 +79,12 @@ private:
    std::optional<cdc::generation_id> _gen_id;
    future<> _cdc_streams_rewrite_complete = make_ready_future<>();

-    /* Returns true if raft topology changes are enabled.
-     * Can only be called from shard 0.
-     */
-    std::function<bool()> _raft_topology_change_enabled;
 public:
    generation_service(config cfg, gms::gossiper&,
            sharded<db::system_distributed_keyspace>&,
            sharded<db::system_keyspace>& sys_ks,
            abort_source&, const locator::shared_token_metadata&,
-            gms::feature_service&, replica::database& db,
-            std::function<bool()> raft_topology_change_enabled);
+            gms::feature_service&, replica::database& db);

    future<> stop();
    ~generation_service();
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -299,13 +299,11 @@ batch_size_fail_threshold_in_kb: 1024
 # max_hint_window_in_ms: 10800000 # 3 hours


-# Validity period for permissions cache (fetching permissions can be an
-# expensive operation depending on the authorizer, CassandraAuthorizer is
-# one example). Defaults to 10000, set to 0 to disable.
+# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
 # Will be disabled automatically for AllowAllAuthorizer.
 # permissions_validity_in_ms: 10000

-# Refresh interval for permissions cache (if enabled).
+# Refresh interval for authorized statements cache.
 # After this interval, cache entries become eligible for refresh. Upon next
 # access, an async reload is scheduled and the old value returned until it
 # completes. If permissions_validity_in_ms is non-zero, then this also must have
@@ -566,15 +564,16 @@ commitlog_total_space_in_mb: -1
 # prometheus_address: 1.2.3.4

 # audit settings
-# By default, Scylla does not audit anything.
+# Table audit is enabled by default.
 # 'audit' config option controls if and where to output audited events:
-#   - "none": auditing is disabled (default)
-#   - "table": save audited events in audit.audit_log column family
+#   - "none": auditing is disabled
+#   - "table": save audited events in audit.audit_log column family (default)
 #   - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
 audit: "table"
 #
 # List of statement categories that should be audited.
-audit_categories: "DCL,DDL,AUTH,ADMIN"
+# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
+audit_categories: "DCL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
--- a/configure.py
+++ b/configure.py
@@ -725,29 +725,9 @@ raft_tests = set([
 vector_search_tests = set([
    'test/vector_search/vector_store_client_test',
    'test/vector_search/load_balancer_test',
-    'test/vector_search/client_test'
-])
-
-vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
-vector_search_validator_deps = set([
-    'test/vector_search_validator/build-validator',
-    'test/vector_search_validator/Cargo.toml',
-    'test/vector_search_validator/crates/validator/Cargo.toml',
-    'test/vector_search_validator/crates/validator/src/main.rs',
-    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
-    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
-    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
-])
-
-vector_store_bin = 'vector-search-validator/bin/vector-store'
-vector_store_deps = set([
-    'test/vector_search_validator/build-env',
-    'test/vector_search_validator/build-vector-store',
-])
-
-vector_search_validator_bins = set([
-    vector_search_validator_bin,
-    vector_store_bin,
+    'test/vector_search/client_test',
+    'test/vector_search/filter_test',
+    'test/vector_search/rescoring_test'
 ])

 wasms = set([
@@ -783,7 +763,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
+all_artifacts = apps | cpp_apps | tests | other | wasms

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -815,6 +795,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
+# Workaround for https://github.com/mozilla/sccache/issues/2575
+arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
+                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -945,8 +928,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'utils/buffer_input_stream.cc',
-                'utils/limiting_data_source.cc',
+                'test/lib/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1034,6 +1016,9 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/functions/aggregate_fcts.cc',
                'cql3/functions/castas_fcts.cc',
                'cql3/functions/error_injection_fcts.cc',
+                'cql3/statements/strong_consistency/modification_statement.cc',
+                'cql3/statements/strong_consistency/select_statement.cc',
+                'cql3/statements/strong_consistency/statement_helpers.cc',
                'cql3/functions/vector_similarity_fcts.cc',
                'cql3/statements/cf_prop_defs.cc',
                'cql3/statements/cf_statement.cc',
@@ -1059,8 +1044,8 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/statements/raw/parsed_statement.cc',
                'cql3/statements/property_definitions.cc',
                'cql3/statements/update_statement.cc',
-                'cql3/statements/strongly_consistent_modification_statement.cc',
-                'cql3/statements/strongly_consistent_select_statement.cc',
+                'cql3/statements/broadcast_modification_statement.cc',
+                'cql3/statements/broadcast_select_statement.cc',
                'cql3/statements/delete_statement.cc',
                'cql3/statements/prune_materialized_view_statement.cc',
                'cql3/statements/batch_statement.cc',
@@ -1189,6 +1174,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
                'utils/http.cc',
+                'utils/http_client_error_processing.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
@@ -1206,6 +1192,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/azure/identity/default_credentials.cc',
                'utils/gcp/gcp_credentials.cc',
                'utils/gcp/object_storage.cc',
+                'utils/gcp/object_storage_retry_strategy.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1290,7 +1277,6 @@ scylla_core = (['message/messaging_service.cc',
                'auth/passwords.cc',
                'auth/password_authenticator.cc',
                'auth/permission.cc',
-                'auth/permissions_cache.cc',
                'auth/service.cc',
                'auth/standard_role_manager.cc',
                'auth/ldap_role_manager.cc',
@@ -1351,6 +1337,9 @@ scylla_core = (['message/messaging_service.cc',
                'lang/wasm.cc',
                'lang/wasm_alien_thread_runner.cc',
                'lang/wasm_instance_cache.cc',
+                'service/strong_consistency/groups_manager.cc',
+                'service/strong_consistency/coordinator.cc',
+                'service/strong_consistency/state_machine.cc',
                'service/raft/group0_state_id_handler.cc',
                'service/raft/group0_state_machine.cc',
                'service/raft/group0_state_machine_merger.cc',
@@ -1380,6 +1369,7 @@ scylla_core = (['message/messaging_service.cc',
                'vector_search/dns.cc',
                'vector_search/client.cc',
                'vector_search/clients.cc',
+                'vector_search/filter.cc',
                'vector_search/truststore.cc'
                ] + [Antlr3Grammar('cql3/Cql.g')] \
                  + scylla_raft_core
@@ -1489,6 +1479,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/hinted_handoff.idl.hh',
        'idl/storage_proxy.idl.hh',
        'idl/sstables.idl.hh',
+        'idl/strong_consistency/state_machine.idl.hh',
        'idl/group0_state_machine.idl.hh',
        'idl/mapreduce_request.idl.hh',
        'idl/replica_exception.idl.hh',
@@ -1547,6 +1538,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
+                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -1654,6 +1646,7 @@ for t in sorted(perf_tests):

 deps['test/boost/combined_tests'] += [
    'test/boost/aggregate_fcts_test.cc',
+    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
    'test/boost/cache_algorithm_test.cc',
@@ -1784,6 +1777,8 @@ deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
 deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vector_store_client_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
+deps['test/vector_search/filter_test'] = ['test/vector_search/filter_test.cc'] + scylla_tests_dependencies
+deps['test/vector_search/rescoring_test'] = ['test/vector_search/rescoring_test.cc'] + scylla_tests_dependencies

 boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]

@@ -2393,7 +2388,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2570,11 +2565,10 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
-                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2604,7 +2598,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms or binary in vector_search_validator_bins:
+            if binary in other or binary in wasms:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2715,11 +2709,10 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
-                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2887,19 +2880,6 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

-    f.write(textwrap.dedent(f'''\
-        rule build-vector-search-validator
-            command = test/vector_search_validator/build-validator $builddir
-        rule build-vector-store
-            command = test/vector_search_validator/build-vector-store $builddir
-        '''))
-    f.write(
-            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
-    )
-    f.write(
-            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
-    )
-
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -3137,7 +3117,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache:
+        if 'sccache' in compiler_cache and args.sccache_rust:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -47,6 +47,9 @@ target_sources(cql3
    functions/aggregate_fcts.cc
    functions/castas_fcts.cc
    functions/error_injection_fcts.cc
+    statements/strong_consistency/select_statement.cc
+    statements/strong_consistency/modification_statement.cc
+    statements/strong_consistency/statement_helpers.cc
    functions/vector_similarity_fcts.cc
    statements/cf_prop_defs.cc
    statements/cf_statement.cc
@@ -72,8 +75,8 @@ target_sources(cql3
    statements/raw/parsed_statement.cc
    statements/property_definitions.cc
    statements/update_statement.cc
-    statements/strongly_consistent_modification_statement.cc
-    statements/strongly_consistent_select_statement.cc
+    statements/broadcast_modification_statement.cc
+    statements/broadcast_select_statement.cc
    statements/delete_statement.cc
    statements/prune_materialized_view_statement.cc
    statements/batch_statement.cc
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool is_ann_ordering = false;
    }
    : K_SELECT (
-                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
-                ( K_DISTINCT { is_distinct = true; } )?
+                ( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                | (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                )?
+                ( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
                sclause=selectClause
               )
      K_FROM (
@@ -425,13 +427,13 @@ selector returns [shared_ptr<raw_selector> s]

 unaliasedSelector returns [uexpression tmp]
    :  ( c=cident                                  { tmp = unresolved_identifier{std::move(c)}; }
+       | v=value                                   { tmp = std::move(v); }
       | K_COUNT '(' countArgument ')'             { tmp = make_count_rows_function_expression(); }
       | K_WRITETIME '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | K_TTL       '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
-       | f=similarityFunctionName args=vectorSimilarityArgs            { tmp = function_call{std::move(f), std::move(args)}; }
       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
       )
       ( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -446,23 +448,9 @@ selectionFunctionArgs returns [std::vector<expression> a]
      ')'
    ;

-vectorSimilarityArgs returns [std::vector<expression> a]
-    : '(' ')'
-    | '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
-          ( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
-      ')'
-    ;
-
-vectorSimilarityArg returns [uexpression a]
-    : s=unaliasedSelector { a = std::move(s); }
-    | v=value             { a = std::move(v); }
-    ;
-
 countArgument
    : '*'
-    | i=INTEGER { if (i->getText() != "1") {
-                    add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
-                } }
+    /* COUNT(1) is also allowed, it is recognized via the general function(args) path */
    ;

 whereClause returns [uexpression clause]
@@ -1706,10 +1694,6 @@ functionName returns [cql3::functions::function_name s]
    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s.keyspace = std::move(ks); $s.name = std::move(f); }
    ;

-similarityFunctionName returns [cql3::functions::function_name s]
-    : f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
-    ;
-
 allowedFunctionName returns [sstring s]
    : f=IDENT                       { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
    | f=QUOTED_NAME                 { $s = $f.text; }
@@ -1718,11 +1702,6 @@ allowedFunctionName returns [sstring s]
    | K_COUNT                       { $s = "count"; }
    ;

-allowedSimilarityFunctionName returns [sstring s]
-    : f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
-      { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
-    ;
-
 functionArgs returns [std::vector<expression> a]
    : '(' ')'
    | '(' t1=term { a.push_back(std::move(t1)); }
@@ -2419,10 +2398,6 @@ K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

 K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;

-K_SIMILARITY_EUCLIDEAN:     S I M I L A R I T Y '_' E U C L I D E A N;
-K_SIMILARITY_COSINE:        S I M I L A R I T Y '_' C O S I N E;
-K_SIMILARITY_DOT_PRODUCT:   S I M I L A R I T Y '_' D O T '_' P R O D U C T;
-
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -10,6 +10,7 @@
 #include "expr-utils.hh"
 #include "evaluate.hh"
 #include "cql3/functions/functions.hh"
+#include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/castas_fcts.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/column_identifier.hh"
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
    return partially_prepared_args;
 }

+// Special case for count(1) - recognize it as the countRows() function. Note it is quite
+// artificial and we might relax it to the more general count(expression) later.
+static
+std::optional<expression>
+try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    return std::visit(overloaded_functor{
+        [&] (const functions::function_name& name) -> std::optional<expression> {
+            auto native_name = name;
+            if (!native_name.has_keyspace()) {
+                native_name = name.as_native_function();
+            }
+            // Collapse count(1) into countRows()
+            if (native_name == functions::function_name::native_function("count")) {
+                if (fc.args.size() == 1) {
+                    if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
+                        if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
+                                && uc_arg->raw_text == "1") {
+                            return expr::function_call{
+                                .func = functions::aggregate_fcts::make_count_rows_function(),
+                                .args = {},
+                            };
+                        } else {
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                        }
+                    }
+                }
+            }
+            return std::nullopt;
+        },
+        [] (const shared_ptr<functions::function>&) -> std::optional<expression> {
+            // Already prepared, nothing to do
+            return std::nullopt;
+        },
+    }, fc.func);
+}
+
 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
+        return prepared;
+    }
    // Try to extract a column family name from the available information.
    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,9 +10,41 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
+#include <span>
+#include <bit>

 namespace cql3 {
 namespace functions {
+
+namespace detail {
+
+std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
+    if (!param) {
+        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
+    }
+
+    const size_t expected_size = dimension * sizeof(float);
+    if (param->size() != expected_size) {
+        throw exceptions::invalid_request_exception(
+            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
+                       expected_size, dimension, param->size()));
+    }
+
+    std::vector<float> result;
+    result.reserve(dimension);
+
+    bytes_view view(*param);
+    for (size_t i = 0; i < dimension; ++i) {
+        // read_simple handles network byte order (big-endian) conversion
+        uint32_t raw = read_simple<uint32_t>(view);
+        result.push_back(std::bit_cast<float>(raw));
+    }
+
+    return result;
+}
+
+} // namespace detail
+
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -22,14 +54,14 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
    double dot_product = 0.0;
    double squared_norm_a = 0.0;
    double squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -37,7 +69,7 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
+        return std::numeric_limits<float>::quiet_NaN();
    }

    // The cosine similarity is in the range [-1, 1].
@@ -46,12 +78,12 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
    double sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];

        double diff = a - b;
        sum += diff * diff;
@@ -65,12 +97,12 @@ float compute_euclidean_similarity(const std::vector<data_value>& v1, const std:

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
    double dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];
        dot_product += a * b;
    }

@@ -136,13 +168,15 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    const auto& type = arg_types()[0];
-    data_value v1 = type->deserialize(*parameters[0]);
-    data_value v2 = type->deserialize(*parameters[1]);
-    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
-    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
+    // Extract dimension from the vector type
+    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
+    size_t dimension = type.get_dimension();

-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
+    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
+    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
+    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
+
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,6 +11,7 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
+#include <span>

 namespace cql3 {
 namespace functions {
@@ -19,7 +20,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
+using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -33,5 +34,14 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

+namespace detail {
+
+// Extract float vector directly from serialized bytes, bypassing data_value overhead.
+// This is an internal API exposed for testing purposes.
+// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
+std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
+
+} // namespace detail
+
 } // namespace functions
 } // namespace cql3
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -48,8 +48,10 @@ const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono

 struct query_processor::remote {
    remote(service::migration_manager& mm, service::mapreduce_service& fwd,
-           service::storage_service& ss, service::raft_group0_client& group0_client)
+           service::storage_service& ss, service::raft_group0_client& group0_client,
+           service::strong_consistency::coordinator& _sc_coordinator)
            : mm(mm), mapreducer(fwd), ss(ss), group0_client(group0_client)
+            , sc_coordinator(_sc_coordinator)
            , gate("query_processor::remote")
    {}

@@ -57,6 +59,7 @@ struct query_processor::remote {
    service::mapreduce_service& mapreducer;
    service::storage_service& ss;
    service::raft_group0_client& group0_client;
+    service::strong_consistency::coordinator& sc_coordinator;

    seastar::named_gate gate;
 };
@@ -514,9 +517,16 @@ query_processor::~query_processor() {
    }
 }

+std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
+query_processor::acquire_strongly_consistent_coordinator() {
+    auto [remote_, holder] = remote();
+    return {remote_.get().sc_coordinator, std::move(holder)};
+}
+
 void query_processor::start_remote(service::migration_manager& mm, service::mapreduce_service& mapreducer,
-                                   service::storage_service& ss, service::raft_group0_client& group0_client) {
-    _remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client);
+                                   service::storage_service& ss, service::raft_group0_client& group0_client,
+                                   service::strong_consistency::coordinator& sc_coordinator) {
+    _remote = std::make_unique<struct remote>(mm, mapreducer, ss, group0_client, sc_coordinator);
 }

 future<> query_processor::stop_remote() {
@@ -860,6 +870,7 @@ struct internal_query_state {
    sstring query_string;
    std::unique_ptr<query_options> opts;
    statements::prepared_statement::checked_weak_ptr p;
+    std::optional<service::query_state> qs;
    bool more_results = true;
 };

@@ -867,10 +878,14 @@ internal_query_state query_processor::create_paged_state(
        const sstring& query_string,
        db::consistency_level cl,
        const data_value_list& values,
-        int32_t page_size) {
+        int32_t page_size,
+        std::optional<service::query_state> qs) {
    auto p = prepare_internal(query_string);
    auto opts = make_internal_options(p, values, cl, page_size);
-    return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), true};
+    if (!qs) {
+        qs.emplace(query_state_for_internal_call());
+    }
+    return internal_query_state{query_string, std::make_unique<cql3::query_options>(std::move(opts)), std::move(p), std::move(qs), true};
 }

 bool query_processor::has_more_results(cql3::internal_query_state& state) const {
@@ -893,9 +908,8 @@ future<> query_processor::for_each_cql_result(
 future<::shared_ptr<untyped_result_set>>
 query_processor::execute_paged_internal(internal_query_state& state) {
    state.p->statement->validate(*this, service::client_state::for_internal_calls());
-    auto qs = query_state_for_internal_call();
    ::shared_ptr<cql_transport::messages::result_message> msg =
-      co_await state.p->statement->execute(*this, qs, *state.opts, std::nullopt);
+      co_await state.p->statement->execute(*this, *state.qs, *state.opts, std::nullopt);

    class visitor : public result_message::visitor_base {
        internal_query_state& _state;
@@ -1202,8 +1216,9 @@ future<> query_processor::query_internal(
        db::consistency_level cl,
        const data_value_list& values,
        int32_t page_size,
-        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f) {
-    auto query_state = create_paged_state(query_string, cl, values, page_size);
+        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
+        std::optional<service::query_state> qs) {
+    auto query_state = create_paged_state(query_string, cl, values, page_size, std::move(qs));
    co_return co_await for_each_cql_result(query_state, std::move(f));
 }

--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -44,6 +44,10 @@ class query_state;
 class mapreduce_service;
 class raft_group0_client;

+namespace strong_consistency {
+class coordinator;
+}
+
 namespace broadcast_tables {
 struct query;
 }
@@ -155,7 +159,8 @@ public:
    ~query_processor();

    void start_remote(service::migration_manager&, service::mapreduce_service&,
-                      service::storage_service& ss, service::raft_group0_client&);
+                      service::storage_service& ss, service::raft_group0_client&,
+                      service::strong_consistency::coordinator&);
    future<> stop_remote();

    data_dictionary::database db() {
@@ -174,6 +179,9 @@ public:
        return _proxy;
    }

+    std::pair<std::reference_wrapper<service::strong_consistency::coordinator>, gate::holder>
+    acquire_strongly_consistent_coordinator();
+
    cql_stats& get_cql_stats() {
        return _cql_stats;
    }
@@ -322,6 +330,7 @@ public:
     * page_size - maximum page size
     * f - a function to be run on each row of the query result,
     *     if the function returns stop_iteration::yes the iteration will stop
+     * qs - optional query state (default: std::nullopt)
     *
     * \note This function is optimized for convenience, not performance.
     */
@@ -330,7 +339,8 @@ public:
            db::consistency_level cl,
            const data_value_list& values,
            int32_t page_size,
-            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f);
+            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)> f,
+            std::optional<service::query_state> qs = std::nullopt);

    /*
     * \brief iterate over all cql results using paging
@@ -499,7 +509,8 @@ private:
            const sstring& query_string,
            db::consistency_level,
            const data_value_list& values,
-            int32_t page_size);
+            int32_t page_size,
+            std::optional<service::query_state> qs = std::nullopt);

    /*!
     * \brief run a query using paging
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -46,6 +46,13 @@ void metadata::add_non_serialized_column(lw_shared_ptr<column_specification> nam
    _column_info->_names.emplace_back(std::move(name));
 }

+void metadata::hide_last_column() {
+    if (_column_info->_column_count == 0) {
+        utils::on_internal_error("Trying to hide a column when there are no columns visible.");
+    }
+    _column_info->_column_count--;
+}
+
 void metadata::set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state) {
    _flags.set<flag::HAS_MORE_PAGES>();
    _paging_state = std::move(paging_state);
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -73,6 +73,7 @@ public:
    uint32_t value_count() const;

    void add_non_serialized_column(lw_shared_ptr<column_specification> name);
+    void hide_last_column();

 public:
    void set_paging_state(lw_shared_ptr<const service::pager::paging_state> paging_state);
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -225,10 +225,9 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            //    The second hyphen is not really true because currently topological changes can
            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-        } catch (const std::exception& e) {
+        } catch (const std::invalid_argument& e) {
            if (replica::database::enforce_rf_rack_validity_for_keyspace(qp.db().get_config(), *ks_md)) {
-                // There's no guarantee what the type of the exception will be, so we need to
-                // wrap it manually here in a type that can be passed to the user.
+                // wrap the exception manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
            } else {
                // Even when RF-rack-validity is not enforced for the keyspace, we'd
--- a/cql3/statements/strongly_consistent_modification_statement.cc
+++ b/cql3/statements/strongly_consistent_modification_statement.cc
@@ -9,7 +9,7 @@
 */


-#include "cql3/statements/strongly_consistent_modification_statement.hh"
+#include "cql3/statements/broadcast_modification_statement.hh"

 #include <optional>

@@ -28,11 +28,11 @@

 namespace cql3 {

-static logging::logger logger("strongly_consistent_modification_statement");
+static logging::logger logger("broadcast_modification_statement");

 namespace statements {

-strongly_consistent_modification_statement::strongly_consistent_modification_statement(
+broadcast_modification_statement::broadcast_modification_statement(
    uint32_t bound_terms,
    schema_ptr schema,
    broadcast_tables::prepared_update query)
@@ -43,7 +43,7 @@ strongly_consistent_modification_statement::strongly_consistent_modification_sta
 { }

 future<::shared_ptr<cql_transport::messages::result_message>>
-strongly_consistent_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+broadcast_modification_statement::execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
    return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<cql_transport::messages::result_message>>);
 }
@@ -63,7 +63,7 @@ evaluate_prepared(
 }

 future<::shared_ptr<cql_transport::messages::result_message>>
-strongly_consistent_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+broadcast_modification_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
    if (this_shard_id() != 0) {
        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
    }
@@ -103,11 +103,11 @@ strongly_consistent_modification_statement::execute_without_checking_exception_m
    ), result);
 }

-uint32_t strongly_consistent_modification_statement::get_bound_terms() const {
+uint32_t broadcast_modification_statement::get_bound_terms() const {
    return _bound_terms;
 }

-future<> strongly_consistent_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
+future<> broadcast_modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
    auto f = state.has_column_family_access(_schema->ks_name(), _schema->cf_name(), auth::permission::MODIFY);
    if (_query.value_condition.has_value()) {
        f = f.then([this, &state] {
@@ -117,7 +117,7 @@ future<> strongly_consistent_modification_statement::check_access(query_processo
    return f;
 }

-bool strongly_consistent_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+bool broadcast_modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return _schema->ks_name() == ks_name && (!cf_name || _schema->cf_name() == *cf_name);
 }

--- a/cql3/statements/strongly_consistent_modification_statement.hh
+++ b/cql3/statements/strongly_consistent_modification_statement.hh
@@ -27,13 +27,13 @@ struct prepared_update {

 }

-class strongly_consistent_modification_statement : public cql_statement_opt_metadata {
+class broadcast_modification_statement : public cql_statement_opt_metadata {
    const uint32_t _bound_terms;
    const schema_ptr _schema;
    const broadcast_tables::prepared_update _query;

 public:
-    strongly_consistent_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);
+    broadcast_modification_statement(uint32_t bound_terms, schema_ptr schema, broadcast_tables::prepared_update query);

    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const override;
--- a/cql3/statements/strongly_consistent_select_statement.cc
+++ b/cql3/statements/strongly_consistent_select_statement.cc
@@ -9,7 +9,7 @@
 */


-#include "cql3/statements/strongly_consistent_select_statement.hh"
+#include "cql3/statements/broadcast_select_statement.hh"

 #include <seastar/core/future.hh>
 #include <seastar/core/on_internal_error.hh>
@@ -24,7 +24,7 @@ namespace cql3 {

 namespace statements {

-static logging::logger logger("strongly_consistent_select_statement");
+static logging::logger logger("broadcast_select_statement");

 static
 expr::expression get_key(const cql3::expr::expression& partition_key_restrictions) {
@@ -58,7 +58,7 @@ bool is_selecting_only_value(const cql3::selection::selection& selection) {
           selection.get_columns()[0]->name() == "value";
 }

-strongly_consistent_select_statement::strongly_consistent_select_statement(schema_ptr schema, uint32_t bound_terms,
+broadcast_select_statement::broadcast_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                                           lw_shared_ptr<const parameters> parameters,
                                                                           ::shared_ptr<selection::selection> selection,
                                                                           ::shared_ptr<const restrictions::statement_restrictions> restrictions,
@@ -73,7 +73,7 @@ strongly_consistent_select_statement::strongly_consistent_select_statement(schem
      _query{prepare_query()}
 { }

-broadcast_tables::prepared_select strongly_consistent_select_statement::prepare_query() const {
+broadcast_tables::prepared_select broadcast_select_statement::prepare_query() const {
    if (!is_selecting_only_value(*_selection)) {
        throw service::broadcast_tables::unsupported_operation_error("only 'value' selector is allowed");
    }
@@ -94,7 +94,7 @@ evaluate_prepared(
 }

 future<::shared_ptr<cql_transport::messages::result_message>>
-strongly_consistent_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
+broadcast_select_statement::execute_without_checking_exception_message(query_processor& qp, service::query_state& qs, const query_options& options, std::optional<service::group0_guard> guard) const {
    if (this_shard_id() != 0) {
        co_return ::make_shared<cql_transport::messages::result_message::bounce_to_shard>(0, cql3::computed_function_values{});
    }
--- a/cql3/statements/strongly_consistent_select_statement.hh
+++ b/cql3/statements/strongly_consistent_select_statement.hh
@@ -25,12 +25,12 @@ struct prepared_select {

 }

-class strongly_consistent_select_statement : public select_statement {
+class broadcast_select_statement : public select_statement {
    const broadcast_tables::prepared_select _query;

    broadcast_tables::prepared_select prepare_query() const;
 public:
-    strongly_consistent_select_statement(schema_ptr schema,
+    broadcast_select_statement(schema_ptr schema,
                     uint32_t bound_terms,
                     lw_shared_ptr<const parameters> parameters,
                     ::shared_ptr<selection::selection> selection,
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -123,10 +123,9 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, utils::chun
            // We hold a group0_guard, so it's correct to check this here.
            // The topology or schema cannot change while we're performing this query.
            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-        } catch (const std::exception& e) {
+        } catch (const std::invalid_argument& e) {
            if (replica::database::enforce_rf_rack_validity_for_keyspace(cfg, *ksm)) {
-                // There's no guarantee what the type of the exception will be, so we need to
-                // wrap it manually here in a type that can be passed to the user.
+                // wrap the exception in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
            } else {
                // Even when RF-rack-validity is not enforced for the keyspace, we'd
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -31,8 +31,6 @@
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"

-bool is_internal_keyspace(std::string_view name);
-
 namespace cql3 {

 namespace statements {
@@ -124,10 +122,6 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 #endif

-    if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
-        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
-    }
-
    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace(), true);
 }

--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,6 +23,7 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
+#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -329,6 +330,19 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

+        table_desc.create_statement = std::move(os).to_managed_string();
+    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
+        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
+        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
+        fragmented_ostringstream os{};
+
+        fmt::format_to(os.to_iter(),
+                "/* Do NOT execute this statement! It's only for informational purposes.\n"
+                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
+                "\n{}\n"
+                "*/",
+                *table_desc.create_statement);
+
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -364,7 +378,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -98,6 +98,7 @@ static locator::replication_strategy_config_options prepare_options(
        const sstring& strategy_class,
        const locator::token_metadata& tm,
        bool rf_rack_valid_keyspaces,
+        bool enforce_rack_list,
        locator::replication_strategy_config_options options,
        const locator::replication_strategy_config_options& old_options,
        bool rack_list_enabled,
@@ -107,7 +108,7 @@ static locator::replication_strategy_config_options prepare_options(
    auto is_nts = locator::abstract_replication_strategy::to_qualified_class_name(strategy_class) == "org.apache.cassandra.locator.NetworkTopologyStrategy";
    auto is_alter = !old_options.empty();
    const auto& all_dcs = tm.get_datacenter_racks_token_owners();
-    auto auto_expand_racks = uses_tablets && rf_rack_valid_keyspaces && rack_list_enabled;
+    auto auto_expand_racks = uses_tablets && rack_list_enabled && (rf_rack_valid_keyspaces || enforce_rack_list);

    logger.debug("prepare_options: {}: is_nts={} auto_expand_racks={} rack_list_enabled={} old_options={} new_options={} all_dcs={}",
                 strategy_class, is_nts, auto_expand_racks, rack_list_enabled, old_options, options, all_dcs);
@@ -417,7 +418,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(s
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
    bool uses_tablets = initial_tablets.has_value();
    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
-    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
+    auto options = prepare_options(sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), {}, rack_list_enabled, uses_tablets);
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
            std::move(options), initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
 }
@@ -434,7 +435,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
    auto sc = get_replication_strategy_class();
    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
    if (sc) {
-        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
+        options = prepare_options(*sc, tm, cfg.rf_rack_valid_keyspaces(), cfg.enforce_rack_list(), get_replication_options(), old_options, rack_list_enabled, uses_tablets);
    } else {
        sc = old->strategy_name();
        options = old_options;
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -11,7 +11,7 @@
 #include "utils/assert.hh"
 #include "cql3/cql_statement.hh"
 #include "cql3/statements/modification_statement.hh"
-#include "cql3/statements/strongly_consistent_modification_statement.hh"
+#include "cql3/statements/broadcast_modification_statement.hh"
 #include "cql3/statements/raw/modification_statement.hh"
 #include "cql3/statements/prepared_statement.hh"
 #include "cql3/expr/expr-utils.hh"
@@ -29,6 +29,8 @@
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
+#include "cql3/statements/strong_consistency/modification_statement.hh"
+#include "cql3/statements/strong_consistency/statement_helpers.hh"

 #include <boost/lexical_cast.hpp>

@@ -546,7 +548,7 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
    }
 }

-::shared_ptr<strongly_consistent_modification_statement>
+::shared_ptr<broadcast_modification_statement>
 modification_statement::prepare_for_broadcast_tables() const {
    // FIXME: implement for every type of `modification_statement`.
    throw service::broadcast_tables::unsupported_operation_error{};
@@ -554,24 +556,27 @@ modification_statement::prepare_for_broadcast_tables() const {

 namespace raw {

-::shared_ptr<cql_statement_opt_metadata>
-modification_statement::prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) {
-    ::shared_ptr<cql3::statements::modification_statement> statement = prepare(db, ctx, stats);
-
-    if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
-        return statement->prepare_for_broadcast_tables();
-    } else {
-        return statement;
-    }
-}
-
 std::unique_ptr<prepared_statement>
 modification_statement::prepare(data_dictionary::database db, cql_stats& stats) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto meta = get_prepare_context();
-    auto statement = prepare_statement(db, meta, stats);
+
+    auto statement = std::invoke([&] -> shared_ptr<cql_statement> {
+        auto result = prepare(db, meta, stats);
+
+        if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
+            return ::make_shared<strong_consistency::modification_statement>(std::move(result));
+        }
+
+        if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
+            return result->prepare_for_broadcast_tables();
+        }
+        return result;
+    });
+
    auto partition_key_bind_indices = meta.get_partition_key_bind_indexes(*schema);
-    return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta, std::move(partition_key_bind_indices));
+    return std::make_unique<prepared_statement>(audit_info(), std::move(statement), meta, 
+        std::move(partition_key_bind_indices));
 }

 ::shared_ptr<cql3::statements::modification_statement>
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -30,7 +30,7 @@ class operation;

 namespace statements {

-class strongly_consistent_modification_statement;
+class broadcast_modification_statement;

 namespace raw { class modification_statement; }

@@ -113,15 +113,15 @@ public:

    virtual void add_update_for_key(mutation& m, const query::clustering_range& range, const update_parameters& params, const json_cache_opt& json_cache) const = 0;

-    virtual uint32_t get_bound_terms() const override;
+    uint32_t get_bound_terms() const override;

-    virtual const sstring& keyspace() const;
+    const sstring& keyspace() const;

-    virtual const sstring& column_family() const;
+    const sstring& column_family() const;

-    virtual bool is_counter() const;
+    bool is_counter() const;

-    virtual bool is_view() const;
+    bool is_view() const;

    int64_t get_timestamp(int64_t now, const query_options& options) const;

@@ -129,12 +129,12 @@ public:

    std::optional<gc_clock::duration> get_time_to_live(const query_options& options) const;

-    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
+    future<> check_access(query_processor& qp, const service::client_state& state) const override;

    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

@@ -256,7 +256,9 @@ public:

    virtual json_cache_opt maybe_prepare_json_cache(const query_options& options) const;

-    virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const;
+    virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const;
+
+    db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;

 protected:
    /**
@@ -264,9 +266,7 @@ protected:
     * processed to check that they are compatible.
     * @throws InvalidRequestException
     */
-    virtual void validate_where_clause_for_conditions() const;
-
-    db::timeout_clock::duration get_timeout(const service::client_state& state, const query_options& options) const;
+    void validate_where_clause_for_conditions() const;

    friend class raw::modification_statement;
 };
--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -50,8 +50,8 @@ public:
 protected:
    virtual audit::statement_category category() const override;
    virtual audit::audit_info_ptr audit_info() const override {
-        // We don't audit batch statements. Instead we audit statements that are inside the batch.
-        return audit::audit::create_no_audit_info();
+        constexpr bool batch = true;
+        return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
    }
 };

--- a/cql3/statements/raw/modification_statement.hh
+++ b/cql3/statements/raw/modification_statement.hh
@@ -40,7 +40,6 @@ protected:

 public:
    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;
-    ::shared_ptr<cql_statement_opt_metadata> prepare_statement(data_dictionary::database db, prepare_context& ctx, cql_stats& stats);
    ::shared_ptr<cql3::statements::modification_statement> prepare(data_dictionary::database db, prepare_context& ctx, cql_stats& stats) const;
    void add_raw(sstring&& raw) { _raw_cql = std::move(raw); }
    const sstring& get_raw_cql() const { return _raw_cql; }
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -131,8 +131,6 @@ private:

    void verify_ordering_is_valid(const prepared_orderings_type&, const schema&, const restrictions::statement_restrictions& restrictions) const;

-    prepared_ann_ordering_type prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const;
-
    // Checks whether this ordering reverses all results.
    // We only allow leaving select results unchanged or reversing them.
    bool is_ordering_reversed(const prepared_orderings_type&) const;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -8,6 +8,8 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "cql3/statements/strong_consistency/select_statement.hh"
+#include "cql3/statements/strong_consistency/statement_helpers.hh"
 #include "cql3/statements/select_statement.hh"
 #include "cql3/expr/expression.hh"
 #include "cql3/expr/evaluate.hh"
@@ -16,7 +18,7 @@
 #include "cql3/statements/raw/select_statement.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/statements/prune_materialized_view_statement.hh"
-#include "cql3/statements/strongly_consistent_select_statement.hh"
+#include "cql3/statements/broadcast_select_statement.hh"

 #include "exceptions/exceptions.hh"
 #include <seastar/core/future.hh>
@@ -25,12 +27,14 @@
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "service/qos/qos_common.hh"
 #include "transport/messages/result_message.hh"
+#include "cql3/functions/functions.hh"
 #include "cql3/functions/as_json_function.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/util.hh"
 #include "cql3/restrictions/statement_restrictions.hh"
 #include "index/secondary_index.hh"
 #include "types/vector.hh"
+#include "vector_search/filter.hh"
 #include "validation.hh"
 #include "exceptions/unrecognized_entity_exception.hh"
 #include <optional>
@@ -255,11 +259,9 @@ uint32_t select_statement::get_bound_terms() const {

 future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
    try {
-        const data_dictionary::database db = qp.db();
-        auto&& s = db.find_schema(keyspace(), column_family());
-        auto cdc = db.get_cdc_base_table(*s);
-        auto& cf_name = s->is_view()
-            ? s->view_info()->base_name()
+        auto cdc = qp.db().get_cdc_base_table(*_schema);
+        auto& cf_name = _schema->is_view()
+            ? _schema->view_info()->base_name()
            : (cdc ? cdc->cf_name() : column_family());
        const schema_ptr& base_schema = cdc ? cdc : _schema;
        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);
@@ -368,8 +370,9 @@ uint64_t select_statement::get_inner_loop_limit(uint64_t limit, bool is_aggregat
 }

 bool select_statement::needs_post_query_ordering() const {
-    // We need post-query ordering only for queries with IN on the partition key and an ORDER BY.
-    return _restrictions->key_is_in_relation() && !_parameters->orderings().empty();
+    // We need post-query ordering for queries with IN on the partition key and an ORDER BY
+    // and ANN index queries with rescoring.
+    return static_cast<bool>(_ordering_comparator);
 }

 struct select_statement_executor {
@@ -1958,14 +1961,46 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
            }));
 }

-::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
-        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
-        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-        std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<attributes> attrs) {
+struct ann_ordering_info {
+    secondary_index::index _index;
+    raw::select_statement::prepared_ann_ordering_type _prepared_ann_ordering;
+    bool is_rescoring_enabled;
+};
+
+static std::optional<ann_ordering_info> get_ann_ordering_info(
+        data_dictionary::database db,
+        schema_ptr schema,
+        lw_shared_ptr<const raw::select_statement::parameters> parameters,
+        prepare_context& ctx) {
+
+    if (parameters->orderings().empty()) {
+        return std::nullopt;
+    }
+
+    auto [column_id, ordering] = parameters->orderings().front();
+    const auto& ann_vector = std::get_if<raw::select_statement::ann_vector>(&ordering);
+    if (!ann_vector) {
+        return std::nullopt;
+    }
+
+    ::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(*schema);
+    const column_definition* def = schema->get_column_definition(column->name());
+    if (!def) {
+        throw exceptions::invalid_request_exception(
+                fmt::format("Undefined column name {}", column->text()));
+    }
+
+    if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
+        throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
+    }
+
+    auto e =  expr::prepare_expression(*ann_vector, db, schema->ks_name(), nullptr, def->column_specification);
+    expr::fill_prepare_context(e, ctx);
+
+    raw::select_statement::prepared_ann_ordering_type prepared_ann_ordering = std::make_pair(std::move(def), std::move(e));
+
    auto cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
-    auto [index_opt, _] = restrictions->find_idx(sim);

    auto indexes = sim.list_indexes();
    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
@@ -1977,27 +2012,90 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    if (it == indexes.end()) {
        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
    }
-    
-    index_opt = *it;

-    if (!index_opt) {
-        throw std::runtime_error("No index found.");
+    return ann_ordering_info{
+        *it,
+        std::move(prepared_ann_ordering),
+        secondary_index::vector_index::is_rescoring_enabled(it->metadata().options())
+    };
+}
+
+static uint32_t add_similarity_function_to_selectors(
+        std::vector<selection::prepared_selector>& prepared_selectors,
+        const ann_ordering_info& ann_ordering_info,
+        data_dictionary::database db,
+        schema_ptr schema) {
+    auto similarity_function_name = secondary_index::vector_index::get_cql_similarity_function_name(ann_ordering_info._index.metadata().options());
+    // Create the function name
+    auto func_name = functions::function_name::native_function(sstring(similarity_function_name));
+
+    // Create the function arguments
+    std::vector<expr::expression> args;
+    args.push_back(expr::column_value(ann_ordering_info._prepared_ann_ordering.first));
+    args.push_back(ann_ordering_info._prepared_ann_ordering.second);
+
+    // Get the function object
+    std::vector<shared_ptr<assignment_testable>> provided_args;
+    provided_args.push_back(expr::as_assignment_testable(args[0], expr::type_of(args[0])));
+    provided_args.push_back(expr::as_assignment_testable(args[1], expr::type_of(args[1])));
+
+    auto func = cql3::functions::instance().get(db, schema->ks_name(), func_name, provided_args, schema->ks_name(), schema->cf_name(), nullptr);
+
+    // Create the function call expression
+    expr::function_call similarity_func_call{
+        .func = func,
+        .args = std::move(args),
+    };
+
+    // Add the similarity function as a prepared selector (last)
+    prepared_selectors.push_back(selection::prepared_selector{
+        .expr = std::move(similarity_func_call),
+        .alias = nullptr,
+    });
+    return prepared_selectors.size() - 1;
+}
+
+static select_statement::ordering_comparator_type get_similarity_ordering_comparator(std::vector<selection::prepared_selector>& prepared_selectors, uint32_t similarity_column_index) {
+    auto type = expr::type_of(prepared_selectors[similarity_column_index].expr);
+    if (type->get_kind() != abstract_type::kind::float_kind) {
+        seastar::on_internal_error(logger, "Similarity function must return float type.");
    }
+    return [similarity_column_index, type] (const raw::select_statement::result_row_type& r1, const raw::select_statement::result_row_type& r2) {
+        auto& c1 = r1[similarity_column_index];
+        auto& c2 = r2[similarity_column_index];
+        auto f1 = c1 ? value_cast<float>(type->deserialize(*c1)) : std::numeric_limits<float>::quiet_NaN();
+        auto f2 = c2 ? value_cast<float>(type->deserialize(*c2)) : std::numeric_limits<float>::quiet_NaN();
+        if (std::isfinite(f1) && std::isfinite(f2)) {
+            return f1 > f2;
+        }
+        return std::isfinite(f1);
+    };
+}
+
+::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
+        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {
+
+    auto prepared_filter = vector_search::prepare_filter(*restrictions, parameters->allow_filtering());

    return ::make_shared<cql3::statements::vector_indexed_table_select_statement>(schema, bound_terms, parameters, std::move(selection), std::move(restrictions),
            std::move(group_by_cell_indices), is_reversed, std::move(ordering_comparator), std::move(prepared_ann_ordering), std::move(limit),
-            std::move(per_partition_limit), stats, *index_opt, std::move(attrs));
+            std::move(per_partition_limit), stats, index, std::move(prepared_filter), std::move(attrs));
 }

 vector_indexed_table_select_statement::vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
        ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
        ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
        prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs)
+        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index,
+        vector_search::prepared_filter prepared_filter, std::unique_ptr<attributes> attrs)
    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit,
              per_partition_limit, stats, std::move(attrs)}
    , _index{index}
-    , _prepared_ann_ordering(std::move(prepared_ann_ordering)) {
+    , _prepared_ann_ordering(std::move(prepared_ann_ordering))
+    , _prepared_filter(std::move(prepared_filter)) {

    if (!limit.has_value()) {
        throw exceptions::invalid_request_exception("Vector ANN queries must have a limit specified");
@@ -2032,13 +2130,19 @@ future<shared_ptr<cql_transport::messages::result_message>> vector_indexed_table

        auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
        auto aoe = abort_on_expiry(timeout);
+        auto filter_json = _prepared_filter.to_json(options);
+        uint64_t fetch = static_cast<uint64_t>(std::ceil(limit * secondary_index::vector_index::get_oversampling(_index.metadata().options())));
        auto pkeys = co_await qp.vector_store_client().ann(
-                _schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), limit, aoe.abort_source());
+                _schema->ks_name(), _index.metadata().name(), _schema, get_ann_ordering_vector(options), fetch, filter_json, aoe.abort_source());
        if (!pkeys.has_value()) {
            co_await coroutine::return_exception(
                    exceptions::invalid_request_exception(std::visit(vector_search::vector_store_client::ann_error_visitor{}, pkeys.error())));
        }

+        if (pkeys->size() > limit && !secondary_index::vector_index::is_rescoring_enabled(_index.metadata().options())) {
+            pkeys->erase(pkeys->begin() + limit, pkeys->end());
+        }
+
        co_return co_await query_base_table(qp, state, options, pkeys.value(), timeout);
    });

@@ -2055,11 +2159,11 @@ void vector_indexed_table_select_statement::update_stats() const {
 }

 lw_shared_ptr<query::read_command> vector_indexed_table_select_statement::prepare_command_for_base_query(
-        query_processor& qp, service::query_state& state, const query_options& options) const {
+        query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const {
    auto slice = make_partition_slice(options);
    return ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(slice), qp.proxy().get_max_result_size(slice),
            query::tombstone_limit(qp.proxy().get_tombstone_limit()),
-            query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())), query::partition_limit(query::max_partitions),
+            query::row_limit(get_inner_loop_limit(fetch_limit, _selection->is_aggregate())), query::partition_limit(query::max_partitions),
            _query_start_time_point, tracing::make_trace_info(state.get_trace_state()), query_id::create_null_id(), query::is_first_page::no,
            options.get_timestamp(state));
 }
@@ -2077,7 +2181,7 @@ std::vector<float> vector_indexed_table_select_statement::get_ann_ordering_vecto
 future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_table_select_statement::query_base_table(query_processor& qp,
        service::query_state& state, const query_options& options, const std::vector<vector_search::primary_key>& pkeys,
        lowres_clock::time_point timeout) const {
-    auto command = prepare_command_for_base_query(qp, state, options);
+    auto command = prepare_command_for_base_query(qp, state, options, pkeys.size());

    // For tables without clustering columns, we can optimize by querying
    // partition ranges instead of individual primary keys, since the
@@ -2116,6 +2220,7 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
            query::result_merger{command->get_row_limit(), query::max_partitions});

    co_return co_await wrap_result_to_error_message([this, &command, &options](auto result) {
+        command->set_row_limit(get_limit(options, _limit));
        return process_results(std::move(result), command, options, _query_start_time_point);
    })(std::move(result));
 }
@@ -2129,6 +2234,7 @@ future<::shared_ptr<cql_transport::messages::result_message>> vector_indexed_tab
                    {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state(), {}, {}, options.get_specific_options().node_local_only},
                    std::nullopt)
            .then(wrap_result_to_error_message([this, &options, command](service::storage_proxy::coordinator_query_result qr) {
+                command->set_row_limit(get_limit(options, _limit));
                return this->process_results(std::move(qr.query_result), command, options, _query_start_time_point);
            }));
 }
@@ -2223,32 +2329,41 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d

    prepared_selectors = maybe_jsonize_select_clause(std::move(prepared_selectors), db, schema);

-    auto aggregation_depth = 0u;
+    std::optional<ann_ordering_info> ann_ordering_info_opt = get_ann_ordering_info(db, schema, _parameters, ctx);
+    bool is_ann_query = ann_ordering_info_opt.has_value();

-    // Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
-    if (!_group_by_columns.empty()) {
-        aggregation_depth = std::max(aggregation_depth, 1u);
-        if (prepared_selectors.empty()) {
-            // We have a "SELECT * GROUP BY". If we leave prepared_selectors
-            // empty, below we choose selection::wildcard() for SELECT *, and
-            // forget to do the "levellize" trick needed for the GROUP BY.
-            // So we need to set prepared_selectors. See #16531.
-            auto all_columns = selection::selection::wildcard_columns(schema);
-            std::vector<::shared_ptr<selection::raw_selector>> select_all;
-            select_all.reserve(all_columns.size());
-            for (const column_definition *cdef : all_columns) {
-                auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
-                select_all.push_back(::make_shared<selection::raw_selector>(
-                    expr::unresolved_identifier(std::move(name)), nullptr));
-            }
-            prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
+    if (prepared_selectors.empty() && (!_group_by_columns.empty() || (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled))) {
+        // We have a "SELECT * GROUP BY" or "SELECT * ORDER BY ANN" with rescoring enabled. If we leave prepared_selectors
+        // empty, below we choose selection::wildcard() for SELECT *, and either:
+        //  - forget to do the "levellize" trick needed for the GROUP BY. See #16531.
+        //  - forget to add the similarity function needed for ORDER BY ANN with rescoring. See below.
+        // So we need to set prepared_selectors. 
+        auto all_columns = selection::selection::wildcard_columns(schema);
+        std::vector<::shared_ptr<selection::raw_selector>> select_all;
+        select_all.reserve(all_columns.size());
+        for (const column_definition *cdef : all_columns) {
+            auto name = ::make_shared<cql3::column_identifier::raw>(cdef->name_as_text(), true);
+            select_all.push_back(::make_shared<selection::raw_selector>(
+                expr::unresolved_identifier(std::move(name)), nullptr));
        }
+        prepared_selectors = selection::raw_selector::to_prepared_selectors(select_all, *schema, db, keyspace());
    }

    for (auto& ps : prepared_selectors) {
        expr::fill_prepare_context(ps.expr, ctx);
    }

+    // Force aggregation if GROUP BY is used. This will wrap every column x as first(x).
+    auto aggregation_depth = _group_by_columns.empty() ? 0u : 1u;
+
+    select_statement::ordering_comparator_type ordering_comparator;
+    bool hide_last_column = false;
+    if (is_ann_query && ann_ordering_info_opt->is_rescoring_enabled) {
+        uint32_t similarity_column_index = add_similarity_function_to_selectors(prepared_selectors, *ann_ordering_info_opt, db, schema);
+        hide_last_column = true;
+        ordering_comparator = get_similarity_ordering_comparator(prepared_selectors, similarity_column_index);
+    }
+
    for (auto& ps : prepared_selectors) {
        aggregation_depth = std::max(aggregation_depth, expr::aggregation_depth(ps.expr));
    }
@@ -2266,6 +2381,11 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                     ? selection::selection::wildcard(schema)
                     : selection::selection::from_selectors(db, schema, keyspace(), levellized_prepared_selectors);

+    if (is_ann_query && hide_last_column) {
+        // Hide the similarity selector from the client by reducing column_count
+        selection->get_result_metadata()->hide_last_column();
+    }
+
    // Cassandra 5.0.2 disallows PER PARTITION LIMIT with aggregate queries
    // but only if GROUP BY is not used.
    // See #9879 for more details.
@@ -2273,8 +2393,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        throw exceptions::invalid_request_exception("PER PARTITION LIMIT is not allowed with aggregate queries.");
    }

-    bool is_ann_query = !_parameters->orderings().empty() && std::holds_alternative<select_statement::ann_vector>(_parameters->orderings().front().second);
-
    auto restrictions = prepare_restrictions(db, schema, ctx, selection, for_view, _parameters->allow_filtering() || is_ann_query,
            restrictions::check_indexes(!_parameters->is_mutation_fragments()));

@@ -2282,19 +2400,14 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
        validate_distinct_selection(*schema, *selection, *restrictions);
    }

-    select_statement::ordering_comparator_type ordering_comparator;
    bool is_reversed_ = false;

-    std::optional<prepared_ann_ordering_type> prepared_ann_ordering;
-
    auto orderings = _parameters->orderings();

-    if (!orderings.empty()) {
+    if (!orderings.empty() && !is_ann_query) {
        std::visit([&](auto&& ordering) {
            using T = std::decay_t<decltype(ordering)>;
-            if constexpr (std::is_same_v<T, select_statement::ann_vector>) {
-                prepared_ann_ordering = prepare_ann_ordering(*schema, ctx, db);
-            } else {
+            if constexpr (!std::is_same_v<T, select_statement::ann_vector>) {
                SCYLLA_ASSERT(!for_view);
                verify_ordering_is_allowed(*_parameters, *restrictions);
                prepared_orderings_type prepared_orderings = prepare_orderings(*schema);
@@ -2307,7 +2420,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
    }

    std::vector<sstring> warnings;
-    if (!prepared_ann_ordering.has_value()) {
+    if (!is_ann_query) {
        check_needs_filtering(*restrictions, db.get_config().strict_allow_filtering(), warnings);
        ensure_filtering_columns_retrieval(db, *selection, *restrictions);
    }
@@ -2361,7 +2474,21 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                && restrictions->partition_key_restrictions_size() == schema->partition_key_size());
    };

-    if (_parameters->is_prune_materialized_view()) {
+    if (strong_consistency::is_strongly_consistent(db, schema->ks_name())) {
+        stmt = ::make_shared<strong_consistency::select_statement>(
+                schema,
+                ctx.bound_variables_size(),
+                _parameters,
+                std::move(selection),
+                std::move(restrictions),
+                std::move(group_by_cell_indices),
+                is_reversed_,
+                std::move(ordering_comparator),
+                prepare_limit(db, ctx, _limit),
+                prepare_limit(db, ctx, _per_partition_limit),
+                stats,
+                std::move(prepared_attrs));
+    } else if (_parameters->is_prune_materialized_view()) {
        stmt = ::make_shared<cql3::statements::prune_materialized_view_statement>(
                schema,
                ctx.bound_variables_size(),
@@ -2390,10 +2517,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
                prepare_limit(db, ctx, _per_partition_limit),
                stats,
                std::move(prepared_attrs));
-    } else if (prepared_ann_ordering) {
+    } else if (is_ann_query) {
        stmt = vector_indexed_table_select_statement::prepare(db, schema, ctx.bound_variables_size(), _parameters, std::move(selection), std::move(restrictions),
-                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(*prepared_ann_ordering),
-                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, std::move(prepared_attrs));
+                std::move(group_by_cell_indices), is_reversed_, std::move(ordering_comparator), std::move(ann_ordering_info_opt->_prepared_ann_ordering),
+                prepare_limit(db, ctx, _limit), prepare_limit(db, ctx, _per_partition_limit), stats, ann_ordering_info_opt->_index, std::move(prepared_attrs));
    } else if (restrictions->uses_secondary_indexing()) {
        stmt = view_indexed_table_select_statement::prepare(
                db,
@@ -2425,7 +2552,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
            std::move(prepared_attrs)
        );
    } else if (service::broadcast_tables::is_broadcast_table_statement(keyspace(), column_family())) {
-        stmt = ::make_shared<cql3::statements::strongly_consistent_select_statement>(
+        stmt = ::make_shared<cql3::statements::broadcast_select_statement>(
                schema,
                ctx.bound_variables_size(),
                _parameters,
@@ -2615,28 +2742,6 @@ void select_statement::verify_ordering_is_valid(const prepared_orderings_type& o
    }
 }

-select_statement::prepared_ann_ordering_type select_statement::prepare_ann_ordering(const schema& schema, prepare_context& ctx, data_dictionary::database db) const {
-    auto [column_id, ordering] = _parameters->orderings().front();
-    const auto& ann_vector = std::get_if<select_statement::ann_vector>(&ordering);
-    SCYLLA_ASSERT(ann_vector);
-
-    ::shared_ptr<column_identifier> column = column_id->prepare_column_identifier(schema);
-    const column_definition* def = schema.get_column_definition(column->name());
-    if (!def) {
-        throw exceptions::invalid_request_exception(
-                fmt::format("Undefined column name {}", column->text()));
-    }
-
-    if (!def->type->is_vector() || static_cast<const vector_type_impl*>(def->type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
-        throw exceptions::invalid_request_exception("ANN ordering is only supported on float vector indexes");
-    }
-
-    auto e =  expr::prepare_expression(*ann_vector, db, keyspace(), nullptr, def->column_specification);
-    expr::fill_prepare_context(e, ctx);
-
-    return std::make_pair(std::move(def), std::move(e));
-}
-
 select_statement::ordering_comparator_type select_statement::get_ordering_comparator(const prepared_orderings_type& orderings,
    selection::selection& selection,
    const restrictions::statement_restrictions& restrictions) {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -22,6 +22,7 @@
 #include "locator/host_id.hh"
 #include "service/cas_shard.hh"
 #include "vector_search/vector_store_client.hh"
+#include "vector_search/filter.hh"

 namespace service {
    class client_state;
@@ -362,6 +363,7 @@ private:
 class vector_indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
    prepared_ann_ordering_type _prepared_ann_ordering;
+    vector_search::prepared_filter _prepared_filter;
    mutable gc_clock::time_point _query_start_time_point;

 public:
@@ -371,13 +373,13 @@ public:
            lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
            ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
            ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
-            std::optional<expr::expression> per_partition_limit, cql_stats& stats, std::unique_ptr<cql3::attributes> attrs);
+            std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);

    vector_indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms, lw_shared_ptr<const parameters> parameters,
            ::shared_ptr<selection::selection> selection, ::shared_ptr<const restrictions::statement_restrictions> restrictions,
            ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed, ordering_comparator_type ordering_comparator,
            prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit, std::optional<expr::expression> per_partition_limit,
-            cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);
+            cql_stats& stats, const secondary_index::index& index, vector_search::prepared_filter prepared_filter, std::unique_ptr<cql3::attributes> attrs);

 private:
    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(
@@ -385,7 +387,7 @@ private:

    void update_stats() const;

-    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options) const;
+    lw_shared_ptr<query::read_command> prepare_command_for_base_query(query_processor& qp, service::query_state& state, const query_options& options, uint64_t fetch_limit) const;

    std::vector<float> get_ann_ordering_vector(const query_options& options) const;

--- a/cql3/statements/strong_consistency/modification_statement.cc
+++ b/cql3/statements/strong_consistency/modification_statement.cc
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "modification_statement.hh"
+
+#include "transport/messages/result_message.hh"
+#include "cql3/query_processor.hh"
+#include "service/strong_consistency/coordinator.hh"
+#include "cql3/statements/strong_consistency/statement_helpers.hh"
+
+namespace cql3::statements::strong_consistency {
+static logging::logger logger("sc_modification_statement");
+
+modification_statement::modification_statement(shared_ptr<base_statement> statement)
+    : cql_statement_opt_metadata(&timeout_config::write_timeout)
+    , _statement(std::move(statement))
+{
+}
+
+using result_message = cql_transport::messages::result_message;
+
+future<shared_ptr<result_message>> modification_statement::execute(query_processor& qp, service::query_state& qs, 
+    const query_options& options, std::optional<service::group0_guard> guard) const
+{
+    return execute_without_checking_exception_message(qp, qs, options, std::move(guard))
+            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<result_message>>);
+}
+
+future<shared_ptr<result_message>> modification_statement::execute_without_checking_exception_message(
+        query_processor& qp, service::query_state& qs, const query_options& options,
+        std::optional<service::group0_guard> guard) const
+{
+    auto json_cache = base_statement::json_cache_opt{};
+    const auto keys = _statement->build_partition_keys(options, json_cache);
+    if (keys.size() != 1 || !query::is_single_partition(keys[0])) {
+        throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
+    }
+    if (_statement->requires_read()) {
+        throw exceptions::invalid_request_exception("Strongly consistent updates don't support data prefetch");
+    }
+
+    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
+    const auto mutate_result = co_await coordinator.get().mutate(_statement->s,
+        keys[0].start()->value().token(),
+        [&](api::timestamp_type ts) {
+            const auto prefetch_data = update_parameters::prefetch_data(_statement->s);
+            const auto ttl = _statement->get_time_to_live(options);
+            const auto params = update_parameters(_statement->s, options, ts, ttl, prefetch_data);
+            const auto ranges = _statement->create_clustering_ranges(options, json_cache);
+            auto muts = _statement->apply_updates(keys, ranges, params, json_cache);
+            if (muts.size() != 1) {
+                on_internal_error(logger, ::format("statement '{}' has unexpected number of mutations {}",
+                    raw_cql_statement, muts.size()));
+            }
+            return std::move(*muts.begin());
+        });
+
+    using namespace service::strong_consistency;
+    if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
+        co_return co_await redirect_statement(qp, options, redirect->target);
+    }
+
+    co_return seastar::make_shared<result_message::void_message>();
+}
+
+future<> modification_statement::check_access(query_processor& qp, const service::client_state& state) const {
+    return _statement->check_access(qp, state);
+}
+
+uint32_t modification_statement::get_bound_terms() const {
+    return _statement->get_bound_terms();
+}
+
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return _statement->depends_on(ks_name, cf_name);
+}
+}
--- a/cql3/statements/strong_consistency/modification_statement.hh
+++ b/cql3/statements/strong_consistency/modification_statement.hh
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "cql3/cql_statement.hh"
+#include "cql3/expr/expression.hh"
+#include "cql3/statements/modification_statement.hh"
+
+namespace cql3::statements::strong_consistency {
+
+class modification_statement : public cql_statement_opt_metadata {
+    using result_message = cql_transport::messages::result_message;
+    using base_statement = cql3::statements::modification_statement;
+
+    shared_ptr<base_statement> _statement;
+public:
+    modification_statement(shared_ptr<base_statement> statement);
+
+    future<shared_ptr<result_message>> execute(query_processor& qp, service::query_state& state,
+        const query_options& options, std::optional<service::group0_guard> guard) const override;
+
+    future<shared_ptr<result_message>> execute_without_checking_exception_message(query_processor& qp,
+        service::query_state& qs, const query_options& options,
+        std::optional<service::group0_guard> guard) const override;
+
+    future<> check_access(query_processor& qp, const service::client_state& state) const override;
+
+    uint32_t get_bound_terms() const override;
+
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
+};
+
+}
--- a/cql3/statements/strong_consistency/select_statement.cc
+++ b/cql3/statements/strong_consistency/select_statement.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "select_statement.hh"
+
+#include "query/query-request.hh"
+#include "cql3/query_processor.hh"
+#include "service/strong_consistency/coordinator.hh"
+#include "cql3/statements/strong_consistency/statement_helpers.hh"
+
+namespace cql3::statements::strong_consistency {
+
+using result_message = cql_transport::messages::result_message;
+
+future<::shared_ptr<result_message>> select_statement::do_execute(query_processor& qp,
+        service::query_state& state, 
+        const query_options& options) const
+{
+    const auto key_ranges = _restrictions->get_partition_key_ranges(options);
+    if (key_ranges.size() != 1 || !query::is_single_partition(key_ranges[0])) {
+        throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
+    }
+    const auto now = gc_clock::now();
+    auto read_command = make_lw_shared<query::read_command>(
+        _query_schema->id(),
+        _query_schema->version(),
+        make_partition_slice(options),
+        query::max_result_size(query::result_memory_limiter::maximum_result_size),
+        query::tombstone_limit(query::tombstone_limit::max),
+        query::row_limit(get_inner_loop_limit(get_limit(options, _limit), _selection->is_aggregate())),
+        query::partition_limit(query::max_partitions),
+        now,
+        tracing::make_trace_info(state.get_trace_state()),
+        query_id::create_null_id(),
+        query::is_first_page::no,
+        options.get_timestamp(state));
+    const auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
+    auto [coordinator, holder] = qp.acquire_strongly_consistent_coordinator();
+    auto query_result = co_await coordinator.get().query(_query_schema, *read_command,
+        key_ranges, state.get_trace_state(), timeout);
+
+    using namespace service::strong_consistency;
+    if (const auto* redirect = get_if<need_redirect>(&query_result)) {
+        co_return co_await redirect_statement(qp, options, redirect->target);
+    }
+
+    co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
+        read_command, options, now);
+}
+
+}
--- a/cql3/statements/strong_consistency/select_statement.hh
+++ b/cql3/statements/strong_consistency/select_statement.hh
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "cql3/cql_statement.hh"
+#include "cql3/statements/select_statement.hh"
+
+namespace cql3::statements::strong_consistency {
+
+class select_statement : public cql3::statements::select_statement {
+    using result_message = cql_transport::messages::result_message;
+
+public:
+    using cql3::statements::select_statement::select_statement;
+
+    future<::shared_ptr<cql_transport::messages::result_message>> do_execute(query_processor& qp,
+        service::query_state& state, const query_options& options) const override;
+};
+
+}
--- a/cql3/statements/strong_consistency/statement_helpers.cc
+++ b/cql3/statements/strong_consistency/statement_helpers.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "statement_helpers.hh"
+
+#include "transport/messages/result_message_base.hh"
+#include "cql3/query_processor.hh"
+#include "replica/database.hh"
+#include "locator/tablet_replication_strategy.hh"
+
+namespace cql3::statements::strong_consistency {
+future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
+        const query_options& options,
+        const locator::tablet_replica& target)
+{
+    const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
+    if (target.host != my_host_id) {
+        throw exceptions::invalid_request_exception(format(
+            "Strongly consistent writes can be executed only on the leader node, "
+            "leader id {}, current host id {}",
+            target.host, my_host_id));
+    }
+    auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
+    co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
+}
+
+bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name) {
+    const auto* tablet_aware_rs = db.find_keyspace(ks_name).get_replication_strategy().maybe_as_tablet_aware();
+    return tablet_aware_rs && tablet_aware_rs->get_consistency() != data_dictionary::consistency_config_option::eventual;
+}
+
+}
--- a/cql3/statements/strong_consistency/statement_helpers.hh
+++ b/cql3/statements/strong_consistency/statement_helpers.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "cql3/cql_statement.hh"
+#include "locator/tablets.hh"
+
+namespace cql3::statements::strong_consistency {
+
+future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
+    query_processor& qp,
+    const query_options& options,
+    const locator::tablet_replica& target);
+
+bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);
+
+}
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -13,7 +13,7 @@
 #include "cql3/expr/expression.hh"
 #include "cql3/expr/evaluate.hh"
 #include "cql3/expr/expr-utils.hh"
-#include "cql3/statements/strongly_consistent_modification_statement.hh"
+#include "cql3/statements/broadcast_modification_statement.hh"
 #include "service/broadcast_tables/experimental/lang.hh"
 #include "raw/update_statement.hh"

@@ -333,7 +333,7 @@ std::optional<expr::expression> get_value_condition(const expr::expression& the_
    return binop->rhs;
 }

-::shared_ptr<strongly_consistent_modification_statement>
+::shared_ptr<broadcast_modification_statement>
 update_statement::prepare_for_broadcast_tables() const {
    if (attrs) {
        if (attrs->is_time_to_live_set()) {
@@ -359,7 +359,7 @@ update_statement::prepare_for_broadcast_tables() const {
        .value_condition = get_value_condition(_condition),
    };

-    return ::make_shared<strongly_consistent_modification_statement>(
+    return ::make_shared<broadcast_modification_statement>(
        get_bound_terms(),
        s,
        query
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -45,7 +45,7 @@ private:
    virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const;

 public:
-    virtual ::shared_ptr<strongly_consistent_modification_statement> prepare_for_broadcast_tables() const override;
+    virtual ::shared_ptr<broadcast_modification_statement> prepare_for_broadcast_tables() const override;
 };

 /*
--- a/db/cache_mutation_reader.hh
+++ b/db/cache_mutation_reader.hh
@@ -323,6 +323,9 @@ void cache_mutation_reader::touch_partition() {

 inline
 future<> cache_mutation_reader::fill_buffer() {
+    if (const auto& ex = get_abort_exception(); ex) {
+        return make_exception_future<>(ex);
+    }
    if (_state == state::before_static_row) {
        touch_partition();
        auto after_static_row = [this] {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
-            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
+    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -621,25 +621,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    * @GroupDescription: Provides an overview of the group.
    */
    /**
-    * @Group Ungrouped properties
-    */
-    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
-        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
-    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
-        "true: auto-adjust memtable shares for flush processes")
-    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
-        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
-        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
-    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
-        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
-        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
-        "Set to 0 to disable automatic flushing all tables before major compaction.")
-    /**
    * @Group Initialization properties
    * @GroupDescription The minimal properties needed for configuring a cluster.
    */
@@ -1220,13 +1201,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* org.apache.cassandra.auth.CassandraRoleManager: Stores role data in the system_auth keyspace;\n"
        "* com.scylladb.auth.LDAPRoleManager: Fetches role data from an LDAP server.")
    , permissions_validity_in_ms(this, "permissions_validity_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
-        "How long permissions in cache remain valid. Depending on the authorizer, such as CassandraAuthorizer, fetching permissions can be resource intensive. Permissions caching is disabled when this property is set to 0 or when AllowAllAuthorizer is used. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
+        "How long authorized statements cache entries remain valid. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
        "and the cached value has been read at least once during the permissions_validity_in_ms time frame. If any of these two conditions doesn't hold the cached value is going to be evicted from the cache.\n"
        "\n"
        "Related information: Object permissions")
    , permissions_update_interval_in_ms(this, "permissions_update_interval_in_ms", liveness::LiveUpdate, value_status::Used, 2000,
-        "Refresh interval for permissions cache (if enabled). After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms.")
-    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Used, 1000,
+        "Refresh interval for authorized statements cache. After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms. This option additionally controls the permissions refresh interval for LDAP.")
+    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Unused, 1000,
        "Maximum cached permission entries. Must have a non-zero value if permissions caching is enabled (see a permissions_validity_in_ms description).")
    , server_encryption_options(this, "server_encryption_options", value_status::Used, {/*none*/},
        "Enable or disable inter-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. The available options are:\n"
@@ -1291,7 +1272,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
    , override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
    , enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
    , enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
            "If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1341,7 +1322,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{compression_parameters::algorithm::lz4_with_dicts},
        "Server-global user table compression options. If enabled, all user tables"
        "will be compressed using the provided options, unless overridden"
-        "by compression options in the table schema. The available options are:\n"
+        "by compression options in the table schema. User tables are all tables in non-system keyspaces. The available options are:\n"
        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor, LZ4WithDictsCompressor (default), SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
@@ -1394,6 +1375,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
            "Admit new reads while there are less than this number of requests that need CPU.")
+    , reader_concurrency_semaphore_preemptive_abort_factor(this, "reader_concurrency_semaphore_preemptive_abort_factor", liveness::LiveUpdate, value_status::Used, 0.3,
+            "Admit new reads while their remaining time is more than this factor times their timeout times when arrived to a semaphore. Its vale means\n"
+            "* <= 0.0 means new reads will never get rejected during admission\n"
+            "* >= 1.0 means new reads will always get rejected during admission\n")
    , view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
            "Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , view_update_reader_concurrency_semaphore_kill_limit_multiplier(this, "view_update_reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
@@ -1513,7 +1498,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
-    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
+    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
@@ -1542,17 +1527,21 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "Allows target tablet size to be configured. Defaults to 5G (in bytes). Maintaining tablets at reasonable sizes is important to be able to " \
         "redistribute load. A higher value means tablet migration throughput can be reduced. A lower value may cause number of tablets to increase significantly, " \
         "potentially resulting in performance drawbacks.")
+    , tablet_streaming_read_concurrency_per_shard(this, "tablet_streaming_read_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
+         "Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
+    , tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
+         "Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
    , replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")

-    , audit(this, "audit", value_status::Used, "none",
+    , audit(this, "audit", value_status::Used, "table",
        "Controls the audit feature:\n"
        "\n"
        "\tnone   : No auditing enabled.\n"
        "\tsyslog : Audit messages sent to Syslog.\n"
        "\ttable  : Audit messages written to column family named audit.audit_log.\n")
-    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH", "Comma separated list of operation categories that should be audited.")
+    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
    , audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
    , audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
    , audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")
@@ -1584,7 +1573,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.  This feature will eventually be removed in a future version.")
    , rf_rack_valid_keyspaces(this, "rf_rack_valid_keyspaces", liveness::MustRestart, value_status::Used, false,
        "Enforce RF-rack-valid keyspaces. Additionally, if there are existing RF-rack-invalid "
-        "keyspaces, attempting to start a node with this option ON will fail.")
+        "keyspaces, attempting to start a node with this option ON will fail. "
+        "DEPRECATED. Use enforce_rack_list instead.")
+    , enforce_rack_list(this, "enforce_rack_list", liveness::MustRestart, value_status::Used, false,
+            "Enforce rack list for tablet keyspaces. "
+            "When the option is on, CREATE STATEMENT expands numeric rfs to rack lists "
+            "and ALTER STATEMENT is allowed only when rack lists are used in all DCs."
+            "Additionally, if there are existing tablet keyspaces with numeric rf in any DC "
+            "attempting to start a node with this option ON will fail.")
    // FIXME: make frequency per table in order to reduce work in each iteration.
    // Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
    , tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
@@ -1595,6 +1591,25 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
    , minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
        "Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
+    /**
+    * @Group Ungrouped properties
+    */
+    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
+        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
+    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
+        "true: auto-adjust memtable shares for flush processes")
+    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
+        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
+        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
+    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
+        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
+        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
+        "Set to 0 to disable automatic flushing all tables before major compaction.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
@@ -1785,6 +1800,21 @@ const db::extensions& db::config::extensions() const {
    return *_extensions;
 }

+compression_parameters db::config::get_sstable_compression_user_table_options(bool dicts_feature_enabled) const {
+    if (sstable_compression_user_table_options.is_set()
+            || dicts_feature_enabled
+            || !sstable_compression_user_table_options().uses_dictionary_compressor()) {
+        return sstable_compression_user_table_options();
+    } else {
+        // Fall back to non-dict if dictionary compression is not enabled cluster-wide.
+        auto options = sstable_compression_user_table_options();
+        auto params = options.get_options();
+        auto algo = compression_parameters::non_dict_equivalent(options.get_algorithm());
+        params[compression_parameters::SSTABLE_COMPRESSION] = sstring(compression_parameters::algorithm_to_name(algo));
+        return compression_parameters{params};
+    }
+}
+
 std::map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
    // We decided against using the construct-on-first-use idiom here:
    // https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
--- a/db/config.hh
+++ b/db/config.hh
@@ -185,13 +185,6 @@ public:
     * All values and documentation taken from
     * http://docs.datastax.com/en/cassandra/2.1/cassandra/configuration/configCassandra_yaml_r.html
     */
-    named_value<double> background_writer_scheduling_quota;
-    named_value<bool> auto_adjust_flush_quota;
-    named_value<float> memtable_flush_static_shares;
-    named_value<float> compaction_static_shares;
-    named_value<float> compaction_max_shares;
-    named_value<bool> compaction_enforce_min_threshold;
-    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
    named_value<sstring> cluster_name;
    named_value<sstring> listen_address;
    named_value<sstring> listen_interface;
@@ -419,7 +412,13 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
+
+    // NOTE: Do not use this option directly.
+    // Use get_sstable_compression_user_table_options() instead.
    named_value<compression_parameters> sstable_compression_user_table_options;
+
+    compression_parameters get_sstable_compression_user_table_options(bool dicts_feature_enabled) const;
+
    named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
    named_value<bool> sstable_compression_dictionaries_enable_writing;
    named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
@@ -440,6 +439,7 @@ public:
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
+    named_value<float> reader_concurrency_semaphore_preemptive_abort_factor;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_cpu_concurrency;
@@ -542,6 +542,8 @@ public:
    named_value<double> tablets_initial_scale_factor;
    named_value<unsigned> tablets_per_shard_goal;
    named_value<uint64_t> target_tablet_size_in_bytes;
+    named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
+    named_value<unsigned> tablet_streaming_write_concurrency_per_shard;

    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
@@ -599,12 +601,21 @@ public:
    named_value<bool> enable_create_table_with_compact_storage;

    named_value<bool> rf_rack_valid_keyspaces;
+    named_value<bool> enforce_rack_list;

    named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
    named_value<bool> force_capacity_based_balancing;
    named_value<float> size_based_balance_threshold_percentage;
    named_value<uint64_t> minimal_tablet_size_for_balancing;

+    named_value<double> background_writer_scheduling_quota;
+    named_value<bool> auto_adjust_flush_quota;
+    named_value<float> memtable_flush_static_shares;
+    named_value<float> compaction_static_shares;
+    named_value<float> compaction_max_shares;
+    named_value<bool> compaction_enforce_min_threshold;
+    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
+
    static const sstring default_tls_priority;
 private:
    template<typename T>
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -31,19 +31,23 @@ size_t quorum_for(const locator::effective_replication_map& erm) {
    return replication_factor ? (replication_factor / 2) + 1 : 0;
 }

-size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
+static size_t get_replication_factor_for_dc(const locator::effective_replication_map& erm, const sstring& dc) {
    using namespace locator;

    const auto& rs = erm.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
-        const network_topology_strategy* nrs =
+        const network_topology_strategy* nts =
            static_cast<const network_topology_strategy*>(&rs);
-        size_t replication_factor = nrs->get_replication_factor(dc);
-        return replication_factor ? (replication_factor / 2) + 1 : 0;
+        return nts->get_replication_factor(dc);
    }

-    return quorum_for(erm);
+    return erm.get_replication_factor();
+}
+
+size_t local_quorum_for(const locator::effective_replication_map& erm, const sstring& dc) {
+    auto rf = get_replication_factor_for_dc(erm, dc);
+    return rf ? (rf / 2) + 1 : 0;
 }

 size_t block_for_local_serial(const locator::effective_replication_map& erm) {
@@ -188,18 +192,30 @@ void assure_sufficient_live_nodes(
        return pending <= live ? live - pending : 0;
    };

+    auto make_rf_zero_error_msg = [cl] (const sstring& local_dc) {
+        return format("Cannot achieve consistency level {} in datacenter '{}' with replication factor 0. "
+                      "Ensure the keyspace is replicated to this datacenter or use a non-local consistency level.", cl, local_dc);
+    };
+
    const auto& topo = erm.get_topology();
+    const sstring& local_dc = topo.get_datacenter();

    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
        break;
    case consistency_level::LOCAL_ONE:
+        if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
+            throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, 1, 0);
+        }
        if (topo.count_local_endpoints(live_endpoints) < topo.count_local_endpoints(pending_endpoints) + 1) {
            throw exceptions::unavailable_exception(cl, 1, 0);
        }
        break;
    case consistency_level::LOCAL_QUORUM: {
+        if (size_t local_rf = get_replication_factor_for_dc(erm, local_dc); local_rf == 0) {
+            throw exceptions::unavailable_exception(make_rf_zero_error_msg(local_dc), cl, need, 0);
+        }
        size_t local_live = topo.count_local_endpoints(live_endpoints);
        size_t pending = topo.count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
    _sender.cancel_draining();
 }

-hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
+hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
    : _key(key)
    , _shard_manager(shard_manager)
    , _store_gate("hint_endpoint_manager")
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
    // Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
    // TODO: Should this logic be deduplicated with what is in the commitlog?
    , _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
-    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
+    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
 {}

 hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -63,7 +63,7 @@ private:
    hint_sender _sender;

 public:
-    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
+    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
    hint_endpoint_manager(hint_endpoint_manager&&);
    ~hint_endpoint_manager();

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
    return cm_it->second;
 }

-hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
+hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
    : _stopped(make_ready_future<>())
    , _ep_key(parent.end_point_key())
    , _ep_manager(parent)
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
-    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
+    , _hints_cpu_sched_group(sg)
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -120,7 +120,7 @@ private:
    std::multimap<db::replay_position, lw_shared_ptr<std::optional<promise<>>>> _replay_waiters;

 public:
-    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper) noexcept;
+    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept;
    ~hint_sender();

    /// \brief A constructor that should be called from the copy/move-constructor of hint_endpoint_manager.
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -142,7 +142,7 @@ future<> directory_initializer::ensure_rebalanced() {
 }

 manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter, int64_t max_hint_window_ms,
-        resource_manager& res_manager, sharded<replica::database>& db)
+        resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg)
    : _hints_dir(fs::path(hints_directory) / fmt::to_string(this_shard_id()))
    , _host_filter(std::move(filter))
    , _proxy(proxy)
@@ -150,6 +150,7 @@ manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_fi
    , _local_db(db.local())
    , _draining_eps_gate(seastar::format("hints::manager::{}", _hints_dir.native()))
    , _resource_manager(res_manager)
+    , _hints_sending_sched_group(sg)
 {
    if (utils::get_local_injector().enter("decrease_hints_flush_period")) {
        hints_flush_period = std::chrono::seconds{1};
@@ -415,7 +416,7 @@ hint_endpoint_manager& manager::get_ep_manager(const endpoint_id& host_id, const

    try {
        std::filesystem::path hint_directory = hints_dir() / (_uses_host_id ? fmt::to_string(host_id) : fmt::to_string(ip));
-        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this});
+        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this, _hints_sending_sched_group});
        hint_endpoint_manager& ep_man = it->second;

        manager_logger.trace("Created an endpoint manager for {}", host_id);
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -133,6 +133,7 @@ private:

    hint_stats _stats;
    seastar::metrics::metric_groups _metrics;
+    scheduling_group _hints_sending_sched_group;

    // We need to keep a variant here. Before migrating hinted handoff to using host ID, hint directories will
    // still represent IP addresses. But after the migration, they will start representing host IDs.
@@ -155,7 +156,7 @@ private:

 public:
    manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter,
-            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db);
+            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg);

    manager(const manager&) = delete;
    manager& operator=(const manager&) = delete;
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -24,7 +24,7 @@
 #include "readers/forwardable.hh"
 #include "readers/nonforwardable.hh"
 #include "cache_mutation_reader.hh"
-#include "partition_snapshot_reader.hh"
+#include "replica/partition_snapshot_reader.hh"
 #include "keys/clustering_key_filter.hh"
 #include "utils/assert.hh"
 #include "utils/updateable_value.hh"
@@ -845,12 +845,12 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
            cache_entry& e = *i;
            upgrade_entry(e);
            tracing::trace(ts, "Reading partition {} from cache", pos);
-            return make_partition_snapshot_flat_reader<false, dummy_accounter>(
+            return replica::make_partition_snapshot_reader<false, dummy_accounter>(
                    schema,
                    std::move(permit),
                    e.key(),
                    query::clustering_key_filter_ranges(slice.row_ranges(*schema, e.key().key())),
-                    e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), nullptr, phase_of(pos)),
+                    e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), &_tracker, phase_of(pos)),
                    false,
                    _tracker.region(),
                    _read_section,
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -96,16 +96,16 @@ static logging::logger diff_logger("schema_diff");
 /** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
 namespace db {
 namespace {
-    const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
-        if (ks_name == schema_tables::NAME) {
-            props.enable_schema_commitlog();
+    const auto set_use_schema_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
+        if (builder.ks_name() == schema_tables::NAME) {
+            builder.enable_schema_commitlog();
        }
    });
    const auto set_group0_table_options =
-        schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
-            if (ks_name == schema_tables::NAME) {
+        schema_builder::register_schema_initializer([](schema_builder& builder) {
+            if (builder.ks_name() == schema_tables::NAME) {
                // all schema tables are group0 tables
-                props.is_group0_table = true;
+                builder.set_is_group0_table(true);
            }
        });
 }
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -65,7 +65,7 @@ future<> snapshot_ctl::run_snapshot_modify_operation(noncopyable_function<future
    });
 }

-future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts) {
    if (tag.empty()) {
        throw std::runtime_error("You must supply a snapshot name.");
    }
@@ -74,21 +74,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
        std::ranges::copy(_db.local().get_keyspaces() | std::views::keys, std::back_inserter(keyspace_names));
    };

-    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
-        return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
+    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), opts, this] () mutable {
+        return do_take_snapshot(std::move(tag), std::move(keyspace_names), opts);
    });
 }

-future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts) {
    co_await coroutine::parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
        return check_snapshot_not_exist(ks_name, tag);
    });
-    co_await coroutine::parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
-        return replica::database::snapshot_keyspace_on_all_shards(_db, ks_name, tag, bool(sf));
+    co_await coroutine::parallel_for_each(keyspace_names, [this, tag = std::move(tag), opts] (const auto& ks_name) {
+        return replica::database::snapshot_keyspace_on_all_shards(_db, ks_name, tag, opts);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
    if (ks_name.empty()) {
        throw std::runtime_error("You must supply a keyspace name");
    }
@@ -99,14 +99,14 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] () mutable {
-        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf);
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), opts] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), opts);
    });
 }

-future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts) {
    co_await check_snapshot_not_exist(ks_name, tag, tables);
-    co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), bool(sf));
+    co_await replica::database::snapshot_tables_on_all_shards(_db, ks_name, std::move(tables), std::move(tag), opts);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -38,10 +38,14 @@ class backup_task_impl;

 } // snapshot namespace

+struct snapshot_options {
+    bool skip_flush = false;
+    gc_clock::time_point created_at = gc_clock::now();
+    std::optional<gc_clock::time_point> expires_at;
+};
+
 class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
 public:
-    using skip_flush = bool_class<class skip_flush_tag>;
-
    struct table_snapshot_details {
        int64_t total;
        int64_t live;
@@ -70,8 +74,8 @@ public:
     *
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_snapshot(sstring tag, skip_flush sf = skip_flush::no) {
-        return take_snapshot(tag, {}, sf);
+    future<> take_snapshot(sstring tag, snapshot_options opts = {}) {
+        return take_snapshot(tag, {}, opts);
    }

    /**
@@ -80,7 +84,7 @@ public:
     * @param tag the tag given to the snapshot; may not be null or empty
     * @param keyspace_names the names of the keyspaces to snapshot; empty means "all"
     */
-    future<> take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
+    future<> take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts = {});

    /**
     * Takes the snapshot of multiple tables. A snapshot name must be specified.
@@ -89,7 +93,7 @@ public:
     * @param tables a vector of tables names to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});

    /**
     * Remove the snapshot with the given name from the given keyspaces.
@@ -127,8 +131,8 @@ private:

    friend class snapshot::backup_task_impl;

-    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
-    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, snapshot_options opts = {}  );
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, snapshot_options opts = {});
 };

 }
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -42,11 +42,11 @@ extern logging::logger cdc_log;

 namespace db {
 namespace {
-    const auto set_wait_for_sync_to_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
-        if ((ks_name == system_distributed_keyspace::NAME_EVERYWHERE && cf_name == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
-            (ks_name == system_distributed_keyspace::NAME && cf_name == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
+    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
+        if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
+            (builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
        {
-            props.wait_for_sync_to_commitlog = true;
+            builder.set_wait_for_sync_to_commitlog(true);
        }
    });
 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -66,24 +66,24 @@ static thread_local auto sstableinfo_type = user_type_impl::get_instance(

 namespace db {
 namespace {
-    const auto set_null_sharder = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
+    const auto set_null_sharder = schema_builder::register_schema_initializer([](schema_builder& builder) {
        // tables in the "system" keyspace which need to use null sharder
        static const std::unordered_set<sstring> tables = {
                // empty
        };
-        if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
-            props.use_null_sharder = true;
+        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
+            builder.set_use_null_sharder(true);
        }
    });
-    const auto set_wait_for_sync_to_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
+    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
        static const std::unordered_set<sstring> tables = {
            system_keyspace::PAXOS,
        };
-        if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
-            props.wait_for_sync_to_commitlog = true;
+        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
+            builder.set_wait_for_sync_to_commitlog(true);
        }
    });
-    const auto set_use_schema_commitlog = schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
+    const auto set_use_schema_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
        static const std::unordered_set<sstring> tables = {
            schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
            system_keyspace::BROADCAST_KV_STORE,
@@ -108,18 +108,18 @@ namespace {
            system_keyspace::ROLE_MEMBERS,
            system_keyspace::ROLE_ATTRIBUTES,
            system_keyspace::ROLE_PERMISSIONS,
-            system_keyspace::v3::CDC_LOCAL,
+            system_keyspace::CDC_LOCAL,
            system_keyspace::DICTS,
            system_keyspace::VIEW_BUILDING_TASKS,
            system_keyspace::CLIENT_ROUTES,
        };
-        if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
-            props.enable_schema_commitlog();
+        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
+            builder.enable_schema_commitlog();
        }
    });

    const auto set_group0_table_options =
-        schema_builder::register_static_configurator([](const sstring& ks_name, const sstring& cf_name, schema_static_props& props) {
+        schema_builder::register_schema_initializer([](schema_builder& builder) {
            static const std::unordered_set<sstring> tables = {
                // scylla_local may store a replicated tombstone related to schema
                // (see `make_group0_schema_version_mutation`), so we include it in the group0 tables list.
@@ -142,8 +142,8 @@ namespace {
                system_keyspace::CLIENT_ROUTES,
                system_keyspace::REPAIR_TASKS,
            };
-            if (ks_name == system_keyspace::NAME && tables.contains(cf_name)) {
-                props.is_group0_table = true;
+            if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
+                builder.set_is_group0_table(true);
            }
        });
 }
@@ -918,7 +918,7 @@ schema_ptr system_keyspace::corrupt_data() {
    return scylla_local;
 }

-schema_ptr system_keyspace::v3::batches() {
+schema_ptr system_keyspace::batches() {
    static thread_local auto schema = [] {
        schema_builder builder(generate_legacy_id(NAME, BATCHES), NAME, BATCHES,
        // partition key
@@ -946,53 +946,7 @@ schema_ptr system_keyspace::v3::batches() {
    return schema;
 }

-schema_ptr system_keyspace::v3::built_indexes() {
-    // identical to ours, but ours otoh is a mix-in of the 3.x series cassandra one
-    return db::system_keyspace::built_indexes();
-}
-
-schema_ptr system_keyspace::v3::local() {
-    static thread_local auto schema = [] {
-        schema_builder builder(generate_legacy_id(NAME, LOCAL), NAME, LOCAL,
-        // partition key
-        {{"key", utf8_type}},
-        // clustering key
-        {},
-        // regular columns
-        {
-                {"bootstrapped", utf8_type},
-                {"broadcast_address", inet_addr_type},
-                {"cluster_name", utf8_type},
-                {"cql_version", utf8_type},
-                {"data_center", utf8_type},
-                {"gossip_generation", int32_type},
-                {"host_id", uuid_type},
-                {"listen_address", inet_addr_type},
-                {"native_protocol_version", utf8_type},
-                {"partitioner", utf8_type},
-                {"rack", utf8_type},
-                {"release_version", utf8_type},
-                {"rpc_address", inet_addr_type},
-                {"schema_version", uuid_type},
-                {"thrift_version", utf8_type},
-                {"tokens", set_type_impl::get_instance(utf8_type, true)},
-                {"truncated_at", map_type_impl::get_instance(uuid_type, bytes_type, true)},
-        },
-        // static columns
-        {},
-        // regular column name type
-        utf8_type,
-        // comment
-        "information about the local node"
-       );
-       builder.set_gc_grace_seconds(0);
-       builder.with_hash_version();
-       return builder.build(schema_builder::compact_storage::no);
-    }();
-    return schema;
-}
-
-schema_ptr system_keyspace::v3::truncated() {
+schema_ptr system_keyspace::truncated() {
    static thread_local auto local = [] {
        schema_builder builder(generate_legacy_id(NAME, TRUNCATED), NAME, TRUNCATED,
        // partition key
@@ -1022,7 +976,7 @@ schema_ptr system_keyspace::v3::truncated() {

 thread_local data_type replay_position_type = tuple_type_impl::get_instance({long_type, int32_type});

-schema_ptr system_keyspace::v3::commitlog_cleanups() {
+schema_ptr system_keyspace::commitlog_cleanups() {
    static thread_local auto local = [] {
        schema_builder builder(generate_legacy_id(NAME, COMMITLOG_CLEANUPS), NAME, COMMITLOG_CLEANUPS,
        // partition key
@@ -1049,47 +1003,7 @@ schema_ptr system_keyspace::v3::commitlog_cleanups() {
    return local;
 }

-schema_ptr system_keyspace::v3::peers() {
-    // identical
-    return db::system_keyspace::peers();
-}
-
-schema_ptr system_keyspace::v3::peer_events() {
-    // identical
-    return db::system_keyspace::peer_events();
-}
-
-schema_ptr system_keyspace::v3::range_xfers() {
-    // identical
-    return db::system_keyspace::range_xfers();
-}
-
-schema_ptr system_keyspace::v3::compaction_history() {
-    // identical
-    return db::system_keyspace::compaction_history();
-}
-
-schema_ptr system_keyspace::v3::sstable_activity() {
-    // identical
-    return db::system_keyspace::sstable_activity();
-}
-
-schema_ptr system_keyspace::v3::size_estimates() {
-    // identical
-    return db::system_keyspace::size_estimates();
-}
-
-schema_ptr system_keyspace::v3::large_partitions() {
-    // identical
-    return db::system_keyspace::large_partitions();
-}
-
-schema_ptr system_keyspace::v3::scylla_local() {
-    // identical
-    return db::system_keyspace::scylla_local();
-}
-
-schema_ptr system_keyspace::v3::available_ranges() {
+schema_ptr system_keyspace::available_ranges() {
    static thread_local auto schema = [] {
        schema_builder builder(generate_legacy_id(NAME, AVAILABLE_RANGES), NAME, AVAILABLE_RANGES,
        // partition key
@@ -1112,7 +1026,7 @@ schema_ptr system_keyspace::v3::available_ranges() {
    return schema;
 }

-schema_ptr system_keyspace::v3::views_builds_in_progress() {
+schema_ptr system_keyspace::views_builds_in_progress() {
    static thread_local auto schema = [] {
        schema_builder builder(generate_legacy_id(NAME, VIEWS_BUILDS_IN_PROGRESS), NAME, VIEWS_BUILDS_IN_PROGRESS,
        // partition key
@@ -1135,7 +1049,7 @@ schema_ptr system_keyspace::v3::views_builds_in_progress() {
    return schema;
 }

-schema_ptr system_keyspace::v3::built_views() {
+schema_ptr system_keyspace::built_views() {
    static thread_local auto schema = [] {
        schema_builder builder(generate_legacy_id(NAME, BUILT_VIEWS), NAME, BUILT_VIEWS,
        // partition key
@@ -1158,7 +1072,7 @@ schema_ptr system_keyspace::v3::built_views() {
    return schema;
 }

-schema_ptr system_keyspace::v3::scylla_views_builds_in_progress() {
+schema_ptr system_keyspace::scylla_views_builds_in_progress() {
    static thread_local auto schema = [] {
        auto id = generate_legacy_id(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
        return schema_builder(NAME, SCYLLA_VIEWS_BUILDS_IN_PROGRESS, std::make_optional(id))
@@ -1174,7 +1088,7 @@ schema_ptr system_keyspace::v3::scylla_views_builds_in_progress() {
    return schema;
 }

-/*static*/ schema_ptr system_keyspace::v3::cdc_local() {
+/*static*/ schema_ptr system_keyspace::cdc_local() {
    static thread_local auto cdc_local = [] {
        schema_builder builder(generate_legacy_id(NAME, CDC_LOCAL), NAME, CDC_LOCAL,
        // partition key
@@ -1800,7 +1714,9 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
    std::unordered_set<dht::token> tset;
    for (auto& t: tokens) {
        auto str = value_cast<sstring>(t);
-        SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
+        if (str != dht::token::from_sstring(str).to_sstring()) {
+            on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
+        }
        tset.insert(dht::token::from_sstring(str));
    }
    return tset;
@@ -2180,21 +2096,21 @@ future<> system_keyspace::update_cdc_generation_id(cdc::generation_id gen_id) {
    co_await std::visit(make_visitor(
    [this] (cdc::generation_id_v1 id) -> future<> {
        co_await execute_cql(
-                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
-                sstring(v3::CDC_LOCAL), id.ts);
+                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", CDC_LOCAL),
+                sstring(CDC_LOCAL), id.ts);
    },
    [this] (cdc::generation_id_v2 id) -> future<> {
        co_await execute_cql(
-                format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", v3::CDC_LOCAL),
-                sstring(v3::CDC_LOCAL), id.ts, id.id);
+                format("INSERT INTO system.{} (key, streams_timestamp, uuid) VALUES (?, ?, ?)", CDC_LOCAL),
+                sstring(CDC_LOCAL), id.ts, id.id);
    }
    ), gen_id);
 }

 future<std::optional<cdc::generation_id>> system_keyspace::get_cdc_generation_id() {
    auto msg = co_await execute_cql(
-            format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", v3::CDC_LOCAL),
-            sstring(v3::CDC_LOCAL));
+            format("SELECT streams_timestamp, uuid FROM system.{} WHERE key = ?", CDC_LOCAL),
+            sstring(CDC_LOCAL));

    if (msg->empty()) {
        co_return std::nullopt;
@@ -2220,19 +2136,19 @@ static const sstring CDC_REWRITTEN_KEY = "rewritten";
 future<> system_keyspace::cdc_set_rewritten(std::optional<cdc::generation_id_v1> gen_id) {
    if (gen_id) {
        return execute_cql(
-                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", v3::CDC_LOCAL),
+                format("INSERT INTO system.{} (key, streams_timestamp) VALUES (?, ?)", CDC_LOCAL),
                CDC_REWRITTEN_KEY, gen_id->ts).discard_result();
    } else {
        // Insert just the row marker.
        return execute_cql(
-                format("INSERT INTO system.{} (key) VALUES (?)", v3::CDC_LOCAL),
+                format("INSERT INTO system.{} (key) VALUES (?)", CDC_LOCAL),
                CDC_REWRITTEN_KEY).discard_result();
    }
 }

 future<bool> system_keyspace::cdc_is_rewritten() {
    // We don't care about the actual timestamp; it's additional information for debugging purposes.
-    return execute_cql(format("SELECT key FROM system.{} WHERE key = ?", v3::CDC_LOCAL), CDC_REWRITTEN_KEY)
+    return execute_cql(format("SELECT key FROM system.{} WHERE key = ?", CDC_LOCAL), CDC_REWRITTEN_KEY)
            .then([] (::shared_ptr<cql3::untyped_result_set> msg) {
        return !msg->empty();
    });
@@ -2376,11 +2292,11 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
                    scylla_local(), db::schema_tables::scylla_table_schema_history(),
                    repair_history(),
                    repair_tasks(),
-                    v3::views_builds_in_progress(), v3::built_views(),
-                    v3::scylla_views_builds_in_progress(),
-                    v3::truncated(),
-                    v3::commitlog_cleanups(),
-                    v3::cdc_local(),
+                    views_builds_in_progress(), built_views(),
+                    scylla_views_builds_in_progress(),
+                    truncated(),
+                    commitlog_cleanups(),
+                    cdc_local(),
                    raft(), raft_snapshots(), raft_snapshot_config(), group0_history(), discovery(),
                    topology(), cdc_generations_v3(), topology_requests(), service_levels_v2(), view_build_status_v2(),
                    dicts(), view_building_tasks(), client_routes(), cdc_streams_state(), cdc_streams_history()
@@ -2403,7 +2319,7 @@ static bool maybe_write_in_user_memory(schema_ptr s) {
    return (s.get() == system_keyspace::batchlog().get())
            || (s.get() == system_keyspace::batchlog_v2().get())
            || (s.get() == system_keyspace::paxos().get())
-            || s == system_keyspace::v3::scylla_views_builds_in_progress();
+            || s == system_keyspace::scylla_views_builds_in_progress();
 }

 future<> system_keyspace::make(
@@ -2689,7 +2605,7 @@ mutation system_keyspace::make_size_estimates_mutation(const sstring& ks, std::v

 future<> system_keyspace::register_view_for_building(sstring ks_name, sstring view_name, const dht::token& token) {
    sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, generation_number, cpu_id, first_token) VALUES (?, ?, ?, ?, ?)",
-            v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
+            SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
    return execute_cql(
            std::move(req),
            std::move(ks_name),
@@ -2705,7 +2621,7 @@ future<> system_keyspace::register_view_for_building_for_all_shards(sstring ks_n
    // before all shards are registered.
    // if another shard has already registered, this won't overwrite its status. if it hasn't registered, we insert
    // a status with first_token=null and next_token=null, indicating it hasn't made progress.
-    auto&& schema = db::system_keyspace::v3::scylla_views_builds_in_progress();
+    auto&& schema = db::system_keyspace::scylla_views_builds_in_progress();
    auto timestamp = api::new_timestamp();
    mutation m{schema, partition_key::from_single_value(*schema, utf8_type->decompose(ks_name))};

@@ -2723,7 +2639,7 @@ future<> system_keyspace::register_view_for_building_for_all_shards(sstring ks_n

 future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring view_name, const dht::token& token) {
    sstring req = format("INSERT INTO system.{} (keyspace_name, view_name, next_token, cpu_id) VALUES (?, ?, ?, ?)",
-            v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
+            SCYLLA_VIEWS_BUILDS_IN_PROGRESS);
    return execute_cql(
            std::move(req),
            std::move(ks_name),
@@ -2734,14 +2650,14 @@ future<> system_keyspace::update_view_build_progress(sstring ks_name, sstring vi

 future<> system_keyspace::remove_view_build_progress_across_all_shards(sstring ks_name, sstring view_name) {
    return execute_cql(
-            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
+            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
            std::move(ks_name),
            std::move(view_name)).discard_result();
 }

 future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring view_name) {
    return execute_cql(
-            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
+            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ? AND cpu_id = ?", SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
            std::move(ks_name),
            std::move(view_name),
            int32_t(this_shard_id())).discard_result();
@@ -2749,20 +2665,20 @@ future<> system_keyspace::remove_view_build_progress(sstring ks_name, sstring vi

 future<> system_keyspace::mark_view_as_built(sstring ks_name, sstring view_name) {
    return execute_cql(
-            format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", v3::BUILT_VIEWS),
+            format("INSERT INTO system.{} (keyspace_name, view_name) VALUES (?, ?)", BUILT_VIEWS),
            std::move(ks_name),
            std::move(view_name)).discard_result();
 }

 future<> system_keyspace::remove_built_view(sstring ks_name, sstring view_name) {
    return execute_cql(
-            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", v3::BUILT_VIEWS),
+            format("DELETE FROM system.{} WHERE keyspace_name = ? AND view_name = ?", BUILT_VIEWS),
            std::move(ks_name),
            std::move(view_name)).discard_result();
 }

 future<std::vector<system_keyspace::view_name>> system_keyspace::load_built_views() {
-    return execute_cql(format("SELECT * FROM system.{}", v3::BUILT_VIEWS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
+    return execute_cql(format("SELECT * FROM system.{}", BUILT_VIEWS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
        return *cql_result
                | std::views::transform([] (const cql3::untyped_result_set::row& row) {
            auto ks_name = row.get_as<sstring>("keyspace_name");
@@ -2774,7 +2690,7 @@ future<std::vector<system_keyspace::view_name>> system_keyspace::load_built_view

 future<std::vector<system_keyspace::view_build_progress>> system_keyspace::load_view_build_progress() {
    return execute_cql(format("SELECT keyspace_name, view_name, first_token, next_token, cpu_id FROM system.{}",
-            v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
+            SCYLLA_VIEWS_BUILDS_IN_PROGRESS)).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
        std::vector<view_build_progress> progress;
        for (auto& row : *cql_result) {
            auto ks_name = row.get_as<sstring>("keyspace_name");
@@ -3227,6 +3143,8 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
        co_return ret;
    }

+    const bool strongly_consistent_tables = _db.features().strongly_consistent_tables;
+
    for (auto& row : *rs) {
        if (!row.has("host_id")) {
            // There are no clustering rows, only the static row.
@@ -3275,7 +3193,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                    };
                }
            } else if (must_have_tokens(nstate)) {
-                on_fatal_internal_error(slogger, format(
+                on_internal_error(slogger, format(
                        "load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
            }
        }
@@ -3357,7 +3275,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            // Currently, at most one node at a time can be in transitioning state.
            if (!map->empty()) {
                const auto& [other_id, other_rs] = *map->begin();
-                on_fatal_internal_error(slogger, format(
+                on_internal_error(slogger, format(
                    "load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
                    other_id, other_rs.state, host_id, nstate));
            }
@@ -3415,8 +3333,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
                        NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
                gen_id.id);
-            SCYLLA_ASSERT(gen_rows);
-            if (gen_rows->empty()) {
+            if (!gen_rows || gen_rows->empty()) {
                on_internal_error(slogger, format(
                    "load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
            }
@@ -3463,7 +3380,9 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            ret.session = service::session_id(some_row.get_as<utils::UUID>("session"));
        }

-        if (some_row.has("tablet_balancing_enabled")) {
+        if (strongly_consistent_tables) {
+            ret.tablet_balancing_enabled = false;
+        } else if (some_row.has("tablet_balancing_enabled")) {
            ret.tablet_balancing_enabled = some_row.get_as<bool>("tablet_balancing_enabled");
        } else {
            ret.tablet_balancing_enabled = true;
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -127,6 +127,8 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>

    static schema_ptr raft_snapshot_config();
    static schema_ptr local();
+    static schema_ptr truncated();
+    static schema_ptr commitlog_cleanups();
    static schema_ptr peers();
    static schema_ptr peer_events();
    static schema_ptr range_xfers();
@@ -137,7 +139,10 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    static schema_ptr large_rows();
    static schema_ptr large_cells();
    static schema_ptr corrupt_data();
-    static schema_ptr scylla_local();
+    static schema_ptr batches();
+    static schema_ptr available_ranges();
+    static schema_ptr built_views();
+    static schema_ptr cdc_local();
    future<> force_blocking_flush(sstring cfname);
    // This function is called when the system.peers table is read,
    // and it fixes some types of inconsistencies that can occur
@@ -204,6 +209,14 @@ public:
    static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
    static constexpr auto CLIENT_ROUTES = "client_routes";
    static constexpr auto VERSIONS = "versions";
+    static constexpr auto BATCHES = "batches";
+    static constexpr auto AVAILABLE_RANGES = "available_ranges";
+    static constexpr auto VIEWS_BUILDS_IN_PROGRESS = "views_builds_in_progress";
+    static constexpr auto BUILT_VIEWS = "built_views";
+    static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
+    static constexpr auto CDC_LOCAL = "cdc_local";
+    static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
+    static constexpr auto CDC_STREAMS = "cdc_streams";

    // auth
    static constexpr auto ROLES = "roles";
@@ -211,42 +224,6 @@ public:
    static constexpr auto ROLE_ATTRIBUTES = "role_attributes";
    static constexpr auto ROLE_PERMISSIONS = "role_permissions";

-    struct v3 {
-        static constexpr auto BATCHES = "batches";
-        static constexpr auto PAXOS = "paxos";
-        static constexpr auto BUILT_INDEXES = "IndexInfo";
-        static constexpr auto LOCAL = "local";
-        static constexpr auto PEERS = "peers";
-        static constexpr auto PEER_EVENTS = "peer_events";
-        static constexpr auto RANGE_XFERS = "range_xfers";
-        static constexpr auto COMPACTION_HISTORY = "compaction_history";
-        static constexpr auto SSTABLE_ACTIVITY = "sstable_activity";
-        static constexpr auto SIZE_ESTIMATES = "size_estimates";
-        static constexpr auto AVAILABLE_RANGES = "available_ranges";
-        static constexpr auto VIEWS_BUILDS_IN_PROGRESS = "views_builds_in_progress";
-        static constexpr auto BUILT_VIEWS = "built_views";
-        static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
-        static constexpr auto CDC_LOCAL = "cdc_local";
-        static schema_ptr batches();
-        static schema_ptr built_indexes();
-        static schema_ptr local();
-        static schema_ptr truncated();
-        static schema_ptr commitlog_cleanups();
-        static schema_ptr peers();
-        static schema_ptr peer_events();
-        static schema_ptr range_xfers();
-        static schema_ptr compaction_history();
-        static schema_ptr sstable_activity();
-        static schema_ptr size_estimates();
-        static schema_ptr large_partitions();
-        static schema_ptr scylla_local();
-        static schema_ptr available_ranges();
-        static schema_ptr views_builds_in_progress();
-        static schema_ptr built_views();
-        static schema_ptr scylla_views_builds_in_progress();
-        static schema_ptr cdc_local();
-    };
-
    // Partition estimates for a given range of tokens.
    struct range_estimates {
        schema_ptr schema;
@@ -264,6 +241,7 @@ public:
    static schema_ptr batchlog_v2();
    static schema_ptr paxos();
    static schema_ptr built_indexes(); // TODO (from Cassandra): make private
+    static schema_ptr scylla_local();
    static schema_ptr raft();
    static schema_ptr raft_snapshots();
    static schema_ptr repair_history();
@@ -283,6 +261,8 @@ public:
    static schema_ptr dicts();
    static schema_ptr view_building_tasks();
    static schema_ptr client_routes();
+    static schema_ptr views_builds_in_progress();
+    static schema_ptr scylla_views_builds_in_progress();

    // auth
    static schema_ptr roles();
--- a/db/view/build_progress_virtual_reader.hh
+++ b/db/view/build_progress_virtual_reader.hh
@@ -195,7 +195,7 @@ public:
        return mutation_reader(std::make_unique<build_progress_reader>(
                s,
                std::move(permit),
-                _db.find_column_family(s->ks_name(), system_keyspace::v3::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
+                _db.find_column_family(s->ks_name(), system_keyspace::SCYLLA_VIEWS_BUILDS_IN_PROGRESS),
                range,
                slice,
                std::move(trace_state),
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -23,6 +23,7 @@

 #include <seastar/core/future-util.hh>
 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/all.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <flat_map>

@@ -65,6 +66,7 @@
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
 #include "utils/small_vector.hh"
+#include "view_builder.hh"
 #include "view_info.hh"
 #include "view_update_checks.hh"
 #include "types/list.hh"
@@ -2238,12 +2240,20 @@ void view_builder::setup_metrics() {
 }

 future<> view_builder::start_in_background(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
+    auto step_fiber = make_ready_future<>();
    try {
        view_builder_init_state vbi;
        auto fail = defer([&barrier] mutable { barrier.abort(); });
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
+        // Semaphore usage invariants:
+        // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+        //   (_base_to_build_step, _built_views, build_status, reader resets).
+        // - The unit is held for the whole operation, including the async chain, until the state
+        //   is stable for the next operation on that shard.
+        // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+        //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+        //   the local acquire because it already holds the unit from the dispatcher.
+        // Guard the whole startup routine with a semaphore so that it's not intercepted by
+        // `on_drop_view`, `on_create_view`, or `on_update_view` events.
        auto units = co_await get_units(_sem, view_builder_semaphore_units);
        // Wait for schema agreement even if we're a seed node.
        co_await mm.wait_for_schema_agreement(_db, db::timeout_clock::time_point::max(), &_as);
@@ -2264,8 +2274,10 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        _mnotifier.register_listener(this);
        co_await calculate_shard_build_step(vbi);
        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+
+        // If preparation above fails, run_in_background() is not invoked, just
+        // the start_in_background() emits a warning into logs and resolves
+        step_fiber = run_in_background();
    } catch (...) {
        auto ex = std::current_exception();
        auto ll = log_level::error;
@@ -2280,10 +2292,12 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        }
        vlogger.log(ll, "start aborted: {}", ex);
    }
+
+    co_await std::move(step_fiber);
 }

 future<> view_builder::start(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
-    _started = start_in_background(mm, std::move(barrier));
+    _step_fiber = start_in_background(mm, std::move(barrier));
    return make_ready_future<>();
 }

@@ -2293,12 +2307,13 @@ future<> view_builder::drain() {
    }
    vlogger.info("Draining view builder");
    _as.request_abort();
-    co_await std::move(_started);
    co_await _mnotifier.unregister_listener(this);
+    co_await _ops_gate.close();
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
-    co_await _build_step.join();
+    _build_step.broken();
+    co_await std::move(_step_fiber);
    co_await coroutine::parallel_for_each(_base_to_build_step, [] (std::pair<const table_id, build_step>& p) {
        return p.second.reader.close();
    });
@@ -2667,63 +2682,59 @@ static bool should_ignore_tablet_keyspace(const replica::database& db, const sst
    return db.features().view_building_coordinator && db.has_keyspace(ks_name) && db.find_keyspace(ks_name).uses_tablets();
 }

-future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
-    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
-    }
-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; seed the global rows before broadcasting.
-        return handle_seed_view_build_progress(ks_name, view_name).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return container().invoke_on_all([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable {
-                return vb.handle_create_view_local(std::move(ks_name), std::move(view_name));
-            });
-        });
-    });
+future<view_builder::view_builder_units> view_builder::get_or_adopt_view_builder_lock(view_builder_units_opt units) {
+    co_return units ? std::move(*units) : co_await get_units(_sem, view_builder_semaphore_units);
 }

-future<> view_builder::handle_seed_view_build_progress(sstring ks_name, sstring view_name) {
+future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
+    if (should_ignore_tablet_keyspace(_db, ks_name)) {
+        co_return;
+    }
+
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+    co_await handle_seed_view_build_progress(ks_name, view_name);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_create_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_create_view_local(ks_name, view_name, std::nullopt); }); });
+}
+
+future<> view_builder::handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name) {
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
    return _sys_ks.register_view_for_building_for_all_shards(view->ks_name(), view->cf_name(), step.current_token());
 }

-future<> view_builder::handle_create_view_local(sstring ks_name, sstring view_name){
-    if (this_shard_id() == 0) { 
-        return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_create_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
-    return when_all(step.base->await_pending_writes(), step.base->await_pending_streams()).discard_result().then([this, &step] {
-        return flush_base(step.base, _as);
-    }).then([this, view, &step] () {
+    try {
+        co_await coroutine::all(
+            [&step] -> future<> {
+                co_await step.base->await_pending_writes(); },
+            [&step] -> future<> {
+                co_await step.base->await_pending_streams(); });
+        co_await flush_base(step.base, _as);
+    
        // This resets the build step to the current token. It may result in views currently
        // being built to receive duplicate updates, but it simplifies things as we don't have
        // to keep around a list of new views to build the next time the reader crosses a token
        // threshold.
-        return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
-            return add_new_view(view, step);
-        }).then_wrapped([this, view] (future<>&& f) {
-            try {
-                f.get();
-            } catch (abort_requested_exception&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (raft::request_aborted&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (...) {
-                vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
-            }
+        co_await initialize_reader_at_current_token(step);
+        co_await add_new_view(view, step);
+    } catch (abort_requested_exception&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (raft::request_aborted&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (...) {
+        vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
+    }

-            // Waited on indirectly in stop().
-            static_cast<void>(_build_step.trigger());
-        });
-    });
+    _build_step.signal();
 }

 void view_builder::on_create_view(const sstring& ks_name, const sstring& view_name) {
@@ -2732,90 +2743,101 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_create_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_create_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view creation {}.{}: {}", ks_name, view_name, ep);
    }));
 }

-void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
+future<> view_builder::dispatch_update_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return;
+        co_return;
    }

+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    auto view = view_ptr(_db.find_schema(ks_name, view_name));
+    auto step_it = _base_to_build_step.find(view->view_info()->base_id());
+    if (step_it == _base_to_build_step.end()) {
+        co_return; // In case all the views for this CF have finished building already.
+    }
+    auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
+        return bs.view->id() == view->id();
+    });
+    if (status_it != step_it->second.build_status.end()) {
+        status_it->view = std::move(view);
+    }
+}
+
+void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
    // Do it in the background, serialized.
-    (void)with_semaphore(_sem, view_builder_semaphore_units, [ks_name, view_name, this] {
-        auto view = view_ptr(_db.find_schema(ks_name, view_name));
-        auto step_it = _base_to_build_step.find(view->view_info()->base_id());
-        if (step_it == _base_to_build_step.end()) {
-            return;// In case all the views for this CF have finished building already.
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_update_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+        try {
+            std::rethrow_exception(ep);
+        } catch (const seastar::gate_closed_exception&) {
+            vlogger.warn("Ignoring gate_closed_exception during view update {}.{}", ks_name, view_name);
+        } catch (const seastar::broken_named_semaphore&) {
+            vlogger.warn("Ignoring broken_named_semaphore during view update {}.{}", ks_name, view_name);
+        } catch (const replica::no_such_column_family&) {
+            vlogger.warn("Ignoring no_such_column_family during view update {}.{}", ks_name, view_name);
        }
-        auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
-            return bs.view->id() == view->id();
-        });
-        if (status_it != step_it->second.build_status.end()) {
-            status_it->view = std::move(view);
-        }
-    }).handle_exception_type([] (replica::no_such_column_family&) { });
+    }));
 }

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
+        co_return;
    }

-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; broadcast local cleanup before global cleanup.
-        return container().invoke_on_all([ks_name, view_name] (view_builder& vb) mutable {
-            return vb.handle_drop_view_local(std::move(ks_name), std::move(view_name));
-        }).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_global_cleanup(std::move(ks_name), std::move(view_name));
-        });
-    });
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_drop_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_drop_view_local(ks_name, view_name, std::nullopt); });});
+    co_await handle_drop_view_global_cleanup(ks_name, view_name);
 }

-future<> view_builder::handle_drop_view_local(sstring ks_name, sstring view_name) {
-    if (this_shard_id() == 0) { 
-        return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_drop_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    vlogger.info0("Stopping to build view {}.{}", ks_name, view_name);
-    // The view is absent from the database at this point, so find it by brute force.
-    ([&, this] {
-        for (auto& [_, step] : _base_to_build_step) {
-            if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
-                continue;
-            }
-            for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
-                if (it->view->cf_name() == view_name) {
-                    _built_views.erase(it->view->id());
-                    step.build_status.erase(it);
-                    return;
-                }
+
+    for (auto& [_, step] : _base_to_build_step) {
+        if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
+            continue;
+        }
+        for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
+            if (it->view->cf_name() == view_name) {
+                _built_views.erase(it->view->id());
+                step.build_status.erase(it);
+                co_return;
            }
        }
-    })();
-    return make_ready_future<>();  
+    }
 }

-future<> view_builder::handle_drop_view_global_cleanup(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name) {
    if (this_shard_id() != 0) {
-        return make_ready_future<>();
+        co_return;
    }
    vlogger.info0("Starting view global cleanup {}.{}", ks_name, view_name);
-    return when_all_succeed(
-                _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name),
-                _sys_ks.remove_built_view(ks_name, view_name),
-                remove_view_build_status(ks_name, view_name))
-                    .discard_result()
-                    .handle_exception([ks_name, view_name] (std::exception_ptr ep) {
-        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, ep);
-    });
+    
+    try {
+        co_await coroutine::all(
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_built_view(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await remove_view_build_status(ks_name, view_name); });
+    } catch (...) {
+        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, std::current_exception());
+    }
 }

 void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name) {
@@ -2824,19 +2846,22 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_drop_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_drop_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view drop {}.{}: {}", ks_name, view_name, ep);
    }));
 }

-future<> view_builder::do_build_step() {
-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this] {
+future<> view_builder::run_in_background() {
+    return seastar::async([this] {
        exponential_backoff_retry r(1s, 1min);
-        while (!_base_to_build_step.empty() && !_as.abort_requested()) {
+        while (!_as.abort_requested()) {
+            try {
+                _build_step.wait([this] { return !_base_to_build_step.empty(); }).get();
+            } catch (const seastar::broken_condition_variable&) {
+                return;
+            }
            auto units = get_units(_sem, view_builder_semaphore_units).get();
            ++_stats.steps_performed;
            try {
@@ -3707,7 +3732,7 @@ void validate_view_keyspace(const data_dictionary::database& db, std::string_vie

    try {
        locator::assert_rf_rack_valid_keyspace(keyspace_name, tmptr, rs);
-    } catch (const std::exception& e) {
+    } catch (const std::invalid_argument& e) {
        throw std::logic_error(fmt::format(
            "Materialized views and secondary indexes are not supported on the keyspace '{}': {}",
            keyspace_name, e.what()));
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -11,13 +11,14 @@
 #include "query/query-request.hh"
 #include "service/migration_listener.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/serialized_action.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "replica/database.hh"

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_future.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -104,6 +105,12 @@ class view_update_generator;
 *            redo the missing step, for simplicity.
 */
 class view_builder final : public service::migration_listener::only_view_notifications, public seastar::peering_sharded_service<view_builder> {
+    //aliasing for semaphore units that will be used throughout the class
+    using view_builder_units = semaphore_units<named_semaphore_exception_factory>;
+
+    //aliasing for optional semaphore units that will be used throughout the class
+    using view_builder_units_opt = std::optional<view_builder_units>;
+
    /**
     * Keeps track of the build progress for a particular view.
     * When the view is built, next_token == first_token.
@@ -168,14 +175,25 @@ class view_builder final : public service::migration_listener::only_view_notific
    reader_permit _permit;
    base_to_build_step_type _base_to_build_step;
    base_to_build_step_type::iterator _current_step = _base_to_build_step.end();
-    serialized_action _build_step{std::bind(&view_builder::do_build_step, this)};
+    condition_variable _build_step;
    static constexpr size_t view_builder_semaphore_units = 1;
    // Ensures bookkeeping operations are serialized, meaning that while we execute
    // a build step we don't consider newly added or removed views. This simplifies
    // the algorithms. Also synchronizes an operation wrt. a call to stop().
+    // Semaphore usage invariants:
+    // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+    //   (_base_to_build_step, _built_views, build_status, reader resets).
+    // - The unit is held for the whole operation, including the async chain, until the state
+    //   is stable for the next operation on that shard.
+    // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+    //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+    //   the local acquire because it already holds the unit from the dispatcher.
+    // Guard the whole startup routine with a semaphore so that it's not intercepted by
+    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
+    seastar::gate _ops_gate;
    seastar::abort_source _as;
-    future<> _started = make_ready_future<>();
+    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<table_id> _built_views;
    // Used for testing.
@@ -262,19 +280,19 @@ private:
    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace_view_name>, std::vector<system_keyspace_view_build_progress>);
    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
-    future<> do_build_step();
+    future<> run_in_background();
    void execute(build_step&, exponential_backoff_retry);
    future<> maybe_mark_view_as_built(view_ptr, dht::token);
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
+    future<> dispatch_update_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
-    future<> handle_seed_view_build_progress(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_global_cleanup(sstring ks_name, sstring view_name);
+    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
+    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name);
+    future<view_builder_units> get_or_adopt_view_builder_lock(view_builder_units_opt units);

    template <typename Func1, typename Func2>
    future<> write_view_build_status(Func1&& fn_group0, Func2&& fn_sys_dist) {
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -242,7 +242,7 @@ future<> view_building_worker::create_staging_sstable_tasks() {
                utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
                table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
            };
-            auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
+            auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
            cmuts.emplace_back(std::move(mut));
        }
    }
@@ -386,7 +386,6 @@ future<> view_building_worker::update_built_views() {
        auto schema = _db.find_schema(table_id);
        return std::make_pair(schema->ks_name(), schema->cf_name());
    };
-    auto& sys_ks = _group0.client().sys_ks();

    std::set<std::pair<sstring, sstring>> built_views;
    for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
@@ -395,22 +394,22 @@ future<> view_building_worker::update_built_views() {
        }
    }

-    auto local_built = co_await sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
+    auto local_built = co_await _sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
        return !_db.has_keyspace(v.first) || _db.find_keyspace(v.first).uses_tablets();
    }) | std::ranges::to<std::set>();

    // Remove dead entries
    for (auto& view: local_built) {
        if (!built_views.contains(view)) {
-            co_await sys_ks.remove_built_view(view.first, view.second);
+            co_await _sys_ks.remove_built_view(view.first, view.second);
        }
    }

    // Add new entries
    for (auto& view: built_views) {
        if (!local_built.contains(view)) {
-            co_await sys_ks.mark_view_as_built(view.first, view.second);
-            co_await sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
+            co_await _sys_ks.mark_view_as_built(view.first, view.second);
+            co_await _sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
        }
    }
 }
@@ -589,11 +588,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
    utils::get_local_injector().inject("do_build_range_fail",
            [] { throw std::runtime_error("do_build_range failed due to error injection"); });

-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this, base_id, views_ids = std::move(views_ids), last_token, &as] {
+    return seastar::async([this, base_id, views_ids = std::move(views_ids), last_token, &as] {
        gc_clock::time_point now = gc_clock::now();
        auto base_cf = _db.find_column_family(base_id).shared_from_this();
        reader_permit permit = _db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "build_views_range", db::no_timeout, {});
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -67,7 +67,10 @@ public:
        return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
            .with_column("peer", inet_addr_type, column_kind::partition_key)
            .with_column("dc", utf8_type)
+            .with_column("rack", utf8_type)
            .with_column("up", boolean_type)
+            .with_column("draining", boolean_type)
+            .with_column("excluded", boolean_type)
            .with_column("status", utf8_type)
            .with_column("load", utf8_type)
            .with_column("tokens", int32_type)
@@ -107,8 +110,13 @@ public:

                if (tm.get_topology().has_node(hostid)) {
                    // Not all entries in gossiper are present in the topology
-                    sstring dc = tm.get_topology().get_location(hostid).dc;
+                    auto& node = tm.get_topology().get_node(hostid);
+                    sstring dc = node.dc_rack().dc;
+                    sstring rack = node.dc_rack().rack;
                    set_cell(cr, "dc", dc);
+                    set_cell(cr, "rack", rack);
+                    set_cell(cr, "draining", node.is_draining());
+                    set_cell(cr, "excluded", node.is_excluded());
                }

                if (ownership.contains(eps.get_ip())) {
@@ -1134,6 +1142,8 @@ public:
            set_cell(r.cells(), "dc", node.dc());
            set_cell(r.cells(), "rack", node.rack());
            set_cell(r.cells(), "up", _gossiper.local().is_alive(host));
+            set_cell(r.cells(), "draining", node.is_draining());
+            set_cell(r.cells(), "excluded", node.is_excluded());
            if (auto ip = _gossiper.local().get_address_map().find(host)) {
                set_cell(r.cells(), "ip", data_value(inet_address(*ip)));
            }
@@ -1144,6 +1154,9 @@ public:
            if (stats && stats->capacity.contains(host)) {
                auto capacity = stats->capacity.at(host);
                set_cell(r.cells(), "storage_capacity", data_value(int64_t(capacity)));
+                if (auto ts_iter = stats->tablet_stats.find(host); ts_iter != stats->tablet_stats.end()) {
+                    set_cell(r.cells(), "effective_capacity", data_value(int64_t(ts_iter->second.effective_capacity)));
+                }

                if (auto utilization = load.get_allocated_utilization(host)) {
                    set_cell(r.cells(), "storage_allocated_utilization", data_value(double(*utilization)));
@@ -1168,9 +1181,12 @@ private:
            .with_column("rack", utf8_type)
            .with_column("ip", inet_addr_type)
            .with_column("up", boolean_type)
+            .with_column("draining", boolean_type)
+            .with_column("excluded", boolean_type)
            .with_column("tablets_allocated", long_type)
            .with_column("tablets_allocated_per_shard", double_type)
            .with_column("storage_capacity", long_type)
+            .with_column("effective_capacity", long_type)
            .with_column("storage_allocated_load", long_type)
            .with_column("storage_allocated_utilization", double_type)
            .with_column("storage_load", long_type)
@@ -1332,8 +1348,8 @@ public:

 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
-        return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
+        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1415,8 +1431,8 @@ public:
    }
 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
-        return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
+        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", timestamp_type, column_kind::clustering_key)
@@ -1484,7 +1500,7 @@ future<> initialize_virtual_tables(
    co_await add_table(std::make_unique<cdc_streams_table>(db, ss));

    db.find_column_family(system_keyspace::size_estimates()).set_virtual_reader(mutation_source(db::size_estimates::virtual_reader(db, sys_ks.local())));
-    db.find_column_family(system_keyspace::v3::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db)));
+    db.find_column_family(system_keyspace::views_builds_in_progress()).set_virtual_reader(mutation_source(db::view::build_progress_virtual_reader(db)));
    db.find_column_family(system_keyspace::built_indexes()).set_virtual_reader(mutation_source(db::index::built_indexes_virtual_reader(db)));
 }

--- a/debug.cc
+++ b/debug.cc
@@ -11,5 +11,7 @@
 namespace debug {

 seastar::sharded<replica::database>* volatile the_database = nullptr;
+seastar::scheduling_group streaming_scheduling_group;
+seastar::scheduling_group gossip_scheduling_group;

 }
--- a/debug.hh
+++ b/debug.hh
@@ -17,7 +17,8 @@ class database;
 namespace debug {

 extern seastar::sharded<replica::database>* volatile the_database;
-
+extern seastar::scheduling_group streaming_scheduling_group;
+extern seastar::scheduling_group gossip_scheduling_group;

 }

--- a/Show More
+++ b/Show More