test/alternator: fix delete_item_no_ts test, add LWT rejection tests for delete ops, simplify assertions, update docs

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
alternator: add custom timestamp support to DeleteItem and BatchWriteItem DeleteRequest
2026-03-05 17:31:41 +00:00 · 2026-03-05 16:16:27 +00:00 · 2026-02-25 22:17:17 +00:00 · 2026-02-25 22:12:07 +00:00 · 2026-02-25 21:58:34 +00:00 · 2026-02-25 21:38:52 +00:00
290 changed files with 5790 additions and 4658 deletions
--- a/.github/workflows/call_jira_sync_pr_milestone.yml
+++ b/.github/workflows/call_jira_sync_pr_milestone.yml
@@ -0,0 +1,22 @@
+name: Sync Jira Based on PR Milestone Events
+
+on:
+  pull_request_target:
+    types: [milestoned, demilestoned]
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  jira-sync-milestone-set:
+    if: github.event.action == 'milestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-sync-milestone-removed:
+    if: github.event.action == 'demilestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -1,4 +1,4 @@
-name: Call Jira release creation for new milestone
+name: Call Jira release creation for new milestone

 on:
  milestone:
@@ -9,6 +9,6 @@ jobs:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
    with:
      # Comma-separated list of Jira project keys
-      jira_project_keys: "SCYLLADB,CUSTOMER"
+      jira_project_keys: "SCYLLADB,CUSTOMER,SMI"
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/close_issue_for_scylla_associate.yml
+++ b/.github/workflows/close_issue_for_scylla_associate.yml
@@ -0,0 +1,62 @@
+name: Close issues created by Scylla associates
+
+on:
+  issues:
+    types: [opened, reopened]
+
+permissions:
+  issues: write
+
+jobs:
+  comment-and-close:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Comment and close if author email is scylladb.com
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const issue = context.payload.issue;
+            const actor = context.actor;
+
+            // Get user data (only public email is available)
+            const { data: user } = await github.rest.users.getByUsername({
+              username: actor,
+            });
+
+            const email = user.email || "";
+            console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
+
+            // Only continue if email exists and ends with @scylladb.com
+            if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
+              console.log("User is not a scylladb.com email (or email not public); skipping.");
+              return;
+            }
+
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const issue_number = issue.number;
+
+            const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
+
+            // Add the comment
+            await github.rest.issues.createComment({
+              owner,
+              repo,
+              issue_number,
+              body,
+            });
+
+            console.log(`Comment added to #${issue_number}`);
+
+            // Close the issue
+            await github.rest.issues.update({
+              owner,
+              repo,
+              issue_number,
+              state: "closed",
+              state_reason: "not_planned"
+            });
+
+            console.log(`Issue #${issue_number} closed.`);
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,16 +9,34 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
+    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
+      - name: Validate Comment Trigger
+        if: github.event_name == 'issue_comment'
+        id: verify_comment
+        shell: bash
+        run: |
+          BODY=$(cat << 'EOF'
+          ${{ github.event.comment.body }}
+          EOF
+          )
+          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+
+          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
+            echo "trigger=true" >> $GITHUB_OUTPUT
+          else
+            echo "trigger=false" >> $GITHUB_OUTPUT
+          fi
+
      - name: Trigger Scylla-CI-Route Jenkins Job
+        if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
        run: |
-          PR_NUMBER=${{ github.event.issue.number }}
+          PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ For further information, please see:

 [developer documentation]: HACKING.md
 [build documentation]: docs/dev/building.md
-[docker image build documentation]: dist/docker/debian/README.md
+[docker image build documentation]: dist/docker/redhat/README.md

 ## Running Scylla

--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
 // Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function can throw an ValidationException API error if there
+// This function can throw a ValidationException API error if there
 // are errors in the format of the condition itself.
 bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_reponse) {
+    if (_should_add_to_response) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
+    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
    bool operator()() const noexcept {
-        return _should_add_to_reponse;
+        return _should_add_to_response;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_reponse = false;
+    bool _should_add_to_response = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -7,6 +7,7 @@
 */

 #include <fmt/ranges.h>
+#include <cstdlib>
 #include <seastar/core/on_internal_error.hh>
 #include "alternator/executor.hh"
 #include "alternator/consumed_capacity.hh"
@@ -108,6 +109,16 @@ const sstring TABLE_CREATION_TIME_TAG_KEY("system:table_creation_time");
 // configured by UpdateTimeToLive to be the expiration-time attribute for
 // this table.
 extern const sstring TTL_TAG_KEY("system:ttl_attribute");
+// If this tag is present, it stores the name of an attribute whose numeric
+// value (in microseconds since the Unix epoch) is used as the write timestamp
+// for PutItem and UpdateItem operations. When the named attribute is present
+// in a PutItem or UpdateItem request, its value is used as the timestamp of
+// the write, and the attribute itself is NOT stored in the item. This allows
+// users to control write ordering for last-write-wins semantics. Because LWT
+// does not allow setting a custom write timestamp, operations using this
+// feature are incompatible with conditions (which require LWT), and with
+// the LWT_ALWAYS write isolation mode; such operations are rejected.
+static const sstring TIMESTAMP_TAG_KEY("system:timestamp_attribute");
 // This will be set to 1 in a case, where user DID NOT specify a range key.
 // The way GSI / LSI is implemented by Alternator assumes user specified keys will come first
 // in materialized view's key list. Then, if needed missing keys are added (current implementation
@@ -237,7 +248,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
 }

 // This function assumes the given value is an object and returns requested member value.
-// If it is not possible an api_error::validation is thrown.
+// If it is not possible, an api_error::validation is thrown.
 static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
    validate_is_object(obj, caller);
    const rjson::value* ret = rjson::find(obj, member_name);
@@ -249,7 +260,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe


 // This function assumes the given value is an object with a single member, and returns this member.
-// In case the requirements are not met an api_error::validation is thrown.
+// In case the requirements are not met, an api_error::validation is thrown.
 static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -682,7 +693,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
 }

 // Sets a KeySchema object inside the given JSON parent describing the key
-// attributes of the the given schema as being either HASH or RANGE keys.
+// attributes of the given schema as being either HASH or RANGE keys.
 // Additionally, adds to a given map mappings between the key attribute
 // names and their type (as a DynamoDB type string).
 void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -916,7 +927,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
                sstring index_name = cf_name.substr(delim_it + 1);
                rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
                rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
-                // Add indexes's KeySchema and collect types for AttributeDefinitions:
+                // Add index's KeySchema and collect types for AttributeDefinitions:
                executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
                // Add projection type
                rjson::value projection = rjson::empty_object();
@@ -1337,13 +1348,14 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
 // Alternator uses tags whose keys start with the "system:" prefix for
 // internal purposes. Those should not be readable by ListTagsOfResource,
 // nor writable with TagResource or UntagResource (see #24098).
-// Only a few specific system tags, currently only "system:write_isolation"
-// and "system:initial_tablets", are deliberately intended to be set and read
-// by the user, so are not considered "internal".
+// Only a few specific system tags, currently only "system:write_isolation",
+// "system:initial_tablets", and "system:timestamp_attribute", are deliberately
+// intended to be set and read by the user, so are not considered "internal".
 static bool tag_key_is_internal(std::string_view tag_key) {
    return tag_key.starts_with("system:")
        && tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
-        && tag_key != INITIAL_TABLETS_TAG_KEY;
+        && tag_key != INITIAL_TABLETS_TAG_KEY
+        && tag_key != TIMESTAMP_TAG_KEY;
 }

 enum class update_tags_action { add_tags, delete_tags };
@@ -2298,8 +2310,11 @@ public:
 // After calling pk_from_json() and ck_from_json() to extract the pk and ck
 // components of a key, and if that succeeded, call check_key() to further
 // check that the key doesn't have any spurious components.
-static void check_key(const rjson::value& key, const schema_ptr& schema) {
-    if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
+// allow_extra_attribute: set to true when the key may contain one extra
+// non-key attribute (e.g., the timestamp pseudo-attribute for DeleteItem).
+static void check_key(const rjson::value& key, const schema_ptr& schema, bool allow_extra_attribute = false) {
+    const unsigned expected = (schema->clustering_key_size() == 0 ? 1 : 2) + (allow_extra_attribute ? 1 : 0);
+    if (key.MemberCount() != expected) {
        throw api_error::validation("Given key attribute not in schema");
    }
 }
@@ -2346,6 +2361,57 @@ void validate_value(const rjson::value& v, const char* caller) {
 // any writing happens (if one of the commands has an error, none of the
 // writes should be done). LWT makes it impossible for the parse step to
 // generate "mutation" objects, because the timestamp still isn't known.
+
+// Convert a DynamoDB number (big_decimal) to an api::timestamp_type
+// (microseconds since the Unix epoch). Fractional microseconds are truncated.
+// Returns nullopt if the value is negative or zero.
+static std::optional<api::timestamp_type> bigdecimal_to_timestamp(const big_decimal& bd) {
+    if (bd.unscaled_value() <= 0) {
+        return std::nullopt;
+    }
+    if (bd.scale() == 0) {
+        // Fast path: integer value, no decimal adjustment needed
+        return static_cast<api::timestamp_type>(bd.unscaled_value());
+    }
+    // General case: adjust for decimal scale.
+    // big_decimal stores value as unscaled_value * 10^(-scale).
+    // scale > 0 means divide by 10^scale (truncate fractional part).
+    // scale < 0 means multiply by 10^|scale| (add trailing zeros).
+    auto str = bd.unscaled_value().str();
+    if (bd.scale() > 0) {
+        int len = str.length();
+        if (len <= bd.scale()) {
+            return std::nullopt;  // Number < 1
+        }
+        str = str.substr(0, len - bd.scale());
+    } else {
+        if (bd.scale() < -18) {
+            // Too large to represent as int64_t
+            return std::nullopt;
+        }
+        for (int i = 0; i < -bd.scale(); i++) {
+            str.push_back('0');
+        }
+    }
+    long long result = strtoll(str.c_str(), nullptr, 10);
+    if (result <= 0) {
+        return std::nullopt;
+    }
+    return static_cast<api::timestamp_type>(result);
+}
+
+// Try to extract a write timestamp from a DynamoDB-typed value.
+// The value should be a number ({"N": "..."}), representing microseconds
+// since the Unix epoch. Returns nullopt if the value is not a valid number
+// or doesn't represent a valid timestamp.
+static std::optional<api::timestamp_type> try_get_timestamp(const rjson::value& attr_value) {
+    std::optional<big_decimal> n = try_unwrap_number(attr_value);
+    if (!n) {
+        return std::nullopt;
+    }
+    return bigdecimal_to_timestamp(*n);
+}
+
 class put_or_delete_item {
 private:
    partition_key _pk;
@@ -2361,11 +2427,17 @@ private:
    // that length can have different meaning depends on the operation but the
    // the calculation of length in bytes to WCU is the same.
    uint64_t _length_in_bytes = 0;
+    // If the table has a system:timestamp_attribute tag, and the named
+    // attribute was found in the item with a valid numeric value, this holds
+    // the extracted timestamp. The attribute is not added to _cells.
+    std::optional<api::timestamp_type> _custom_timestamp;
 public:
    struct delete_item {};
    struct put_item {};
-    put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
-    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes);
+    put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item,
+            const std::optional<bytes>& timestamp_attribute = std::nullopt);
+    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
+            const std::optional<bytes>& timestamp_attribute = std::nullopt);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
    mutation build(schema_ptr schema, api::timestamp_type ts) const;
@@ -2380,11 +2452,32 @@ public:
    bool is_put_item() noexcept {
        return _cells.has_value();
    }
+    // Returns the custom write timestamp extracted from the timestamp attribute,
+    // if any. If not set, the caller should use api::new_timestamp() instead.
+    std::optional<api::timestamp_type> custom_timestamp() const noexcept {
+        return _custom_timestamp;
+    }
 };

-put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
+put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item, const std::optional<bytes>& timestamp_attribute)
        : _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
-    check_key(key, schema);
+    if (timestamp_attribute) {
+        // The timestamp attribute may be provided as a "pseudo-key": it is
+        // not a real key column, but can be included in the "Key" object to
+        // carry the custom write timestamp. If found, extract the timestamp
+        // and don't store it in the item.
+        const rjson::value* ts_val = rjson::find(key, to_string_view(*timestamp_attribute));
+        if (ts_val) {
+            if (auto t = try_get_timestamp(*ts_val)) {
+                _custom_timestamp = t;
+            } else {
+                throw api_error::validation(fmt::format(
+                    "The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)",
+                    to_string_view(*timestamp_attribute)));
+            }
+        }
+    }
+    check_key(key, schema, _custom_timestamp.has_value());
 }

 // find_attribute() checks whether the named attribute is stored in the
@@ -2435,7 +2528,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
 //   case, this function simply won't be called for this attribute.)
 //
 // This function checks if the given attribute update is an update to some
-// GSI's key, and if the value is unsuitable, a api_error::validation is
+// GSI's key, and if the value is unsuitable, an api_error::validation is
 // thrown. The checking here is similar to the checking done in
 // get_key_from_typed_value() for the base table's key columns.
 //
@@ -2471,7 +2564,8 @@ static inline void validate_value_if_index_key(
    }
 }

-put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes)
+put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
+        const std::optional<bytes>& timestamp_attribute)
        : _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
    _cells = std::vector<cell>();
    _cells->reserve(item.MemberCount());
@@ -2480,6 +2574,17 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
        validate_value(it->value, "PutItem");
        const column_definition* cdef = find_attribute(*schema, column_name);
        validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
+        // If this is the timestamp attribute, it must be a valid numeric value
+        // (microseconds since epoch). Use it as the write timestamp and do not
+        // store it in the item data. Reject the write if the value is non-numeric.
+        if (timestamp_attribute && column_name == *timestamp_attribute) {
+            if (auto t = try_get_timestamp(it->value)) {
+                _custom_timestamp = t;
+                // The attribute is consumed as timestamp, not stored in _cells.
+                continue;
+            }
+            throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*timestamp_attribute)));
+        }
        _length_in_bytes += column_name.size();
        if (!cdef) {
            // This attribute may be a key column of one of the GSI or LSI,
@@ -2671,6 +2776,13 @@ rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& reque
    // _pk and _ck will be assigned later, by the subclass's constructor
    // (each operation puts the key in a slightly different location in
    // the request).
+    const auto tags_ptr = db::get_tags_of_table(_schema);
+    if (tags_ptr) {
+        auto it = tags_ptr->find(TIMESTAMP_TAG_KEY);
+        if (it != tags_ptr->end() && !it->second.empty()) {
+            _timestamp_attribute = to_bytes(it->second);
+        }
+    }
 }

 std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) {
@@ -2815,6 +2927,21 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        .alternator = true,
        .alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
    };
+    // If the operation uses a custom write timestamp (from the
+    // system:timestamp_attribute tag), LWT is incompatible because LWT
+    // requires the timestamp to be set by the Paxos protocol. Reject the
+    // operation if it would need to use LWT.
+    if (has_custom_timestamp()) {
+        bool would_use_lwt = _write_isolation == write_isolation::LWT_ALWAYS ||
+            (needs_read_before_write &&
+             _write_isolation != write_isolation::FORBID_RMW &&
+             _write_isolation != write_isolation::UNSAFE_RMW);
+        if (would_use_lwt) {
+            throw api_error::validation(
+                "Using the system:timestamp_attribute is not compatible with "
+                "conditional writes or the 'always' write isolation policy.");
+        }
+    }
    if (needs_read_before_write) {
        if (_write_isolation == write_isolation::FORBID_RMW) {
            throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
@@ -2913,7 +3040,8 @@ public:
    put_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
        : rmw_operation(proxy, std::move(request))
        , _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{},
-            si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name()))) {
+            si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name())),
+            _timestamp_attribute) {
        _pk = _mutation_builder.pk();
        _ck = _mutation_builder.ck();
        if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
@@ -2945,6 +3073,9 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
+    bool has_custom_timestamp() const noexcept {
+        return _mutation_builder.custom_timestamp().has_value();
+    }
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
@@ -2962,7 +3093,10 @@ public:
        } else {
            _return_attributes = {};
        }
-        return _mutation_builder.build(_schema, ts);
+        // Use the custom timestamp from the timestamp attribute if available,
+        // otherwise use the provided timestamp.
+        api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
+        return _mutation_builder.build(_schema, effective_ts);
    }
    virtual ~put_item_operation() = default;
 };
@@ -3014,7 +3148,7 @@ public:
    parsed::condition_expression _condition_expression;
    delete_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
        : rmw_operation(proxy, std::move(request))
-        , _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
+        , _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}, _timestamp_attribute) {
        _pk = _mutation_builder.pk();
        _ck = _mutation_builder.ck();
        if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
@@ -3045,6 +3179,9 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
+    bool has_custom_timestamp() const noexcept override {
+        return _mutation_builder.custom_timestamp().has_value();
+    }
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
@@ -3065,7 +3202,10 @@ public:
        if (_consumed_capacity._total_bytes == 0) {
            _consumed_capacity._total_bytes = 1;
        }
-        return _mutation_builder.build(_schema, ts);
+        // Use the custom timestamp from the timestamp attribute if available,
+        // otherwise use the provided timestamp.
+        api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
+        return _mutation_builder.build(_schema, effective_ts);
    }
    virtual ~delete_item_operation() = default;
 };
@@ -3252,10 +3392,13 @@ future<> executor::do_batch_write(
        // Do a normal write, without LWT:
        utils::chunked_vector<mutation> mutations;
        mutations.reserve(mutation_builders.size());
-        api::timestamp_type now = api::new_timestamp();
+        api::timestamp_type default_ts = api::new_timestamp();
        bool any_cdc_enabled = false;
        for (auto& b : mutation_builders) {
-            mutations.push_back(b.second.build(b.first, now));
+            // Use custom timestamp from the timestamp attribute if available,
+            // otherwise use the default timestamp for all items in this batch.
+            api::timestamp_type ts = b.second.custom_timestamp().value_or(default_ts);
+            mutations.push_back(b.second.build(b.first, ts));
            any_cdc_enabled |= b.first->cdc_options().enabled();
        }
        return _proxy.mutate(std::move(mutations),
@@ -3355,6 +3498,16 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c

        std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
                1, primary_key_hash{schema}, primary_key_equal{schema});
+        // Look up the timestamp attribute tag once per table (shared by all
+        // PutRequests and DeleteRequests for this table).
+        std::optional<bytes> ts_attr;
+        const auto tags_ptr = db::get_tags_of_table(schema);
+        if (tags_ptr) {
+            auto tag_it = tags_ptr->find(TIMESTAMP_TAG_KEY);
+            if (tag_it != tags_ptr->end() && !tag_it->second.empty()) {
+                ts_attr = to_bytes(tag_it->second);
+            }
+        }
        for (auto& request : it->value.GetArray()) {
            auto& r = get_single_member(request, "RequestItems element");
            const auto r_name = rjson::to_string_view(r.name);
@@ -3363,7 +3516,8 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                validate_is_object(item, "Item in PutRequest");
                auto&& put_item = put_or_delete_item(
                        item, schema, put_or_delete_item::put_item{},
-                        si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())));
+                        si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())),
+                        ts_attr);
                mutation_builders.emplace_back(schema, std::move(put_item));
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
@@ -3374,7 +3528,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                const rjson::value& key = get_member(r.value, "Key", "DeleteRequest");
                validate_is_object(key, "Key in DeleteRequest");
                mutation_builders.emplace_back(schema, put_or_delete_item(
-                        key, schema, put_or_delete_item::delete_item{}));
+                        key, schema, put_or_delete_item::delete_item{}, ts_attr));
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
                        mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
@@ -3548,7 +3702,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
    return true;
 }

-// Add a path to a attribute_path_map. Throws a validation error if the path
+// Add a path to an attribute_path_map. Throws a validation error if the path
 // "overlaps" with one already in the filter (one is a sub-path of the other)
 // or "conflicts" with it (both a member and index is requested).
 template<typename T>
@@ -3983,6 +4137,10 @@ public:
    virtual ~update_item_operation() = default;
    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override;
    bool needs_read_before_write() const;
+    // Returns true if the timestamp attribute is being set in this update
+    // (via AttributeUpdates PUT or UpdateExpression SET). Used to detect
+    // whether a custom write timestamp will be used.
+    bool has_custom_timestamp() const noexcept;

 private:
    void delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
@@ -4117,6 +4275,44 @@ update_item_operation::needs_read_before_write() const {
           (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
 }

+bool
+update_item_operation::has_custom_timestamp() const noexcept {
+    if (!_timestamp_attribute) {
+        return false;
+    }
+    // Check if the timestamp attribute is being set via AttributeUpdates PUT
+    // with a valid numeric value.
+    if (_attribute_updates) {
+        std::string_view ts_attr = to_string_view(*_timestamp_attribute);
+        for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
+            if (rjson::to_string_view(it->name) == ts_attr) {
+                const rjson::value* action = rjson::find(it->value, "Action");
+                if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
+                    // Only consider it a custom timestamp if the value is numeric
+                    if (try_get_timestamp((it->value)["Value"])) {
+                        return true;
+                    }
+                }
+                break;
+            }
+        }
+    }
+    // Check if the timestamp attribute is being set via UpdateExpression SET.
+    // We can't check the actual value type without resolving the expression
+    // (which requires previous_item), so we conservatively return true if the
+    // attribute appears in a SET action, and handle the non-numeric case in apply().
+    // A non-numeric value will cause apply() to throw a ValidationException.
+    if (!_update_expression.empty()) {
+        std::string ts_attr(to_string_view(*_timestamp_attribute));
+        auto it = _update_expression.find(ts_attr);
+        if (it != _update_expression.end() && it->second.has_value()) {
+            const auto& action = it->second.get_value();
+            return std::holds_alternative<parsed::update_expression::action::set>(action._action);
+        }
+    }
+    return false;
+}
+
 // action_result() returns the result of applying an UpdateItem action -
 // this result is either a JSON object or an unset optional which indicates
 // the action was a deletion. The caller (update_item_operation::apply()
@@ -4392,6 +4588,17 @@ inline void update_item_operation::apply_attribute_updates(const std::unique_ptr
            throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
        }
        std::string action = rjson::to_string((it->value)["Action"]);
+        // If this is the timestamp attribute being PUT, it must be a valid
+        // numeric value (microseconds since epoch). Use it as the write
+        // timestamp and skip storing it. Reject if the value is non-numeric.
+        if (_timestamp_attribute && column_name == *_timestamp_attribute && action == "PUT") {
+            if (it->value.HasMember("Value")) {
+                if (try_get_timestamp((it->value)["Value"])) {
+                    continue;
+                }
+                throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
+            }
+        }
        if (action == "DELETE") {
            // The DELETE operation can do two unrelated tasks. Without a
            // "Value" option, it is used to delete an attribute. With a
@@ -4495,6 +4702,20 @@ inline void update_item_operation::apply_update_expression(const std::unique_ptr
        if (cdef && cdef->is_primary_key()) {
            throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
        }
+        // If this is the timestamp attribute being set via UpdateExpression SET,
+        // it must be a valid numeric value (microseconds since epoch). Use it as
+        // the write timestamp and skip storing it. Reject if non-numeric.
+        if (_timestamp_attribute && to_bytes(column_name) == *_timestamp_attribute &&
+                actions.second.has_value() &&
+                std::holds_alternative<parsed::update_expression::action::set>(actions.second.get_value()._action)) {
+            std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
+            if (result) {
+                if (try_get_timestamp(*result)) {
+                    continue;  // Skip - already used as timestamp
+                }
+                throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
+            }
+        }
        if (actions.second.has_value()) {
            // An action on a top-level attribute column_name. The single
            // action is actions.second.get_value(). We can simply invoke
@@ -4543,6 +4764,44 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
        return {};
    }

+    // If the table has a timestamp attribute, look for it in the update
+    // (AttributeUpdates PUT or UpdateExpression SET). If found with a valid
+    // numeric value, use it as the write timestamp instead of the provided ts.
+    api::timestamp_type effective_ts = ts;
+    if (_timestamp_attribute) {
+        bool found_ts = false;
+        if (_attribute_updates) {
+            std::string_view ts_attr = to_string_view(*_timestamp_attribute);
+            for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
+                if (rjson::to_string_view(it->name) == ts_attr) {
+                    const rjson::value* action = rjson::find(it->value, "Action");
+                    if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
+                        if (auto t = try_get_timestamp((it->value)["Value"])) {
+                            effective_ts = *t;
+                            found_ts = true;
+                        }
+                    }
+                    break;
+                }
+            }
+        }
+        if (!found_ts && !_update_expression.empty()) {
+            std::string ts_attr(to_string_view(*_timestamp_attribute));
+            auto it = _update_expression.find(ts_attr);
+            if (it != _update_expression.end() && it->second.has_value()) {
+                const auto& action = it->second.get_value();
+                if (std::holds_alternative<parsed::update_expression::action::set>(action._action)) {
+                    std::optional<rjson::value> result = action_result(action, previous_item.get());
+                    if (result) {
+                        if (auto t = try_get_timestamp(*result)) {
+                            effective_ts = *t;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
    // In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
    // _return_attributes and parts of it will be overwritten by the new
    // updates (in do_update() and do_delete()). We need to make a copy and
@@ -4571,10 +4830,10 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
    auto& row = m.partition().clustered_row(*_schema, _ck);
    auto modified_attrs = attribute_collector();
    if (!_update_expression.empty()) {
-        apply_update_expression(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
+        apply_update_expression(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
    }
    if (_attribute_updates) {
-        apply_attribute_updates(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
+        apply_attribute_updates(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
    }
    if (!modified_attrs.empty()) {
        auto serialized_map = modified_attrs.to_mut().serialize(*attrs_type());
@@ -4585,7 +4844,7 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
    // marker. An update with only DELETE operations must not add a row marker
    // (this was issue #5862) but any other update, even an empty one, should.
    if (any_updates || !any_deletes) {
-        row.apply(row_marker(ts));
+        row.apply(row_marker(effective_ts));
    } else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
        // There was no pre-existing item, and we're not creating one, so
        // don't report the new item in the returned Attributes.
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -50,7 +50,7 @@ public:
        _operators.emplace_back(i);
        check_depth_limit();
    }
-    void add_dot(std::string(name)) {
+    void add_dot(std::string name) {
        _operators.emplace_back(std::move(name));
        check_depth_limit();
    }
@@ -85,7 +85,7 @@ struct constant {
    }
 };

-// "value" is is a value used in the right hand side of an assignment
+// "value" is a value used in the right hand side of an assignment
 // expression, "SET a = ...". It can be a constant (a reference to a value
 // included in the request, e.g., ":val"), a path to an attribute from the
 // existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
 // The supported primitive conditions are:
 // 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
 //    v1 and v2 are values - from the item (an attribute path), the query
-//    (a ":val" reference), or a function of the the above (only the size()
+//    (a ":val" reference), or a function of the above (only the size()
 //    function is supported).
 // 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
 // 3. N-ary operator - v1 IN ( v2, v3, ... )
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -18,6 +18,7 @@
 #include "executor.hh"
 #include "tracing/trace_state.hh"
 #include "keys/keys.hh"
+#include "bytes.hh"

 namespace alternator {

@@ -72,6 +73,11 @@ protected:
    clustering_key _ck = clustering_key::make_empty();
    write_isolation _write_isolation;
    mutable wcu_consumed_capacity_counter _consumed_capacity;
+    // If the table has a "system:timestamp_attribute" tag, this holds the
+    // name of the attribute (converted to bytes) whose numeric value should
+    // be used as the write timestamp instead of the current time. The
+    // attribute itself is NOT stored in the item data.
+    std::optional<bytes> _timestamp_attribute;
    // All RMW operations can have a ReturnValues parameter from the following
    // choices. But note that only UpdateItem actually supports all of them:
    enum class returnvalues {
@@ -113,6 +119,9 @@ public:
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
    virtual ~rmw_operation() = default;
+    // Returns true if the operation will use a custom write timestamp (from the
+    // system:timestamp_attribute tag). Subclasses override this as needed.
+    virtual bool has_custom_timestamp() const noexcept { return false; }
    const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
    schema_ptr schema() const { return _schema; }
    const rjson::value& request() const { return _request; }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
 clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);

-// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
+// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -141,7 +141,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLive request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 //
 // Here is a brief overview of how the expiration service works:
 //
@@ -593,7 +593,7 @@ static future<> scan_table_ranges(
            if (retries >= 10) {
                // Don't get stuck forever asking the same page, maybe there's
                // a bug or a real problem in several replicas. Give up on
-                // this scan an retry the scan from a random position later,
+                // this scan and retry the scan from a random position later,
                // in the next scan period.
                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
            }
@@ -767,7 +767,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -30,7 +30,7 @@ namespace alternator {

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLeave request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
 public:
    // Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
    data_dictionary::database _db;
    service::storage_proxy& _proxy;
    gms::gossiper& _gossiper;
-    // _end is set by start(), and resolves when the the background service
+    // _end is set by start(), and resolves when the background service
    // started by it ends. To ask the background service to end, _abort_source
    // should be triggered. stop() below uses both _abort_source and _end.
    std::optional<future<>> _end;
--- a/api/api-doc/authorization_cache.json
+++ b/api/api-doc/authorization_cache.json
@@ -12,7 +12,7 @@
      "operations":[
        {
          "method":"POST",
-          "summary":"Reset cache",
+          "summary":"Resets authorized prepared statements cache",
          "type":"void",
          "nickname":"authorization_cache_reset",
          "produces":[
--- a/api/api.hh
+++ b/api/api.hh
@@ -23,31 +23,6 @@

 namespace api {

-template<class T>
-std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
-    std::vector<T> res;
-    res.reserve(map.size());
-
-    for (const auto& [key, value] : map) {
-        res.push_back(T());
-        res.back().key = key;
-        res.back().value = value;
-    }
-    return res;
-}
-
-template<class T, class MAP>
-std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
-    res.reserve(res.size() + std::size(map));
-
-    for (const auto& [key, value] : map) {
-        T val;
-        val.key = fmt::to_string(key);
-        val.value = fmt::to_string(value);
-        res.push_back(val);
-    }
-    return res;
-}
 template <typename T, typename S = T>
 T map_sum(T&& dest, const S& src) {
    for (const auto& i : src) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,6 +515,15 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
+        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
+                    keyspace,
+                    table,
+                    endpoint,
+                    bucket,
+                    prefix,
+                    sstables.size(),
+                    scope,
+                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
@@ -527,13 +536,15 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
-    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
+    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req);
        auto view = req->get_path_param("view");
-        return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
+            storage_service_json::mapper res;
+            res.key = i.first;
+            res.value = i.second;
+            return res;
+        }));
    });

    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -571,6 +582,16 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
 }

+namespace {
+template <typename Key, typename Value>
+storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
+    storage_service_json::mapper val;
+    val.key = fmt::to_string(i.first);
+    val.value = fmt::to_string(i.second);
+    return val;
+}
+}
+
 static
 future<json::json_return_type>
 rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -588,12 +609,7 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
            throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
        }

-        co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
-            storage_service_json::mapper val;
-            val.key = fmt::to_string(i.first);
-            val.value = fmt::to_string(i.second);
-            return val;
-        }));
+        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

 static
@@ -677,7 +693,6 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            table_id = validate_table(ctx.db.local(), keyspace, table);
        }

-        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
            ss::maplist_mapper m;
@@ -1308,10 +1323,7 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
            throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
        }

-        return ss.local().get_ownership().then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1328,10 +1340,7 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
            }
        }

-        return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1341,7 +1350,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
        apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
        throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1407,7 +1416,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
        apilog.warn("retrain_dict: called before the cluster feature was enabled");
        throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -17,7 +17,6 @@ target_sources(scylla_auth
    password_authenticator.cc
    passwords.cc
    permission.cc
-    permissions_cache.cc
    resource.cc
    role_or_anonymous.cc
    roles-metadata.cc
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -8,6 +8,7 @@

 #include "auth/cache.hh"
 #include "auth/common.hh"
+#include "auth/role_or_anonymous.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -18,6 +19,8 @@
 #include <seastar/core/abort_source.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/core/format.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/do_with.hh>

 namespace auth {

@@ -27,7 +30,21 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
    : _current_version(0)
    , _qp(qp)
    , _loading_sem(1)
-    , _as(as) {
+    , _as(as)
+    , _permission_loader(nullptr)
+    , _permission_loader_sem(8) {
+    namespace sm = seastar::metrics;
+    _metrics.add_group("auth_cache", {
+        sm::make_gauge("roles", [this] { return _roles.size(); },
+                sm::description("Number of roles currently cached")),
+        sm::make_gauge("permissions", [this] {
+            return _cached_permissions_count;
+        }, sm::description("Total number of permission sets currently cached across all roles"))
+    });
+}
+
+void cache::set_permission_loader(permission_loader_func loader) {
+    _permission_loader = std::move(loader);
 }

 lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -38,6 +55,83 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
+    std::unordered_map<resource, permission_set>* perms_cache;
+    lw_shared_ptr<role_record> role_ptr;
+
+    if (is_anonymous(role)) {
+        perms_cache = &_anonymous_permissions;
+    } else {
+        const auto& role_name = *role.name;
+        auto role_it = _roles.find(role_name);
+        if (role_it == _roles.end()) {
+            // Role might have been deleted but there are some connections
+            // left which reference it. They should no longer have access to anything.
+            return make_ready_future<permission_set>(permissions::NONE);
+        }
+        role_ptr = role_it->second;
+        perms_cache = &role_ptr->cached_permissions;
+    }
+
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        return make_ready_future<permission_set>(it->second);
+    }
+    // keep alive role_ptr as it holds perms_cache (except anonymous)
+    return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
+        return load_permissions(role, r, perms_cache);
+    });
+}
+
+future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_permission_loader_sem, 1, _as);
+
+    // Check again, perhaps we were blocked and other call loaded
+    // the permissions already. This is a protection against misses storm.
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        co_return it->second;
+    }
+    auto perms = co_await _permission_loader(role, r);
+    add_permissions(*perms_cache, r, perms);
+    co_return perms;
+}
+
+future<> cache::prune(const resource& r) {
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    _anonymous_permissions.erase(r);
+    for (auto& it : _roles) {
+        // Prunning can run concurrently with other functions but it
+        // can only cause cached_permissions extra reload via get_permissions.
+        remove_permissions(it.second->cached_permissions, r);
+        co_await coroutine::maybe_yield();
+    }
+}
+
+future<> cache::reload_all_permissions() noexcept {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
+        std::vector<resource> keys;
+        keys.reserve(m.size());
+        for (const auto& [res, _] : m) {
+            keys.push_back(res);
+        }
+        return keys;
+    };
+    const role_or_anonymous anon;
+    for (const auto& res : copy_keys(_anonymous_permissions)) {
+        _anonymous_permissions[res] = co_await _permission_loader(anon, res);
+    }
+    for (auto& [role, entry] : _roles) {
+        auto& perms_cache = entry->cached_permissions;
+        auto r = role_or_anonymous(role);
+        for (const auto& res : copy_keys(perms_cache)) {
+            perms_cache[res] = co_await _permission_loader(r, res);
+        }
+    }
+    logger.debug("Reloaded auth cache with {} entries", _roles.size());
+}
+
 future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
    auto rec = make_lw_shared<role_record>();
    rec->version = _current_version;
@@ -105,7 +199,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
 future<> cache::prune_all() noexcept {
    for (auto it = _roles.begin(); it != _roles.end(); ) {
        if (it->second->version != _current_version) {
-            _roles.erase(it++);
+            remove_role(it++);
            co_await coroutine::maybe_yield();
        } else {
            ++it;
@@ -129,7 +223,7 @@ future<> cache::load_all() {
        const auto name = r.get_as<sstring>("role");
        auto role = co_await fetch_role(name);
        if (role) {
-            _roles[name] = role;
+            add_role(name, role);
        }
        co_return stop_iteration::no;
    };
@@ -142,11 +236,32 @@ future<> cache::load_all() {
        co_await distribute_role(name, role);
    }
    co_await container().invoke_on_others([this](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        c._current_version = _current_version;
        co_await c.prune_all();
    });
 }

+future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
+    if (!role) {
+        // Role might have been removed or not yet added, either way
+        // their members will be handled by another top call to this function.
+        co_return;
+    }
+    for (const auto& member_name : role->members) {
+        bool is_new = roles.insert(member_name).second;
+        if (!is_new) {
+            continue;
+        }
+        lw_shared_ptr<cache::role_record> member_role;
+        auto r = _roles.find(member_name);
+        if (r != _roles.end()) {
+            member_role = r->second;
+        }
+        co_await gather_inheriting_roles(roles, member_role, member_name);
+    }
+}
+
 future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    if (legacy_mode(_qp)) {
        co_return;
@@ -154,27 +269,41 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    SCYLLA_ASSERT(this_shard_id() == 0);
    auto units = co_await get_units(_loading_sem, 1, _as);

+    std::unordered_set<role_name_t> roles_to_clear_perms;
    for (const auto& name : roles) {
        logger.info("Loading role {}", name);
        auto role = co_await fetch_role(name);
         if (role) {
-            _roles[name] = role;
+            add_role(name, role);
+            co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
        } else {
-            _roles.erase(name);
+            if (auto it = _roles.find(name); it != _roles.end()) {
+                auto old_role = it->second;
+                remove_role(it);
+                co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
+            }
        }
        co_await distribute_role(name, role);
    }
+
+    co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
+        for (const auto& name : roles_to_clear_perms) {
+            c.clear_role_permissions(name);
+            co_await coroutine::maybe_yield();
+        }
+    });
 }

 future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
    auto role_ptr = role.get();
-    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        if (!role_ptr) {
-            c._roles.erase(name);
-            return;
+            c.remove_role(name);
+            co_return;
        }
        auto role_copy = make_lw_shared<role_record>(*role_ptr);
-        c._roles[name] = std::move(role_copy);
+        c.add_role(name, std::move(role_copy));
    });
 }

@@ -185,4 +314,40 @@ bool cache::includes_table(const table_id& id) noexcept {
            || id == db::system_keyspace::role_permissions()->id();
 }

+void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+    }
+    _cached_permissions_count += role->cached_permissions.size();
+    _roles[name] = std::move(role);
+}
+
+void cache::remove_role(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        remove_role(it);
+    }
+}
+
+void cache::remove_role(roles_map::iterator it) {
+    _cached_permissions_count -= it->second->cached_permissions.size();
+    _roles.erase(it);
+}
+
+void cache::clear_role_permissions(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+        it->second->cached_permissions.clear();
+    }
+}
+
+void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
+    if (cache.emplace(r, perms).second) {
+        ++_cached_permissions_count;
+    }
+}
+
+void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
+    _cached_permissions_count -= cache.erase(r);
+}
+
 } // namespace auth
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -17,11 +17,14 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/metrics_registration.hh>

 #include <absl/container/flat_hash_map.h>

 #include "auth/permission.hh"
 #include "auth/common.hh"
+#include "auth/resource.hh"
+#include "auth/role_or_anonymous.hh"

 namespace cql3 { class query_processor; }

@@ -31,6 +34,7 @@ class cache : public peering_sharded_service<cache> {
 public:
    using role_name_t = sstring;
    using version_tag_t = char;
+    using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;

 	struct role_record {
        bool can_login = false;
@@ -40,11 +44,19 @@ public:
        sstring salted_hash;
        std::unordered_map<sstring, sstring> attributes;
        std::unordered_map<sstring, permission_set> permissions;
+    private:
+        friend cache;
+        // cached permissions include effects of role's inheritance
+        std::unordered_map<resource, permission_set> cached_permissions;
        version_tag_t version; // used for seamless cache reloads
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    void set_permission_loader(permission_loader_func loader);
+    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
+    future<> prune(const resource& r);
+    future<> reload_all_permissions() noexcept;
    future<> load_all();
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;
@@ -52,14 +64,31 @@ public:
 private:
    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
    roles_map _roles;
+    // anonymous permissions map exists mainly due to compatibility with
+    // higher layers which use role_or_anonymous to get permissions.
+    std::unordered_map<resource, permission_set> _anonymous_permissions;
    version_tag_t _current_version;
    cql3::query_processor& _qp;
-    semaphore _loading_sem;
+    semaphore _loading_sem; // protects iteration of _roles map
    abort_source& _as;
+    permission_loader_func _permission_loader;
+    semaphore _permission_loader_sem; // protects against reload storms on a single role change
+    metrics::metric_groups _metrics;
+    size_t _cached_permissions_count = 0;

    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
    future<> prune_all() noexcept;
    future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
+    future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
+
+    void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
+    void remove_role(const role_name_t& name);
+    void remove_role(roles_map::iterator it);
+    void clear_role_permissions(const role_name_t& name);
+    void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
+    void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
+
+    future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
 };

 } // namespace auth
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -88,10 +88,16 @@ static const class_registrator<

 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
+        uint32_t permissions_update_interval_in_ms,
+        utils::observer<uint32_t>  permissions_update_interval_in_ms_observer,
        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
        : _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
        , _bind_password(bind_password)
-        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
+        , _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
+        , _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
+        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
+        , _cache(cache)
+        , _cache_pruner(make_ready_future<>()) {
 }

 ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -100,6 +106,8 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
            qp.db().get_config().ldap_attr_role(),
            qp.db().get_config().ldap_bind_dn(),
            qp.db().get_config().ldap_bind_passwd(),
+            qp.db().get_config().permissions_update_interval_in_ms(),
+            qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
            qp,
            rg0c,
            mm,
@@ -119,6 +127,22 @@ future<> ldap_role_manager::start() {
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
    }
+    _cache_pruner = futurize_invoke([this] () -> future<> {
+        while (true) {
+            try {
+                co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
+            } catch (const seastar::sleep_aborted&) {
+                co_return; // ignore
+            }
+            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
+                try {
+                    co_await c.reload_all_permissions();
+                } catch (...) {
+                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+                }
+            });
+        }
+    });
    return _std_mgr.start();
 }

@@ -175,7 +199,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {

 future<> ldap_role_manager::stop() {
    _as.request_abort();
-    return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
+    return std::move(_cache_pruner).then([this] {
+        return _std_mgr.stop();
+    }).then([this] {
+        return _connection_factory.stop();
+    });
 }

 future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
 #include <stdexcept>

 #include "ent/ldap/ldap_connection.hh"
@@ -34,14 +35,22 @@ class ldap_role_manager : public role_manager {
    seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
    seastar::sstring _bind_name; ///< Username for LDAP simple bind.
    seastar::sstring _bind_password; ///< Password for LDAP simple bind.
+
+    uint32_t _permissions_update_interval_in_ms;
+    utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
+
    mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
    seastar::abort_source _as;
+    cache& _cache;
+    seastar::future<> _cache_pruner;
  public:
    ldap_role_manager(
            std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
            std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
            std::string_view bind_name, ///< LDAP bind credentials.
            std::string_view bind_password, ///< LDAP bind credentials.
+            uint32_t permissions_update_interval_in_ms,
+            utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
            cql3::query_processor& qp, ///< Passed to standard_role_manager.
            ::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
            ::service::migration_manager& mm, ///< Passed to standard_role_manager.
--- a/auth/permissions_cache.cc
+++ b/auth/permissions_cache.cc
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "auth/permissions_cache.hh"
-
-#include <fmt/ranges.h>
-#include "auth/authorizer.hh"
-#include "auth/service.hh"
-
-namespace auth {
-
-permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
-        : _cache(c, log, [&ser, &log](const key_type& k) {
-              log.debug("Refreshing permissions for {}", k.first);
-              return ser.get_uncached_permissions(k.first, k.second);
-          }) {
-}
-
-bool permissions_cache::update_config(utils::loading_cache_config c) {
-    return _cache.update_config(std::move(c));
-}
-
-void permissions_cache::reset() {
-    _cache.reset();
-}
-
-future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
-    return do_with(key_type(maybe_role, r), [this](const auto& k) {
-        return _cache.get(k);
-    });
-}
-
-}
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <iostream>
-#include <utility>
-
-#include <fmt/core.h>
-#include <seastar/core/future.hh>
-
-#include "auth/permission.hh"
-#include "auth/resource.hh"
-#include "auth/role_or_anonymous.hh"
-#include "utils/log.hh"
-#include "utils/hash.hh"
-#include "utils/loading_cache.hh"
-
-namespace std {
-
-inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
-    fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
-    return os;
-}
-
-}
-
-namespace db {
-class config;
-}
-
-namespace auth {
-
-class service;
-
-class permissions_cache final {
-    using cache_type = utils::loading_cache<
-            std::pair<role_or_anonymous, resource>,
-            permission_set,
-            1,
-            utils::loading_cache_reload_enabled::yes,
-            utils::simple_entry_size<permission_set>,
-            utils::tuple_hash>;
-
-    using key_type = typename cache_type::key_type;
-
-    cache_type _cache;
-
-public:
-    explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
-
-    future <> stop() {
-        return _cache.stop();
-    }
-
-    bool update_config(utils::loading_cache_config);
-    void reset();
-    future<permission_set> get(const role_or_anonymous&, const resource&);
-};
-
-}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -64,11 +64,11 @@ static const sstring superuser_col_name("super");
 static logging::logger log("auth_service");

 class auth_migration_listener final : public ::service::migration_listener {
-    authorizer& _authorizer;
+    service& _service;
    cql3::query_processor& _qp;

 public:
-    explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a),  _qp(qp) {
+    explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s),  _qp(qp) {
    }

 private:
@@ -92,14 +92,14 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
+        (void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });

-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
        });
@@ -111,9 +111,8 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_data_resource(ks_name, cf_name), mc);
+        (void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
        });
@@ -126,9 +125,8 @@ private:
            return;
        }
        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, function_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
        });
@@ -138,9 +136,8 @@ private:
            // in non legacy path revoke is part of schema change statement execution
            return;
        }
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, aggregate_name), mc);
+        (void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
+            return _service.revoke_all(r, mc);
        }).handle_exception([] (std::exception_ptr e) {
            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
        });
@@ -157,7 +154,6 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 }

 service::service(
-        utils::loading_cache_config c,
        cache& cache,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
@@ -166,25 +162,17 @@ service::service(
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r,
        maintenance_socket_enabled used_by_maintenance_socket)
-            : _loading_cache_config(std::move(c))
-            , _permissions_cache(nullptr)
-            , _cache(cache)
+            : _cache(cache)
            , _qp(qp)
            , _group0_client(g0)
            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
-            , _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
-            , _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
-            , _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
+            , _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
            , _used_by_maintenance_socket(used_by_maintenance_socket) {}

 service::service(
-        utils::loading_cache_config c,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
        ::service::migration_notifier& mn,
@@ -193,7 +181,6 @@ service::service(
        maintenance_socket_enabled used_by_maintenance_socket,
        cache& cache)
            : service(
-                      std::move(c),
                      cache,
                      qp,
                      g0,
@@ -257,7 +244,14 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        co_await _role_manager->ensure_superuser_is_created();
    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
-    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
+    if (!_used_by_maintenance_socket) {
+        // Maintenance socket mode can't cache permissions because it has
+        // different authorizer. We can't mix cached permissions, they could be
+        // different in normal mode.
+        _cache.set_permission_loader(std::bind(
+                &service::get_uncached_permissions,
+                this, std::placeholders::_1, std::placeholders::_2));
+    }
    co_await once_among_shards([this] {
        _mnotifier.register_listener(_migration_listener.get());
        return make_ready_future<>();
@@ -269,9 +263,7 @@ future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
-        if (_permissions_cache) {
-            return _permissions_cache->stop();
-        }
+        _cache.set_permission_loader(nullptr);
        return make_ready_future<>();
    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
@@ -283,21 +275,8 @@ future<> service::ensure_superuser_is_created() {
    co_await _authenticator->ensure_superuser_is_created();
 }

-void service::update_cache_config() {
-    auto db = _qp.db();
-
-    utils::loading_cache_config perm_cache_config;
-    perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
-    perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
-    perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
-
-    if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
-        log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
-    }
-}

 void service::reset_authorization_cache() {
-    _permissions_cache->reset();
    _qp.reset_cache();
 }

@@ -322,7 +301,10 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
 }

 future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
-    return _permissions_cache->get(maybe_role, r);
+    if (legacy_mode(_qp) || _used_by_maintenance_socket) {
+        return get_uncached_permissions(maybe_role, r);
+    }
+    return _cache.get_permissions(maybe_role, r);
 }

 future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -447,6 +429,11 @@ future<bool> service::exists(const resource& r) const {
    return make_ready_future<bool>(false);
 }

+future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
+    co_await _authorizer->revoke_all(r, mc);
+    co_await _cache.prune(r);
+}
+
 future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
    std::vector<cql3::description> result{};

@@ -801,7 +788,7 @@ future<> revoke_permissions(
 }

 future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
-    return ser.underlying_authorizer().revoke_all(r, mc);
+    return ser.revoke_all(r, mc);
 }

 future<std::vector<permission_details>> list_filtered_permissions(
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -20,7 +20,6 @@
 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
 #include "auth/permission.hh"
-#include "auth/permissions_cache.hh"
 #include "auth/cache.hh"
 #include "auth/role_manager.hh"
 #include "auth/common.hh"
@@ -75,8 +74,6 @@ public:
 /// peering_sharded_service inheritance is needed to be able to access shard local authentication service
 /// given an object from another shard. Used for bouncing lwt requests to correct shard.
 class service final : public seastar::peering_sharded_service<service> {
-    utils::loading_cache_config _loading_cache_config;
-    std::unique_ptr<permissions_cache> _permissions_cache;
    cache& _cache;

    cql3::query_processor& _qp;
@@ -94,20 +91,12 @@ class service final : public seastar::peering_sharded_service<service> {
    // Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
    std::unique_ptr<::service::migration_listener> _migration_listener;

-    std::function<void(uint32_t)> _permissions_cache_cfg_cb;
-    serialized_action _permissions_cache_config_action;
-
-    utils::observer<uint32_t> _permissions_cache_max_entries_observer;
-    utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
-    utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
-
    maintenance_socket_enabled _used_by_maintenance_socket;

    abort_source _as;

 public:
    service(
-            utils::loading_cache_config,
            cache& cache,
            cql3::query_processor&,
            ::service::raft_group0_client&,
@@ -123,7 +112,6 @@ public:
    /// of the instances themselves.
    ///
    service(
-            utils::loading_cache_config,
            cql3::query_processor&,
            ::service::raft_group0_client&,
            ::service::migration_notifier&,
@@ -138,8 +126,6 @@ public:

    future<> ensure_superuser_is_created();

-    void update_cache_config();
-
    void reset_authorization_cache();

    ///
@@ -181,6 +167,13 @@ public:

    future<bool> exists(const resource&) const;

+    ///
+    /// Revoke all permissions granted to any role for a particular resource.
+    ///
+    /// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
+    ///
+    future<> revoke_all(const resource&, ::service::group0_batch&) const;
+
    ///
    /// Produces descriptions that can be used to restore the state of auth. That encompasses
    /// roles, role grants, and permission grants.
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -814,8 +814,7 @@ generation_service::generation_service(
            config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
            sharded<db::system_keyspace>& sys_ks,
            abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
-            replica::database& db,
-            std::function<bool()> raft_topology_change_enabled)
+            replica::database& db)
        : _cfg(std::move(cfg))
        , _gossiper(g)
        , _sys_dist_ks(sys_dist_ks)
@@ -824,7 +823,6 @@ generation_service::generation_service(
        , _token_metadata(stm)
        , _feature_service(f)
        , _db(db)
-        , _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
 {
 }

@@ -878,16 +876,7 @@ future<> generation_service::on_join(gms::inet_address ep, locator::host_id id,
 future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
    assert_shard_zero(__PRETTY_FUNCTION__);

-    if (_raft_topology_change_enabled()) {
-        return make_ready_future<>();
-    }
-
-    return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
-        auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
-        cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
-
-        return legacy_handle_cdc_generation(gen_id);
-    });
+    return make_ready_future<>();
 }

 future<> generation_service::check_and_repair_cdc_streams() {
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -79,17 +79,12 @@ private:
    std::optional<cdc::generation_id> _gen_id;
    future<> _cdc_streams_rewrite_complete = make_ready_future<>();

-    /* Returns true if raft topology changes are enabled.
-     * Can only be called from shard 0.
-     */
-    std::function<bool()> _raft_topology_change_enabled;
 public:
    generation_service(config cfg, gms::gossiper&,
            sharded<db::system_distributed_keyspace>&,
            sharded<db::system_keyspace>& sys_ks,
            abort_source&, const locator::shared_token_metadata&,
-            gms::feature_service&, replica::database& db,
-            std::function<bool()> raft_topology_change_enabled);
+            gms::feature_service&, replica::database& db);

    future<> stop();
    ~generation_service();
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -299,13 +299,11 @@ batch_size_fail_threshold_in_kb: 1024
 # max_hint_window_in_ms: 10800000 # 3 hours


-# Validity period for permissions cache (fetching permissions can be an
-# expensive operation depending on the authorizer, CassandraAuthorizer is
-# one example). Defaults to 10000, set to 0 to disable.
+# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
 # Will be disabled automatically for AllowAllAuthorizer.
 # permissions_validity_in_ms: 10000

-# Refresh interval for permissions cache (if enabled).
+# Refresh interval for authorized statements cache.
 # After this interval, cache entries become eligible for refresh. Upon next
 # access, an async reload is scheduled and the old value returned until it
 # completes. If permissions_validity_in_ms is non-zero, then this also must have
@@ -566,15 +564,16 @@ commitlog_total_space_in_mb: -1
 # prometheus_address: 1.2.3.4

 # audit settings
-# By default, Scylla does not audit anything.
+# Table audit is enabled by default.
 # 'audit' config option controls if and where to output audited events:
-#   - "none": auditing is disabled (default)
-#   - "table": save audited events in audit.audit_log column family
+#   - "none": auditing is disabled
+#   - "table": save audited events in audit.audit_log column family (default)
 #   - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
 audit: "table"
 #
 # List of statement categories that should be audited.
-audit_categories: "DCL,DDL,AUTH,ADMIN"
+# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
+audit_categories: "DCL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
--- a/configure.py
+++ b/configure.py
@@ -730,28 +730,6 @@ vector_search_tests = set([
    'test/vector_search/rescoring_test'
 ])

-vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
-vector_search_validator_deps = set([
-    'test/vector_search_validator/build-validator',
-    'test/vector_search_validator/Cargo.toml',
-    'test/vector_search_validator/crates/validator/Cargo.toml',
-    'test/vector_search_validator/crates/validator/src/main.rs',
-    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
-    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
-    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
-])
-
-vector_store_bin = 'vector-search-validator/bin/vector-store'
-vector_store_deps = set([
-    'test/vector_search_validator/build-env',
-    'test/vector_search_validator/build-vector-store',
-])
-
-vector_search_validator_bins = set([
-    vector_search_validator_bin,
-    vector_store_bin,
-])
-
 wasms = set([
    'wasm/return_input.wat',
    'wasm/test_complex_null_values.wat',
@@ -785,7 +763,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins
+all_artifacts = apps | cpp_apps | tests | other | wasms

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -1196,6 +1174,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
                'utils/http.cc',
+                'utils/http_client_error_processing.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
@@ -1213,6 +1192,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/azure/identity/default_credentials.cc',
                'utils/gcp/gcp_credentials.cc',
                'utils/gcp/object_storage.cc',
+                'utils/gcp/object_storage_retry_strategy.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1297,7 +1277,6 @@ scylla_core = (['message/messaging_service.cc',
                'auth/passwords.cc',
                'auth/password_authenticator.cc',
                'auth/permission.cc',
-                'auth/permissions_cache.cc',
                'auth/service.cc',
                'auth/standard_role_manager.cc',
                'auth/ldap_role_manager.cc',
@@ -1667,6 +1646,7 @@ for t in sorted(perf_tests):

 deps['test/boost/combined_tests'] += [
    'test/boost/aggregate_fcts_test.cc',
+    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
    'test/boost/cache_algorithm_test.cc',
@@ -2585,11 +2565,10 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
-                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2619,7 +2598,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms or binary in vector_search_validator_bins:
+            if binary in other or binary in wasms:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2730,11 +2709,10 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
-                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2902,19 +2880,6 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

-    f.write(textwrap.dedent(f'''\
-        rule build-vector-search-validator
-            command = test/vector_search_validator/build-validator $builddir
-        rule build-vector-store
-            command = test/vector_search_validator/build-vector-store $builddir
-        '''))
-    f.write(
-            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
-    )
-    f.write(
-            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
-    )
-
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool is_ann_ordering = false;
    }
    : K_SELECT (
-                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
-                ( K_DISTINCT { is_distinct = true; } )?
+                ( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                | (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                )?
+                ( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
                sclause=selectClause
               )
      K_FROM (
@@ -425,13 +427,13 @@ selector returns [shared_ptr<raw_selector> s]

 unaliasedSelector returns [uexpression tmp]
    :  ( c=cident                                  { tmp = unresolved_identifier{std::move(c)}; }
+       | v=value                                   { tmp = std::move(v); }
       | K_COUNT '(' countArgument ')'             { tmp = make_count_rows_function_expression(); }
       | K_WRITETIME '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | K_TTL       '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
-       | f=similarityFunctionName args=vectorSimilarityArgs            { tmp = function_call{std::move(f), std::move(args)}; }
       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
       )
       ( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -446,23 +448,9 @@ selectionFunctionArgs returns [std::vector<expression> a]
      ')'
    ;

-vectorSimilarityArgs returns [std::vector<expression> a]
-    : '(' ')'
-    | '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
-          ( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
-      ')'
-    ;
-
-vectorSimilarityArg returns [uexpression a]
-    : s=unaliasedSelector { a = std::move(s); }
-    | v=value             { a = std::move(v); }
-    ;
-
 countArgument
    : '*'
-    | i=INTEGER { if (i->getText() != "1") {
-                    add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
-                } }
+    /* COUNT(1) is also allowed, it is recognized via the general function(args) path */
    ;

 whereClause returns [uexpression clause]
@@ -1706,10 +1694,6 @@ functionName returns [cql3::functions::function_name s]
    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s.keyspace = std::move(ks); $s.name = std::move(f); }
    ;

-similarityFunctionName returns [cql3::functions::function_name s]
-    : f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
-    ;
-
 allowedFunctionName returns [sstring s]
    : f=IDENT                       { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
    | f=QUOTED_NAME                 { $s = $f.text; }
@@ -1718,11 +1702,6 @@ allowedFunctionName returns [sstring s]
    | K_COUNT                       { $s = "count"; }
    ;

-allowedSimilarityFunctionName returns [sstring s]
-    : f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
-      { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
-    ;
-
 functionArgs returns [std::vector<expression> a]
    : '(' ')'
    | '(' t1=term { a.push_back(std::move(t1)); }
@@ -2419,10 +2398,6 @@ K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

 K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;

-K_SIMILARITY_EUCLIDEAN:     S I M I L A R I T Y '_' E U C L I D E A N;
-K_SIMILARITY_COSINE:        S I M I L A R I T Y '_' C O S I N E;
-K_SIMILARITY_DOT_PRODUCT:   S I M I L A R I T Y '_' D O T '_' P R O D U C T;
-
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -10,6 +10,7 @@
 #include "expr-utils.hh"
 #include "evaluate.hh"
 #include "cql3/functions/functions.hh"
+#include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/castas_fcts.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/column_identifier.hh"
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
    return partially_prepared_args;
 }

+// Special case for count(1) - recognize it as the countRows() function. Note it is quite
+// artificial and we might relax it to the more general count(expression) later.
+static
+std::optional<expression>
+try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    return std::visit(overloaded_functor{
+        [&] (const functions::function_name& name) -> std::optional<expression> {
+            auto native_name = name;
+            if (!native_name.has_keyspace()) {
+                native_name = name.as_native_function();
+            }
+            // Collapse count(1) into countRows()
+            if (native_name == functions::function_name::native_function("count")) {
+                if (fc.args.size() == 1) {
+                    if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
+                        if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
+                                && uc_arg->raw_text == "1") {
+                            return expr::function_call{
+                                .func = functions::aggregate_fcts::make_count_rows_function(),
+                                .args = {},
+                            };
+                        } else {
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                        }
+                    }
+                }
+            }
+            return std::nullopt;
+        },
+        [] (const shared_ptr<functions::function>&) -> std::optional<expression> {
+            // Already prepared, nothing to do
+            return std::nullopt;
+        },
+    }, fc.func);
+}
+
 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
+        return prepared;
+    }
    // Try to extract a column family name from the available information.
    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,9 +10,41 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
+#include <span>
+#include <bit>

 namespace cql3 {
 namespace functions {
+
+namespace detail {
+
+std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
+    if (!param) {
+        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
+    }
+
+    const size_t expected_size = dimension * sizeof(float);
+    if (param->size() != expected_size) {
+        throw exceptions::invalid_request_exception(
+            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
+                       expected_size, dimension, param->size()));
+    }
+
+    std::vector<float> result;
+    result.reserve(dimension);
+
+    bytes_view view(*param);
+    for (size_t i = 0; i < dimension; ++i) {
+        // read_simple handles network byte order (big-endian) conversion
+        uint32_t raw = read_simple<uint32_t>(view);
+        result.push_back(std::bit_cast<float>(raw));
+    }
+
+    return result;
+}
+
+} // namespace detail
+
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -22,14 +54,14 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
    double dot_product = 0.0;
    double squared_norm_a = 0.0;
    double squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -37,7 +69,7 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
+        return std::numeric_limits<float>::quiet_NaN();
    }

    // The cosine similarity is in the range [-1, 1].
@@ -46,12 +78,12 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
    double sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];

        double diff = a - b;
        sum += diff * diff;
@@ -65,12 +97,12 @@ float compute_euclidean_similarity(const std::vector<data_value>& v1, const std:

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
    double dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        double a = v1[i];
+        double b = v2[i];
        dot_product += a * b;
    }

@@ -136,13 +168,15 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    const auto& type = arg_types()[0];
-    data_value v1 = type->deserialize(*parameters[0]);
-    data_value v2 = type->deserialize(*parameters[1]);
-    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
-    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
+    // Extract dimension from the vector type
+    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
+    size_t dimension = type.get_dimension();

-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
+    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
+    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
+    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
+
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,6 +11,7 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
+#include <span>

 namespace cql3 {
 namespace functions {
@@ -19,7 +20,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
+using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -33,5 +34,14 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

+namespace detail {
+
+// Extract float vector directly from serialized bytes, bypassing data_value overhead.
+// This is an internal API exposed for testing purposes.
+// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
+std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
+
+} // namespace detail
+
 } // namespace functions
 } // namespace cql3
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,6 +23,7 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
+#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -329,6 +330,19 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

+        table_desc.create_statement = std::move(os).to_managed_string();
+    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
+        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
+        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
+        fragmented_ostringstream os{};
+
+        fmt::format_to(os.to_iter(),
+                "/* Do NOT execute this statement! It's only for informational purposes.\n"
+                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
+                "\n{}\n"
+                "*/",
+                *table_desc.create_statement);
+
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -364,7 +378,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -259,11 +259,9 @@ uint32_t select_statement::get_bound_terms() const {

 future<> select_statement::check_access(query_processor& qp, const service::client_state& state) const {
    try {
-        const data_dictionary::database db = qp.db();
-        auto&& s = db.find_schema(keyspace(), column_family());
-        auto cdc = db.get_cdc_base_table(*s);
-        auto& cf_name = s->is_view()
-            ? s->view_info()->base_name()
+        auto cdc = qp.db().get_cdc_base_table(*_schema);
+        auto& cf_name = _schema->is_view()
+            ? _schema->view_info()->base_name()
            : (cdc ? cdc->cf_name() : column_family());
        const schema_ptr& base_schema = cdc ? cdc : _schema;
        bool is_vector_indexed = secondary_index::vector_index::has_vector_index(*base_schema);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
-            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
+    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -1201,13 +1201,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "* org.apache.cassandra.auth.CassandraRoleManager: Stores role data in the system_auth keyspace;\n"
        "* com.scylladb.auth.LDAPRoleManager: Fetches role data from an LDAP server.")
    , permissions_validity_in_ms(this, "permissions_validity_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
-        "How long permissions in cache remain valid. Depending on the authorizer, such as CassandraAuthorizer, fetching permissions can be resource intensive. Permissions caching is disabled when this property is set to 0 or when AllowAllAuthorizer is used. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
+        "How long authorized statements cache entries remain valid. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
        "and the cached value has been read at least once during the permissions_validity_in_ms time frame. If any of these two conditions doesn't hold the cached value is going to be evicted from the cache.\n"
        "\n"
        "Related information: Object permissions")
    , permissions_update_interval_in_ms(this, "permissions_update_interval_in_ms", liveness::LiveUpdate, value_status::Used, 2000,
-        "Refresh interval for permissions cache (if enabled). After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms.")
-    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Used, 1000,
+        "Refresh interval for authorized statements cache. After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms. This option additionally controls the permissions refresh interval for LDAP.")
+    , permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Unused, 1000,
        "Maximum cached permission entries. Must have a non-zero value if permissions caching is enabled (see a permissions_validity_in_ms description).")
    , server_encryption_options(this, "server_encryption_options", value_status::Used, {/*none*/},
        "Enable or disable inter-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. The available options are:\n"
@@ -1272,7 +1272,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
    , override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
    , enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
    , enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
            "If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1498,7 +1498,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
-    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
+    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Deprecated, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
@@ -1527,17 +1527,21 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "Allows target tablet size to be configured. Defaults to 5G (in bytes). Maintaining tablets at reasonable sizes is important to be able to " \
         "redistribute load. A higher value means tablet migration throughput can be reduced. A lower value may cause number of tablets to increase significantly, " \
         "potentially resulting in performance drawbacks.")
+    , tablet_streaming_read_concurrency_per_shard(this, "tablet_streaming_read_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
+         "Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
+    , tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
+         "Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
    , replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
    , service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")

-    , audit(this, "audit", value_status::Used, "none",
+    , audit(this, "audit", value_status::Used, "table",
        "Controls the audit feature:\n"
        "\n"
        "\tnone   : No auditing enabled.\n"
        "\tsyslog : Audit messages sent to Syslog.\n"
        "\ttable  : Audit messages written to column family named audit.audit_log.\n")
-    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH", "Comma separated list of operation categories that should be audited.")
+    , audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
    , audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
    , audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
    , audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -542,6 +542,8 @@ public:
    named_value<double> tablets_initial_scale_factor;
    named_value<unsigned> tablets_per_shard_goal;
    named_value<uint64_t> target_tablet_size_in_bytes;
+    named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
+    named_value<unsigned> tablet_streaming_write_concurrency_per_shard;

    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
    named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1714,7 +1714,9 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
    std::unordered_set<dht::token> tset;
    for (auto& t: tokens) {
        auto str = value_cast<sstring>(t);
-        SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
+        if (str != dht::token::from_sstring(str).to_sstring()) {
+            on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
+        }
        tset.insert(dht::token::from_sstring(str));
    }
    return tset;
@@ -3191,7 +3193,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                    };
                }
            } else if (must_have_tokens(nstate)) {
-                on_fatal_internal_error(slogger, format(
+                on_internal_error(slogger, format(
                        "load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
            }
        }
@@ -3273,7 +3275,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            // Currently, at most one node at a time can be in transitioning state.
            if (!map->empty()) {
                const auto& [other_id, other_rs] = *map->begin();
-                on_fatal_internal_error(slogger, format(
+                on_internal_error(slogger, format(
                    "load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
                    other_id, other_rs.state, host_id, nstate));
            }
@@ -3331,8 +3333,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
                        NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
                gen_id.id);
-            SCYLLA_ASSERT(gen_rows);
-            if (gen_rows->empty()) {
+            if (!gen_rows || gen_rows->empty()) {
                on_internal_error(slogger, format(
                    "load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
            }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -215,6 +215,8 @@ public:
    static constexpr auto BUILT_VIEWS = "built_views";
    static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
    static constexpr auto CDC_LOCAL = "cdc_local";
+    static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
+    static constexpr auto CDC_STREAMS = "cdc_streams";

    // auth
    static constexpr auto ROLES = "roles";
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2308,6 +2308,7 @@ future<> view_builder::drain() {
    vlogger.info("Draining view builder");
    _as.request_abort();
    co_await _mnotifier.unregister_listener(this);
+    co_await _ops_gate.close();
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
@@ -2742,30 +2743,48 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_create_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_create_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view creation {}.{}: {}", ks_name, view_name, ep);
    }));
 }

-void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
+future<> view_builder::dispatch_update_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return;
+        co_return;
    }

+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    auto view = view_ptr(_db.find_schema(ks_name, view_name));
+    auto step_it = _base_to_build_step.find(view->view_info()->base_id());
+    if (step_it == _base_to_build_step.end()) {
+        co_return; // In case all the views for this CF have finished building already.
+    }
+    auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
+        return bs.view->id() == view->id();
+    });
+    if (status_it != step_it->second.build_status.end()) {
+        status_it->view = std::move(view);
+    }
+}
+
+void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
    // Do it in the background, serialized.
-    (void)with_semaphore(_sem, view_builder_semaphore_units, [ks_name, view_name, this] {
-        auto view = view_ptr(_db.find_schema(ks_name, view_name));
-        auto step_it = _base_to_build_step.find(view->view_info()->base_id());
-        if (step_it == _base_to_build_step.end()) {
-            return;// In case all the views for this CF have finished building already.
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_update_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+        try {
+            std::rethrow_exception(ep);
+        } catch (const seastar::gate_closed_exception&) {
+            vlogger.warn("Ignoring gate_closed_exception during view update {}.{}", ks_name, view_name);
+        } catch (const seastar::broken_named_semaphore&) {
+            vlogger.warn("Ignoring broken_named_semaphore during view update {}.{}", ks_name, view_name);
+        } catch (const replica::no_such_column_family&) {
+            vlogger.warn("Ignoring no_such_column_family during view update {}.{}", ks_name, view_name);
        }
-        auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
-            return bs.view->id() == view->id();
-        });
-        if (status_it != step_it->second.build_status.end()) {
-            status_it->view = std::move(view);
-        }
-    }).handle_exception_type([] (replica::no_such_column_family&) { });
+    }));
 }

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
@@ -2827,7 +2846,9 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }

    // Do it in the background, serialized and broadcast from shard 0.
-    static_cast<void>(dispatch_drop_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
+    static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
+        return dispatch_drop_view(std::move(ks_name), std::move(view_name));
+    }).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
        vlogger.warn("Failed to dispatch view drop {}.{}: {}", ks_name, view_name, ep);
    }));
 }
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -16,6 +16,7 @@

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
@@ -190,6 +191,7 @@ class view_builder final : public service::migration_listener::only_view_notific
    // Guard the whole startup routine with a semaphore so that it's not intercepted by
    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
+    seastar::gate _ops_gate;
    seastar::abort_source _as;
    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
@@ -284,6 +286,7 @@ private:
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
+    future<> dispatch_update_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -588,11 +588,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
    utils::get_local_injector().inject("do_build_range_fail",
            [] { throw std::runtime_error("do_build_range failed due to error injection"); });

-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this, base_id, views_ids = std::move(views_ids), last_token, &as] {
+    return seastar::async([this, base_id, views_ids = std::move(views_ids), last_token, &as] {
        gc_clock::time_point now = gc_clock::now();
        auto base_cf = _db.find_column_family(base_id).shared_from_this();
        reader_permit permit = _db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "build_views_range", db::no_timeout, {});
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -67,6 +67,7 @@ public:
        return schema_builder(system_keyspace::NAME, "cluster_status", std::make_optional(id))
            .with_column("peer", inet_addr_type, column_kind::partition_key)
            .with_column("dc", utf8_type)
+            .with_column("rack", utf8_type)
            .with_column("up", boolean_type)
            .with_column("draining", boolean_type)
            .with_column("excluded", boolean_type)
@@ -111,7 +112,9 @@ public:
                    // Not all entries in gossiper are present in the topology
                    auto& node = tm.get_topology().get_node(hostid);
                    sstring dc = node.dc_rack().dc;
+                    sstring rack = node.dc_rack().rack;
                    set_cell(cr, "dc", dc);
+                    set_cell(cr, "rack", rack);
                    set_cell(cr, "draining", node.is_draining());
                    set_cell(cr, "excluded", node.is_excluded());
                }
@@ -1345,8 +1348,8 @@ public:

 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
-        return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
+        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1428,8 +1431,8 @@ public:
    }
 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
-        return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
+        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", timestamp_type, column_kind::clustering_key)
--- a/debug.cc
+++ b/debug.cc
@@ -12,5 +12,6 @@ namespace debug {

 seastar::sharded<replica::database>* volatile the_database = nullptr;
 seastar::scheduling_group streaming_scheduling_group;
+seastar::scheduling_group gossip_scheduling_group;

 }
--- a/debug.hh
+++ b/debug.hh
@@ -18,6 +18,7 @@ namespace debug {

 extern seastar::sharded<replica::database>* volatile the_database;
 extern seastar::scheduling_group streaming_scheduling_group;
+extern seastar::scheduling_group gossip_scheduling_group;

 }

--- a/dist/docker/redhat/README.md
+++ b/dist/docker/redhat/README.md
@@ -12,7 +12,7 @@ Do the following in the top-level Scylla source directory:
 2. Run `ninja dist-dev` (with the same mode name as above) to prepare
   the distribution artifacts.

-3. Run `./dist/docker/debian/build_docker.sh --mode dev`
+3. Run `./dist/docker/redhat/build_docker.sh --mode dev`
   
   This creates a docker image as a **file**, in the OCI format, and prints
   its name, looking something like:
--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -70,7 +70,7 @@ bcp() { buildah copy "$container" "$@"; }
 run() { buildah run "$container" "$@"; }
 bconfig() { buildah config "$@" "$container"; }

-container="$(buildah from docker.io/redhat/ubi9-minimal:latest)"
+container="$(buildah from --pull=always docker.io/redhat/ubi9-minimal:latest)"

 packages=(
    "build/dist/$config/redhat/RPMS/$arch/$product-$version-$release.$arch.rpm"
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1,6 +1,10 @@
 ### a dictionary of redirections
 #old path: new path

+# Move the OS Support page
+
+/stable/getting-started/os-support.html: https://docs.scylladb.com/stable/versioning/os-support-per-version.html
+
 # Remove an outdated KB

 /stable/kb/perftune-modes-sync.html: /stable/kb/index.html
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -142,10 +142,6 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
 Alternator implements such requests by reading the entire top-level
 attribute a, modifying only a.b[3].c, and then writing back a.

-Currently, Alternator doesn't use Tablets. That's because Alternator relies
-on LWT (lightweight transactions), and LWT is not supported in keyspaces
-with Tablets enabled.
-
 ```{eval-rst}
 .. toctree::
    :maxdepth: 2
--- a/docs/alternator/new-apis.md
+++ b/docs/alternator/new-apis.md
@@ -213,3 +213,71 @@ Alternator table, the following features will not work for this table:
 * Enabling Streams with CreateTable or UpdateTable doesn't work
  (results in an error).
  See <https://github.com/scylladb/scylla/issues/23838>.
+
+## Custom write timestamps
+
+DynamoDB doesn't allow clients to set the write timestamp of updates. All
+updates use the current server time as their timestamp, and ScyllaDB uses
+these timestamps for last-write-wins conflict resolution when concurrent
+writes reach different replicas.
+
+ScyllaDB Alternator extends this with the `system:timestamp_attribute` tag,
+which allows specifying a custom write timestamp for each PutItem,
+UpdateItem, DeleteItem, or BatchWriteItem request. To use this feature:
+
+1. Tag the table (at CreateTable time or using TagResource) with
+   `system:timestamp_attribute` set to the name of an attribute that will
+   hold the custom write timestamp.
+
+2. When performing a PutItem or UpdateItem, include the named attribute
+   in the request with a numeric value. The value represents the write
+   timestamp in **microseconds since the Unix epoch** (this is the same
+   unit used internally by ScyllaDB for timestamps).
+   For a DeleteItem or a BatchWriteItem DeleteRequest, include the named
+   attribute in the `Key` parameter (it will be stripped from the key
+   before use).
+
+3. The named attribute is **not stored** in the item data - it only
+   controls the write timestamp. If you also want to record the timestamp
+   as data, use a separate attribute for that purpose.
+
+4. If the named attribute is absent, the write proceeds normally using the
+   current server time as the timestamp. If the named attribute is present
+   but has a non-numeric value, the write is rejected with a ValidationException.
+
+### Limitations
+
+- **Incompatible with conditions**: If the write includes a ConditionExpression
+  (or uses the `Expected` legacy condition), LWT is needed and the operation
+  is rejected with a ValidationException, because LWT requires the write
+  timestamp to be set by the Paxos protocol, not by the client.
+
+- **Incompatible with `always` write isolation**: Tables using the `always`
+  (or `always_use_lwt`) write isolation policy cannot use the timestamp
+  attribute feature at all, because every write uses LWT in that mode.
+  When using `system:timestamp_attribute`, consider tagging the table with
+  `system:write_isolation=only_rmw_uses_lwt` (or `forbid_rmw`) so that
+  unconditional writes do not use LWT.
+
+### Example use case
+
+This feature is useful for ingesting data from multiple sources where each
+record has a known logical timestamp. By setting the `system:timestamp_attribute`
+tag, you can ensure that the record with the highest logical timestamp always
+wins, regardless of ingestion order:
+
+```python
+# Create table with timestamp attribute
+dynamodb.create_table(
+    TableName='my_table',
+    ...
+    Tags=[{'Key': 'system:timestamp_attribute', 'Value': 'write_ts'}]
+)
+
+# Write a record with a specific timestamp (in microseconds since epoch)
+table.put_item(Item={
+    'pk': 'my_key',
+    'data': 'new_value',
+    'write_ts': Decimal('1700000000000000'),  # Nov 14, 2023 in microseconds
+})
+```
--- a/docs/architecture/tablets.rst
+++ b/docs/architecture/tablets.rst
@@ -187,6 +187,23 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
    the keyspace schema with ``tablets = { 'enabled': false }`` or 
    ``tablets = { 'enabled': true }``.

+.. _keyspace-rf-rack-valid-to-enforce-rack-list:
+
+Enforcing Rack-List Replication for Tablet Keyspaces
+------------------------------------------------------------------
+
+The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
+:term:`RF-rack-valid <RF-rack-valid keyspace>`.
+
+Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
+:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
+to ``rf_rack_valid_keyspaces`` option.
+
+To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
+rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
+After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
+sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
+
 .. _tablets-limitations:

 Limitations and Unsupported Features
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -200,8 +200,6 @@ for two cases. One is setting replication factor to 0, in which case the number
 The other is when the numeric replication factor is equal to the current number of replicas
 for a given datacanter, in which case the current rack list is preserved.

-Altering from a numeric replication factor to a rack list is not supported yet.
-
 Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
 auto-expansion will only *add* new datacenters for safety, it will not alter
 existing datacenters or remove any even if they are no longer in the cluster.
@@ -424,6 +422,21 @@ Altering from a rack list to a numeric replication factor is not supported.

 Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).

+.. _conversion-to-rack-list-rf:
+
+Conversion to rack-list replication factor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
+
+.. code-block:: cql
+
+  CREATE KEYSPACE Excelsior
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
+
+  ALTER KEYSPACE Excelsior
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
+
 .. _drop-keyspace-statement:

 DROP KEYSPACE
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -25,6 +25,8 @@ Querying data from data is done using a ``SELECT`` statement:
           : | CAST '(' `selector` AS `cql_type` ')'
           : | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
           : | COUNT '(' '*' ')'
+           : | literal
+           : | bind_marker
           : )
           : ( '.' `field_name` | '[' `term` ']' )*
   where_clause: `relation` ( AND `relation` )*
@@ -35,6 +37,8 @@ Querying data from data is done using a ``SELECT`` statement:
   operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
   ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
   timeout: `duration`
+   literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
+   bind_marker: '?' | ':' `identifier`

 For instance::

@@ -81,6 +85,13 @@ A :token:`selector` can be one of the following:
 - A casting, which allows you to convert a nested selector to a (compatible) type.
 - A function call, where the arguments are selector themselves.
 - A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
+- A literal value (constant).
+- A bind variable (`?` or `:name`).
+
+Note that due to a quirk of the type system, literals and bind markers cannot be
+used as top-level selectors, as the parser cannot infer their type. However, they can be used
+when nested inside functions, as the function formal parameter types provide the
+necessary context.

 Aliases
 ```````
@@ -281,7 +292,8 @@ For example::
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;


-Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
+Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
+or columns provided in a definition of the index.

 For example::

--- a/docs/cql/secondary-indexes.rst
+++ b/docs/cql/secondary-indexes.rst
@@ -140,17 +140,83 @@ Vector Index :label-note:`ScyllaDB Cloud`
   `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.

 ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
-similarity search on vector data. 
+similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
+index for indexing vectors per partition.

 The vector index is the only custom type index supported in ScyllaDB. It is created using
-the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:
+the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
+add additional columns to the index for filtering the search results. The partition column
+specified in the global vector index definition must be the vector column, and any subsequent
+columns are treated as filtering columns. The local vector index requires that the partition key
+of the base table is also the partition key of the index and the vector column is the first one
+from the following columns.
+
+Example of a simple index:

 .. code-block:: cql

-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding) 
+      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
      USING 'vector_index' 
      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};

+The vector column (``embedding``) is indexed to enable similarity search using
+a global vector index. Additional filtering can be performed on the primary key
+columns of the base table.
+
+Example of a global vector index with additional filtering:
+
+.. code-block:: cql
+
+      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
+      USING 'vector_index' 
+      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
+
+The vector column (``embedding``) is indexed to enable similarity search using
+a global index. Additional columns are added for filtering the search results.
+The filtering is possible on ``category``, ``info`` and all primary key columns
+of the base table.
+
+Example of a local vector index:
+
+.. code-block:: cql
+
+      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
+      USING 'vector_index' 
+      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
+
+The vector column (``embedding``) is indexed for similarity search (a local
+index) and additional columns are added for filtering the search results. The
+filtering is possible on ``category``, ``info`` and all primary key columns of
+the base table. The columns ``id`` and ``created_at`` must be the partition key
+of the base table.
+
+Vector indexes support additional filtering columns of native data types
+(excluding counter and duration). The indexed column itself must be a vector
+column, while the extra columns can be used to filter search results.
+
+The supported types are:
+
+* ``ascii``
+* ``bigint``
+* ``blob``
+* ``boolean``
+* ``date``
+* ``decimal``
+* ``double``
+* ``float``
+* ``inet``
+* ``int``
+* ``smallint``
+* ``text``
+* ``varchar``
+* ``time``
+* ``timestamp``
+* ``timeuuid``
+* ``tinyint``
+* ``uuid``
+* ``varint``
+
+
 The following options are supported for vector indexes. All of them are optional.

 +------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
--- a/docs/dev/audit.md
+++ b/docs/dev/audit.md
@@ -108,6 +108,4 @@ check the statement and throw if it is disallowed, similar to what

 Obviously, an audit definition must survive a server restart and stay
 consistent among all nodes in a cluster.  We'll accomplish both by
-storing audits in a system table.  They will be cached in memory the
-same way `permissions_cache` caches table contents in `permission_set`
-objects resident in memory.
+storing audits in a system table.
--- a/docs/dev/protocol-extensions.md
+++ b/docs/dev/protocol-extensions.md
@@ -39,6 +39,17 @@ Both client and server use the same string identifiers for the keys to determine
 negotiated extension set, judging by the presence of a particular key in the
 SUPPORTED/STARTUP messages.

+## Client options
+
+`client_options` column in `system.clients` table stores all data sent by the
+client in STARTUP request, as a `map<text, text>`. This column may be useful
+for debugging and monitoring purposes.
+
+Drivers can send additional data in STARTUP, e.g. load balancing policy, retry
+policy, timeouts, and other configuration.
+Such data should be sent in `CLIENT_OPTIONS` key, as JSON. The recommended
+structure of this JSON will be decided in the future.
+
 ## Intranode sharding

 This extension allows the driver to discover how Scylla internally
@@ -74,8 +85,6 @@ The keys and values are:
    as an indicator to which shard client wants to connect. The desired shard number
    is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
    Its value is a decimal representation of type `uint16_t`, by default `19142`.
-  - `CLIENT_OPTIONS` is a string containing a JSON object representation that
-    contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.

 Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
 `biased-token-round-robin`. To apply the algorithm,
--- a/docs/dev/system_keyspace.md
+++ b/docs/dev/system_keyspace.md
@@ -563,17 +563,18 @@ CREATE TABLE system.clients (
    address inet,
    port int,
    client_type text,
+    client_options frozen<map<text, text>>,
    connection_stage text,
    driver_name text,
    driver_version text,
    hostname text,
    protocol_version int,
+    scheduling_group text,
    shard_id int,
    ssl_cipher_suite text,
    ssl_enabled boolean,
    ssl_protocol text,
    username text,
-    scheduling_group text,
    PRIMARY KEY (address, port, client_type)
 ) WITH CLUSTERING ORDER BY (port ASC, client_type ASC)
 ~~~
@@ -581,4 +582,7 @@ CREATE TABLE system.clients (
 Currently only CQL clients are tracked. The table used to be present on disk (in data
 directory) before and including version 4.5.

+`client_options` column stores all data sent by the client in the STARTUP request.
+This column is useful for debugging and monitoring purposes.
+
 ## TODO: the rest
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -156,7 +156,7 @@ How do I check the current version of ScyllaDB that I am running?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 * On a regular system or VM (running Ubuntu, CentOS, or RedHat Enterprise): :code:`$ scylla --version`

-Check the :doc:`Operating System Support Guide </getting-started/os-support>` for a list of supported operating systems and versions.
+Check the `Operating System Support Guide <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for a list of supported operating systems and versions.

 * On a docker node: :code:`$ docker exec -it Node_Z scylla --version`

--- a/docs/features/automatic-repair.rst
+++ b/docs/features/automatic-repair.rst
@@ -3,9 +3,9 @@
 Automatic Repair
 ================

-Traditionally, launching `repairs </operating-scylla/procedures/maintenance/repair>`_ in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
+Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.

-Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the tablet `tablet </architecture/tablets>`_ automatically.
+Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
 Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.

 To enable automatic repair, add this to the configuration (``scylla.yaml``):
@@ -20,4 +20,4 @@ More featureful configuration methods will be implemented in the future.

 To disable, set ``auto_repair_enabled_default: false``.

-Automatic repair relies on `Incremental Repair </features/incremental-repair>`_ and as such it only works with `tablet </architecture/tablets>`_ tables.
+Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
--- a/docs/features/incremental-repair.rst
+++ b/docs/features/incremental-repair.rst
@@ -3,7 +3,7 @@
 Incremental Repair
 ==================

-ScyllaDB's standard `repair </operating-scylla/procedures/maintenance/repair>`_ process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
+ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.

 The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.

@@ -51,7 +51,7 @@ Benefits of Incremental Repair
 *   **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
 *   **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.

-Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with `Automatic Repair </features/automatic-repair>`_.
+Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.

 Notes
 -----
--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -18,7 +18,7 @@ Getting Started
  :class: my-panel
  
  * :doc:`ScyllaDB System Requirements Guide</getting-started/system-requirements/>`
-  * :doc:`OS Support by Platform and Version</getting-started/os-support/>`
+  * `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
    
 .. panel-box::
  :title: Install and Configure ScyllaDB
--- a/docs/getting-started/install-scylla/index.rst
+++ b/docs/getting-started/install-scylla/index.rst
@@ -10,7 +10,6 @@ Install ScyllaDB |CURRENT_VERSION|
   /getting-started/install-scylla/launch-on-azure
   /getting-started/installation-common/scylla-web-installer
   /getting-started/install-scylla/install-on-linux
-   /getting-started/installation-common/install-jmx
   /getting-started/install-scylla/run-in-docker
   /getting-started/installation-common/unified-installer
   /getting-started/installation-common/air-gapped-install
@@ -24,9 +23,9 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
  :id: "getting-started"
  :class: my-panel

-  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on AWS </getting-started/install-scylla/launch-on-aws>`
-  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on GCP </getting-started/install-scylla/launch-on-gcp>`
-  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on Azure </getting-started/install-scylla/launch-on-azure>`
+  * :doc:`Launch ScyllaDB on AWS </getting-started/install-scylla/launch-on-aws>`
+  * :doc:`Launch ScyllaDB on GCP </getting-started/install-scylla/launch-on-gcp>`
+  * :doc:`Launch ScyllaDB on Azure </getting-started/install-scylla/launch-on-azure>`


 .. panel-box::
@@ -35,8 +34,7 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
  :class: my-panel

  * :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
-  * :doc:`Install ScyllaDB |CURRENT_VERSION| Linux Packages </getting-started/install-scylla/install-on-linux>`
-  * :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
+  * :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
  * :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
  * :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
  * :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -17,7 +17,7 @@ This article will help you install ScyllaDB on Linux using platform-specific pac
 Prerequisites
 ----------------

-* Ubuntu, Debian, CentOS, or RHEL (see :doc:`OS Support by Platform and Version </getting-started/os-support>`
+* Ubuntu, Debian, CentOS, or RHEL (see `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
  for details about supported versions and architecture)
 * Root or ``sudo`` access to the system
 * Open :ref:`ports used by ScyllaDB <networking-ports>`
@@ -94,16 +94,6 @@ Install ScyllaDB

               apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1

-
-        #. (Ubuntu only) Set Java 11.
-
-            .. code-block:: console
-    
-               sudo apt-get update
-               sudo apt-get install -y openjdk-11-jre-headless
-               sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
-
-
   .. group-tab:: Centos/RHEL

        #. Install the EPEL repository.
@@ -157,14 +147,6 @@ Install ScyllaDB
    
               sudo yum install scylla-5.2.3

-(Optional) Install scylla-jmx
-------------------------------
-
-    scylla-jmx is an optional package and is not installed by default.
-    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
-
-
-
 .. include:: /getting-started/_common/setup-after-install.rst

 Next Steps
--- a/docs/getting-started/installation-common/install-jmx.rst
+++ b/docs/getting-started/installation-common/install-jmx.rst
@@ -1,78 +0,0 @@
-
-======================================
-Install scylla-jmx Package
-======================================
-
-scylla-jmx is an optional package and is not installed by default.
-If you need JMX server, you can still install it from scylla-jmx GitHub page.
-
-.. tabs::
-
-   .. group-tab:: Debian/Ubuntu
-        #. Download .deb package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".deb".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code-block:: console
-    
-               sudo apt install -y ./scylla-jmx_<version>_all.deb
-
-
-   .. group-tab:: Centos/RHEL
-
-        #. Download .rpm package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".rpm".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code-block:: console
-    
-               sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
-
-
-   .. group-tab:: Install without root privileges
-
-        #. Download .tar.gz package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".tar.gz".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code:: console
-    
-                tar xpf scylla-jmx-<version>.noarch.tar.gz
-                cd scylla-jmx
-                ./install.sh --nonroot
-
-Next Steps
-----------
-
-* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
-* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
-* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
-* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
-* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -10,7 +10,7 @@ Prerequisites
 --------------

 Ensure that your platform is supported by the ScyllaDB version you want to install. 
-See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.
+See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_.

 Install ScyllaDB with Web Installer
 ---------------------------------------
--- a/docs/getting-started/installation-common/unified-installer.rst
+++ b/docs/getting-started/installation-common/unified-installer.rst
@@ -12,7 +12,8 @@ the package manager (dnf and apt).
 Prerequisites
 ---------------
 Ensure your platform is supported by the ScyllaDB version you want to install. 
-See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.
+See `OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
+for information about supported Linux distributions and versions.

 Note that if you're on CentOS 7, only root offline installation is supported.

@@ -48,11 +49,6 @@ Download and Install

    ./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3

-#. (Optional) Install scylla-jmx
-
-    scylla-jmx is an optional package and is not installed by default.
-    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
-
 Configure and Run ScyllaDB
 ----------------------------

--- a/docs/getting-started/os-support.rst
+++ b/docs/getting-started/os-support.rst
@@ -1,26 +0,0 @@
-OS Support by Linux Distributions and Version
-==============================================
-
-The following matrix shows which Linux distributions, containers, and images
-are :ref:`supported <os-support-definition>` with which versions of ScyllaDB.
-
-.. datatemplate:json:: /_static/data/os-support.json
-  :template: platforms.tmpl
-
-``*`` 2024.1.9 and later
-
-All releases are available as a Docker container, EC2 AMI, GCP, and Azure images.
-
-.. _os-support-definition:
-
-By *supported*, it is meant that:
-
- A binary installation package is available.
- The download and install procedures are tested as part of the ScyllaDB release process for each version.
- An automated install is included from :doc:`ScyllaDB Web Installer for Linux tool </getting-started/installation-common/scylla-web-installer>` (for the latest versions).
-
-You can `build ScyllaDB from source <https://github.com/scylladb/scylladb#build-prerequisites>`_
-on other x86_64 or aarch64 platforms, without any guarantees.
-
-
-
--- a/docs/getting-started/requirements.rst
+++ b/docs/getting-started/requirements.rst
@@ -8,12 +8,12 @@ ScyllaDB Requirements
   :hidden:
  
   system-requirements
-   OS Support <os-support>
+   OS Support <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>
   Cloud Instance Recommendations <cloud-instance-recommendations>
   scylla-in-a-shared-environment
   
 * :doc:`System Requirements</getting-started/system-requirements/>`
-* :doc:`OS Support by Platform and Version</getting-started/os-support/>`
+* `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
 * :doc:`Cloud Instance Recommendations AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>`
 * :doc:`Running ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`

--- a/docs/getting-started/system-requirements.rst
+++ b/docs/getting-started/system-requirements.rst
@@ -8,7 +8,7 @@ Supported Platforms
 ===================
 ScyllaDB runs on 64-bit Linux. The x86_64 and AArch64 architectures are supported (AArch64 support includes AWS EC2 Graviton).

-See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about 
+See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ for information about 
 supported operating systems, distros, and versions.

 See :doc:`Cloud Instance Recommendations for AWS, GCP, and Azure </getting-started/cloud-instance-recommendations>` for information
--- a/docs/kb/increase-permission-cache.rst
+++ b/docs/kb/increase-permission-cache.rst
@@ -1,43 +0,0 @@
-====================================================
-Increase Permission Cache to Avoid Non-paged Queries
-====================================================
-
-**Topic: Mitigate non-paged queries coming from connection authentications**
-
-**Audience: ScyllaDB administrators**
-
-
-
-Issue
-----
-
-If you create lots of roles and give them lots of permissions your nodes might spike with non-paged queries.
-
-Root Cause
----------
-
-``permissions_cache_max_entries`` is set to 1000 by default. This setting may not be high enough for bigger deployments with lots of tables, users, and roles with permissions.
-
-
-Solution
--------
-
-Open the scylla.yaml configuration for editing and adjust the following parameters:
-``permissions_cache_max_entries`` - increase this value to suit your needs. See the example below.
-``permissions_update_interval_in_ms``
-``permissions_validity_in_ms``
-
-Note:: ``permissions_update_interval_in_ms`` and  ``permissions_validity_in_ms`` can be set to also make the authentication records come from cache instead of lookups, which generate non-paged queries
-
-
-Example
-------
-
-Considering with ``permissions_cache_max_entries`` there is no maximum value, it's just limited by your memory.
-The cache consumes memory as it caches all records from the list of users and their associated roles (similar to a cartesian product).
-
-Every user, role, and permissions(7 types) on a per table basis are cached.
-
-If for example, you have 1 user with 1 role and 1 table, the table will have 7 permission types and 7 entries  1 * 1 * 1 * 7 = 7.
-When expanded to 5 users, 5 roles, and 10 tables this will be 5 * 5 * 10 * 7 = 1750 entries, which is above the default cache value of 1000. The entries that go over the max value (750 entries) will be non-paged queries for every new connection from the client (and clients tend to reconnect often).
-In cases like this, you may want to consider trading your memory for not stressing the entire cluster with ``auth`` queries.
--- a/docs/kb/index.rst
+++ b/docs/kb/index.rst
@@ -38,7 +38,6 @@ Knowledge Base
  * :doc:`If a query does not reveal enough results </kb/cqlsh-results>`
  * :doc:`How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds>` - How to change the ``gc_grace_seconds`` parameter and prevent data resurrection.
  * :doc:`How to flush old tombstones from a table </kb/tombstones-flush>` - How to remove old tombstones from SSTables.
-  * :doc:`Increase Cache to Avoid Non-paged Queries </kb/increase-permission-cache>` - How to increase the ``permissions_cache_max_entries`` setting.
  * :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
  * :doc:`Facts about TTL, Compaction, and gc_grace_seconds <ttl-facts>`
  * :doc:`Efficient Tombstone Garbage Collection in ICS <garbage-collection-ics>`
--- a/docs/operating-scylla/nodetool-commands/decommission.rst
+++ b/docs/operating-scylla/nodetool-commands/decommission.rst
@@ -25,7 +25,8 @@ Before you run ``nodetool decommission``:
  starting the removal procedure.
 * Make sure that the number of nodes remaining in the DC after you decommission a node
  will be the same or higher than the Replication Factor configured for the keyspace
-  in this DC. If the number of remaining nodes is lower than the RF, the decommission
+  in this DC. Please mind that e.g. audit feature, which is enabled by default, may require
+  adjusting ``audit`` keyspace. If the number of remaining nodes is lower than the RF, the decommission
  request may fail.
  In such a case, ALTER the keyspace to reduce the RF before running ``nodetool decommission``.

--- a/docs/operating-scylla/nodetool-commands/rebuild.rst
+++ b/docs/operating-scylla/nodetool-commands/rebuild.rst
@@ -25,4 +25,8 @@ For Example:

   nodetool rebuild <source-dc-name>

+``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
+
+See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
+
 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
@@ -155,7 +155,6 @@ Add New DC
      UN   54.235.9.159    109.75 KB       256     ?               39798227-9f6f-4868-8193-08570856c09a    RACK1
      UN   54.146.228.25   128.33 KB       256     ?               7a4957a1-9590-4434-9746-9c8a6f796a0c    RACK1

-.. TODO possibly provide additional information WRT how ALTER works with tablets

 #. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:

@@ -171,26 +170,68 @@ Add New DC

      DESCRIBE KEYSPACE mykeyspace;

-      CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};
+      CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};

   ALTER Command

   .. code-block:: cql

-      ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
-      ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
-      ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
+      ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};

   After

   .. code-block:: cql

      DESCRIBE KEYSPACE mykeyspace;
-      CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class’: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
-      CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
-      CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
+      CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};

-#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.
+   For tablet keyspaces, update the replication factor one by one:
+
+   .. code-block:: cql
+
+      DESCRIBE KEYSPACE mykeyspace2;
+
+      CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
+
+   .. code-block:: cql
+
+      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
+      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
+      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
+
+   .. note::
+         If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
+
+         Before
+
+         .. code-block:: cql
+
+            DESCRIBE KEYSPACE mykeyspace3;
+
+            CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
+
+         Add all the nodes to the new datacenter and then alter the keyspace one by one:
+
+         .. code-block:: cql
+
+            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
+            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
+            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
+
+         After
+
+         .. code-block:: cql
+
+            DESCRIBE KEYSPACE mykeyspace3;
+            CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
+
+         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
+
+#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.

   For example:

@@ -198,7 +239,7 @@ Add New DC

   The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.

-#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
+#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_

 #. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.

--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -40,12 +40,14 @@ Prerequisites
 Procedure
 ---------

-#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
+#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.

   For example:

   If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC

+#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
+
 #. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.

   For example:
@@ -73,6 +75,44 @@ Procedure

      cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};

+   For tablet keyspaces, update the replication factor one by one:
+
+   .. code-block:: shell
+
+      cqlsh> DESCRIBE nba2
+      cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
+
+   .. code-block:: shell
+
+      cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
+      cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
+
+   .. note::
+         If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
+
+         .. code-block:: shell
+
+            cqlsh> DESCRIBE nba3
+            cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
+
+         .. code-block:: shell
+
+            cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
+            cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
+
+         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
+
+   .. note::
+
+      If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
+      You must also alter the ``audit`` keyspace to remove replicas from the decommissioned data-center. For example:
+
+      .. code-block:: shell
+
+         cqlsh> ALTER KEYSPACE audit WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
+
+      Failure to do so will result in decommission errors such as "zero replica after the removal".
+
 #. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
   Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.

--- a/docs/operating-scylla/procedures/maintenance/repair.rst
+++ b/docs/operating-scylla/procedures/maintenance/repair.rst
@@ -52,18 +52,14 @@ Row-level repair improves ScyllaDB in two ways:
  * keeping the data in a temporary buffer.
  * using the cached data to calculate the checksum and send it to the replicas.

-See also
-
-* `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_
-
-* `Blog: ScyllaDB Open Source 3.1: Efficiently Maintaining Consistency with Row-Level Repair <https://www.scylladb.com/2019/08/13/scylla-open-source-3-1-efficiently-maintaining-consistency-with-row-level-repair/>`_
+See also the `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/>`_.

 Incremental Repair
 ------------------

-Built on top of `Row-level Repair <row-level-repair_>`_ and `Tablets </architecture/tablets>`_, Incremental Repair enables frequent and quick repairs. For more details, see `Incremental Repair </features/incremental-repair>`_.
+Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.

 Automatic Repair
 ----------------

-Built on top of `Incremental Repair </features/incremental-repair>`_, `Automatic Repair </features/automatic-repair>`_ offers repair scheduling and execution directly in ScyllaDB, without external processes.
+Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
--- a/docs/operating-scylla/security/auditing.rst
+++ b/docs/operating-scylla/security/auditing.rst
@@ -14,11 +14,11 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
 Enabling Audit
 ---------------

-By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
+By default, table auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
 You can set the following options:

-* ``none`` - Audit is disabled (default).
-* ``table`` - Audit is enabled, and messages are stored in a Scylla table.
+* ``none`` - Audit is disabled.
+* ``table`` - Audit is enabled, and messages are stored in a Scylla table (default).
 * ``syslog`` - Audit is enabled, and messages are sent to Syslog.
 * ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.

@@ -32,7 +32,7 @@ The audit can be tuned using the following flags or ``scylla.yaml`` entries:
 ==================  ==================================  ========================================================================================================================
 Flag                Default Value                       Description
 ==================  ==================================  ========================================================================================================================
-audit_categories    "DCL,DDL,AUTH,ADMIN"                                  Comma-separated list of statement categories that should be audited
+audit_categories    "DCL,AUTH,ADMIN"                                  Comma-separated list of statement categories that should be audited
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
 audit_tables        “”                                  Comma-separated list of table names that should be audited, in the format of <keyspacename>.<tablename>
 ------------------  ----------------------------------  ------------------------------------------------------------------------------------------------------------------------
@@ -86,9 +86,7 @@ Storing Audit Messages in Syslog
   .. code-block:: shell

      # audit setting
-      # by default, Scylla does not audit anything.
-      # It is possible to enable auditing to the following places:
-      #   - audit.audit_log column family by setting the flag to "table"
+      # 'audit' config option controls if and where to output audited events:
      audit: "syslog"
      #
      # List of statement categories that should be audited.
@@ -159,9 +157,7 @@ For example:
   .. code-block:: shell

      # audit setting
-      # by default, Scylla does not audit anything.
-      # It is possible to enable auditing to the following places:
-      #   - audit.audit_log column family by setting the flag to "table"
+      # 'audit' config option controls if and where to output audited events:
      audit: "table"
      #
      # List of statement categories that should be audited.
@@ -215,8 +211,8 @@ Handling Audit Failures

 In some cases, auditing may not be possible, for example, when:

-* A table is used as the audit’s backend, and the audit partition where the audit row is saved is not available because the node that holds this partition is down.
-* Syslog is used as the audit’s backend, and the Syslog sink (a regular unix socket) is unresponsive/unavailable.
+* A table is used as the audit’s backend, and the partitions where the audit rows are saved are unavailable because the nodes holding those partitions are down or unreachable due to network issues.
+* Syslog is used as the audit’s backend, and the Syslog sink (a regular Unix socket) is unresponsive or unavailable.

 If the audit fails and audit messages are not stored in the configured audit’s backend, you can still review the audit log in the regular ScyllaDB logs.

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.4.x-to-2025.4.y.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.4.x-to-2025.4.y.rst
@@ -14,7 +14,7 @@ if necessary.

 This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
 CentOS, Debian, and Ubuntu.
-See :doc:`OS Support by Platform and Version </getting-started/os-support>`
+See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
 for information about supported versions.

 It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/upgrade-guide-from-2025.x-to-2025.4.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/upgrade-guide-from-2025.x-to-2025.4.rst
@@ -17,7 +17,7 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
 to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.

 This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian, 
-and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>` 
+and Ubuntu. See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ 
 for information about supported versions.

 It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.
@@ -199,9 +199,6 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.


-If you need JMX server, see
-:doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
-and get new version. 

 Start the node
 --------------
--- a/ent/encryption/gcp_host.cc
+++ b/ent/encryption/gcp_host.cc
@@ -284,6 +284,7 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
                }
                [[fallthrough]];
            case httpclient::reply_status::request_timeout:
+            case httpclient::reply_status::too_many_requests:
                if (retry < max_retries) {
                    // service unavailable etc -> backoff + retry
                    do_backoff = true;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -2424,8 +2424,8 @@ bool gossiper::is_enabled() const {
 void gossiper::add_expire_time_for_endpoint(locator::host_id endpoint, clk::time_point expire_time) {
    auto now_ = now();
    auto diff = std::chrono::duration_cast<std::chrono::seconds>(expire_time - now_).count();
-    logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T}]: (expire = {}, now = {}, diff = {} seconds)",
-            endpoint, fmt::localtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
+    logger.info("Node {} will be removed from gossip at [{:%Y-%m-%d %T %z}]: (expire = {}, now = {}, diff = {} seconds)",
+            endpoint, fmt::gmtime(clk::to_time_t(expire_time)), expire_time.time_since_epoch().count(),
            now_.time_since_epoch().count(), diff);
    _expire_time_endpoint_map[endpoint] = expire_time;
 }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -153,6 +153,8 @@ public:
    }
    const std::set<inet_address>& get_seeds() const noexcept;

+    seastar::scheduling_group get_scheduling_group() const noexcept { return _gcfg.gossip_scheduling_group; }
+
 public:
    static clk::time_point inline now() noexcept { return clk::now(); }
 public:
--- a/gms/inet_address.cc
+++ b/gms/inet_address.cc
@@ -23,11 +23,11 @@ static_assert(std::is_nothrow_move_constructible_v<gms::inet_address>);

 future<gms::inet_address> gms::inet_address::lookup(sstring name, opt_family family, opt_family preferred) {
    return seastar::net::dns::get_host_by_name(std::move(name), family).then([preferred](seastar::net::hostent&& h) {
-        for (auto& addr : h.addr_list) {
-            if (!preferred || addr.in_family() == preferred) {
-                return gms::inet_address(addr);
+        for (auto& ent : h.addr_entries) {
+            if (!preferred || ent.addr.in_family() == preferred) {
+                return gms::inet_address(ent.addr);
            }
        }
-        return gms::inet_address(h.addr_list.front());
+        return gms::inet_address(h.addr_entries.front().addr);
    });
 }
--- a/index/vector_index.cc
+++ b/index/vector_index.cc
@@ -17,11 +17,11 @@
 #include "index/secondary_index.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/concrete_types.hh"
+#include "types/types.hh"
 #include "utils/managed_string.hh"
 #include <seastar/core/sstring.hh>
 #include <boost/algorithm/string.hpp>

-
 namespace secondary_index {

 static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
@@ -147,17 +147,88 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
 }

 void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
-    if (targets.size() != 1) {
-        throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
-    }
-    auto target = targets[0];
-    auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
-    if (!c_def) {
-        throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
-    }
-    auto type = c_def->type;
-    if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
-        throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
+
+    struct validate_visitor {
+        const class schema& schema;
+        bool& is_vector;
+
+        /// Vector indexes support filtering on native types that can be used as primary key columns.
+        /// There is no counter (it cannot be used with vector columns)
+        /// and no duration (it cannot be used as a primary key or in secondary indexes).
+        static bool is_supported_filtering_column(abstract_type const & kind_type) {
+            switch (kind_type.get_kind()) {
+                case abstract_type::kind::ascii:
+                case abstract_type::kind::boolean:
+                case abstract_type::kind::byte:
+                case abstract_type::kind::bytes:
+                case abstract_type::kind::date:
+                case abstract_type::kind::decimal:
+                case abstract_type::kind::double_kind:
+                case abstract_type::kind::float_kind:
+                case abstract_type::kind::inet:
+                case abstract_type::kind::int32:
+                case abstract_type::kind::long_kind:
+                case abstract_type::kind::short_kind:
+                case abstract_type::kind::simple_date:
+                case abstract_type::kind::time:
+                case abstract_type::kind::timestamp:
+                case abstract_type::kind::timeuuid:
+                case abstract_type::kind::utf8:
+                case abstract_type::kind::uuid:
+                case abstract_type::kind::varint:
+                    return true;
+                default:
+                    break;
+            }
+            return false;
+        }
+
+        void validate(cql3::column_identifier const& column, bool is_vector) const {
+            auto const& c_name = column.to_string();
+            auto const* c_def = schema.get_column_definition(column.name());
+            if (c_def == nullptr) {
+                throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
+            }
+
+            auto type = c_def->type;
+
+            if (is_vector) {
+                auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
+                if (vector_type == nullptr) {
+                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
+                }
+
+                auto elements_type = vector_type->get_elements_type();
+                if (elements_type->get_kind() != abstract_type::kind::float_kind) {
+                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
+                }
+                return;
+            }
+
+            if (!is_supported_filtering_column(*type)) {
+                throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
+            }
+        }
+
+        void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
+            for (const auto& column : columns) {
+                // CQL restricts the secondary local index to have multiple columns with partition key only.
+                // Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
+                // so we can assume here that these are non-vectors filtering columns.
+                validate(*column, false);
+            }
+        }
+
+        void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
+            validate(*column, is_vector);
+            // The first column is the vector column, the rest mustn't be vectors.
+            is_vector = false;
+        }
+    };
+
+    bool is_vector = true;
+    for (const auto& target : targets) {
+        std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
    }
 }

--- a/init.cc
+++ b/init.cc
@@ -11,7 +11,6 @@
 #include "seastarx.hh"
 #include "db/config.hh"

-#include <boost/algorithm/string/trim.hpp>
 #include <seastar/core/coroutine.hh>
 #include "sstables/sstable_compressor_factory.hh"
 #include "gms/feature_service.hh"
@@ -30,11 +29,7 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,

    std::set<gms::inet_address> seeds;
    if (seed_provider.parameters.contains("seeds")) {
-        size_t begin = 0;
-        size_t next = 0;
-        sstring seeds_str = seed_provider.parameters.find("seeds")->second;
-        while (begin < seeds_str.length() && begin != (next=seeds_str.find(",",begin))) {
-            auto seed = boost::trim_copy(seeds_str.substr(begin,next-begin));
+        for (const auto& seed : utils::split_comma_separated_list(seed_provider.parameters.at("seeds"))) {
            try {
                seeds.emplace(gms::inet_address::lookup(seed, family, preferred).get());
            } catch (...) {
@@ -46,11 +41,10 @@ std::set<gms::inet_address> get_seeds_from_db_config(const db::config& cfg,
                               seed,
                               std::current_exception());
            }
-            begin = next+1;
        }
    }
    if (seeds.empty()) {
-        seeds.emplace(gms::inet_address("127.0.0.1"));
+        seeds.emplace("127.0.0.1");
    }
    startlog.info("seeds={{{}}}, listen_address={}, broadcast_address={}",
            fmt::join(seeds, ", "), listen, broadcast_address);
@@ -102,13 +96,6 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
    if (!cfg.check_experimental(db::experimental_features_t::feature::STRONGLY_CONSISTENT_TABLES)) {
        disabled.insert("STRONGLY_CONSISTENT_TABLES"s);
    }
-    if (cfg.force_gossip_topology_changes()) {
-        if (cfg.enable_tablets_by_default()) {
-            throw std::runtime_error("Tablets cannot be enabled with gossip topology changes.  Use either --tablets-mode-for-new-keyspaces=enabled|enforced or --force-gossip-topology-changes, but not both.");
-        }
-        startlog.warn("The tablets feature is disabled due to forced gossip topology changes");
-        disabled.insert("TABLETS"s);
-    }
    if (!cfg.table_digest_insensitive_to_expiry()) {
        disabled.insert("TABLE_DIGEST_INSENSITIVE_TO_EXPIRY"s);
    }
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -150,7 +150,6 @@ fedora_packages=(
    llvm
    openldap-servers
    openldap-devel
-    toxiproxy
    cyrus-sasl
    fipscheck
    cpp-jwt-devel
@@ -158,7 +157,10 @@ fedora_packages=(
    podman
    buildah

-    https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
+    # for cassandra-stress
+    java-openjdk-headless
+    snappy
+
    elfutils
    jq

@@ -295,6 +297,7 @@ print_usage() {
    echo "  --print-pip-runtime-packages Print required pip packages for Scylla"
    echo "  --print-pip-symlinks Print list of pip provided commands which need to install to /usr/bin"
    echo "  --print-node-exporter-filename Print node_exporter filename"
+    echo "  --future Install dependencies for future toolchain (Fedora rawhide based)"
    exit 1
 }

@@ -302,6 +305,7 @@ PRINT_PYTHON3=false
 PRINT_PIP=false
 PRINT_PIP_SYMLINK=false
 PRINT_NODE_EXPORTER=false
+FUTURE=false
 while [ $# -gt 0 ]; do
    case "$1" in
        "--print-python3-runtime-packages")
@@ -320,6 +324,10 @@ while [ $# -gt 0 ]; do
            PRINT_NODE_EXPORTER=true
            shift 1
            ;;
+        "--future")
+            FUTURE=true
+            shift 1
+            ;;
         *)
            print_usage
            ;;
@@ -350,6 +358,10 @@ if $PRINT_NODE_EXPORTER; then
    exit 0
 fi

+if ! $FUTURE; then
+    fedora_packages+=(toxiproxy)
+fi
+
 umask 0022

 ./seastar/install-dependencies.sh
@@ -377,6 +389,10 @@ elif [ "$ID" = "fedora" ]; then
        exit 1
    fi
    dnf install -y "${fedora_packages[@]}" "${fedora_python3_packages[@]}"
+
+    # Fedora 45 tightened key checks, and cassandra-stress is not signed yet.
+    dnf install --no-gpgchecks -y https://github.com/scylladb/cassandra-stress/releases/download/v3.18.1/cassandra-stress-java21-3.18.1-1.noarch.rpm
+
    PIP_DEFAULT_ARGS="--only-binary=:all: -v"
    pip_constrained_packages=""
    for package in "${!pip_packages[@]}"
@@ -447,3 +463,11 @@ if [ ! -z "${CURL_ARGS}" ]; then
 else
    echo "Minio server and client are up-to-date, skipping download"
 fi
+
+if $FUTURE ; then
+    toxyproxy_version="v2.12.0"
+    for bin in toxiproxy-cli toxiproxy-server; do
+        curl -fSL -o "/usr/local/bin/${bin}" "https://github.com/Shopify/toxiproxy/releases/download/${toxyproxy_version}/${bin}-linux-$(go_arch)"
+        chmod +x "/usr/local/bin/${bin}"
+    done
+fi
--- a/lang/lua.cc
+++ b/lang/lua.cc
@@ -8,6 +8,7 @@

 #include <boost/date_time/gregorian/greg_date.hpp>
 #include <boost/date_time/posix_time/posix_time.hpp>
+#include <random>
 #include "lua.hh"
 #include "lang/lua_scylla_types.hh"
 #include "exceptions/exceptions.hh"
@@ -28,6 +29,14 @@
 #    define LUA_504_PLUS(x...)
 #endif

+// Lua 5.5 added a seed parameter to lua_newstate
+
+#if LUA_VERSION_NUM >= 505
+#    define LUA_505_PLUS(x...) x
+#else
+#    define LUA_505_PLUS(x...)
+#endif
+
 using namespace seastar;
 using namespace lua;

@@ -126,7 +135,11 @@ static void debug_hook(lua_State* l, lua_Debug* ar) {

 static lua_slice_state new_lua(const lua::runtime_config& cfg) {
    auto a_state = std::make_unique<alloc_state>(cfg.max_bytes, cfg.max_contiguous);
-    std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get())};
+#if LUA_VERSION_NUM >= 505
+    static thread_local std::default_random_engine rng{std::random_device{}()};
+    auto seed = rng();
+#endif
+    std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get() LUA_505_PLUS(, seed))};
    if (!l) {
        throw std::runtime_error("could not create lua state");
    }
@@ -270,17 +283,6 @@ concept CanHandleLuaTypes = requires(Func f) {
    { f(*static_cast<const lua_table*>(nullptr)) }                      -> std::same_as<lua_visit_ret_type<Func>>;
 };

-// This is used to test if a double fits in a long long, so
-// we expect overflows. Prevent the sanitizer from complaining.
-#ifdef __clang__
-[[clang::no_sanitize("undefined")]]
-#endif
-static
-long long
-cast_to_long_long_allow_overflow(double v) {
-    return (long long)v;
-}
-
 template <typename Func>
 requires CanHandleLuaTypes<Func>
 static auto visit_lua_value(lua_State* l, int index, Func&& f) {
@@ -291,9 +293,10 @@ static auto visit_lua_value(lua_State* l, int index, Func&& f) {
        auto operator()(const long long& v) { return f(utils::multiprecision_int(v)); }
        auto operator()(const utils::multiprecision_int& v) { return f(v); }
        auto operator()(const double& v) {
-            long long v2 = cast_to_long_long_allow_overflow(v);
-            if (v2 == v) {
-                return (*this)(v2);
+            auto min = double(std::numeric_limits<long long>::min());
+            auto max = double(std::numeric_limits<long long>::max());
+            if (min <= v && v <= max && std::trunc(v) == v) {
+                return (*this)((long long)v);
            }
            // FIXME: We could use frexp to produce a decimal instead of a double
            return f(v);
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -616,12 +616,16 @@ tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topo
    return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
 }

-tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
-    if (get_tablet_info(id).replicas.size() < 2) {
+tablet_replica tablet_map::get_secondary_replica(tablet_id id, const locator::topology& topo) const {
+    const auto& orig_replicas = get_tablet_info(id).replicas;
+    if (orig_replicas.size() < 2) {
        throw std::runtime_error(format("No secondary replica for tablet id {}", id));
    }
-    const auto& replicas = get_tablet_info(id).replicas;
-    return replicas.at((size_t(id)+1) % replicas.size());
+    tablet_replica_set replicas = orig_replicas;
+    std::ranges::sort(replicas, tablet_replica_comparator(topo));
+    // This formula must match the one in get_primary_replica(),
+    // just with + 1.
+    return replicas.at((size_t(id) + size_t(id) / replicas.size() + 1) % replicas.size());
 }

 std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -648,9 +648,10 @@ public:
    /// Returns the primary replica for the tablet
    tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;

-    /// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
+    /// Returns the secondary replica for the tablet: the replica that immediately follows the primary
+    /// replica in the topology-sorted replica list.
    /// \throws std::runtime_error if the tablet has less than 2 replicas.
-    tablet_replica get_secondary_replica(tablet_id id) const;
+    tablet_replica get_secondary_replica(tablet_id id, const locator::topology& topo) const;

    // Returns the replica that matches hosts and dcs filters for tablet_task_info.
    std::optional<tablet_replica> maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const;
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -1170,6 +1170,17 @@ token_metadata::set_version_tracker(version_tracker_t tracker) {
    _impl->set_version_tracker(std::move(tracker));
 }

+version_tracker::version_tracker(utils::phased_barrier::operation op, const token_metadata& tm)
+    : _op(std::move(op))
+    , _version(tm.get_version())
+    , _tm(&tm)
+{
+}
+
+long version_tracker::version_use_count() const {
+    return _tm->use_count();
+}
+
 version_tracker::~version_tracker() {
    if (_expired_at) {
        auto now = std::chrono::steady_clock::now();
@@ -1181,8 +1192,8 @@ version_tracker::~version_tracker() {
    }
 }

-version_tracker shared_token_metadata::new_tracker(token_metadata::version_t version) {
-    auto tracker = version_tracker(_versions_barrier.start(), version);
+version_tracker shared_token_metadata::new_tracker(const token_metadata& tm) {
+    auto tracker = version_tracker(_versions_barrier.start(), tm);
    _trackers.push_front(tracker);
    return tracker;
 }
@@ -1198,6 +1209,18 @@ void shared_token_metadata::clear_and_dispose(std::unique_ptr<token_metadata_imp
    }
 }

+std::unordered_map<service::topology::version_t, int> shared_token_metadata::describe_stale_versions() {
+    std::unordered_map<service::topology::version_t, int> result;
+    const auto active_version = _shared.get()->get_version();
+    for (const auto& t: _trackers) {
+        const auto v = t.version();
+        if (v < active_version) {
+            result.emplace(v, t.version_use_count());
+        }
+    }
+    return result;
+}
+
 void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
    if (_shared->get_ring_version() >= tmptr->get_ring_version()) {
        on_internal_error(tlogger, format("shared_token_metadata: must not set non-increasing ring_version: {} -> {}", _shared->get_ring_version(), tmptr->get_ring_version()));
@@ -1211,7 +1234,7 @@ void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {

    tmptr->set_shared_token_metadata(*this);
    _shared = std::move(tmptr);
-    _shared->set_version_tracker(new_tracker(_shared->get_version()));
+    _shared->set_version_tracker(new_tracker(*_shared));

    for (auto&& v : _trackers) {
        if (v.version() != _shared->get_version()) {
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -112,6 +112,7 @@ public:
 private:
    utils::phased_barrier::operation _op;
    service::topology::version_t _version;
+    const token_metadata* _tm = nullptr;
    link_type _link;

    // When engaged it means the version is no longer latest and should be released soon as to
@@ -120,8 +121,7 @@ private:
    std::chrono::steady_clock::duration _log_threshold;
 public:
    version_tracker() = default;
-    version_tracker(utils::phased_barrier::operation op, service::topology::version_t version)
-        : _op(std::move(op)), _version(version) {}
+    version_tracker(utils::phased_barrier::operation op, const token_metadata& tm);
    version_tracker(version_tracker&&) noexcept = default;
    version_tracker& operator=(version_tracker&& o) noexcept {
        if (this != &o) {
@@ -137,6 +137,8 @@ public:
        return _version;
    }

+    long version_use_count() const;
+
    void mark_expired(std::chrono::steady_clock::duration log_threshold) {
        if (!_expired_at) {
            _expired_at = std::chrono::steady_clock::now();
@@ -172,7 +174,7 @@ private:
    friend class token_metadata_impl;
 };

-class token_metadata final {
+class token_metadata final: public enable_lw_shared_from_this<token_metadata> {
    shared_token_metadata* _shared_token_metadata = nullptr;
    std::unique_ptr<token_metadata_impl> _impl;
 private:
@@ -410,7 +412,7 @@ class shared_token_metadata : public peering_sharded_service<shared_token_metada
            boost::intrusive::constant_time_size<false>>;
    version_tracker_list_type _trackers;
 private:
-    version_tracker new_tracker(token_metadata::version_t);
+    version_tracker new_tracker(const token_metadata& tm);
 public:
    // used to construct the shared object as a sharded<> instance
    // lock_func returns semaphore_units<>
@@ -419,7 +421,7 @@ public:
        , _lock_func(std::move(lock_func))
        , _versions_barrier("shared_token_metadata::versions_barrier")
    {
-        _shared->set_version_tracker(new_tracker(_shared->get_version()));
+        _shared->set_version_tracker(new_tracker(*_shared));
    }

    shared_token_metadata(const shared_token_metadata& x) = delete;
@@ -446,6 +448,9 @@ public:
        _stall_detector_threshold = threshold;
    }

+    // Returns a map version -> use_count
+    std::unordered_map<service::topology::version_t, int> describe_stale_versions();
+
    future<> stale_versions_in_use() const {
        return _stale_versions_in_use.get_future();
    }
--- a/main.cc
+++ b/main.cc
@@ -1150,6 +1150,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            dbcfg.memtable_scheduling_group = create_scheduling_group("memtable", "mt", 1000).get();
            dbcfg.memtable_to_cache_scheduling_group = create_scheduling_group("memtable_to_cache", "mt2c", 200).get();
            dbcfg.gossip_scheduling_group = create_scheduling_group("gossip", "gms", 1000).get();
+            debug::gossip_scheduling_group = dbcfg.gossip_scheduling_group;
            dbcfg.commitlog_scheduling_group = create_scheduling_group("commitlog", "clog", 1000).get();
            dbcfg.schema_commitlog_scheduling_group = create_scheduling_group("schema_commitlog", "sclg", 1000).get();
            dbcfg.available_memory = memory::stats().total_memory();
@@ -2041,8 +2042,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            cdc_config.ring_delay = std::chrono::milliseconds(cfg->ring_delay_ms());
            cdc_config.dont_rewrite_streams = cfg->cdc_dont_rewrite_streams();
            cdc_generation_service.start(std::move(cdc_config), std::ref(gossiper), std::ref(sys_dist_ks), std::ref(sys_ks),
-                    std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db),
-                    [&ss] () -> bool { return ss.local().raft_topology_change_enabled(); }).get();
+                    std::ref(stop_signal.as_sharded_abort_source()), std::ref(token_metadata), std::ref(feature_service), std::ref(db)).get();
            auto stop_cdc_generation_service = defer_verbose_shutdown("CDC Generation Management service", [] {
                cdc_generation_service.stop().get();
            });
@@ -2071,13 +2071,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                gossiper.local().unregister_(mm.local().shared_from_this()).get();
            });

-            utils::loading_cache_config perm_cache_config;
-            perm_cache_config.max_size = cfg->permissions_cache_max_entries();
-            perm_cache_config.expiry = std::chrono::milliseconds(cfg->permissions_validity_in_ms());
-            perm_cache_config.refresh = std::chrono::milliseconds(cfg->permissions_update_interval_in_ms());
-
            auto start_auth_service = [&mm] (sharded<auth::service>& auth_service, std::any& stop_auth_service, const char* what) {
-                supervisor::notify(fmt::format("starting {}", what));
                auth_service.invoke_on_all(&auth::service::start, std::ref(mm), std::ref(sys_ks)).get();

                stop_auth_service = defer_verbose_shutdown(what, [&auth_service] {
@@ -2105,7 +2099,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                maintenance_auth_config.authenticator_java_name = sstring{auth::allow_all_authenticator_name};
                maintenance_auth_config.role_manager_java_name = sstring{auth::maintenance_socket_role_manager_name};

-                maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client),  std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
+                maintenance_auth_service.start(std::ref(qp), std::ref(group0_client),  std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();

                cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);

@@ -2372,7 +2366,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            auth_config.authenticator_java_name = qualified_authenticator_name;
            auth_config.role_manager_java_name = qualified_role_manager_name;

-            auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
+            auth_service.start(std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();

            std::any stop_auth_service;
            // Has to be called after node joined the cluster (join_cluster())
--- a/mutation/mutation_compactor.hh
+++ b/mutation/mutation_compactor.hh
@@ -272,25 +272,27 @@ private:

    bool can_purge_tombstone(const tombstone& t, is_shadowable is_shadowable, const gc_clock::time_point deletion_time) {
        max_purgeable::can_purge_result purge_res { };
+        std::optional<bool> expired;

        if (_tombstone_gc_state.cheap_to_get_gc_before(_schema)) {
            // if retrieval of grace period is cheap, can_gc() will only be
            // called for tombstones that are older than grace period, in
            // order to avoid unnecessary bloom filter checks when calculating
            // max purgeable timestamp.
-            purge_res.can_purge = satisfy_grace_period(deletion_time);
+            expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
            if (purge_res.can_purge) {
                purge_res = can_gc(t, is_shadowable);
            }
        } else {
            purge_res = can_gc(t, is_shadowable);
            if (purge_res.can_purge) {
-                purge_res.can_purge = satisfy_grace_period(deletion_time);
+                expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
            }
        }

        if constexpr (sstable_compaction()) {
-            if (!_tombstone_stats || !t) {
+            // Tombstone GC stats only account for expired tombstones (those eligible for GC).
+            if (!_tombstone_stats || !t || !expired.value_or(satisfy_grace_period(deletion_time))) {
                return purge_res.can_purge;
            }

--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
-size 6492280
+oid sha256:9034610470ff645fab03da5ad6c690e5b41f3307ea4b529c7e63b0786a1289ed
+size 6539600
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
-size 6492176
+oid sha256:0c4bbf51dbe01d684ea5b9a9157781988ed499604d2fde90143bad0b9a5594f0
+size 6543944
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -459,7 +459,7 @@ future<> server_impl::wait_for_state_change(seastar::abort_source* as) {
    }

    try {
-        return as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future();
+        co_await (as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future());
    } catch (abort_requested_exception&) {
        throw request_aborted(fmt::format(
            "Aborted while waiting for state change on server: {}, latest applied entry: {}, current state: {}", _id, _applied_idx, _fsm->current_state()));
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -252,6 +252,10 @@ public:
    //
    // The caller may pass a pointer to an abort_source to make the function abortable.
    // It it passes nullptr, the function is unabortable.
+    //
+    // Exceptions:
+    // raft::request_aborted
+    //     Thrown if abort is requested before the operation finishes.
    virtual future<> wait_for_state_change(seastar::abort_source* as) = 0;

    // The returned future is resolved when a leader is elected for the current term.
@@ -262,6 +266,10 @@ public:
    //
    // The caller may pass a pointer to an abort_source to make the function abortable.
    // It it passes nullptr, the function is unabortable.
+    //
+    // Exceptions:
+    // raft::request_aborted
+    //     Thrown if abort is requested before the operation finishes.
    virtual future<> wait_for_leader(seastar::abort_source* as) = 0;

    // Manually trigger snapshot creation and log truncation.
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -103,8 +103,8 @@ thread_local dirty_memory_manager default_dirty_memory_manager;

 inline
 flush_controller
-make_flush_controller(const db::config& cfg, backlog_controller::scheduling_group& sg, std::function<double()> fn) {
-    return flush_controller(sg, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
+make_flush_controller(const db::config& cfg, const database_config& dbcfg, std::function<double()> fn) {
+    return flush_controller(dbcfg.memtable_scheduling_group, cfg.memtable_flush_static_shares(), 50ms, cfg.unspooled_dirty_soft_limit(), std::move(fn));
 }

 keyspace::keyspace(config cfg, locator::effective_replication_map_factory& erm_factory)
@@ -394,8 +394,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.unspooled_dirty_soft_limit(), default_scheduling_group())
    , _dirty_memory_manager(*this, dbcfg.available_memory * 0.50, cfg.unspooled_dirty_soft_limit(), dbcfg.statement_scheduling_group)
    , _dbcfg(dbcfg)
-    , _flush_sg(dbcfg.memtable_scheduling_group)
-    , _memtable_controller(make_flush_controller(_cfg, _flush_sg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
+    , _memtable_controller(make_flush_controller(_cfg, _dbcfg, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
        auto backlog = (_dirty_memory_manager.unspooled_dirty_memory()) / limit;
        if (_dirty_memory_manager.has_extraneous_flushes_requested()) {
            backlog = std::max(backlog, _memtable_controller.backlog_of_shares(200));
@@ -1504,12 +1503,10 @@ keyspace::make_column_family_config(const schema& s, const database& db) const {
    cfg.compaction_concurrency_semaphore = _config.compaction_concurrency_semaphore;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
-    cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
    cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
    cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
    cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
    cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
-    cfg.statement_scheduling_group = _config.statement_scheduling_group;
    cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
    cfg.enable_node_aggregated_table_metrics = db_config.enable_node_aggregated_table_metrics();
    cfg.tombstone_warn_threshold = db_config.tombstone_warn_threshold();
@@ -2453,12 +2450,10 @@ database::make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;

-    cfg.compaction_scheduling_group = _dbcfg.compaction_scheduling_group;
    cfg.memory_compaction_scheduling_group = _dbcfg.memory_compaction_scheduling_group;
    cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group;
    cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group;
    cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group;
-    cfg.statement_scheduling_group = _dbcfg.statement_scheduling_group;
    cfg.enable_metrics_reporting = _cfg.enable_keyspace_column_family_metrics();

    cfg.view_update_memory_semaphore_limit = max_memory_pending_view_updates();
@@ -3782,7 +3777,7 @@ future<utils::chunked_vector<temporary_buffer<char>>> database::sample_data_file
            &result,
            chunk_size
        ] (database& local_db, state_by_shard& local_state) -> future<> {
-            auto ticket = get_units(local_db._sample_data_files_local_concurrency_limiter, 1);
+            auto ticket = co_await get_units(local_db._sample_data_files_local_concurrency_limiter, 1);

            // In `chosen_chunks`, the sorted array of chosen chunk offsets (in the "global chunk list"),
            // find the range of offsets which belongs to us.
--- a/Show More
+++ b/Show More