test_refresh_deletes_uploaded_sstables should wait for sstables to get deleted

SSTable unlinking is async, so in some cases it may happen that the upload dir is not empty immediately after refresh is done. This patch adjusts test_refresh_deletes_uploaded_sstables so it waits with a timeout till the upload dir becomes empty instead of just assuming the API will sync on sstables being gone. Fixes SCYLLADB-1190 Signed-off-by: Robert Bindar <robert.bindar@scylladb.com> Closes scylladb/scylladb#29215
Merge 'test: audit: clean up test helper class naming' from Dario Mirovic
2026-03-26 08:43:14 +03:00 · 2026-03-25 15:30:16 +01:00 · 2026-03-25 13:21:08 +01:00 · 2026-03-25 13:18:37 +01:00 · 2026-03-25 11:45:53 +02:00 · 2026-03-24 23:49:49 +02:00
2317 changed files with 33050 additions and 24605 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -55,22 +55,26 @@ ninja build/<mode>/test/boost/<test_name>
 ninja build/<mode>/scylla

 # Run all tests in a file
-./test.py --mode=<mode> <test_path>
+./test.py --mode=<mode> test/<suite>/<test_name>.py

 # Run a single test case from a file
-./test.py --mode=<mode> <test_path>::<test_function_name>
+./test.py --mode=<mode> test/<suite>/<test_name>.py::<test_function_name>
+
+# Run all tests in a directory
+./test.py --mode=<mode> test/<suite>/

 # Examples
-./test.py --mode=dev alternator/
-./test.py --mode=dev cluster/test_raft_voters::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/alternator/
+./test.py --mode=dev test/cluster/test_raft_voters.py::test_raft_limited_voters_retain_coordinator
+./test.py --mode=dev test/cqlpy/test_json.py

 # Optional flags
-./test.py --mode=dev cluster/test_raft_no_quorum -v  # Verbose output
-./test.py --mode=dev cluster/test_raft_no_quorum --repeat 5  # Repeat test 5 times
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py -v  # Verbose output
+./test.py --mode=dev test/cluster/test_raft_no_quorum.py --repeat 5  # Repeat test 5 times
 ```

 **Important:**
- Use path without `.py` extension (e.g., `cluster/test_raft_no_quorum`, not `cluster/test_raft_no_quorum.py`)
+- Use full path with `.py` extension (e.g., `test/cluster/test_raft_no_quorum.py`, not `cluster/test_raft_no_quorum`)
 - To run a single test case, append `::<test_function_name>` to the file path
 - Add `-v` for verbose output
 - Add `--repeat <num>` to repeat a test multiple times
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,6 +1,6 @@
 version: 2
 updates:
- package-ecosystem: "pip"
+- package-ecosystem: "uv"
  directory: "/docs"
  schedule:
    interval: "daily"
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -8,6 +8,9 @@ on:
 jobs:
  check-fixes-prefix:
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
    steps:
      - name: Check PR body for "Fixes" prefix patterns
        uses: actions/github-script@v7
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -0,0 +1,53 @@
+name: Backport with Jira Integration
+
+on:
+  push:
+    branches:
+      - master
+      - next-*.*
+      - branch-*.*
+  pull_request_target:
+    types: [labeled, closed]
+    branches: 
+      - master
+      - next
+      - next-*.*
+      - branch-*.*
+
+jobs:
+  backport-on-push:
+    if: github.event_name == 'push'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'push'
+      base_branch: ${{ github.ref }}
+      commits: ${{ github.event.before }}..${{ github.sha }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-on-label:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'labeled'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      head_commit: ${{ github.event.pull_request.base.sha }}
+      label_name: ${{ github.event.label.name }}
+      pr_state: ${{ github.event.pull_request.state }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-chain:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'chain'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      pr_body: ${{ github.event.pull_request.body }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_sync.yml
+++ b/.github/workflows/call_jira_sync.yml
@@ -1,8 +1,8 @@
-name: Sync Jira Based on PR Events
+name: Sync Jira Based on PR Events

 on:
  pull_request_target:
-    types: [opened, ready_for_review, review_requested, labeled, unlabeled, closed]
+    types: [opened, edited, ready_for_review, review_requested, labeled, unlabeled, closed]

 permissions:
  contents: read
@@ -10,32 +10,9 @@ permissions:
  issues: write

 jobs:
-  jira-sync-pr-opened:
-    if: github.event.action == 'opened'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_opened.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  jira-sync-in-review:
-    if: github.event.action == 'ready_for_review' || github.event.action == 'review_requested'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_in_review.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  jira-sync-add-label:
-    if: github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_add_label.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  jira-status-remove-label:
-    if: github.event.action == 'unlabeled'
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_remove_label.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  jira-status-pr-closed:
-    if: github.event.action == 'closed' 
-    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_closed.yml@main
+  jira-sync:
+    uses: scylladb/github-automation/.github/workflows/main_pr_events_jira_sync.yml@main
+    with:
+      caller_action: ${{ github.event.action }}
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_jira_sync_pr_milestone.yml
+++ b/.github/workflows/call_jira_sync_pr_milestone.yml
@@ -0,0 +1,22 @@
+name: Sync Jira Based on PR Milestone Events
+
+on:
+  pull_request_target:
+    types: [milestoned, demilestoned]
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  jira-sync-milestone-set:
+    if: github.event.action == 'milestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_set.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-sync-milestone-removed:
+    if: github.event.action == 'demilestoned'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_milestone_removed.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -2,13 +2,13 @@ name: Call Jira release creation for new milestone

 on:
  milestone:
-    types: [created]
+    types: [created, closed]

 jobs:
  sync-milestone-to-jira:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
    with:
      # Comma-separated list of Jira project keys
-      jira_project_keys: "SCYLLADB,CUSTOMER"
+      jira_project_keys: "SCYLLADB,CUSTOMER,SMI,RELENG,VECTOR"
    secrets:
      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/close_issue_for_scylla_associate.yml
+++ b/.github/workflows/close_issue_for_scylla_associate.yml
@@ -0,0 +1,62 @@
+name: Close issues created by Scylla associates
+
+on:
+  issues:
+    types: [opened, reopened]
+
+permissions:
+  issues: write
+
+jobs:
+  comment-and-close:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Comment and close if author email is scylladb.com
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const issue = context.payload.issue;
+            const actor = context.actor;
+
+            // Get user data (only public email is available)
+            const { data: user } = await github.rest.users.getByUsername({
+              username: actor,
+            });
+
+            const email = user.email || "";
+            console.log(`Actor: ${actor}, public email: ${email || "<none>"}`);
+
+            // Only continue if email exists and ends with @scylladb.com
+            if (!email || !email.toLowerCase().endsWith("@scylladb.com")) {
+              console.log("User is not a scylladb.com email (or email not public); skipping.");
+              return;
+            }
+
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+            const issue_number = issue.number;
+
+            const body = "Issues in this repository are closed automatically. Scylla associates should use Jira to manage issues.\nPlease move this issue to Jira https://scylladb.atlassian.net/jira/software/c/projects/SCYLLADB/list";
+
+            // Add the comment
+            await github.rest.issues.createComment({
+              owner,
+              repo,
+              issue_number,
+              body,
+            });
+
+            console.log(`Comment added to #${issue_number}`);
+
+            // Close the issue
+            await github.rest.issues.update({
+              owner,
+              repo,
+              issue_number,
+              state: "closed",
+              state_reason: "not_planned"
+            });
+
+            console.log(`Issue #${issue_number} closed.`);
--- a/.github/workflows/docs-pages.yaml
+++ b/.github/workflows/docs-pages.yaml
@@ -19,6 +19,8 @@ on:
 jobs:
  release:
    permissions:
+      pages: write
+      id-token: write
      contents: write
    runs-on: ubuntu-latest
    steps:
@@ -31,7 +33,9 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.12"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
      - name: Set up env
        run: make -C docs FLAG="${{ env.FLAG }}" setupenv
      - name: Build docs
--- a/.github/workflows/docs-pr.yaml
+++ b/.github/workflows/docs-pr.yaml
@@ -29,7 +29,9 @@ jobs:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.12"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
      - name: Set up env
        run: make -C docs FLAG="${{ env.FLAG }}" setupenv
      - name: Build docs
--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -14,7 +14,8 @@ env:
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log

-permissions: {}
+permissions:
+  contents: read

 # cancel the in-progress run upon a repush
 concurrency:
@@ -34,8 +35,6 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
-      - run: |
-          sudo dnf -y install clang-tools-extra
      - name: Generate compilation database
        run: |
          cmake                                         \
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -1,4 +1,6 @@
 name: Trigger Scylla CI Route
+permissions:
+  contents: read

 on:
  issue_comment:
@@ -12,16 +14,38 @@ jobs:
    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
+      - name: Verify Org Membership
+        id: verify_author
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
+          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
+          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
+        shell: bash
+        run: |
+          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
+            AUTHOR="$PR_AUTHOR"
+            ASSOCIATION="$PR_ASSOCIATION"
+          else
+            AUTHOR="$COMMENT_AUTHOR"
+            ASSOCIATION="$COMMENT_ASSOCIATION"
+          fi
+          if [[ "$ASSOCIATION" == "MEMBER" || "$ASSOCIATION" == "OWNER" ]]; then
+            echo "member=true" >> $GITHUB_OUTPUT
+          else
+            echo "::warning::${AUTHOR} is not a member of scylladb (association: ${ASSOCIATION}); skipping CI trigger."
+            echo "member=false" >> $GITHUB_OUTPUT
+          fi
+
      - name: Validate Comment Trigger
        if: github.event_name == 'issue_comment'
        id: verify_comment
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
        shell: bash
        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')

          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
            echo "trigger=true" >> $GITHUB_OUTPUT
@@ -30,13 +54,13 @@ jobs:
          fi

      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
+        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
+          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
+          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
-          PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
-          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
+            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
--- a/.github/workflows/trigger_jenkins.yaml
+++ b/.github/workflows/trigger_jenkins.yaml
@@ -1,5 +1,8 @@
 name: Trigger next gating

+permissions:
+  contents: read
+
 on:
  push:
    branches:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,7 +300,6 @@ add_subdirectory(locator)
 add_subdirectory(message)
 add_subdirectory(mutation)
 add_subdirectory(mutation_writer)
-add_subdirectory(node_ops)
 add_subdirectory(readers)
 add_subdirectory(replica)
 add_subdirectory(raft)
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ For further information, please see:

 [developer documentation]: HACKING.md
 [build documentation]: docs/dev/building.md
-[docker image build documentation]: dist/docker/debian/README.md
+[docker image build documentation]: dist/docker/redhat/README.md

 ## Running Scylla

--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0-rc2
+VERSION=2026.2.0-dev

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -13,7 +13,8 @@
 #include <string_view>
 #include "alternator/auth.hh"
 #include <fmt/format.h>
-#include "auth/password_authenticator.hh"
+#include "db/consistency_level_type.hh"
+#include "db/system_keyspace.hh"
 #include "service/storage_proxy.hh"
 #include "alternator/executor.hh"
 #include "cql3/selection/selection.hh"
@@ -25,8 +26,8 @@ namespace alternator {

 static logging::logger alogger("alternator-auth");

-future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username) {
-    schema_ptr schema = proxy.data_dictionary().find_schema(auth::get_auth_ks_name(as.query_processor()), "roles");
+future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username) {
+    schema_ptr schema = proxy.data_dictionary().find_schema(db::system_keyspace::NAME, "roles");
    partition_key pk = partition_key::from_single_value(*schema, utf8_type->decompose(username));
    dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
    std::vector<query::clustering_range> bounds{query::clustering_range::make_open_ended_both_sides()};
@@ -39,7 +40,7 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::serv
    auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id, can_login_col->id}, selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
            proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
-    auto cl = auth::password_authenticator::consistency_for_user(username);
+    auto cl = db::consistency_level::LOCAL_ONE;

    service::client_state client_state{service::client_state::internal_tag()};
    service::storage_proxy::coordinator_query_result qr = co_await proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
--- a/alternator/auth.hh
+++ b/alternator/auth.hh
@@ -20,6 +20,6 @@ namespace alternator {

 using key_cache = utils::loading_cache<std::string, std::string, 1>;

-future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::service& as, std::string username);
+future<std::string> get_key_from_roles(service::storage_proxy& proxy, std::string username);

 }
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -618,7 +618,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
 // Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function can throw an ValidationException API error if there
+// This function can throw a ValidationException API error if there
 // are errors in the format of the condition itself.
 bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_reponse) {
+    if (_should_add_to_response) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
+    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
    bool operator()() const noexcept {
-        return _should_add_to_reponse;
+        return _should_add_to_response;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_reponse = false;
+    bool _should_add_to_response = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -63,6 +63,7 @@
 #include "types/types.hh"
 #include "db/system_keyspace.hh"
 #include "cql3/statements/ks_prop_defs.hh"
+#include "alternator/ttl_tag.hh"

 using namespace std::chrono_literals;

@@ -164,7 +165,7 @@ static map_type attrs_type() {

 static const column_definition& attrs_column(const schema& schema) {
    const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
-    SCYLLA_ASSERT(cdef);
+    throwing_assert(cdef);
    return *cdef;
 }

@@ -237,7 +238,7 @@ static void validate_is_object(const rjson::value& value, const char* caller) {
 }

 // This function assumes the given value is an object and returns requested member value.
-// If it is not possible an api_error::validation is thrown.
+// If it is not possible, an api_error::validation is thrown.
 static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
    validate_is_object(obj, caller);
    const rjson::value* ret = rjson::find(obj, member_name);
@@ -249,7 +250,7 @@ static const rjson::value& get_member(const rjson::value& obj, const char* membe


 // This function assumes the given value is an object with a single member, and returns this member.
-// In case the requirements are not met an api_error::validation is thrown.
+// In case the requirements are not met, an api_error::validation is thrown.
 static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        throw api_error::validation(format("{}: expected an object with a single member.", caller));
@@ -682,7 +683,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
 }

 // Sets a KeySchema object inside the given JSON parent describing the key
-// attributes of the the given schema as being either HASH or RANGE keys.
+// attributes of the given schema as being either HASH or RANGE keys.
 // Additionally, adds to a given map mappings between the key attribute
 // names and their type (as a DynamoDB type string).
 void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
@@ -916,7 +917,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
                sstring index_name = cf_name.substr(delim_it + 1);
                rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
                rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
-                // Add indexes's KeySchema and collect types for AttributeDefinitions:
+                // Add index's KeySchema and collect types for AttributeDefinitions:
                executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
                // Add projection type
                rjson::value projection = rjson::empty_object();
@@ -1649,7 +1650,7 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
 }

 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
+    throwing_assert(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
    // command. We can't inspect the current database schema at this point
@@ -2435,7 +2436,7 @@ std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table
 //   case, this function simply won't be called for this attribute.)
 //
 // This function checks if the given attribute update is an update to some
-// GSI's key, and if the value is unsuitable, a api_error::validation is
+// GSI's key, and if the value is unsuitable, an api_error::validation is
 // thrown. The checking here is similar to the checking done in
 // get_key_from_typed_value() for the base table's key columns.
 //
@@ -2837,14 +2838,12 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        }
    } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
        std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
-        SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
+        throwing_assert(m); // !needs_read_before_write, so apply() did not check a condition
        return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
            return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
        });
    }
-    if (!cas_shard) {
-        on_internal_error(elogger, "cas_shard is not set");
-    }
+    throwing_assert(cas_shard);
    // If we're still here, we need to do this write using LWT:
    global_stats.write_using_lwt++;
    per_table_stats.write_using_lwt++;
@@ -3464,7 +3463,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
    if (should_add_wcu) {
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
-    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_write_item_latency.mark(duration);
+    for (const auto& w : per_table_wcu) {
+        w.first->api_operations.batch_write_item_latency.mark(duration);
+    }
    co_return rjson::print(std::move(ret));
 }

@@ -3548,7 +3551,7 @@ static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>
    return true;
 }

-// Add a path to a attribute_path_map. Throws a validation error if the path
+// Add a path to an attribute_path_map. Throws a validation error if the path
 // "overlaps" with one already in the filter (one is a sub-path of the other)
 // or "conflicts" with it (both a member and index is requested).
 template<typename T>
@@ -4975,7 +4978,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (!some_succeeded && eptr) {
        co_await coroutine::return_exception_ptr(std::move(eptr));
    }
-    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_get_item_latency.mark(duration);
+    for (const table_requests& rs : requests) {
+        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
+        per_table_stats->api_operations.batch_get_item_latency.mark(duration);
+    }
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
@@ -5413,7 +5421,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
 }

 static dht::token token_for_segment(int segment, int total_segments) {
-    SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
+    throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments);
    uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
    return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
 }
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -50,7 +50,7 @@ public:
        _operators.emplace_back(i);
        check_depth_limit();
    }
-    void add_dot(std::string(name)) {
+    void add_dot(std::string name) {
        _operators.emplace_back(std::move(name));
        check_depth_limit();
    }
@@ -85,7 +85,7 @@ struct constant {
    }
 };

-// "value" is is a value used in the right hand side of an assignment
+// "value" is a value used in the right hand side of an assignment
 // expression, "SET a = ...". It can be a constant (a reference to a value
 // included in the request, e.g., ":val"), a path to an attribute from the
 // existing item (e.g., "a.b[3].c"), or a function of other such values.
@@ -205,7 +205,7 @@ public:
 // The supported primitive conditions are:
 // 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
 //    v1 and v2 are values - from the item (an attribute path), the query
-//    (a ":val" reference), or a function of the the above (only the size()
+//    (a ":val" reference), or a function of the above (only the size()
 //    function is supported).
 // 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
 // 3. N-ary operator - v1 IN ( v2, v3, ... )
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -55,7 +55,7 @@ partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
 clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema);

-// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
+// If v encodes a number (i.e., it is a {"N": [...]}), returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);

--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -411,8 +411,8 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
        }
    }

-    auto cache_getter = [&proxy = _proxy, &as = _auth_service] (std::string username) {
-        return get_key_from_roles(proxy, as, std::move(username));
+    auto cache_getter = [&proxy = _proxy] (std::string username) {
+        return get_key_from_roles(proxy, std::move(username));
    };
    return _key_cache.get_ptr(user, cache_getter).then_wrapped([this, &req, &content,
                                                    user = std::move(user),
@@ -710,7 +710,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
-    SCYLLA_ASSERT(req->content_stream);
+    throwing_assert(req->content_stream);
    chunked_content content = co_await read_entire_stream(*req->content_stream, request_content_length_limit);
    // If the request had no Content-Length, we reserved too many units
    // so need to return some
@@ -771,7 +771,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    if (!username.empty()) {
        client_state.set_login(auth::authenticated_user(username));
    }
-    co_await client_state.maybe_update_per_service_level_params();
+    client_state.maybe_update_per_service_level_params();

    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content, _max_users_query_size_in_trace_output.get());
    tracing::trace(trace_state, "{}", op);
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -14,20 +14,6 @@
 namespace alternator {

 const char* ALTERNATOR_METRICS = "alternator";
-static seastar::metrics::histogram estimated_histogram_to_metrics(const utils::estimated_histogram& histogram) {
-    seastar::metrics::histogram res;
-    res.buckets.resize(histogram.bucket_offsets.size());
-    uint64_t cumulative_count = 0;
-    res.sample_count = histogram._count;
-    res.sample_sum = histogram._sample_sum;
-    for (size_t i = 0; i < res.buckets.size(); i++) {
-        auto& v = res.buckets[i];
-        v.upper_bound = histogram.bucket_offsets[i];
-        cumulative_count += histogram.buckets[i];
-        v.count = cumulative_count;
-    }
-    return res;
-}

 static seastar::metrics::label column_family_label("cf");
 static seastar::metrics::label keyspace_label("ks");
@@ -151,21 +137,21 @@ static void register_metrics_with_optional_table(seastar::metrics::metric_groups
            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"), labels,
                    stats.api_operations.batch_get_item_batch_total)(op("BatchGetItem")).aggregate(aggregate_labels).set_skip_when_empty(),
            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.api_operations.batch_get_item_histogram);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("batch_item_count_histogram", seastar::metrics::description("Histogram of the number of items in a batch request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.api_operations.batch_write_item_histogram);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.get_item_op_size_kb);})(op("GetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.put_item_op_size_kb);})(op("PutItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.delete_item_op_size_kb);})(op("DeleteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.update_item_op_size_kb);})(op("UpdateItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_get_item_op_size_kb);})(op("BatchGetItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
            seastar::metrics::make_histogram("operation_size_kb", seastar::metrics::description("Histogram of item sizes involved in a request"), labels,
-                    [&stats]{ return estimated_histogram_to_metrics(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
+                    [&stats]{ return to_metrics_histogram(stats.operation_sizes.batch_write_item_op_size_kb);})(op("BatchWriteItem")).aggregate({seastar::metrics::shard_label}).set_skip_when_empty(),
    });

    seastar::metrics::label expression_label("expression");
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -16,6 +16,8 @@
 #include "cql3/stats.hh"

 namespace alternator {
+using batch_histogram = utils::estimated_histogram_with_max<128>;
+using op_size_histogram = utils::estimated_histogram_with_max<512>;

 // Object holding per-shard statistics related to Alternator.
 // While this object is alive, these metrics are also registered to be
@@ -76,34 +78,34 @@ public:
        utils::timed_rate_moving_average_summary_and_histogram batch_get_item_latency;
        utils::timed_rate_moving_average_summary_and_histogram get_records_latency;

-        utils::estimated_histogram batch_get_item_histogram{22}; // a histogram that covers the range 1 - 100
-        utils::estimated_histogram batch_write_item_histogram{22}; // a histogram that covers the range 1 - 100
+        batch_histogram batch_get_item_histogram;
+        batch_histogram batch_write_item_histogram;
    } api_operations;
    // Operation size metrics
    struct {
        // Item size statistics collected per table and aggregated per node.
-        // Each histogram covers the range 0 - 446. Resolves #25143.
+        // Each histogram covers the range 0 - 512. Resolves #25143.
        // A size is the retrieved item's size.
-        utils::estimated_histogram get_item_op_size_kb{30};
+        op_size_histogram get_item_op_size_kb;
        // A size is the maximum of the new item's size and the old item's size.
-        utils::estimated_histogram put_item_op_size_kb{30};
+        op_size_histogram put_item_op_size_kb;
        // A size is the deleted item's size. If the deleted item's size is
        // unknown (i.e. read-before-write wasn't necessary and it wasn't
        // forced by a configuration option), it won't be recorded on the
        // histogram.
-        utils::estimated_histogram delete_item_op_size_kb{30};
+        op_size_histogram delete_item_op_size_kb;
        // A size is the maximum of existing item's size and the estimated size
        // of the update. This will be changed to the maximum of the existing item's
        // size and the new item's size in a subsequent PR.
-        utils::estimated_histogram update_item_op_size_kb{30};
+        op_size_histogram update_item_op_size_kb;

        // A size is the sum of the sizes of all items per table. This means
        // that a single BatchGetItem / BatchWriteItem updates the histogram
        // for each table that it has items in.
        // The sizes are the retrieved items' sizes grouped per table.
-        utils::estimated_histogram batch_get_item_op_size_kb{30};
+        op_size_histogram batch_get_item_op_size_kb;
        // The sizes are the the written items' sizes grouped per table.
-        utils::estimated_histogram batch_write_item_op_size_kb{30};
+        op_size_histogram batch_write_item_op_size_kb;
    } operation_sizes;
    // Count of authentication and authorization failures, counted if either
    // alternator_enforce_authorization or alternator_warn_authorization are
@@ -140,7 +142,7 @@ public:
    cql3::cql_stats cql_stats;

    // Enumeration of expression types only for stats
-    // if needed it can be extended e.g. per operation 
+    // if needed it can be extended e.g. per operation
    enum expression_types {
        UPDATE_EXPRESSION,
        CONDITION_EXPRESSION,
@@ -164,7 +166,7 @@ struct table_stats {
 void register_metrics(seastar::metrics::metric_groups& metrics, const stats& stats);

 inline uint64_t bytes_to_kb_ceil(uint64_t bytes) {
-    return (bytes + 1023) / 1024;
+    return (bytes) / 1024;
 }

 }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -33,6 +33,8 @@
 #include "data_dictionary/data_dictionary.hh"
 #include "utils/rjson.hh"

+static logging::logger elogger("alternator-streams");
+
 /**
 * Base template type to implement  rapidjson::internal::TypeHelper<...>:s
 * for types that are ostreamable/string constructible/castable.
@@ -428,6 +430,25 @@ using namespace std::chrono_literals;
 // Dynamo docs says no data shall live longer than 24h.
 static constexpr auto dynamodb_streams_max_window = 24h;

+// find the parent shard in previous generation for the given child shard
+// takes care of wrap-around case in vnodes
+// prev_streams must be sorted by token
+const cdc::stream_id& find_parent_shard_in_previous_generation(db_clock::time_point prev_timestamp, const utils::chunked_vector<cdc::stream_id> &prev_streams, const cdc::stream_id &child) {
+    if (prev_streams.empty()) {
+        // something is really wrong - streams are empty
+        // let's try internal_error in hope it will be notified and fixed
+        on_internal_error(elogger, fmt::format("streams are empty for cdc generation at {} ({})", prev_timestamp, prev_timestamp.time_since_epoch().count()));
+    }
+    auto it = std::lower_bound(prev_streams.begin(), prev_streams.end(), child.token(), [](const cdc::stream_id& id, const dht::token& t) {
+        return id.token() < t;
+    });
+    if (it == prev_streams.end()) {
+        // wrap around case - take first
+        it = prev_streams.begin();
+    }
+    return *it;
+}
+
 future<executor::request_return_type> executor::describe_stream(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.describe_stream++;

@@ -491,7 +512,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        co_return rjson::print(std::move(ret));
    }

    // TODO: label
@@ -502,123 +523,113 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
+    auto e = topologies.end();
+    auto prev = e;
+    auto shards = rjson::empty_array();

-        auto e = topologies.end();
-        auto prev = e;
-        auto shards = rjson::empty_array();
+    std::optional<shard_id> last;

-        std::optional<shard_id> last;
+    auto i = topologies.begin();
+    // if we're a paged query, skip to the generation where we left of.
+    if (shard_start) {
+        i = topologies.find(shard_start->time);
+    }

-        auto i = topologies.begin();
-        // if we're a paged query, skip to the generation where we left of.
-        if (shard_start) {
-            i = topologies.find(shard_start->time);
-        }
+    // for parent-child stuff we need id:s to be sorted by token
+    // (see explanation above) since we want to find closest
+    // token boundary when determining parent.
+    // #7346 - we processed and searched children/parents in
+    // stored order, which is not necessarily token order,
+    // so the finding of "closest" token boundary (using upper bound)
+    // could give somewhat weird results.
+    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return id1.token() < id2.token();
+    };

-        // for parent-child stuff we need id:s to be sorted by token
-        // (see explanation above) since we want to find closest
-        // token boundary when determining parent.
-        // #7346 - we processed and searched children/parents in
-        // stored order, which is not necessarily token order,
-        // so the finding of "closest" token boundary (using upper bound)
-        // could give somewhat weird results.
-        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return id1.token() < id2.token();
-        };
+    // #7409 - shards must be returned in lexicographical order,
+    // normal bytes compare is string_traits<int8_t>::compare.
+    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+    // compare instead we inadvertently will sort in string lexical.
+    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+    };
+
+    // need a prev even if we are skipping stuff
+    if (i != topologies.begin()) {
+        prev = std::prev(i);
+    }
+
+    for (; limit > 0 && i != e; prev = i, ++i) {
+        auto& [ts, sv] = *i;
+
+        last = std::nullopt;
+
+        auto lo = sv.streams.begin();
+        auto end = sv.streams.end();

        // #7409 - shards must be returned in lexicographical order,
-        // normal bytes compare is string_traits<int8_t>::compare.
-        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-        // compare instead we inadvertently will sort in string lexical.
-        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-        };
+        std::sort(lo, end, id_cmp);

-        // need a prev even if we are skipping stuff
-        if (i != topologies.begin()) {
-            prev = std::prev(i);
+        if (shard_start) {
+            // find next shard position
+            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+            shard_start = std::nullopt;
        }

-        for (; limit > 0 && i != e; prev = i, ++i) {
-            auto& [ts, sv] = *i;
+        if (lo != end && prev != e) {
+            // We want older stuff sorted in token order so we can find matching
+            // token range when determining parent shard.
+            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+        }
+
+        auto expired = [&]() -> std::optional<db_clock::time_point> {
+            auto j = std::next(i);
+            if (j == e) {
+                return std::nullopt;
+            }
+            // add this so we sort of match potential 
+            // sequence numbers in get_records result.
+            return j->first + confidence_interval(db);
+        }();
+
+        while (lo != end) {
+            auto& id = *lo++;
+
+            auto shard = rjson::empty_object();
+
+            if (prev != e) {
+                auto &pid = find_parent_shard_in_previous_generation(prev->first, prev->second.streams, id);
+                rjson::add(shard, "ParentShardId", shard_id(prev->first, pid));
+            }
+
+            last.emplace(ts, id);
+            rjson::add(shard, "ShardId", *last);
+            auto range = rjson::empty_object();
+            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+            if (expired) {
+                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+            }
+
+            rjson::add(shard, "SequenceNumberRange", std::move(range));
+            rjson::push_back(shards, std::move(shard));
+            
+            if (--limit == 0) {
+                break;
+            }

            last = std::nullopt;
-
-            auto lo = sv.streams.begin();
-            auto end = sv.streams.end();
-
-            // #7409 - shards must be returned in lexicographical order,
-            std::sort(lo, end, id_cmp);
-
-            if (shard_start) {
-                // find next shard position
-                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-                shard_start = std::nullopt;
-            }
-
-            if (lo != end && prev != e) {
-                // We want older stuff sorted in token order so we can find matching
-                // token range when determining parent shard.
-                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-            }
-
-            auto expired = [&]() -> std::optional<db_clock::time_point> {
-                auto j = std::next(i);
-                if (j == e) {
-                    return std::nullopt;
-                }
-                // add this so we sort of match potential 
-                // sequence numbers in get_records result.
-                return j->first + confidence_interval(db);
-            }();
-
-            while (lo != end) {
-                auto& id = *lo++;
-
-                auto shard = rjson::empty_object();
-
-                if (prev != e) {
-                    auto& pids = prev->second.streams;
-                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                        return t < id.token();
-                    });
-                    if (pid != pids.begin()) {
-                        pid = std::prev(pid);
-                    }
-                    if (pid != pids.end()) {
-                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                    }
-                }
-
-                last.emplace(ts, id);
-                rjson::add(shard, "ShardId", *last);
-                auto range = rjson::empty_object();
-                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-                if (expired) {
-                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-                }
-
-                rjson::add(shard, "SequenceNumberRange", std::move(range));
-                rjson::push_back(shards, std::move(shard));
-                
-                if (--limit == 0) {
-                    break;
-                }
-
-                last = std::nullopt;
-            }
        }
+    }

-        if (last) {
-            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-        }
+    if (last) {
+        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+    }

-        rjson::add(stream_desc, "Shards", std::move(shards));
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-            
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-    });
+    rjson::add(stream_desc, "Shards", std::move(shards));
+    rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        
+    co_return rjson::print(std::move(ret));
 }

 enum class shard_iterator_type {
@@ -898,172 +909,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
-            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-        auto result_set = builder.build();
-        auto records = rjson::empty_array();
+    auto result_set = builder.build();
+    auto records = rjson::empty_array();

-        auto& metadata = result_set->get_metadata();
+    auto& metadata = result_set->get_metadata();

-        auto op_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == op_column_name;
-            })
-        );
-        auto ts_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == timestamp_column_name;
-            })
-        );
-        auto eor_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == eor_column_name;
-            })
-        );
+    auto op_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == op_column_name;
+        })
+    );
+    auto ts_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == timestamp_column_name;
+        })
+    );
+    auto eor_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == eor_column_name;
+        })
+    );

-        std::optional<utils::UUID> timestamp;
-        auto dynamodb = rjson::empty_object();
-        auto record = rjson::empty_object();
-        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+    std::optional<utils::UUID> timestamp;
+    auto dynamodb = rjson::empty_object();
+    auto record = rjson::empty_object();
+    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-        using op_utype = std::underlying_type_t<cdc::operation>;
+    using op_utype = std::underlying_type_t<cdc::operation>;

-        auto maybe_add_record = [&] {
-            if (!dynamodb.ObjectEmpty()) {
-                rjson::add(record, "dynamodb", std::move(dynamodb));
-                dynamodb = rjson::empty_object();
-            }
-            if (!record.ObjectEmpty()) {
-                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.1");
-                rjson::push_back(records, std::move(record));
-                record = rjson::empty_object();
-                --limit;
-            }
-        };
+    auto maybe_add_record = [&] {
+        if (!dynamodb.ObjectEmpty()) {
+            rjson::add(record, "dynamodb", std::move(dynamodb));
+            dynamodb = rjson::empty_object();
+        }
+        if (!record.ObjectEmpty()) {
+            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+            rjson::add(record, "eventSource", "scylladb:alternator");
+            rjson::add(record, "eventVersion", "1.1");
+            rjson::push_back(records, std::move(record));
+            record = rjson::empty_object();
+            --limit;
+        }
+    };

-        for (auto& row : result_set->rows()) {
-            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+    for (auto& row : result_set->rows()) {
+        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-            if (!dynamodb.HasMember("Keys")) {
-                auto keys = rjson::empty_object();
-                describe_single_item(*selection, row, key_names, keys);
-                rjson::add(dynamodb, "Keys", std::move(keys));
-                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-                rjson::add(dynamodb, "StreamViewType", type);
-                // TODO: SizeBytes
-            }
-
-            /**
-             * We merge rows with same timestamp into a single event.
-             * This is pretty much needed, because a CDC row typically
-             * encodes ~half the info of an alternator write. 
-             * 
-             * A big, big downside to how alternator records are written
-             * (i.e. CQL), is that the distinction between INSERT and UPDATE
-             * is somewhat lost/unmappable to actual eventName. 
-             * A write (currently) always looks like an insert+modify
-             * regardless whether we wrote existing record or not. 
-             * 
-             * Maybe RMW ops could be done slightly differently so 
-             * we can distinguish them here...
-             * 
-             * For now, all writes will become MODIFY.
-             * 
-             * Note: we do not check the current pre/post
-             * flags on CDC log, instead we use data to 
-             * drive what is returned. This is (afaict)
-             * consistent with dynamo streams
-             */
-            switch (op) {
-            case cdc::operation::pre_image:
-            case cdc::operation::post_image:
-            {
-                auto item = rjson::empty_object();
-                describe_single_item(*selection, row, attr_names, item, nullptr, true);
-                describe_single_item(*selection, row, key_names, item);
-                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-                break;
-            }
-            case cdc::operation::update:
-                rjson::add(record, "eventName", "MODIFY");
-                break;
-            case cdc::operation::insert:
-                rjson::add(record, "eventName", "INSERT");
-                break;
-            case cdc::operation::service_row_delete:
-            case cdc::operation::service_partition_delete:
-            {
-                auto user_identity = rjson::empty_object();
-                rjson::add(user_identity, "Type", "Service");
-                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-                rjson::add(record, "userIdentity", std::move(user_identity));
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            default:
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            if (eor) {
-                maybe_add_record();
-                timestamp = ts;
-                if (limit == 0) {
-                    break;
-                }
-            }
+        if (!dynamodb.HasMember("Keys")) {
+            auto keys = rjson::empty_object();
+            describe_single_item(*selection, row, key_names, keys);
+            rjson::add(dynamodb, "Keys", std::move(keys));
+            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(dynamodb, "StreamViewType", type);
+            // TODO: SizeBytes
        }

-        auto ret = rjson::empty_object();
-        auto nrecords = records.Size();
-        rjson::add(ret, "Records", std::move(records));
-
-        if (nrecords != 0) {
-            // #9642. Set next iterators threshold to > last
-            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-            // Note that here we unconditionally return NextShardIterator,
-            // without checking if maybe we reached the end-of-shard. If the
-            // shard did end, then the next read will have nrecords == 0 and
-            // will notice end end of shard and not return NextShardIterator.
-            rjson::add(ret, "NextShardIterator", next_iter);
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        /**
+         * We merge rows with same timestamp into a single event.
+         * This is pretty much needed, because a CDC row typically
+         * encodes ~half the info of an alternator write. 
+         * 
+         * A big, big downside to how alternator records are written
+         * (i.e. CQL), is that the distinction between INSERT and UPDATE
+         * is somewhat lost/unmappable to actual eventName. 
+         * A write (currently) always looks like an insert+modify
+         * regardless whether we wrote existing record or not. 
+         * 
+         * Maybe RMW ops could be done slightly differently so 
+         * we can distinguish them here...
+         * 
+         * For now, all writes will become MODIFY.
+         * 
+         * Note: we do not check the current pre/post
+         * flags on CDC log, instead we use data to 
+         * drive what is returned. This is (afaict)
+         * consistent with dynamo streams
+         */
+        switch (op) {
+        case cdc::operation::pre_image:
+        case cdc::operation::post_image:
+        {
+            auto item = rjson::empty_object();
+            describe_single_item(*selection, row, attr_names, item, nullptr, true);
+            describe_single_item(*selection, row, key_names, item);
+            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            break;
        }
-
-        // ugh. figure out if we are and end-of-shard
-        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-
-        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
-            auto& shard = iter.shard;            
-
-            if (shard.time < ts && ts < high_ts) {
-                // The DynamoDB documentation states that when a shard is
-                // closed, reading it until the end has NextShardIterator
-                // "set to null". Our test test_streams_closed_read
-                // confirms that by "null" they meant not set at all.
-            } else {
-                // We could have return the same iterator again, but we did
-                // a search from it until high_ts and found nothing, so we
-                // can also start the next search from high_ts.
-                // TODO: but why? It's simpler just to leave the iterator be.
-                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-                rjson::add(ret, "NextShardIterator", iter);
+        case cdc::operation::update:
+            rjson::add(record, "eventName", "MODIFY");
+            break;
+        case cdc::operation::insert:
+            rjson::add(record, "eventName", "INSERT");
+            break;
+        case cdc::operation::service_row_delete:
+        case cdc::operation::service_partition_delete:
+        {
+            auto user_identity = rjson::empty_object();
+            rjson::add(user_identity, "Type", "Service");
+            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+            rjson::add(record, "userIdentity", std::move(user_identity));
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        default:
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        if (eor) {
+            maybe_add_record();
+            timestamp = ts;
+            if (limit == 0) {
+                break;
            }
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            if (is_big(ret)) {
-                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
-            }
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-        });
-    });
+        }
+    }
+
+    auto ret = rjson::empty_object();
+    auto nrecords = records.Size();
+    rjson::add(ret, "Records", std::move(records));
+
+    if (nrecords != 0) {
+        // #9642. Set next iterators threshold to > last
+        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+        // Note that here we unconditionally return NextShardIterator,
+        // without checking if maybe we reached the end-of-shard. If the
+        // shard did end, then the next read will have nrecords == 0 and
+        // will notice end end of shard and not return NextShardIterator.
+        rjson::add(ret, "NextShardIterator", next_iter);
+        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+        co_return rjson::print(std::move(ret));
+    }
+
+    // ugh. figure out if we are and end-of-shard
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+
+    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
+    auto& shard = iter.shard;
+
+    if (shard.time < ts && ts < high_ts) {
+        // The DynamoDB documentation states that when a shard is
+        // closed, reading it until the end has NextShardIterator
+        // "set to null". Our test test_streams_closed_read
+        // confirms that by "null" they meant not set at all.
+    } else {
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        rjson::add(ret, "NextShardIterator", iter);
+    }
+    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+    if (is_big(ret)) {
+        co_return make_streamed(std::move(ret));
+    }
+    co_return rjson::print(std::move(ret));
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -46,6 +46,7 @@
 #include "alternator/executor.hh"
 #include "alternator/controller.hh"
 #include "alternator/serialization.hh"
+#include "alternator/ttl_tag.hh"
 #include "dht/sharder.hh"
 #include "db/config.hh"
 #include "db/tags/utils.hh"
@@ -57,19 +58,10 @@ static logging::logger tlogger("alternator_ttl");

 namespace alternator {

-// We write the expiration-time attribute enabled on a table in a
-// tag TTL_TAG_KEY.
-// Currently, the *value* of this tag is simply the name of the attribute,
-// and the expiration scanner interprets it as an Alternator attribute name -
-// It can refer to a real column or if that doesn't exist, to a member of
-// the ":attrs" map column. Although this is designed for Alternator, it may
-// be good enough for CQL as well (there, the ":attrs" column won't exist).
-extern const sstring TTL_TAG_KEY;
-
 future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.update_time_to_live++;
    if (!_proxy.features().alternator_ttl) {
-        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
+        co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Upgrade all nodes to a version that supports it.");
    }

    schema_ptr schema = get_table(_proxy, request);
@@ -141,7 +133,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLive request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 //
 // Here is a brief overview of how the expiration service works:
 //
@@ -324,9 +316,7 @@ static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_se
    const auto& tm = *erm->get_token_metadata_ptr();
    const auto& sorted_tokens = tm.sorted_tokens();
    std::vector<std::pair<dht::token_range, locator::host_id>> ret;
-    if (sorted_tokens.empty()) {
-        on_internal_error(tlogger, "Token metadata is empty");
-    }
+    throwing_assert(!sorted_tokens.empty());
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
        co_await coroutine::maybe_yield();
@@ -563,7 +553,7 @@ static future<> scan_table_ranges(
        expiration_service::stats& expiration_stats)
 {
    const schema_ptr& s = scan_ctx.s;
-    SCYLLA_ASSERT (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
+    throwing_assert(partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
    auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
            *scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
    while (!p->is_exhausted()) {
@@ -593,7 +583,7 @@ static future<> scan_table_ranges(
            if (retries >= 10) {
                // Don't get stuck forever asking the same page, maybe there's
                // a bug or a real problem in several replicas. Give up on
-                // this scan an retry the scan from a random position later,
+                // this scan and retry the scan from a random position later,
                // in the next scan period.
                throw runtime_exception("scanner thread failed after too many timeouts for the same page");
            }
@@ -640,13 +630,38 @@ static future<> scan_table_ranges(
                }
            } else {
                // For a real column to contain an expiration time, it
-                // must be a numeric type.
-                // FIXME: Currently we only support decimal_type (which is
-                // what Alternator uses), but other numeric types can be
-                // supported as well to make this feature more useful in CQL.
-                // Note that kind::decimal is also checked above.
-                big_decimal n = value_cast<big_decimal>(v);
-                expired = is_expired(n, now);
+                // must be a numeric type. We currently support decimal
+                // (used by Alternator TTL) as well as bigint, int and
+                // timestamp (used by CQL per-row TTL).
+                switch (meta[*expiration_column]->type->get_kind()) {
+                    case abstract_type::kind::decimal:
+                        // Used by Alternator TTL for key columns not stored
+                        // in the map. The value is in seconds, fractional
+                        // part is ignored.
+                        expired = is_expired(value_cast<big_decimal>(v), now);
+                        break;
+                    case abstract_type::kind::long_kind:
+                        // Used by CQL per-row TTL. The value is in seconds.
+                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int64_t>(v))), now);
+                        break;
+                    case abstract_type::kind::int32:
+                        // Used by CQL per-row TTL. The value is in seconds.
+                        // Using int type is not recommended because it will
+                        // overflow in 2038, but we support it to allow users
+                        // to use existing int columns for expiration.
+                        expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int32_t>(v))), now);
+                        break;
+                    case abstract_type::kind::timestamp:
+                        // Used by CQL per-row TTL. The value is in milliseconds
+                        // but we truncate it to gc_clock's precision (whole seconds).
+                        expired = is_expired(gc_clock::time_point(std::chrono::duration_cast<gc_clock::duration>(value_cast<db_clock::time_point>(v).time_since_epoch())), now);
+                        break;
+                    default:
+                        // Should never happen - we verified the column's type
+                        // before starting the scan.
+                        [[unlikely]]
+                        on_internal_error(tlogger, format("expiration scanner value of unsupported type {} in column {}", meta[*expiration_column]->type->cql3_type_name(), scan_ctx.column_name) );
+                }
            }
            if (expired) {
                expiration_stats.items_deleted++;
@@ -708,16 +723,12 @@ static future<bool> scan_table(
        co_return false;
    }
    // attribute_name may be one of the schema's columns (in Alternator, this
-    // means it's a key column), or an element in Alternator's attrs map
-    // encoded in Alternator's JSON encoding.
-    // FIXME: To make this less Alternators-specific, we should encode in the
-    // single key's value three things:
-    // 1. The name of a column
-    // 2. Optionally if column is a map, a member in the map
-    // 3. The deserializer for the value: CQL or Alternator (JSON).
-    // The deserializer can be guessed: If the given column or map item is
-    // numeric, it can be used directly. If it is a "bytes" type, it needs to
-    // be deserialized using Alternator's deserializer.
+    // means a key column, in CQL it's a regular column), or an element in
+    // Alternator's attrs map encoded in Alternator's JSON encoding (which we
+    // decode). If attribute_name is a real column, in Alternator it will have
+    // the type decimal, counting seconds since the UNIX epoch, while in CQL
+    // it will one of the types bigint or int (counting seconds) or timestamp
+    // (counting milliseconds).
    bytes column_name = to_bytes(*attribute_name);
    const column_definition *cd = s->get_column_definition(column_name);
    std::optional<std::string> member;
@@ -736,11 +747,14 @@ static future<bool> scan_table(
    data_type column_type = cd->type;
    // Verify that the column has the right type: If "member" exists
    // the column must be a map, and if it doesn't, the column must
-    // (currently) be a decimal_type. If the column has the wrong type
-    // nothing can get expired in this table, and it's pointless to
-    // scan it.
+    // be decimal_type (Alternator), bigint, int or timestamp (CQL).
+    // If the column has the wrong type nothing can get expired in
+    // this table, and it's pointless to scan it.
    if ((member && column_type->get_kind() != abstract_type::kind::map) ||
-        (!member && column_type->get_kind() != abstract_type::kind::decimal)) {
+        (!member && column_type->get_kind() != abstract_type::kind::decimal &&
+         column_type->get_kind() != abstract_type::kind::long_kind &&
+         column_type->get_kind() != abstract_type::kind::int32 &&
+         column_type->get_kind() != abstract_type::kind::timestamp)) {
        tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
        co_return false;
    }
@@ -767,7 +781,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
@@ -878,12 +892,10 @@ future<> expiration_service::run() {
 future<> expiration_service::start() {
    // Called by main() on each shard to start the expiration-service
    // thread. Just runs run() in the background and allows stop().
-    if (_db.features().alternator_ttl) {
-        if (!shutting_down()) {
-            _end = run().handle_exception([] (std::exception_ptr ep) {
-                tlogger.error("expiration_service failed: {}", ep);
-            });
-        }
+    if (!shutting_down()) {
+        _end = run().handle_exception([] (std::exception_ptr ep) {
+            tlogger.error("expiration_service failed: {}", ep);
+        });
    }
    return make_ready_future<>();
 }
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -30,7 +30,7 @@ namespace alternator {

 // expiration_service is a sharded service responsible for cleaning up expired
 // items in all tables with per-item expiration enabled. Currently, this means
-// Alternator tables with TTL configured via a UpdateTimeToLeave request.
+// Alternator tables with TTL configured via an UpdateTimeToLive request.
 class expiration_service final : public seastar::peering_sharded_service<expiration_service> {
 public:
    // Object holding per-shard statistics related to the expiration service.
@@ -52,7 +52,7 @@ private:
    data_dictionary::database _db;
    service::storage_proxy& _proxy;
    gms::gossiper& _gossiper;
-    // _end is set by start(), and resolves when the the background service
+    // _end is set by start(), and resolves when the background service
    // started by it ends. To ask the background service to end, _abort_source
    // should be triggered. stop() below uses both _abort_source and _end.
    std::optional<future<>> _end;
--- a/alternator/ttl_tag.hh
+++ b/alternator/ttl_tag.hh
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "seastarx.hh"
+#include <seastar/core/sstring.hh>
+
+namespace alternator {
+// We use the table tag TTL_TAG_KEY ("system:ttl_attribute") to remember
+// which attribute was chosen as the expiration-time attribute for
+// Alternator's TTL and CQL's per-row TTL features.
+// Currently, the *value* of this tag is simply the name of the attribute:
+// It can refer to a real column or if that doesn't exist, to a member of
+// the ":attrs" map column (which Alternator uses).
+extern const sstring TTL_TAG_KEY;
+} // namespace alternator
+
+// let users use TTL_TAG_KEY without the "alternator::" prefix,
+// to make it easier to move it to a different namespace later.
+using alternator::TTL_TAG_KEY;
--- a/api/api-doc/authorization_cache.json
+++ b/api/api-doc/authorization_cache.json
@@ -12,7 +12,7 @@
      "operations":[
        {
          "method":"POST",
-          "summary":"Reset cache",
+          "summary":"Resets authorized prepared statements cache",
          "type":"void",
          "nickname":"authorization_cache_reset",
          "produces":[
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -243,7 +243,7 @@
                 "GOSSIP_DIGEST_SYN",
                 "GOSSIP_DIGEST_ACK2",
                 "GOSSIP_SHUTDOWN",
-                 "DEFINITIONS_UPDATE",
+                 "UNUSED__DEFINITIONS_UPDATE",
                 "TRUNCATE",
                 "UNUSED__REPLICATION_FINISHED",
                 "MIGRATION_REQUEST",
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1295,6 +1295,45 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_compaction",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger compaction of the key-value storage",
+               "type":"void",
+               "nickname":"logstor_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"major",
+                     "description":"When true, perform a major compaction",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/logstor_flush",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Trigger flush of logstor storage",
+               "type":"void",
+               "nickname":"logstor_flush",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/storage_service/active_repair/",
         "operations":[
@@ -3085,6 +3124,48 @@
            }
         ]
      },
+
+      {
+         "path":"/storage_service/tablets/snapshots",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Takes the snapshot for the given keyspaces/tables. A snapshot name must be specified.",
+               "type":"void",
+               "nickname":"take_cluster_snapshot",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"tag",
+                     "description":"the tag given to the snapshot",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"keyspace",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+
      {
         "path":"/storage_service/quiesce_topology",
         "operations":[
@@ -3187,6 +3268,38 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/logstor_info",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Logstor segment information for one table",
+               "type":"table_logstor_info",
+               "nickname":"logstor_info",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"The keyspace",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"table name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/retrain_dict",
         "operations":[
@@ -3595,6 +3708,47 @@
            }
        }
      },
+        "logstor_hist_bucket":{
+         "id":"logstor_hist_bucket",
+         "properties":{
+            "bucket":{
+               "type":"long"
+            },
+            "count":{
+               "type":"long"
+            },
+            "min_data_size":{
+               "type":"long"
+            },
+            "max_data_size":{
+               "type":"long"
+            }
+         }
+        },
+        "table_logstor_info":{
+         "id":"table_logstor_info",
+         "description":"Per-table logstor segment distribution",
+         "properties":{
+            "keyspace":{
+               "type":"string"
+            },
+            "table":{
+               "type":"string"
+            },
+            "compaction_groups":{
+               "type":"long"
+            },
+            "segments":{
+               "type":"long"
+            },
+            "data_size_histogram":{
+               "type":"array",
+               "items":{
+                  "$ref":"logstor_hist_bucket"
+               }
+            }
+         }
+        },
      "tablet_repair_result":{
        "id":"tablet_repair_result",
        "description":"Tablet repair result",
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -209,6 +209,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/chosen_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get sstable version currently chosen for use in new sstables",
+               "type":"string",
+               "nickname":"get_chosen_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/api.cc
+++ b/api/api.cc
@@ -122,9 +122,9 @@ future<> unset_thrift_controller(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_thrift_controller(ctx, r); });
 }

-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    return ctx.http_server.set_routes([&ctx, &ss, &group0_client] (routes& r) {
-            set_storage_service(ctx, r, ss, group0_client);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
+    return ctx.http_server.set_routes([&ctx, &ss, &ssc, &group0_client] (routes& r) {
+            set_storage_service(ctx, r, ss, ssc, group0_client);
        });
 }

--- a/api/api.hh
+++ b/api/api.hh
@@ -23,31 +23,6 @@

 namespace api {

-template<class T>
-std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
-    std::vector<T> res;
-    res.reserve(map.size());
-
-    for (const auto& [key, value] : map) {
-        res.push_back(T());
-        res.back().key = key;
-        res.back().value = value;
-    }
-    return res;
-}
-
-template<class T, class MAP>
-std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
-    res.reserve(res.size() + std::size(map));
-
-    for (const auto& [key, value] : map) {
-        T val;
-        val.key = fmt::to_string(key);
-        val.value = fmt::to_string(value);
-        res.push_back(val);
-    }
-    return res;
-}
 template <typename T, typename S = T>
 T map_sum(T&& dest, const S& src) {
    for (const auto& i : src) {
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -98,7 +98,7 @@ future<> set_server_config(http_context& ctx, db::config& cfg);
 future<> unset_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx, sharded<locator::snitch_ptr>& snitch);
 future<> unset_server_snitch(http_context& ctx);
-future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client&);
+future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
 future<> unset_server_storage_service(http_context& ctx);
 future<> set_server_client_routes(http_context& ctx, sharded<service::client_routes_service>& cr);
 future<> unset_server_client_routes(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -18,7 +18,9 @@
 #include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
+#include <sstream>
 #include "db/data_listeners.hh"
+#include "utils/hash.hh"
 #include "storage_service.hh"
 #include "compaction/compaction_manager.hh"
 #include "unimplemented.hh"
@@ -342,6 +344,56 @@ uint64_t accumulate_on_active_memtables(replica::table& t, noncopyable_function<
    return ret;
 }

+static
+future<json::json_return_type>
+rest_toppartitions_generic(sharded<replica::database>& db, std::unique_ptr<http::request> req) {
+        bool filters_provided = false;
+
+        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
+        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                table_filters.emplace(parse_fully_qualified_cf_name(filter));
+            }
+        }
+
+        std::unordered_set<sstring> keyspace_filters {};
+        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
+            filters_provided = true;
+            std::stringstream ss { filters };
+            std::string filter;
+            while (!filters.empty() && ss.good()) {
+                std::getline(ss, filter, ',');
+                keyspace_filters.emplace(std::move(filter));
+            }
+        }
+
+        // when the query is empty return immediately
+        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
+            apilog.debug("toppartitions query: processing results");
+            cf::toppartitions_query_results results;
+
+            results.read_cardinality = 0;
+            results.write_cardinality = 0;
+
+            return make_ready_future<json::json_return_type>(results);
+        }
+
+        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
+        api::req_param<unsigned> capacity(*req, "capacity", 256);
+        api::req_param<unsigned> list_size(*req, "list_size", 10);
+
+        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
+            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
+
+        return seastar::do_with(db::toppartitions_query(db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
+            return run_toppartitions_query(q);
+        });
+}
+
 void set_column_family(http_context& ctx, routes& r, sharded<replica::database>& db) {
    cf::get_column_family_name.set(r, [&db] (const_req req){
        std::vector<sstring> res;
@@ -1047,6 +1099,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<replica::database>&
        });
    });

+    ss::toppartitions_generic.set(r, [&db] (std::unique_ptr<http::request> req) {
+        return rest_toppartitions_generic(db, std::move(req));
+    });
+
    cf::force_major_compaction.set(r, [&ctx, &db](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        if (!req->get_query_param("split_output").empty()) {
            fail(unimplemented::cause::API);
@@ -1213,6 +1269,7 @@ void unset_column_family(http_context& ctx, routes& r) {
    cf::get_sstable_count_per_level.unset(r);
    cf::get_sstables_for_key.unset(r);
    cf::toppartitions.unset(r);
+    ss::toppartitions_generic.unset(r);
    cf::force_major_compaction.unset(r);
    ss::get_load.unset(r);
    ss::get_metrics_load.unset(r);
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -17,9 +17,7 @@
 #include "gms/feature_service.hh"
 #include "schema/schema_builder.hh"
 #include "sstables/sstables_manager.hh"
-#include "utils/hash.hh"
 #include <optional>
-#include <sstream>
 #include <stdexcept>
 #include <time.h>
 #include <algorithm>
@@ -536,13 +534,15 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
-    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
+    ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req);
        auto view = req->get_path_param("view");
-        return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
+            storage_service_json::mapper res;
+            res.key = i.first;
+            res.value = i.second;
+            return res;
+        }));
    });

    cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -580,6 +580,16 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
    co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
 }

+namespace {
+template <typename Key, typename Value>
+storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
+    storage_service_json::mapper val;
+    val.key = fmt::to_string(i.first);
+    val.value = fmt::to_string(i.second);
+    return val;
+}
+}
+
 static
 future<json::json_return_type>
 rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -597,62 +607,7 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
            throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
        }

-        co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
-            storage_service_json::mapper val;
-            val.key = fmt::to_string(i.first);
-            val.value = fmt::to_string(i.second);
-            return val;
-        }));
-}
-
-static
-future<json::json_return_type>
-rest_toppartitions_generic(http_context& ctx, std::unique_ptr<http::request> req) {
-        bool filters_provided = false;
-
-        std::unordered_set<std::tuple<sstring, sstring>, utils::tuple_hash> table_filters {};
-        if (auto filters = req->get_query_param("table_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                table_filters.emplace(parse_fully_qualified_cf_name(filter));
-            }
-        }
-
-        std::unordered_set<sstring> keyspace_filters {};
-        if (auto filters = req->get_query_param("keyspace_filters"); !filters.empty()) {
-            filters_provided = true;
-            std::stringstream ss { filters };
-            std::string filter;
-            while (!filters.empty() && ss.good()) {
-                std::getline(ss, filter, ',');
-                keyspace_filters.emplace(std::move(filter));
-            }
-        }
-
-        // when the query is empty return immediately
-        if (filters_provided && table_filters.empty() && keyspace_filters.empty()) {
-            apilog.debug("toppartitions query: processing results");
-            httpd::column_family_json::toppartitions_query_results results;
-
-            results.read_cardinality = 0;
-            results.write_cardinality = 0;
-
-            return make_ready_future<json::json_return_type>(results);
-        }
-
-        api::req_param<std::chrono::milliseconds, unsigned> duration{*req, "duration", 1000ms};
-        api::req_param<unsigned> capacity(*req, "capacity", 256);
-        api::req_param<unsigned> list_size(*req, "list_size", 10);
-
-        apilog.info("toppartitions query: #table_filters={} #keyspace_filters={} duration={} list_size={} capacity={}",
-            !table_filters.empty() ? std::to_string(table_filters.size()) : "all", !keyspace_filters.empty() ? std::to_string(keyspace_filters.size()) : "all", duration.value, list_size.value, capacity.value);
-
-        return seastar::do_with(db::toppartitions_query(ctx.db, std::move(table_filters), std::move(keyspace_filters), duration.value, list_size, capacity), [] (db::toppartitions_query& q) {
-            return run_toppartitions_query(q);
-        });
+        co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
 }

 static
@@ -686,7 +641,6 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
            table_id = validate_table(ctx.db.local(), keyspace, table);
        }

-        std::vector<ss::maplist_mapper> res;
        co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
                [](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
            ss::maplist_mapper m;
@@ -777,17 +731,13 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        apilog.info("cleanup_all global={}", global);

-        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
-            if (!ss.is_topology_coordinator_enabled()) {
-                co_return false;
-            }
-            co_await ss.do_clusterwide_vnodes_cleanup();
-            co_return true;
-        });
-        if (done) {
+        if (global) {
+            co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+                co_return co_await ss.do_clusterwide_vnodes_cleanup();
+            });
            co_return json::json_return_type(0);
        }
-        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
+        // fall back to the local cleanup if local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<compaction::global_cleanup_compaction_task_impl>({}, db);
@@ -795,9 +745,7 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::

        // Mark this node as clean
        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
-            if (ss.is_topology_coordinator_enabled()) {
-                co_await ss.reset_cleanup_needed();
-            }
+            co_await ss.reset_cleanup_needed();
        });

        co_return json::json_return_type(0);
@@ -808,9 +756,6 @@ future<json::json_return_type>
 rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
        apilog.info("reset_cleanup_needed");
        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
-            if (!ss.is_topology_coordinator_enabled()) {
-                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
-            }
            return ss.reset_cleanup_needed();
        });
        co_return json_void();
@@ -838,9 +783,31 @@ rest_force_keyspace_flush(http_context& ctx, std::unique_ptr<http::request> req)

 static
 future<json::json_return_type>
-rest_decommission(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+rest_logstor_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+        bool major = false;
+        if (auto major_param = req->get_query_param("major"); !major_param.empty()) {
+            major = validate_bool(major_param);
+        }
+        apilog.info("logstor_compaction: major={}", major);
+        auto& db = ctx.db;
+        co_await replica::database::trigger_logstor_compaction_on_all_shards(db, major);
+        co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_logstor_flush(http_context& ctx, std::unique_ptr<http::request> req) {
+        apilog.info("logstor_flush");
+        auto& db = ctx.db;
+        co_await replica::database::flush_logstor_separator_on_all_shards(db);
+        co_return json_void();
+}
+
+static
+future<json::json_return_type>
+rest_decommission(sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, std::unique_ptr<http::request> req) {
        apilog.info("decommission");
-        return ss.local().decommission().then([] {
+        return ss.local().decommission(ssc).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
 }
@@ -1317,10 +1284,7 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
            throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
        }

-        return ss.local().get_ownership().then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1337,10 +1301,7 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
            }
        }

-        return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
-            std::vector<storage_service_json::mapper> res;
-            return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
-        });
+        co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
 }

 static
@@ -1350,7 +1311,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
        apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
        throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
@@ -1416,7 +1377,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
        apilog.warn("retrain_dict: called before the cluster feature was enabled");
        throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
    }
-    auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
+    auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
    auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
    auto cf = api::req_param<sstring>(*req, "cf", {}).value;
    apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
@@ -1562,6 +1523,54 @@ rest_sstable_info(http_context& ctx, std::unique_ptr<http::request> req) {
        });
 }

+static
+future<json::json_return_type>
+rest_logstor_info(http_context& ctx, std::unique_ptr<http::request> req) {
+        auto keyspace = api::req_param<sstring>(*req, "keyspace", {}).value;
+        auto table = api::req_param<sstring>(*req, "table", {}).value;
+        if (table.empty()) {
+            table = api::req_param<sstring>(*req, "cf", {}).value;
+        }
+
+        if (keyspace.empty()) {
+            throw bad_param_exception("The query parameter 'keyspace' is required");
+        }
+        if (table.empty()) {
+            throw bad_param_exception("The query parameter 'table' is required");
+        }
+
+        keyspace = validate_keyspace(ctx, keyspace);
+        auto tid = validate_table(ctx.db.local(), keyspace, table);
+
+        auto& cf = ctx.db.local().find_column_family(tid);
+        if (!cf.uses_logstor()) {
+            throw bad_param_exception(fmt::format("Table {}.{} does not use logstor", keyspace, table));
+        }
+
+        return do_with(replica::logstor::table_segment_stats{}, [keyspace = std::move(keyspace), table = std::move(table), tid, &ctx] (replica::logstor::table_segment_stats& merged_stats) {
+            return ctx.db.map_reduce([&merged_stats](replica::logstor::table_segment_stats&& shard_stats) {
+                merged_stats += shard_stats;
+            }, [tid](const replica::database& db) {
+                return db.get_logstor_table_segment_stats(tid);
+            }).then([&merged_stats, keyspace = std::move(keyspace), table = std::move(table)] {
+                ss::table_logstor_info result;
+                result.keyspace = keyspace;
+                result.table = table;
+                result.compaction_groups = merged_stats.compaction_group_count;
+                result.segments = merged_stats.segment_count;
+
+                for (const auto& bucket : merged_stats.histogram) {
+                    ss::logstor_hist_bucket hist;
+                    hist.count = bucket.count;
+                    hist.max_data_size = bucket.max_data_size;
+                    result.data_size_histogram.push(std::move(hist));
+                }
+
+                return make_ready_future<json::json_return_type>(stream_object(result));
+            });
+        });
+}
+
 static
 future<json::json_return_type>
 rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::raft_group0_client& group0_client, std::unique_ptr<http::request> req) {
@@ -1574,26 +1583,14 @@ rest_reload_raft_topology_state(sharded<service::storage_service>& ss, service::
 static
 future<json::json_return_type>
 rest_upgrade_to_raft_topology(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("Requested to schedule upgrade to raft topology");
-        try {
-            co_await ss.invoke_on(0, [] (auto& ss) {
-                return ss.start_upgrade_to_raft_topology();
-            });
-        } catch (...) {
-            auto ex = std::current_exception();
-            apilog.error("Failed to schedule upgrade to raft topology: {}", ex);
-            std::rethrow_exception(std::move(ex));
-        }
+        apilog.info("Requested to schedule upgrade to raft topology, but this version does not need it since it uses raft topology by default.");
        co_return json_void();
 }

 static
 future<json::json_return_type>
 rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        const auto ustate = co_await ss.invoke_on(0, [] (auto& ss) {
-            return ss.get_topology_upgrade_state();
-        });
-        co_return sstring(format("{}", ustate));
+        co_return sstring("done");
 }

 static
@@ -1803,9 +1800,8 @@ rest_bind(FuncType func, BindArgs&... args) {
    return std::bind_front(func, std::ref(args)...);
 }

-void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
+void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
-    ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
@@ -1820,7 +1816,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
-    ss::decommission.set(r, rest_bind(rest_decommission, ss));
+    ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
+    ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
+    ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
    ss::move.set(r, rest_bind(rest_move, ss));
    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
@@ -1869,6 +1867,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
+    ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
@@ -1885,7 +1884,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

 void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_token_endpoint.unset(r);
-    ss::toppartitions_generic.unset(r);
    ss::get_release_version.unset(r);
    ss::get_scylla_release_version.unset(r);
    ss::get_schema_version.unset(r);
@@ -1899,6 +1897,8 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reset_cleanup_needed.unset(r);
    ss::force_flush.unset(r);
    ss::force_keyspace_flush.unset(r);
+    ss::logstor_compaction.unset(r);
+    ss::logstor_flush.unset(r);
    ss::decommission.unset(r);
    ss::move.unset(r);
    ss::remove_node.unset(r);
@@ -1946,6 +1946,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_ownership.unset(r);
    ss::get_effective_ownership.unset(r);
    ss::sstable_info.unset(r);
+    ss::logstor_info.unset(r);
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
@@ -2025,6 +2026,8 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
+        auto tcopt = req->get_query_param("tc");
+
        db::snapshot_options opts = {
            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
        };
@@ -2049,6 +2052,27 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        }
    });

+    ss::take_cluster_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        apilog.info("take_cluster_snapshot: {}", req->get_query_params());
+        auto tag = req->get_query_param("tag");
+        auto column_families = split(req->get_query_param("table"), ",");
+        // Note: not published/active. Retain as internal option, but...
+        auto sfopt = req->get_query_param("skip_flush");
+
+        db::snapshot_options opts = {
+            .skip_flush = strcasecmp(sfopt.c_str(), "true") == 0,
+        };
+
+        std::vector<sstring> keynames = split(req->get_query_param("keyspace"), ",");
+        try {
+            co_await snap_ctl.local().take_cluster_column_family_snapshot(keynames, column_families, tag, opts);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_cluster_snapshot failed: {}", std::current_exception());
+            throw;
+        }
+    });
+
    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        apilog.info("del_snapshot: {}", req->get_query_params());
        auto tag = req->get_query_param("tag");
@@ -2139,6 +2163,7 @@ void unset_snapshot(http_context& ctx, routes& r) {
    ss::start_backup.unset(r);
    cf::get_true_snapshots_size.unset(r);
    cf::get_all_true_snapshots_size.unset(r);
+    ss::decommission.unset(r);
 }

 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -66,7 +66,7 @@ struct scrub_info {

 scrub_info parse_scrub_options(const http_context& ctx, std::unique_ptr<http::request> req);

-void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, service::raft_group0_client&);
+void set_storage_service(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>&, service::raft_group0_client&);
 void unset_storage_service(http_context& ctx, httpd::routes& r);
 void set_sstables_loader(http_context& ctx, httpd::routes& r, sharded<sstables_loader>& sst_loader);
 void unset_sstables_loader(http_context& ctx, httpd::routes& r);
--- a/api/system.cc
+++ b/api/system.cc
@@ -190,6 +190,13 @@ void set_system(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
        });
    });
+
+    hs::get_chosen_sstable_version.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return smp::submit_to(0, [&ctx] {
+            auto format = ctx.db.local().get_user_sstables_manager().get_preferred_sstable_version();
+            return make_ready_future<json::json_return_type>(seastar::to_sstring(format));
+        });
+    });
 }

 }
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table);
-}
-
-audit_info_ptr audit::create_no_audit_info() {
-    return audit_info_ptr();
+    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

 future<> audit::start(const db::config& cfg) {
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
-    if (batch != nullptr) {
+    auto audit_info = statement->get_audit_info();
+    if (!audit_info) {
+        return make_ready_future<>();
+    }
+    if (audit_info->batch()) {
+        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        auto audit_info = statement->get_audit_info();
-        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
+        if (audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
+        return make_ready_future<>();
    }
-    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,11 +75,13 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
+    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table)
+    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
+        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -89,6 +91,7 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
+    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -126,8 +129,7 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
-    static audit_info_ptr create_no_audit_info();
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/auth/CMakeLists.txt
+++ b/auth/CMakeLists.txt
@@ -17,15 +17,14 @@ target_sources(scylla_auth
    password_authenticator.cc
    passwords.cc
    permission.cc
-    permissions_cache.cc
    resource.cc
    role_or_anonymous.cc
-    roles-metadata.cc
    sasl_challenge.cc
    saslauthd_authenticator.cc
    service.cc
    standard_role_manager.cc
    transitional.cc
+    maintenance_socket_authenticator.cc
    maintenance_socket_role_manager.cc)
 target_include_directories(scylla_auth
  PUBLIC
@@ -49,4 +48,4 @@ if (Scylla_USE_PRECOMPILED_HEADER_USE)
  target_precompile_headers(scylla_auth REUSE_FROM scylla-precompiled-header)
 endif()
 check_headers(check-headers scylla_auth
-  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
+  GLOB_RECURSE ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,19 +9,9 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 constexpr std::string_view allow_all_authenticator_name("org.apache.cassandra.auth.AllowAllAuthenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        allow_all_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
-
 }
--- a/auth/allow_all_authorizer.cc
+++ b/auth/allow_all_authorizer.cc
@@ -9,18 +9,9 @@
 #include "auth/allow_all_authorizer.hh"

 #include "auth/common.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 constexpr std::string_view allow_all_authorizer_name("org.apache.cassandra.auth.AllowAllAuthorizer");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-    authorizer,
-    allow_all_authorizer,
-    cql3::query_processor&,
-    ::service::raft_group0_client&,
-    ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthorizer");
-
 }
--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -26,7 +26,7 @@ extern const std::string_view allow_all_authorizer_name;

 class allow_all_authorizer final  : public authorizer {
 public:
-    allow_all_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
+    allow_all_authorizer(cql3::query_processor&) {
    }

    virtual future<> start() override {
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -8,6 +8,7 @@

 #include "auth/cache.hh"
 #include "auth/common.hh"
+#include "auth/role_or_anonymous.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
@@ -18,6 +19,8 @@
 #include <seastar/core/abort_source.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/core/format.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/do_with.hh>

 namespace auth {

@@ -27,10 +30,24 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
    : _current_version(0)
    , _qp(qp)
    , _loading_sem(1)
-    , _as(as) {
+    , _as(as)
+    , _permission_loader(nullptr)
+    , _permission_loader_sem(8) {
+    namespace sm = seastar::metrics;
+    _metrics.add_group("auth_cache", {
+        sm::make_gauge("roles", [this] { return _roles.size(); },
+                sm::description("Number of roles currently cached")),
+        sm::make_gauge("permissions", [this] {
+            return _cached_permissions_count;
+        }, sm::description("Total number of permission sets currently cached across all roles"))
+    });
 }

-lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
+void cache::set_permission_loader(permission_loader_func loader) {
+    _permission_loader = std::move(loader);
+}
+
+lw_shared_ptr<const cache::role_record> cache::get(std::string_view role) const noexcept {
    auto it = _roles.find(role);
    if (it == _roles.end()) {
        return {};
@@ -38,6 +55,93 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
    return it->second;
 }

+void cache::for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const {
+    for (const auto& [name, record] : _roles) {
+        func(name, *record);
+    }
+}
+
+size_t cache::roles_count() const noexcept {
+    return _roles.size();
+}
+
+future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
+    std::unordered_map<resource, permission_set>* perms_cache;
+    lw_shared_ptr<role_record> role_ptr;
+
+    if (is_anonymous(role)) {
+        perms_cache = &_anonymous_permissions;
+    } else {
+        const auto& role_name = *role.name;
+        auto role_it = _roles.find(role_name);
+        if (role_it == _roles.end()) {
+            // Role might have been deleted but there are some connections
+            // left which reference it. They should no longer have access to anything.
+            return make_ready_future<permission_set>(permissions::NONE);
+        }
+        role_ptr = role_it->second;
+        perms_cache = &role_ptr->cached_permissions;
+    }
+
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        return make_ready_future<permission_set>(it->second);
+    }
+    // keep alive role_ptr as it holds perms_cache (except anonymous)
+    return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
+        return load_permissions(role, r, perms_cache);
+    });
+}
+
+future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_permission_loader_sem, 1, _as);
+
+    // Check again, perhaps we were blocked and other call loaded
+    // the permissions already. This is a protection against misses storm.
+    if (auto it = perms_cache->find(r); it != perms_cache->end()) {
+        co_return it->second;
+    }
+    auto perms = co_await _permission_loader(role, r);
+    add_permissions(*perms_cache, r, perms);
+    co_return perms;
+}
+
+future<> cache::prune(const resource& r) {
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    _anonymous_permissions.erase(r);
+    for (auto& it : _roles) {
+        // Prunning can run concurrently with other functions but it
+        // can only cause cached_permissions extra reload via get_permissions.
+        remove_permissions(it.second->cached_permissions, r);
+        co_await coroutine::maybe_yield();
+    }
+}
+
+future<> cache::reload_all_permissions() noexcept {
+    SCYLLA_ASSERT(_permission_loader);
+    auto units = co_await get_units(_loading_sem, 1, _as);
+    auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
+        std::vector<resource> keys;
+        keys.reserve(m.size());
+        for (const auto& [res, _] : m) {
+            keys.push_back(res);
+        }
+        return keys;
+    };
+    const role_or_anonymous anon;
+    for (const auto& res : copy_keys(_anonymous_permissions)) {
+        _anonymous_permissions[res] = co_await _permission_loader(anon, res);
+    }
+    for (auto& [role, entry] : _roles) {
+        auto& perms_cache = entry->cached_permissions;
+        auto r = role_or_anonymous(role);
+        for (const auto& res : copy_keys(perms_cache)) {
+            perms_cache[res] = co_await _permission_loader(r, res);
+        }
+    }
+    logger.debug("Reloaded auth cache with {} entries", _roles.size());
+}
+
 future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
    auto rec = make_lw_shared<role_record>();
    rec->version = _current_version;
@@ -105,7 +209,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
 future<> cache::prune_all() noexcept {
    for (auto it = _roles.begin(); it != _roles.end(); ) {
        if (it->second->version != _current_version) {
-            _roles.erase(it++);
+            remove_role(it++);
            co_await coroutine::maybe_yield();
        } else {
            ++it;
@@ -115,9 +219,6 @@ future<> cache::prune_all() noexcept {
 }

 future<> cache::load_all() {
-    if (legacy_mode(_qp)) {
-        co_return;
-    }
    SCYLLA_ASSERT(this_shard_id() == 0);
    auto units = co_await get_units(_loading_sem, 1, _as);

@@ -129,7 +230,7 @@ future<> cache::load_all() {
        const auto name = r.get_as<sstring>("role");
        auto role = co_await fetch_role(name);
        if (role) {
-            _roles[name] = role;
+            add_role(name, role);
        }
        co_return stop_iteration::no;
    };
@@ -142,39 +243,71 @@ future<> cache::load_all() {
        co_await distribute_role(name, role);
    }
    co_await container().invoke_on_others([this](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        c._current_version = _current_version;
        co_await c.prune_all();
    });
 }

-future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
-    if (legacy_mode(_qp)) {
+future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
+    if (!role) {
+        // Role might have been removed or not yet added, either way
+        // their members will be handled by another top call to this function.
        co_return;
    }
+    for (const auto& member_name : role->members) {
+        bool is_new = roles.insert(member_name).second;
+        if (!is_new) {
+            continue;
+        }
+        lw_shared_ptr<cache::role_record> member_role;
+        auto r = _roles.find(member_name);
+        if (r != _roles.end()) {
+            member_role = r->second;
+        }
+        co_await gather_inheriting_roles(roles, member_role, member_name);
+    }
+}
+
+future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    SCYLLA_ASSERT(this_shard_id() == 0);
    auto units = co_await get_units(_loading_sem, 1, _as);

+    std::unordered_set<role_name_t> roles_to_clear_perms;
    for (const auto& name : roles) {
        logger.info("Loading role {}", name);
        auto role = co_await fetch_role(name);
         if (role) {
-            _roles[name] = role;
+            add_role(name, role);
+            co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
        } else {
-            _roles.erase(name);
+            if (auto it = _roles.find(name); it != _roles.end()) {
+                auto old_role = it->second;
+                remove_role(it);
+                co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
+            }
        }
        co_await distribute_role(name, role);
    }
+
+    co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
+        for (const auto& name : roles_to_clear_perms) {
+            c.clear_role_permissions(name);
+            co_await coroutine::maybe_yield();
+        }
+    });
 }

 future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
    auto role_ptr = role.get();
-    co_await container().invoke_on_others([&name, role_ptr](cache& c) {
+    co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
+        auto units = co_await get_units(c._loading_sem, 1, c._as);
        if (!role_ptr) {
-            c._roles.erase(name);
-            return;
+            c.remove_role(name);
+            co_return;
        }
        auto role_copy = make_lw_shared<role_record>(*role_ptr);
-        c._roles[name] = std::move(role_copy);
+        c.add_role(name, std::move(role_copy));
    });
 }

@@ -185,4 +318,40 @@ bool cache::includes_table(const table_id& id) noexcept {
            || id == db::system_keyspace::role_permissions()->id();
 }

+void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+    }
+    _cached_permissions_count += role->cached_permissions.size();
+    _roles[name] = std::move(role);
+}
+
+void cache::remove_role(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        remove_role(it);
+    }
+}
+
+void cache::remove_role(roles_map::iterator it) {
+    _cached_permissions_count -= it->second->cached_permissions.size();
+    _roles.erase(it);
+}
+
+void cache::clear_role_permissions(const role_name_t& name) {
+    if (auto it = _roles.find(name); it != _roles.end()) {
+        _cached_permissions_count -= it->second->cached_permissions.size();
+        it->second->cached_permissions.clear();
+    }
+}
+
+void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
+    if (cache.emplace(r, perms).second) {
+        ++_cached_permissions_count;
+    }
+}
+
+void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
+    _cached_permissions_count -= cache.erase(r);
+}
+
 } // namespace auth
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <string_view>
 #include <unordered_set>
 #include <unordered_map>

@@ -17,11 +18,14 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/metrics_registration.hh>

-#include <absl/container/flat_hash_map.h>
+#include "absl-flat_hash_map.hh"

 #include "auth/permission.hh"
 #include "auth/common.hh"
+#include "auth/resource.hh"
+#include "auth/role_or_anonymous.hh"

 namespace cql3 { class query_processor; }

@@ -31,6 +35,7 @@ class cache : public peering_sharded_service<cache> {
 public:
    using role_name_t = sstring;
    using version_tag_t = char;
+    using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;

 	struct role_record {
        bool can_login = false;
@@ -38,28 +43,60 @@ public:
        std::unordered_set<role_name_t> member_of;
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
-        std::unordered_map<sstring, sstring> attributes;
-        std::unordered_map<sstring, permission_set> permissions;
+        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
+        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
+    private:
+        friend cache;
+        // cached permissions include effects of role's inheritance
+        std::unordered_map<resource, permission_set> cached_permissions;
        version_tag_t version; // used for seamless cache reloads
    };

    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
-    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
+    lw_shared_ptr<const role_record> get(std::string_view role) const noexcept;
+    void set_permission_loader(permission_loader_func loader);
+    future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
+    future<> prune(const resource& r);
+    future<> reload_all_permissions() noexcept;
    future<> load_all();
    future<> load_roles(std::unordered_set<role_name_t> roles);
    static bool includes_table(const table_id&) noexcept;

+    // Returns the number of roles in the cache.
+    size_t roles_count() const noexcept;
+
+    // The callback doesn't suspend (no co_await) so it observes the state
+    // of the cache atomically.
+    void for_each_role(const std::function<void(const role_name_t&, const role_record&)>& func) const;
+
 private:
-    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
+    using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>, sstring_hash, sstring_eq>;
    roles_map _roles;
+    // anonymous permissions map exists mainly due to compatibility with
+    // higher layers which use role_or_anonymous to get permissions.
+    std::unordered_map<resource, permission_set> _anonymous_permissions;
    version_tag_t _current_version;
    cql3::query_processor& _qp;
-    semaphore _loading_sem;
+    semaphore _loading_sem; // protects iteration of _roles map
    abort_source& _as;
+    permission_loader_func _permission_loader;
+    semaphore _permission_loader_sem; // protects against reload storms on a single role change
+    metrics::metric_groups _metrics;
+    size_t _cached_permissions_count = 0;

    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
    future<> prune_all() noexcept;
    future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
+    future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
+
+    void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
+    void remove_role(const role_name_t& name);
+    void remove_role(roles_map::iterator it);
+    void clear_role_permissions(const role_name_t& name);
+    void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
+    void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
+
+    future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
 };

 } // namespace auth
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -13,14 +13,11 @@
 #include <boost/regex.hpp>
 #include <fmt/ranges.h>

-#include "utils/class_registrator.hh"
 #include "utils/to_string.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"

-static const auto CERT_AUTH_NAME = "com.scylladb.auth.CertificateAuthenticator";
-const std::string_view auth::certificate_authenticator_name(CERT_AUTH_NAME);

 static logging::logger clogger("certificate_authenticator");

@@ -30,13 +27,6 @@ static const std::string cfg_query_attr = "query";
 static const std::string cfg_source_subject = "SUBJECT";
 static const std::string cfg_source_altname = "ALTNAME";

-static const class_registrator<auth::authenticator
-    , auth::certificate_authenticator
-    , cql3::query_processor&
-    , ::service::raft_group0_client&
-    , ::service::migration_manager&
-    , auth::cache&> cert_auth_reg(CERT_AUTH_NAME);
-
 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };
@@ -99,7 +89,7 @@ future<> auth::certificate_authenticator::stop() {
 }

 std::string_view auth::certificate_authenticator::qualified_java_name() const {
-    return certificate_authenticator_name;
+    return "com.scylladb.auth.CertificateAuthenticator";
 }

 bool auth::certificate_authenticator::require_authentication() const {
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -27,8 +27,6 @@ namespace auth {

 class cache;

-extern const std::string_view certificate_authenticator_name;
-
 class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -14,18 +14,11 @@
 #include <seastar/core/sharded.hh>

 #include "mutation/canonical_mutation.hh"
-#include "schema/schema_fwd.hh"
 #include "mutation/timestamp.hh"
-#include "utils/assert.hh"
 #include "utils/exponential_backoff_retry.hh"
 #include "cql3/query_processor.hh"
-#include "cql3/statements/create_table_statement.hh"
-#include "schema/schema_builder.hh"
-#include "service/migration_manager.hh"
 #include "service/raft/group0_state_machine.hh"
 #include "timeout_config.hh"
-#include "utils/error_injection.hh"
-#include "db/system_keyspace.hh"

 namespace auth {

@@ -33,22 +26,14 @@ namespace meta {

 namespace legacy {
    constinit const std::string_view AUTH_KS("system_auth");
-    constinit const std::string_view USERS_CF("users");
 } // namespace legacy
 constinit const std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
 } // namespace meta

 static logging::logger auth_log("auth");

-bool legacy_mode(cql3::query_processor& qp) {
-    return qp.auth_version < db::auth_version_t::v2;
-}
-
-std::string_view get_auth_ks_name(cql3::query_processor& qp) {
-    if (legacy_mode(qp)) {
-        return meta::legacy::AUTH_KS;
-    }
-    return db::system_keyspace::NAME;
+std::string default_superuser(cql3::query_processor& qp) {
+    return qp.db().get_config().auth_superuser_name();
 }

 // Func must support being invoked more than once.
@@ -65,47 +50,6 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-static future<> create_legacy_metadata_table_if_missing_impl(
-        std::string_view table_name,
-        cql3::query_processor& qp,
-        std::string_view cql,
-        ::service::migration_manager& mm) {
-    SCYLLA_ASSERT(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
-
-    auto db = qp.db();
-    auto parsed_statement = cql3::query_processor::parse_statement(cql, cql3::dialect{});
-    auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);
-
-    parsed_cf_statement.prepare_keyspace(meta::legacy::AUTH_KS);
-
-    auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
-            parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);
-
-    const auto schema = statement->get_cf_meta_data(qp.db());
-    const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
-
-    schema_builder b(schema);
-    b.set_uuid(uuid);
-    schema_ptr table = b.build();
-
-    if (!db.has_schema(table->ks_name(), table->cf_name())) {
-        auto group0_guard = co_await mm.start_group0_operation();
-        auto ts = group0_guard.write_timestamp();
-        try {
-            co_return co_await mm.announce(co_await ::service::prepare_new_column_family_announcement(qp.proxy(), table, ts),
-                    std::move(group0_guard), format("auth: create {} metadata table", table->cf_name()));
-        } catch (const exceptions::already_exists_exception&) {}
-    }
-}
-
-future<> create_legacy_metadata_table_if_missing(
-        std::string_view table_name,
-        cql3::query_processor& qp,
-        std::string_view cql,
-        ::service::migration_manager& mm) noexcept {
-    return futurize_invoke(create_legacy_metadata_table_if_missing_impl, table_name, qp, cql, mm);
-}
-
 ::service::query_state& internal_distributed_query_state() noexcept {
 #ifdef DEBUG
    // Give the much slower debug tests more headroom for completing auth queries.
@@ -140,56 +84,6 @@ static future<> announce_mutations_with_guard(
    return group0_client.add_entry(std::move(group0_cmd), std::move(group0_guard), as, timeout);
 }

-future<> announce_mutations_with_batching(
-        ::service::raft_group0_client& group0_client,
-        start_operation_func_t start_operation_func,
-        std::function<::service::mutations_generator(api::timestamp_type t)> gen,
-        seastar::abort_source& as,
-        std::optional<::service::raft_timeout> timeout) {
-    // account for command's overhead, it's better to use smaller threshold than constantly bounce off the limit
-    size_t memory_threshold = group0_client.max_command_size() * 0.75;
-    utils::get_local_injector().inject("auth_announce_mutations_command_max_size",
-        [&memory_threshold] {
-        memory_threshold = 1000;
-    });
-
-    size_t memory_usage = 0;
-    utils::chunked_vector<canonical_mutation> muts;
-
-    // guard has to be taken before we execute code in gen as
-    // it can do read-before-write and we want announce_mutations
-    // operation to be linearizable with other such calls,
-    // for instance if we do select and then delete in gen
-    // we want both to operate on the same data or fail
-    // if someone else modified it in the middle
-    std::optional<::service::group0_guard> group0_guard;
-    group0_guard = co_await start_operation_func(as);
-    auto timestamp = group0_guard->write_timestamp();
-
-    auto g = gen(timestamp);
-    while (auto mut = co_await g()) {
-        muts.push_back(canonical_mutation{*mut});
-        memory_usage += muts.back().representation().size();
-        if (memory_usage >= memory_threshold) {
-            if (!group0_guard) {
-                group0_guard = co_await start_operation_func(as);
-                timestamp = group0_guard->write_timestamp();
-            }
-            co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
-            group0_guard = std::nullopt;
-            memory_usage = 0;
-            muts = {};
-        }
-    }
-    if (!muts.empty()) {
-        if (!group0_guard) {
-            group0_guard = co_await start_operation_func(as);
-            timestamp = group0_guard->write_timestamp();
-        }
-        co_await announce_mutations_with_guard(group0_client, std::move(muts), std::move(*group0_guard), as, timeout);
-    }
-}
-
 future<> announce_mutations(
        cql3::query_processor& qp,
        ::service::raft_group0_client& group0_client,
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -21,12 +21,7 @@

 using namespace std::chrono_literals;

-namespace replica {
-class database;
-}
-
 namespace service {
-class migration_manager;
 class query_state;
 }

@@ -40,10 +35,8 @@ namespace meta {

 namespace legacy {
 extern constinit const std::string_view AUTH_KS;
-extern constinit const std::string_view USERS_CF;
 } // namespace legacy

-constexpr std::string_view DEFAULT_SUPERUSER_NAME("cassandra");
 extern constinit const std::string_view AUTH_PACKAGE_NAME;

 } // namespace meta
@@ -52,12 +45,7 @@ constexpr std::string_view PERMISSIONS_CF = "role_permissions";
 constexpr std::string_view ROLE_MEMBERS_CF = "role_members";
 constexpr std::string_view ROLE_ATTRIBUTES_CF = "role_attributes";

-// This is a helper to check whether auth-v2 is on.
-bool legacy_mode(cql3::query_processor& qp);
-
-// We have legacy implementation using different keyspace
-// and need to parametrize depending on runtime feature.
-std::string_view get_auth_ks_name(cql3::query_processor& qp);
+std::string default_superuser(cql3::query_processor& qp);

 template <class Task>
 future<> once_among_shards(Task&& f) {
@@ -71,12 +59,6 @@ future<> once_among_shards(Task&& f) {
 // Func must support being invoked more than once.
 future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_function<future<>()> func);

-future<> create_legacy_metadata_table_if_missing(
-        std::string_view table_name,
-        cql3::query_processor&,
-        std::string_view cql,
-        ::service::migration_manager&) noexcept;
-
 ///
 /// Time-outs for internal, non-local CQL queries.
 ///
@@ -84,20 +66,6 @@ future<> create_legacy_metadata_table_if_missing(

 ::service::raft_timeout get_raft_timeout() noexcept;

-// Execute update query via group0 mechanism, mutations will be applied on all nodes.
-// Use this function when need to perform read before write on a single guard or if
-// you have more than one mutation and potentially exceed single command size limit.
-using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source&)>;
-future<> announce_mutations_with_batching(
-        ::service::raft_group0_client& group0_client,
-        // since we can operate also in topology coordinator context where we need stronger
-        // guarantees than start_operation from group0_client gives we allow to inject custom
-        // function here
-        start_operation_func_t start_operation_func,
-        std::function<::service::mutations_generator(api::timestamp_type t)> gen,
-        seastar::abort_source& as,
-        std::optional<::service::raft_timeout> timeout);
-
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 future<> announce_mutations(
        cql3::query_processor& qp,
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -26,7 +26,6 @@ extern "C" {
 #include "cql3/untyped_result_set.hh"
 #include "exceptions/exceptions.hh"
 #include "utils/log.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

@@ -40,111 +39,14 @@ static constexpr std::string_view PERMISSIONS_NAME = "permissions";

 static logging::logger alogger("default_authorizer");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authorizer,
-        default_authorizer,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.CassandraAuthorizer");
-
-default_authorizer::default_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-        : _qp(qp)
-        , _migration_manager(mm) {
+default_authorizer::default_authorizer(cql3::query_processor& qp)
+        : _qp(qp) {
 }

 default_authorizer::~default_authorizer() {
 }

-static const sstring legacy_table_name{"permissions"};
-
-bool default_authorizer::legacy_metadata_exists() const {
-    return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
-}
-
-future<bool> default_authorizer::legacy_any_granted() const {
-    static const sstring query = seastar::format("SELECT * FROM {}.{} LIMIT 1", meta::legacy::AUTH_KS, PERMISSIONS_CF);
-
-    return _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            {},
-            cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> results) {
-        return !results->empty();
-    });
-}
-
-future<> default_authorizer::migrate_legacy_metadata() {
-    alogger.info("Starting migration of legacy permissions metadata.");
-    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
-
-    return _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
-        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
-            return do_with(
-                    row.get_as<sstring>("username"),
-                    parse_resource(row.get_as<sstring>(RESOURCE_NAME)),
-                    ::service::group0_batch::unused(),
-                    [this, &row](const auto& username, const auto& r, auto& mc) {
-                const permission_set perms = permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
-                return grant(username, perms, r, mc);
-            });
-        }).finally([results] {});
-    }).then([] {
-        alogger.info("Finished migrating legacy permissions metadata.");
-    }).handle_exception([](std::exception_ptr ep) {
-        alogger.error("Encountered an error during migration!");
-        std::rethrow_exception(ep);
-    });
-}
-
-future<> default_authorizer::start_legacy() {
-    static const sstring create_table = fmt::format(
-            "CREATE TABLE {}.{} ("
-            "{} text,"
-            "{} text,"
-            "{} set<text>,"
-            "PRIMARY KEY({}, {})"
-            ") WITH gc_grace_seconds={}",
-            meta::legacy::AUTH_KS,
-            PERMISSIONS_CF,
-            ROLE_NAME,
-            RESOURCE_NAME,
-            PERMISSIONS_NAME,
-            ROLE_NAME,
-            RESOURCE_NAME,
-            90 * 24 * 60 * 60); // 3 months.
-
-    return once_among_shards([this] {
-        return create_legacy_metadata_table_if_missing(
-                PERMISSIONS_CF,
-                _qp,
-                create_table,
-                _migration_manager).then([this] {
-            _finished = do_after_system_ready(_as, [this] {
-                return async([this] {
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
-
-                    if (legacy_metadata_exists()) {
-                        if (!legacy_any_granted().get()) {
-                            migrate_legacy_metadata().get();
-                            return;
-                        }
-
-                        alogger.warn("Ignoring legacy permissions metadata since role permissions exist.");
-                    }
-                });
-            });
-        });
-    });
-}
-
 future<> default_authorizer::start() {
-    if (legacy_mode(_qp)) {
-        return start_legacy();
-    }
    return make_ready_future<>();
 }

@@ -161,7 +63,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc

    const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? AND {} = ?",
            PERMISSIONS_NAME,
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            PERMISSIONS_CF,
            ROLE_NAME,
            RESOURCE_NAME);
@@ -185,21 +87,13 @@ default_authorizer::modify(
        std::string_view op,
        ::service::group0_batch& mc) {
    const sstring query = seastar::format("UPDATE {}.{} SET {} = {} {} ? WHERE {} = ? AND {} = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            PERMISSIONS_CF,
            PERMISSIONS_NAME,
            PERMISSIONS_NAME,
            op,
            ROLE_NAME,
            RESOURCE_NAME);
-    if (legacy_mode(_qp)) {
-        co_return co_await _qp.execute_internal(
-                query,
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {permissions::to_strings(set), sstring(role_name), resource.name()},
-                cql3::query_processor::cache_internal::no).discard_result();
-    }
    co_await collect_mutations(_qp, mc, query,
            {permissions::to_strings(set), sstring(role_name), resource.name()});
 }
@@ -218,7 +112,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            ROLE_NAME,
            RESOURCE_NAME,
            PERMISSIONS_NAME,
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            PERMISSIONS_CF);

    const auto results = co_await _qp.execute_internal(
@@ -243,74 +137,16 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
 future<> default_authorizer::revoke_all(std::string_view role_name, ::service::group0_batch& mc) {
    try {
        const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                PERMISSIONS_CF,
                ROLE_NAME);
-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(
-                    query,
-                    db::consistency_level::ONE,
-                    internal_distributed_query_state(),
-                    {sstring(role_name)},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
-        }
+        co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
    } catch (const exceptions::request_execution_exception& e) {
        alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", role_name, e);
    }
 }

-future<> default_authorizer::revoke_all_legacy(const resource& resource) {
-    static const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
-            ROLE_NAME,
-            get_auth_ks_name(_qp),
-            PERMISSIONS_CF,
-            RESOURCE_NAME);
-
-    return _qp.execute_internal(
-            query,
-            db::consistency_level::LOCAL_ONE,
-            {resource.name()},
-            cql3::query_processor::cache_internal::no).then_wrapped([this, resource](future<::shared_ptr<cql3::untyped_result_set>> f) {
-        try {
-            auto res = f.get();
-            return parallel_for_each(
-                    res->begin(),
-                    res->end(),
-                    [this, res, resource](const cql3::untyped_result_set::row& r) {
-                static const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
-                        get_auth_ks_name(_qp),
-                        PERMISSIONS_CF,
-                        ROLE_NAME,
-                        RESOURCE_NAME);
-
-                return _qp.execute_internal(
-                        query,
-                        db::consistency_level::LOCAL_ONE,
-                        {r.get_as<sstring>(ROLE_NAME), resource.name()},
-                        cql3::query_processor::cache_internal::no).discard_result().handle_exception(
-                                [resource](auto ep) {
-                    try {
-                        std::rethrow_exception(ep);
-                    } catch (const exceptions::request_execution_exception& e) {
-                        alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
-                    }
-
-                });
-            });
-        } catch (const exceptions::request_execution_exception& e) {
-            alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
-            return make_ready_future();
-        }
-    });
-}
-
 future<> default_authorizer::revoke_all(const resource& resource, ::service::group0_batch& mc) {
-    if (legacy_mode(_qp)) {
-        co_return co_await revoke_all_legacy(resource);
-    }
-
    if (resource.kind() == resource_kind::data &&
            data_resource_view(resource).is_keyspace()) {
        revoke_all_keyspace_resources(resource, mc);
@@ -321,7 +157,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
    auto gen = [this, name] (api::timestamp_type t) -> ::service::mutations_generator {
        const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
                ROLE_NAME,
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                PERMISSIONS_CF,
                RESOURCE_NAME);
        auto res = co_await _qp.execute_internal(
@@ -331,7 +167,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
                cql3::query_processor::cache_internal::no);
        for (const auto& r : *res) {
            const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
-                    get_auth_ks_name(_qp),
+                    db::system_keyspace::NAME,
                    PERMISSIONS_CF,
                    ROLE_NAME,
                    RESOURCE_NAME);
@@ -356,7 +192,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
        const sstring query = seastar::format("SELECT {}, {} FROM {}.{}",
                ROLE_NAME,
                RESOURCE_NAME,
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                PERMISSIONS_CF);
        auto res = co_await _qp.execute_internal(
                query,
@@ -371,7 +207,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
                continue;
            }
            const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
-                    get_auth_ks_name(_qp),
+                    db::system_keyspace::NAME,
                    PERMISSIONS_CF,
                    ROLE_NAME,
                    RESOURCE_NAME);
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -27,14 +27,12 @@ namespace auth {
 class default_authorizer : public authorizer {
    cql3::query_processor& _qp;

-    ::service::migration_manager& _migration_manager;
-
    abort_source _as{};

    future<> _finished{make_ready_future<>()};

 public:
-    default_authorizer(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    default_authorizer(cql3::query_processor&);

    ~default_authorizer();

@@ -59,16 +57,6 @@ public:
    virtual const resource_set& protected_resources() const override;

 private:
-    future<> start_legacy();
-
-    bool legacy_metadata_exists() const;
-
-    future<> revoke_all_legacy(const resource&);
-
-    future<bool> legacy_any_granted() const;
-
-    future<> migrate_legacy_metadata();
-
    future<> modify(std::string_view, permission_set, const resource&, std::string_view, ::service::group0_batch&);

    void revoke_all_keyspace_resources(const resource& ks_resource, ::service::group0_batch& mc);
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -24,7 +24,6 @@
 #include "exceptions/exceptions.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"
 #include "db/config.hh"
 #include "utils/exponential_backoff_retry.hh"

@@ -72,26 +71,22 @@ std::vector<sstring> get_attr_values(LDAP* ld, LDAPMessage* res, const char* att
    return values;
 }

-const char* ldap_role_manager_full_name = "com.scylladb.auth.LDAPRoleManager";
-
 } // anonymous namespace

 namespace auth {

-static const class_registrator<
-    role_manager,
-    ldap_role_manager,
-    cql3::query_processor&,
-    ::service::raft_group0_client&,
-    ::service::migration_manager&,
-    cache&> registration(ldap_role_manager_full_name);
-
 ldap_role_manager::ldap_role_manager(
        std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
+        uint32_t permissions_update_interval_in_ms,
+        utils::observer<uint32_t>  permissions_update_interval_in_ms_observer,
        cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
        : _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
        , _bind_password(bind_password)
-        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
+        , _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
+        , _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
+        , _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
+        , _cache(cache)
+        , _cache_pruner(make_ready_future<>()) {
 }

 ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
@@ -100,6 +95,8 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
            qp.db().get_config().ldap_attr_role(),
            qp.db().get_config().ldap_bind_dn(),
            qp.db().get_config().ldap_bind_passwd(),
+            qp.db().get_config().permissions_update_interval_in_ms(),
+            qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
            qp,
            rg0c,
            mm,
@@ -107,7 +104,7 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
 }

 std::string_view ldap_role_manager::qualified_java_name() const noexcept {
-    return ldap_role_manager_full_name;
+    return "com.scylladb.auth.LDAPRoleManager";
 }

 const resource_set& ldap_role_manager::protected_resources() const {
@@ -119,6 +116,22 @@ future<> ldap_role_manager::start() {
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
    }
+    _cache_pruner = futurize_invoke([this] () -> future<> {
+        while (true) {
+            try {
+                co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
+            } catch (const seastar::sleep_aborted&) {
+                co_return; // ignore
+            }
+            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
+                try {
+                    co_await c.reload_all_permissions();
+                } catch (...) {
+                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+                }
+            });
+        }
+    });
    return _std_mgr.start();
 }

@@ -175,7 +188,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {

 future<> ldap_role_manager::stop() {
    _as.request_abort();
-    return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
+    return std::move(_cache_pruner).then([this] {
+        return _std_mgr.stop();
+    }).then([this] {
+        return _connection_factory.stop();
+    });
 }

 future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
 #include <stdexcept>

 #include "ent/ldap/ldap_connection.hh"
@@ -34,22 +35,29 @@ class ldap_role_manager : public role_manager {
    seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
    seastar::sstring _bind_name; ///< Username for LDAP simple bind.
    seastar::sstring _bind_password; ///< Password for LDAP simple bind.
+
+    uint32_t _permissions_update_interval_in_ms;
+    utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
+
    mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
    seastar::abort_source _as;
+    cache& _cache;
+    seastar::future<> _cache_pruner;
  public:
    ldap_role_manager(
            std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
            std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
            std::string_view bind_name, ///< LDAP bind credentials.
            std::string_view bind_password, ///< LDAP bind credentials.
+            uint32_t permissions_update_interval_in_ms,
+            utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
            cql3::query_processor& qp, ///< Passed to standard_role_manager.
            ::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
            ::service::migration_manager& mm, ///< Passed to standard_role_manager.
            cache& cache ///< Passed to standard_role_manager.
    );

-    /// Retrieves LDAP configuration entries from qp and invokes the other constructor.  Required by
-    /// class_registrator<role_manager>.
+    /// Retrieves LDAP configuration entries from qp and invokes the other constructor.
    ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache);

    /// Thrown when query-template parsing fails.
--- a/auth/maintenance_socket_authenticator.cc
+++ b/auth/maintenance_socket_authenticator.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#include "auth/maintenance_socket_authenticator.hh"
+
+
+namespace auth {
+
+maintenance_socket_authenticator::~maintenance_socket_authenticator() {
+}
+
+future<> maintenance_socket_authenticator::start() {
+    return make_ready_future<>();
+}
+
+future<> maintenance_socket_authenticator::ensure_superuser_is_created() const {
+    return make_ready_future<>();
+}
+
+bool maintenance_socket_authenticator::require_authentication() const {
+    return false;
+}
+
+} // namespace auth
--- a/auth/maintenance_socket_authenticator.hh
+++ b/auth/maintenance_socket_authenticator.hh
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include <seastar/core/shared_future.hh>
+
+#include "password_authenticator.hh"
+
+namespace auth {
+
+// maintenance_socket_authenticator is used for clients connecting to the
+// maintenance socket. It does not require authentication,
+// while still allowing the managing of roles and their credentials.
+class maintenance_socket_authenticator : public password_authenticator {
+public:
+    using password_authenticator::password_authenticator;
+
+    virtual ~maintenance_socket_authenticator();
+
+    virtual future<> start() override;
+
+    virtual future<> ensure_superuser_is_created() const override;
+
+    bool require_authentication() const override;
+};
+
+} // namespace auth
+
--- a/auth/maintenance_socket_authorizer.hh
+++ b/auth/maintenance_socket_authorizer.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/default_authorizer.hh"
+#include "auth/permission.hh"
+
+namespace auth {
+
+// maintenance_socket_authorizer is used for clients connecting to the
+// maintenance socket. It grants all permissions unconditionally (like
+// AllowAllAuthorizer) while still supporting grant/revoke operations
+// (delegated to the underlying CassandraAuthorizer / default_authorizer).
+class maintenance_socket_authorizer : public default_authorizer {
+public:
+    using default_authorizer::default_authorizer;
+
+    ~maintenance_socket_authorizer() override = default;
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
+        return make_ready_future<permission_set>(permissions::ALL);
+    }
+};
+
+} // namespace auth
--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -13,23 +13,48 @@
 #include <string_view>
 #include "auth/cache.hh"
 #include "cql3/description.hh"
-#include "utils/class_registrator.hh"
+#include "utils/log.hh"
+#include "utils/on_internal_error.hh"

 namespace auth {

-constexpr std::string_view maintenance_socket_role_manager_name = "com.scylladb.auth.MaintenanceSocketRoleManager";
+static logging::logger log("maintenance_socket_role_manager");

-static const class_registrator<
-        role_manager,
-        maintenance_socket_role_manager,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration(sstring{maintenance_socket_role_manager_name});
+future<> maintenance_socket_role_manager::ensure_role_operations_are_enabled() {
+    if (_is_maintenance_mode) {
+        on_internal_error(log, "enabling role operations not allowed in maintenance mode");
+    }

+    if (_std_mgr.has_value()) {
+        on_internal_error(log, "role operations are already enabled");
+    }
+
+    _std_mgr.emplace(_qp, _group0_client, _migration_manager, _cache);
+    return _std_mgr->start();
+}
+
+void maintenance_socket_role_manager::set_maintenance_mode() {
+    if (_std_mgr.has_value()) {
+        on_internal_error(log, "cannot enter maintenance mode after role operations have been enabled");
+    }
+    _is_maintenance_mode = true;
+}
+
+maintenance_socket_role_manager::maintenance_socket_role_manager(
+        cql3::query_processor& qp,
+        ::service::raft_group0_client& rg0c,
+        ::service::migration_manager& mm,
+        cache& c)
+    : _qp(qp)
+    , _group0_client(rg0c)
+    , _migration_manager(mm)
+    , _cache(c)
+    , _std_mgr(std::nullopt)
+    , _is_maintenance_mode(false) {
+}

 std::string_view maintenance_socket_role_manager::qualified_java_name() const noexcept {
-    return maintenance_socket_role_manager_name;
+    return "com.scylladb.auth.MaintenanceSocketRoleManager";
 }

 const resource_set& maintenance_socket_role_manager::protected_resources() const {
@@ -43,81 +68,161 @@ future<> maintenance_socket_role_manager::start() {
 }

 future<> maintenance_socket_role_manager::stop() {
-    return make_ready_future<>();
+    return _std_mgr ? _std_mgr->stop() : make_ready_future<>();
 }

 future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
-    return make_ready_future<>();
+    return _std_mgr ? _std_mgr->ensure_superuser_is_created() : make_ready_future<>();
 }

 template<typename T = void>
-future<T> operation_not_supported_exception(std::string_view operation) {
+future<T> operation_not_available_in_maintenance_mode_exception(std::string_view operation) {
    return make_exception_future<T>(
-        std::runtime_error(fmt::format("role manager: {} operation not supported through maintenance socket", operation)));
+        std::runtime_error(fmt::format("role manager: {} operation not available through maintenance socket in maintenance mode", operation)));
 }

-future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config&, ::service::group0_batch&) {
-    return operation_not_supported_exception("CREATE");
+template<typename T = void>
+future<T> manager_not_ready_exception(std::string_view operation) {
+    return make_exception_future<T>(
+        std::runtime_error(fmt::format("role manager: {} operation not available because manager not ready yet (role operations not enabled)", operation)));
+}
+
+future<> maintenance_socket_role_manager::validate_operation(std::string_view name) const {
+    if (_is_maintenance_mode) {
+        return operation_not_available_in_maintenance_mode_exception(name);
+    }
+    if (!_std_mgr) {
+        return manager_not_ready_exception(name);
+    }
+    return make_ready_future<>();
+}
+
+future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
+    auto f = validate_operation("CREATE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->create(role_name, c, mc);
 }

 future<> maintenance_socket_role_manager::drop(std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("DROP");
+    auto f = validate_operation("DROP");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->drop(role_name, mc);
 }

-future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) {
-    return operation_not_supported_exception("ALTER");
+future<> maintenance_socket_role_manager::alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) {
+    auto f = validate_operation("ALTER");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->alter(role_name, u, mc);
 }

 future<> maintenance_socket_role_manager::grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("GRANT");
+    auto f = validate_operation("GRANT");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->grant(grantee_name, role_name, mc);
 }

 future<> maintenance_socket_role_manager::revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("REVOKE");
+    auto f = validate_operation("REVOKE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->revoke(revokee_name, role_name, mc);
 }

-future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
-    return operation_not_supported_exception<role_set>("QUERY GRANTED");
+future<role_set> maintenance_socket_role_manager::query_granted(std::string_view grantee_name, recursive_role_query m) {
+    auto f = validate_operation("QUERY GRANTED");
+    if (f.failed()) {
+        return make_exception_future<role_set>(f.get_exception());
+    }
+    return _std_mgr->query_granted(grantee_name, m);
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
-    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state& qs) {
+    auto f = validate_operation("QUERY ALL DIRECTLY GRANTED");
+    if (f.failed()) {
+        return make_exception_future<role_to_directly_granted_map>(f.get_exception());
+    }
+    return _std_mgr->query_all_directly_granted(qs);
 }

-future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
-    return operation_not_supported_exception<role_set>("QUERY ALL");
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state& qs) {
+    auto f = validate_operation("QUERY ALL");
+    if (f.failed()) {
+        return make_exception_future<role_set>(f.get_exception());
+    }
+    return _std_mgr->query_all(qs);
 }

 future<bool> maintenance_socket_role_manager::exists(std::string_view role_name) {
-    return operation_not_supported_exception<bool>("EXISTS");
+    auto f = validate_operation("EXISTS");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->exists(role_name);
 }

 future<bool> maintenance_socket_role_manager::is_superuser(std::string_view role_name) {
-    return make_ready_future<bool>(true);
+    auto f = validate_operation("IS SUPERUSER");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->is_superuser(role_name);
 }

 future<bool> maintenance_socket_role_manager::can_login(std::string_view role_name) {
-    return make_ready_future<bool>(true);
+    auto f = validate_operation("CAN LOGIN");
+    if (f.failed()) {
+        return make_exception_future<bool>(f.get_exception());
+    }
+    return _std_mgr->can_login(role_name);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
-    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    auto f = validate_operation("GET ATTRIBUTE");
+    if (f.failed()) {
+        return make_exception_future<std::optional<sstring>>(f.get_exception());
+    }
+    return _std_mgr->get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
-    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    auto f = validate_operation("QUERY ATTRIBUTE FOR ALL");
+    if (f.failed()) {
+        return make_exception_future<role_manager::attribute_vals>(f.get_exception());
+    }
+    return _std_mgr->query_attribute_for_all(attribute_name, qs);
 }

 future<> maintenance_socket_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("SET ATTRIBUTE");
+    auto f = validate_operation("SET ATTRIBUTE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->set_attribute(role_name, attribute_name, attribute_value, mc);
 }

 future<> maintenance_socket_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
-    return operation_not_supported_exception("REMOVE ATTRIBUTE");
+    auto f = validate_operation("REMOVE ATTRIBUTE");
+    if (f.failed()) {
+        return f;
+    }
+    return _std_mgr->remove_attribute(role_name, attribute_name, mc);
 }

 future<std::vector<cql3::description>> maintenance_socket_role_manager::describe_role_grants() {
-    return operation_not_supported_exception<std::vector<cql3::description>>("DESCRIBE SCHEMA WITH INTERNALS");
+    auto f = validate_operation("DESCRIBE ROLE GRANTS");
+    if (f.failed()) {
+        return make_exception_future<std::vector<cql3::description>>(f.get_exception());
+    }
+    return _std_mgr->describe_role_grants();
 }

 } // namespace auth
--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -11,6 +11,7 @@
 #include "auth/cache.hh"
 #include "auth/resource.hh"
 #include "auth/role_manager.hh"
+#include "auth/standard_role_manager.hh"
 #include <seastar/core/future.hh>

 namespace cql3 {
@@ -24,13 +25,26 @@ class raft_group0_client;

 namespace auth {

-extern const std::string_view maintenance_socket_role_manager_name;
-
-// This role manager is used by the maintenance socket. It has disabled all role management operations to not depend on
-// system_auth keyspace, which may be not yet created when the maintenance socket starts listening.
+// This role manager is used by the maintenance socket. It has disabled all role management operations
+// in maintenance mode. In normal mode it delegates all operations to a standard_role_manager,
+// which is created on demand when the node joins the cluster.
 class maintenance_socket_role_manager final : public role_manager {
+    cql3::query_processor& _qp;
+    ::service::raft_group0_client& _group0_client;
+    ::service::migration_manager& _migration_manager;
+    cache& _cache;
+    std::optional<standard_role_manager> _std_mgr;
+    bool _is_maintenance_mode;
+
 public:
-    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {}
+    void set_maintenance_mode() override;
+
+    // Ensures role management operations are enabled.
+    // It must be called once the node has joined the cluster.
+    // In the meantime all role management operations will fail.
+    future<> ensure_role_operations_are_enabled() override;
+
+    maintenance_socket_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    virtual std::string_view qualified_java_name() const noexcept override;

@@ -42,21 +56,21 @@ public:

    virtual future<> ensure_superuser_is_created() override;

-    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;
+    virtual future<> create(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;

-    virtual future<> alter(std::string_view role_name, const role_config_update&, ::service::group0_batch&) override;
+    virtual future<> alter(std::string_view role_name, const role_config_update& u, ::service::group0_batch& mc) override;

    virtual future<> grant(std::string_view grantee_name, std::string_view role_name, ::service::group0_batch& mc) override;

    virtual future<> revoke(std::string_view revokee_name, std::string_view role_name, ::service::group0_batch& mc) override;

-    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;
+    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query m) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& qs) override;

-    virtual future<role_set> query_all(::service::query_state&) override;
+    virtual future<role_set> query_all(::service::query_state& qs) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -64,15 +78,19 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

    virtual future<> remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) override;

    virtual future<std::vector<cql3::description>> describe_role_grants() override;
+
+private:
+    future<> validate_operation(std::string_view name) const;
+
 };

 }
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -26,10 +26,9 @@
 #include "cql3/untyped_result_set.hh"
 #include "utils/log.hh"
 #include "service/migration_manager.hh"
-#include "utils/class_registrator.hh"
-#include "replica/database.hh"
 #include "cql3/query_processor.hh"
 #include "db/config.hh"
+#include "db/system_keyspace.hh"

 namespace auth {

@@ -37,29 +36,10 @@ constexpr std::string_view password_authenticator_name("org.apache.cassandra.aut

 // name of the hash column.
 static constexpr std::string_view SALTED_HASH = "salted_hash";
-static constexpr std::string_view DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
-static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);
-
 static logging::logger plogger("password_authenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        password_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
-
 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

-static std::string_view get_config_value(std::string_view value, std::string_view def) {
-    return value.empty() ? def : value;
-}
-std::string password_authenticator::default_superuser(const db::config& cfg) {
-    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
-}
-
 password_authenticator::~password_authenticator() {
 }

@@ -69,7 +49,6 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
    , _migration_manager(mm)
    , _cache(cache)
    , _stopped(make_ready_future<>()) 
-    , _superuser(default_superuser(qp.db().get_config()))
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -78,76 +57,18 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {

 sstring password_authenticator::update_row_query() const {
    return seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            SALTED_HASH,
            meta::roles_table::role_col_name);
 }

-static const sstring legacy_table_name{"credentials"};
-
-bool password_authenticator::legacy_metadata_exists() const {
-    return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
-}
-
-future<> password_authenticator::migrate_legacy_metadata() const {
-    plogger.info("Starting migration of legacy authentication metadata.");
-    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
-
-    return _qp.execute_internal(
-            query,
-            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
-            cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
-        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
-            auto username = row.get_as<sstring>("username");
-            auto salted_hash = row.get_as<sstring>(SALTED_HASH);
-            static const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
-                    meta::legacy::AUTH_KS,
-                    meta::roles_table::name,
-                    SALTED_HASH,
-                    meta::roles_table::role_col_name);
-            return _qp.execute_internal(
-                    query,
-                    consistency_for_user(username),
-                    internal_distributed_query_state(),
-                    {std::move(salted_hash), username},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        }).finally([results] {});
-    }).then([] {
-       plogger.info("Finished migrating legacy authentication metadata.");
-    }).handle_exception([](std::exception_ptr ep) {
-        plogger.error("Encountered an error during migration!");
-        std::rethrow_exception(ep);
-    });
-}
-
-future<> password_authenticator::legacy_create_default_if_missing() {
-    const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
-    if (exists) {
-        co_return;
-    }
-    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
-    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
-    }
-    const auto query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
-            meta::legacy::AUTH_KS,
-            meta::roles_table::name,
-            SALTED_HASH,
-            meta::roles_table::role_col_name);
-    co_await _qp.execute_internal(
-            query,
-            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
-            {salted_pwd, _superuser},
-            cql3::query_processor::cache_internal::no);
-    plogger.info("Created default superuser authentication record.");
-}
-
 future<> password_authenticator::maybe_create_default_password() {
    auto needs_password = [this] () -> future<bool> {
-        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        if (default_superuser(_qp).empty()) {
+            co_return false;
+        }
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
        auto results = co_await _qp.execute_internal(query,
                db::consistency_level::LOCAL_ONE,
                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
@@ -157,7 +78,7 @@ future<> password_authenticator::maybe_create_default_password() {
        bool has_default = false;
        bool has_superuser_with_password = false;
        for (auto& result : *results) {
-            if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
+            if (result.get_as<sstring>(meta::roles_table::role_col_name) == default_superuser(_qp)) {
                has_default = true;
            }
            if (has_salted_hash(result)) {
@@ -178,12 +99,12 @@ future<> password_authenticator::maybe_create_default_password() {
        co_return;
    }
    // Set default superuser's password.
-    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    std::string salted_pwd(_qp.db().get_config().auth_superuser_salted_password());
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
+        co_return;
    }
    const auto update_query = update_row_query();
-    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
+    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, default_superuser(_qp)});
    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
    plogger.info("Created default superuser authentication record.");
 }
@@ -216,58 +137,14 @@ future<> password_authenticator::start() {

        _stopped = do_after_system_ready(_as, [this] {
            return async([this] {
-                if (legacy_mode(_qp)) {
-                    if (!_superuser_created_promise.available()) {
-                        // Counterintuitively, we mark promise as ready before any startup work
-                        // because wait_for_schema_agreement() below will block indefinitely
-                        // without cluster majority. In that case, blocking node startup
-                        // would lead to a cluster deadlock.
-                        _superuser_created_promise.set_value();
-                    }
-                    _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
-
-                    if (legacy::any_nondefault_role_row_satisfies(_qp, &has_salted_hash, _superuser).get()) {
-                        if (legacy_metadata_exists()) {
-                            plogger.warn("Ignoring legacy authentication metadata since nondefault data already exist.");
-                        }
-
-                        return;
-                    }
-
-                    if (legacy_metadata_exists()) {
-                        migrate_legacy_metadata().get();
-                        return;
-                    }
-                    legacy_create_default_if_missing().get();
-                }
                utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
-                if (!legacy_mode(_qp)) {
-                    maybe_create_default_password_with_retries().get();
-                    if (!_superuser_created_promise.available()) {
-                        _superuser_created_promise.set_value();
-                    }
+                maybe_create_default_password_with_retries().get();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
                }
            });
        });

-        if (legacy_mode(_qp)) {
-            static const sstring create_roles_query = fmt::format(
-                    "CREATE TABLE {}.{} ("
-                    "  {} text PRIMARY KEY,"
-                    "  can_login boolean,"
-                    "  is_superuser boolean,"
-                    "  member_of set<text>,"
-                    "  salted_hash text"
-                    ")",
-                    meta::legacy::AUTH_KS,
-                    meta::roles_table::name,
-                    meta::roles_table::role_col_name);
-            return create_legacy_metadata_table_if_missing(
-                    meta::roles_table::name,
-                    _qp,
-                    create_roles_query,
-                    _migration_manager);
-        }
        return make_ready_future<>();
    });
 }
@@ -277,15 +154,6 @@ future<> password_authenticator::stop() {
    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

-db::consistency_level password_authenticator::consistency_for_user(std::string_view role_name) {
-    // TODO: this is plain dung. Why treat hardcoded default special, but for example a user-created
-    // super user uses plain LOCAL_ONE?
-    if (role_name == DEFAULT_USER_NAME) {
-        return db::consistency_level::QUORUM;
-    }
-    return db::consistency_level::LOCAL_ONE;
-}
-
 std::string_view password_authenticator::qualified_java_name() const {
    return password_authenticator_name;
 }
@@ -315,20 +183,12 @@ future<authenticated_user> password_authenticator::authenticate(
    const sstring password = credentials.at(PASSWORD_KEY);

    try {
-        std::optional<sstring> salted_hash;
-        if (legacy_mode(_qp)) {
-            salted_hash = co_await get_password_hash(username);
-            if (!salted_hash) {
-                throw exceptions::authentication_exception("Username and/or password are incorrect");
-            }
-        } else {
-            auto role = _cache.get(username);
-            if (!role || role->salted_hash.empty()) {
-                throw exceptions::authentication_exception("Username and/or password are incorrect");
-            }
-            salted_hash = role->salted_hash;
+        auto role = _cache.get(username);
+        if (!role || role->salted_hash.empty()) {
+            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
-        const bool password_match = co_await passwords::check(password, *salted_hash);
+        const auto& salted_hash = role->salted_hash;
+        const bool password_match = co_await passwords::check(password, salted_hash);
        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
@@ -367,16 +227,7 @@ future<> password_authenticator::create(std::string_view role_name, const authen
    }

    const auto query = update_row_query();
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
-                query,
-                consistency_for_user(role_name),
-                internal_distributed_query_state(),
-                {std::move(*maybe_hash), sstring(role_name)},
-                cql3::query_processor::cache_internal::no).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
-    }
+    co_await collect_mutations(_qp, mc, query, {std::move(*maybe_hash), sstring(role_name)});
 }

 future<> password_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
@@ -387,38 +238,21 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
    const auto password = std::get<password_option>(*options.credentials).password;

    const sstring query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            SALTED_HASH,
            meta::roles_table::role_col_name);
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
-                query,
-                consistency_for_user(role_name),
-                internal_distributed_query_state(),
-                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
-                cql3::query_processor::cache_internal::no).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc, query,
-                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
-    }
+    co_await collect_mutations(_qp, mc, query,
+            {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
 }

 future<> password_authenticator::drop(std::string_view name, ::service::group0_batch& mc) {
    const sstring query = seastar::format("DELETE {} FROM {}.{} WHERE {} = ?",
            SALTED_HASH,
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            meta::roles_table::role_col_name);
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
-                query, consistency_for_user(name),
-                internal_distributed_query_state(),
-                {sstring(name)},
-                cql3::query_processor::cache_internal::no).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc, query, {sstring(name)});
-    }
+    co_await collect_mutations(_qp, mc, query, {sstring(name)});
 }

 future<custom_options> password_authenticator::query_custom_options(std::string_view role_name) const {
@@ -437,13 +271,13 @@ future<std::optional<sstring>> password_authenticator::get_password_hash(std::st
    // that a map lookup string->statement is not gonna kill us much.
    const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ?",
                SALTED_HASH,
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                meta::roles_table::name,
                meta::roles_table::role_col_name);

    const auto res = co_await _qp.execute_internal(
            query,
-            consistency_for_user(role_name),
+            db::consistency_level::LOCAL_ONE,
            internal_distributed_query_state(),
            {role_name},
            cql3::query_processor::cache_internal::yes);
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -13,7 +13,6 @@
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/shared_future.hh>

-#include "db/consistency_level_type.hh"
 #include "auth/authenticator.hh"
 #include "auth/passwords.hh"
 #include "auth/cache.hh"
@@ -44,15 +43,11 @@ class password_authenticator : public authenticator {
    cache& _cache;
    future<> _stopped;
    abort_source _as;
-    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
    shared_promise<> _superuser_created_promise;
    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;

 public:
-    static db::consistency_level consistency_for_user(std::string_view role_name);
-    static std::string default_superuser(const db::config&);
-
    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    ~password_authenticator();
@@ -90,12 +85,6 @@ public:
    virtual future<> ensure_superuser_is_created() const override;

 private:
-    bool legacy_metadata_exists() const;
-
-    future<> migrate_legacy_metadata() const;
-
-    future<> legacy_create_default_if_missing();
-
    future<> maybe_create_default_password();
    future<> maybe_create_default_password_with_retries();

--- a/auth/permissions_cache.cc
+++ b/auth/permissions_cache.cc
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "auth/permissions_cache.hh"
-
-#include <fmt/ranges.h>
-#include "auth/authorizer.hh"
-#include "auth/service.hh"
-
-namespace auth {
-
-permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
-        : _cache(c, log, [&ser, &log](const key_type& k) {
-              log.debug("Refreshing permissions for {}", k.first);
-              return ser.get_uncached_permissions(k.first, k.second);
-          }) {
-}
-
-bool permissions_cache::update_config(utils::loading_cache_config c) {
-    return _cache.update_config(std::move(c));
-}
-
-void permissions_cache::reset() {
-    _cache.reset();
-}
-
-future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
-    return do_with(key_type(maybe_role, r), [this](const auto& k) {
-        return _cache.get(k);
-    });
-}
-
-}
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2017-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <iostream>
-#include <utility>
-
-#include <fmt/core.h>
-#include <seastar/core/future.hh>
-
-#include "auth/permission.hh"
-#include "auth/resource.hh"
-#include "auth/role_or_anonymous.hh"
-#include "utils/log.hh"
-#include "utils/hash.hh"
-#include "utils/loading_cache.hh"
-
-namespace std {
-
-inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
-    fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
-    return os;
-}
-
-}
-
-namespace db {
-class config;
-}
-
-namespace auth {
-
-class service;
-
-class permissions_cache final {
-    using cache_type = utils::loading_cache<
-            std::pair<role_or_anonymous, resource>,
-            permission_set,
-            1,
-            utils::loading_cache_reload_enabled::yes,
-            utils::simple_entry_size<permission_set>,
-            utils::tuple_hash>;
-
-    using key_type = typename cache_type::key_type;
-
-    cache_type _cache;
-
-public:
-    explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
-
-    future <> stop() {
-        return _cache.stop();
-    }
-
-    bool update_config(utils::loading_cache_config);
-    void reset();
-    future<permission_set> get(const role_or_anonymous&, const resource&);
-};
-
-}
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -112,6 +112,11 @@ public:

    virtual future<> stop() = 0;

+    ///
+    /// Notify that the maintenance mode is starting.
+    ///
+    virtual void set_maintenance_mode() {}
+
    ///
    /// Ensure that superuser role exists.
    ///
@@ -119,6 +124,11 @@ public:
    ///
    virtual future<> ensure_superuser_is_created() = 0;

+    ///
+    /// Ensure role management operations are enabled. Some role managers may defer initialization.
+    ///
+    virtual future<> ensure_role_operations_are_enabled() { return make_ready_future<>(); }
+
    ///
    /// \returns an exceptional future with \ref role_already_exists for a role that has previously been created.
    ///
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2018-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#include "auth/roles-metadata.hh"
-
-#include <seastar/core/format.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-
-#include "auth/common.hh"
-#include "cql3/query_processor.hh"
-#include "cql3/untyped_result_set.hh"
-
-namespace auth {
-
-namespace legacy {
-
-future<bool> default_role_row_satisfies(
-        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
-    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            auth::meta::legacy::AUTH_KS,
-            meta::roles_table::name,
-            meta::roles_table::role_col_name);
-
-    for (auto cl : { db::consistency_level::ONE, db::consistency_level::QUORUM }) {
-        auto results = co_await qp.execute_internal(query, cl
-            , internal_distributed_query_state()
-            , {rolename.value_or(std::string(auth::meta::DEFAULT_SUPERUSER_NAME))}
-            , cql3::query_processor::cache_internal::yes
-            );
-        if (!results->empty()) {
-            co_return p(results->one());
-        }
-    }
-    co_return false;
-}
-
-future<bool> any_nondefault_role_row_satisfies(
-        cql3::query_processor& qp,
-        std::function<bool(const cql3::untyped_result_set_row&)> p,
-        std::optional<std::string> rolename) {
-    const sstring query = seastar::format("SELECT * FROM {}.{}", auth::meta::legacy::AUTH_KS, meta::roles_table::name);
-
-    auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
-        , internal_distributed_query_state(), cql3::query_processor::cache_internal::no
-        );
-    if (results->empty()) {
-        co_return false;
-    }
-    static const sstring col_name = sstring(meta::roles_table::role_col_name);
-
-    co_return std::ranges::any_of(*results, [&](const cql3::untyped_result_set_row& row) {
-        auto superuser = rolename ? std::string_view(*rolename) : meta::DEFAULT_SUPERUSER_NAME;
-        const bool is_nondefault = row.get_as<sstring>(col_name) != superuser;
-        return is_nondefault && p(row);
-    });
-}
-
-} // namespace legacy
-
-} // namespace auth
--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -8,18 +8,7 @@

 #pragma once

-#include <optional>
 #include <string_view>
-#include <functional>
-
-#include <seastar/core/future.hh>
-
-#include "seastarx.hh"
-
-namespace cql3 {
-class query_processor;
-class untyped_result_set_row;
-}

 namespace auth {

@@ -35,26 +24,4 @@ constexpr std::string_view role_col_name{"role", 4};

 } // namespace meta

-namespace legacy {
-
-///
-/// Check that the default role satisfies a predicate, or `false` if the default role does not exist.
-///
-future<bool> default_role_row_satisfies(
-        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
-
-///
-/// Check that any nondefault role satisfies a predicate. `false` if no nondefault roles exist.
-///
-future<bool> any_nondefault_role_row_satisfies(
-        cql3::query_processor&,
-        std::function<bool(const cql3::untyped_result_set_row&)>,
-        std::optional<std::string> rolename = {}
-        );
-
-} // namespace legacy
-
 } // namespace auth
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -22,21 +22,11 @@
 #include "db/config.hh"
 #include "utils/log.hh"
 #include "seastarx.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

 static logging::logger mylog("saslauthd_authenticator");

-// To ensure correct initialization order, we unfortunately need to use a string literal.
-static const class_registrator<
-        authenticator,
-        saslauthd_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
-
 saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -16,6 +16,8 @@
 #include <algorithm>
 #include <chrono>

+#include <boost/algorithm/string.hpp>
+
 #include <seastar/core/future-util.hh>
 #include <seastar/core/shard_id.hh>
 #include <seastar/core/sharded.hh>
@@ -23,8 +25,18 @@

 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
+#include "auth/certificate_authenticator.hh"
 #include "auth/common.hh"
+#include "auth/default_authorizer.hh"
+#include "auth/ldap_role_manager.hh"
+#include "auth/maintenance_socket_authenticator.hh"
+#include "auth/maintenance_socket_authorizer.hh"
+#include "auth/maintenance_socket_role_manager.hh"
+#include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
+#include "auth/saslauthd_authenticator.hh"
+#include "auth/standard_role_manager.hh"
+#include "auth/transitional.hh"
 #include "cql3/functions/functions.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/description.hh"
@@ -43,7 +55,6 @@
 #include "service/raft/raft_group0_client.hh"
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
-#include "utils/class_registrator.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "data_dictionary/keyspace_metadata.hh"
 #include "service/storage_service.hh"
@@ -63,91 +74,6 @@ static const sstring superuser_col_name("super");

 static logging::logger log("auth_service");

-class auth_migration_listener final : public ::service::migration_listener {
-    authorizer& _authorizer;
-    cql3::query_processor& _qp;
-
-public:
-    explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a),  _qp(qp) {
-    }
-
-private:
-    void on_create_keyspace(const sstring& ks_name) override {}
-    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
-    void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
-    void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
-
-    void on_update_keyspace(const sstring& ks_name) override {}
-    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
-    void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
-    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
-
-    void on_drop_keyspace(const sstring& ks_name) override {
-        if (!legacy_mode(_qp)) {
-            // in non legacy path revoke is part of schema change statement execution
-            return;
-        }
-        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
-        });
-
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
-        });
-    }
-
-    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
-        if (!legacy_mode(_qp)) {
-            // in non legacy path revoke is part of schema change statement execution
-            return;
-        }
-        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_data_resource(ks_name, cf_name), mc);
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
-        });
-    }
-
-    void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {
-        if (!legacy_mode(_qp)) {
-            // in non legacy path revoke is part of schema change statement execution
-            return;
-        }
-        // Do it in the background.
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, function_name), mc);
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
-        });
-    }
-    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {
-        if (!legacy_mode(_qp)) {
-            // in non legacy path revoke is part of schema change statement execution
-            return;
-        }
-        (void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
-            return _authorizer.revoke_all(
-                    auth::make_functions_resource(ks_name, aggregate_name), mc);
-        }).handle_exception([] (std::exception_ptr e) {
-            log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
-        });
-    }
-    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
-};
-
 static future<> validate_role_exists(const service& ser, std::string_view role_name) {
    return ser.underlying_role_manager().exists(role_name).then([role_name](bool exists) {
        if (!exists) {
@@ -157,50 +83,36 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 }

 service::service(
-        utils::loading_cache_config c,
        cache& cache,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
-        ::service::migration_notifier& mn,
        std::unique_ptr<authorizer> z,
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r,
        maintenance_socket_enabled used_by_maintenance_socket)
-            : _loading_cache_config(std::move(c))
-            , _permissions_cache(nullptr)
-            , _cache(cache)
+            : _cache(cache)
            , _qp(qp)
            , _group0_client(g0)
-            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
-            , _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
-            , _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
-            , _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
-            , _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
            , _used_by_maintenance_socket(used_by_maintenance_socket) {}

 service::service(
-        utils::loading_cache_config c,
        cql3::query_processor& qp,
        ::service::raft_group0_client& g0,
-        ::service::migration_notifier& mn,
-        ::service::migration_manager& mm,
-        const service_config& sc,
+        authorizer_factory authorizer_factory,
+        authenticator_factory authenticator_factory,
+        role_manager_factory role_manager_factory,
        maintenance_socket_enabled used_by_maintenance_socket,
        cache& cache)
            : service(
-                      std::move(c),
                      cache,
                      qp,
                      g0,
-                      mn,
-                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
-                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
+                      authorizer_factory(),
+                      authenticator_factory(),
+                      role_manager_factory(),
                      used_by_maintenance_socket) {
 }

@@ -233,9 +145,6 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager
 }

 future<> service::start(::service::migration_manager& mm, db::system_keyspace& sys_ks) {
-    auto auth_version = co_await sys_ks.get_auth_version();
-    // version is set in query processor to be easily available in various places we call auth::legacy_mode check.
-    _qp.auth_version = auth_version;
    if (this_shard_id() == 0) {
        co_await _cache.load_all();
    }
@@ -257,25 +166,20 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        co_await _role_manager->ensure_superuser_is_created();
    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
-    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
-    co_await once_among_shards([this] {
-        _mnotifier.register_listener(_migration_listener.get());
-        return make_ready_future<>();
-    });
+    if (!_used_by_maintenance_socket) {
+        // Maintenance socket mode can't cache permissions because it has
+        // different authorizer. We can't mix cached permissions, they could be
+        // different in normal mode.
+        _cache.set_permission_loader(std::bind(
+                &service::get_uncached_permissions,
+                this, std::placeholders::_1, std::placeholders::_2));
+    }
 }

 future<> service::stop() {
    _as.request_abort();
-    // Only one of the shards has the listener registered, but let's try to
-    // unregister on each one just to make sure.
-    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
-        if (_permissions_cache) {
-            return _permissions_cache->stop();
-        }
-        return make_ready_future<>();
-    }).then([this] {
-        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
-    });
+    _cache.set_permission_loader(nullptr);
+    return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
 }

 future<> service::ensure_superuser_is_created() {
@@ -283,21 +187,8 @@ future<> service::ensure_superuser_is_created() {
    co_await _authenticator->ensure_superuser_is_created();
 }

-void service::update_cache_config() {
-    auto db = _qp.db();
-
-    utils::loading_cache_config perm_cache_config;
-    perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
-    perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
-    perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
-
-    if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
-        log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
-    }
-}

 void service::reset_authorization_cache() {
-    _permissions_cache->reset();
    _qp.reset_cache();
 }

@@ -322,7 +213,14 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
 }

 future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
-    return _permissions_cache->get(maybe_role, r);
+    if (_used_by_maintenance_socket) {
+        return get_uncached_permissions(maybe_role, r);
+    }
+    return _cache.get_permissions(maybe_role, r);
+}
+
+void service::set_maintenance_mode() {
+    _role_manager->set_maintenance_mode();
 }

 future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
@@ -360,6 +258,10 @@ static void validate_authentication_options_are_supported(
    }
 }

+future<> service::ensure_role_operations_are_enabled() {
+    return _role_manager->ensure_role_operations_are_enabled();
+}
+
 future<> service::create_role(std::string_view name,
        const role_config& config,
        const authentication_options& options,
@@ -377,11 +279,6 @@ future<> service::create_role(std::string_view name,
        ep = std::current_exception();
    }
    if (ep) {
-        // Rollback only in legacy mode as normally mutations won't be
-        // applied in case exception is raised
-        if (legacy_mode(_qp)) {
-            co_await underlying_role_manager().drop(name, mc);
-        }
        std::rethrow_exception(std::move(ep));
    }
 }
@@ -447,6 +344,11 @@ future<bool> service::exists(const resource& r) const {
    return make_ready_future<bool>(false);
 }

+future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
+    co_await _authorizer->revoke_all(r, mc);
+    co_await _cache.prune(r);
+}
+
 future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
    std::vector<cql3::description> result{};

@@ -455,11 +357,11 @@ future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_

    const bool authenticator_uses_password_hashes = _authenticator->uses_password_hashes();

-    auto produce_create_statement = [with_hashed_passwords] (const sstring& formatted_role_name,
+    const auto default_su = cql3::util::maybe_quote(default_superuser(_qp));
+
+    auto produce_create_statement = [&default_su, with_hashed_passwords] (const sstring& formatted_role_name,
            const std::optional<sstring>& maybe_hashed_password, bool can_login, bool is_superuser) {
-        // Even after applying formatting to a role, `formatted_role_name` can only equal `meta::DEFAULT_SUPER_NAME`
-        // if the original identifier was equal to it.
-        const sstring role_part = formatted_role_name == meta::DEFAULT_SUPERUSER_NAME
+        const sstring role_part = formatted_role_name == default_su
                ? seastar::format("IF NOT EXISTS {}", formatted_role_name)
                : formatted_role_name;

@@ -672,6 +574,10 @@ future<std::vector<cql3::description>> service::describe_auth(bool with_hashed_p
 // Free functions.
 //

+void set_maintenance_mode(service& ser) {
+    ser.set_maintenance_mode();
+}
+
 future<bool> has_superuser(const service& ser, const authenticated_user& u) {
    if (is_anonymous(u)) {
        return make_ready_future<bool>(false);
@@ -680,6 +586,10 @@ future<bool> has_superuser(const service& ser, const authenticated_user& u) {
    return ser.has_superuser(*u.name);
 }

+future<> ensure_role_operations_are_enabled(service& ser) {
+    return ser.underlying_role_manager().ensure_role_operations_are_enabled();
+}
+
 future<role_set> get_roles(const service& ser, const authenticated_user& u) {
    if (is_anonymous(u)) {
        return make_ready_future<role_set>();
@@ -801,7 +711,7 @@ future<> revoke_permissions(
 }

 future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
-    return ser.underlying_authorizer().revoke_all(r, mc);
+    return ser.revoke_all(r, mc);
 }

 future<std::vector<permission_details>> list_filtered_permissions(
@@ -862,83 +772,115 @@ future<> commit_mutations(service& ser, ::service::group0_batch&& mc) {
    return ser.commit_mutations(std::move(mc));
 }

-future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as) {
-    // FIXME: if this function fails it may leave partial data in the new tables
-    // that should be cleared
-    auto gen = [&sys_ks] (api::timestamp_type ts) -> ::service::mutations_generator {
-        auto& qp = sys_ks.query_processor();
-        for (const auto& cf_name : std::vector<sstring>{
-                "roles", "role_members", "role_attributes", "role_permissions"}) {
-            schema_ptr schema;
-            try {
-                schema = qp.db().find_schema(meta::legacy::AUTH_KS, cf_name);
-            } catch (const data_dictionary::no_such_column_family&) {
-                continue; // some tables might not have been created if they were not used
-            }
+namespace {

-            std::vector<sstring> col_names;
-            for (const auto& col : schema->all_columns()) {
-                col_names.push_back(col.name_as_cql_string());
-            }
-            sstring val_binders_str = "?";
-            for (size_t i = 1; i < col_names.size(); ++i) {
-                val_binders_str += ", ?";
-            }
+std::string_view get_short_name(std::string_view name) {
+    auto pos = name.find_last_of('.');
+    if (pos == std::string_view::npos) {
+        return name;
+    }
+    return name.substr(pos + 1);
+}

-            std::vector<mutation> collected;
-            // use longer than usual timeout as we scan the whole table
-            // but not infinite or very long as we want to fail reasonably fast
-            const auto t = 5min;
-            const timeout_config tc{t, t, t, t, t, t, t};
-            ::service::client_state cs(::service::client_state::internal_tag{}, tc);
-            ::service::query_state qs(cs, empty_service_permit());
+} // anonymous namespace

-            co_await qp.query_internal(
-                seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
-                db::consistency_level::ALL,
-                {},
-                1000,
-                [&qp, &cf_name, &col_names, &val_binders_str, &schema, ts, &collected] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
-                    std::vector<data_value_or_unset> values;
-                    for (const auto& col : schema->all_columns()) {
-                        if (row.has(col.name_as_text())) {
-                            values.push_back(
-                                    col.type->deserialize(row.get_blob_unfragmented(col.name_as_text())));
-                        } else {
-                            values.push_back(unset_value{});
-                        }
-                    }
-                    auto muts = co_await qp.get_mutations_internal(
-                            seastar::format("INSERT INTO {}.{} ({}) VALUES ({})",
-                                    db::system_keyspace::NAME,
-                                    cf_name,
-                                    fmt::join(col_names, ", "),
-                                    val_binders_str),
-                            internal_distributed_query_state(),
-                            ts,
-                            std::move(values));
-                    if (muts.size() != 1) {
-                        on_internal_error(log,
-                                format("expecting single insert mutation, got {}", muts.size()));
-                    }
+authorizer_factory make_authorizer_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp) {
+    std::string_view short_name = get_short_name(name);

-                    collected.push_back(std::move(muts[0]));
-                    co_return stop_iteration::no;
-                },
-                std::move(qs));
+    if (boost::iequals(short_name, "AllowAllAuthorizer")) {
+        return [&qp] {
+            return std::make_unique<allow_all_authorizer>(qp.local());
+        };
+    } else if (boost::iequals(short_name, "CassandraAuthorizer")) {
+        return [&qp] {
+            return std::make_unique<default_authorizer>(qp.local());
+        };
+    } else if (boost::iequals(short_name, "TransitionalAuthorizer")) {
+        return [&qp] {
+            return std::make_unique<transitional_authorizer>(qp.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown authorizer: {}", name));
+}

-            for (auto& m : collected) {
-                co_yield std::move(m);
-            }
-        }
-        co_yield co_await sys_ks.make_auth_version_mutation(ts,
-                db::system_keyspace::auth_version_t::v2);
+authenticator_factory make_authenticator_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    std::string_view short_name = get_short_name(name);
+
+    if (boost::iequals(short_name, "AllowAllAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<allow_all_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "PasswordAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<password_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "CertificateAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<certificate_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "SaslauthdAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<saslauthd_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "TransitionalAuthenticator")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<transitional_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown authenticator: {}", name));
+}
+
+role_manager_factory make_role_manager_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    std::string_view short_name = get_short_name(name);
+
+    if (boost::iequals(short_name, "CassandraRoleManager")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<standard_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    } else if (boost::iequals(short_name, "LDAPRoleManager")) {
+        return [&qp, &g0, &mm, &auth_cache] {
+            return std::make_unique<ldap_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
+        };
+    }
+    throw std::invalid_argument(fmt::format("Unknown role manager: {}", name));
+}
+
+authenticator_factory make_maintenance_socket_authenticator_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    return [&qp, &g0, &mm, &auth_cache] {
+        return std::make_unique<maintenance_socket_authenticator>(qp.local(), g0, mm.local(), auth_cache.local());
+    };
+}
+
+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp) {
+    return [&qp] {
+        return std::make_unique<maintenance_socket_authorizer>(qp.local());
+    };
+}
+
+role_manager_factory make_maintenance_socket_role_manager_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& auth_cache) {
+    return [&qp, &g0, &mm, &auth_cache] {
+        return std::make_unique<maintenance_socket_role_manager>(qp.local(), g0, mm.local(), auth_cache.local());
    };
-    co_await announce_mutations_with_batching(g0,
-            start_operation_func,
-            std::move(gen),
-            as,
-            std::nullopt);
 }

 }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -12,6 +12,7 @@
 #include <memory>
 #include <optional>

+#include <seastar/core/coroutine.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>
@@ -20,7 +21,6 @@
 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
 #include "auth/permission.hh"
-#include "auth/permissions_cache.hh"
 #include "auth/cache.hh"
 #include "auth/role_manager.hh"
 #include "auth/common.hh"
@@ -37,19 +37,16 @@ class query_processor;

 namespace service {
 class migration_manager;
-class migration_notifier;
-class migration_listener;
 }

 namespace auth {

 class role_or_anonymous;

-struct service_config final {
-    sstring authorizer_java_name;
-    sstring authenticator_java_name;
-    sstring role_manager_java_name;
-};
+/// Factory function types for creating auth module instances on each shard.
+using authorizer_factory = std::function<std::unique_ptr<authorizer>()>;
+using authenticator_factory = std::function<std::unique_ptr<authenticator>()>;
+using role_manager_factory = std::function<std::unique_ptr<role_manager>()>;

 ///
 /// Due to poor (in this author's opinion) decisions of Apache Cassandra, certain choices of one role-manager,
@@ -75,43 +72,27 @@ public:
 /// peering_sharded_service inheritance is needed to be able to access shard local authentication service
 /// given an object from another shard. Used for bouncing lwt requests to correct shard.
 class service final : public seastar::peering_sharded_service<service> {
-    utils::loading_cache_config _loading_cache_config;
-    std::unique_ptr<permissions_cache> _permissions_cache;
    cache& _cache;

    cql3::query_processor& _qp;

    ::service::raft_group0_client& _group0_client;

-    ::service::migration_notifier& _mnotifier;
-
    authorizer::ptr_type _authorizer;

    authenticator::ptr_type _authenticator;

    role_manager::ptr_type _role_manager;

-    // Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
-    std::unique_ptr<::service::migration_listener> _migration_listener;
-
-    std::function<void(uint32_t)> _permissions_cache_cfg_cb;
-    serialized_action _permissions_cache_config_action;
-
-    utils::observer<uint32_t> _permissions_cache_max_entries_observer;
-    utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
-    utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
-
    maintenance_socket_enabled _used_by_maintenance_socket;

    abort_source _as;

 public:
    service(
-            utils::loading_cache_config,
            cache& cache,
            cql3::query_processor&,
            ::service::raft_group0_client&,
-            ::service::migration_notifier&,
            std::unique_ptr<authorizer>,
            std::unique_ptr<authenticator>,
            std::unique_ptr<role_manager>,
@@ -119,16 +100,15 @@ public:

    ///
    /// This constructor is intended to be used when the class is sharded via \ref seastar::sharded. In that case, the
-    /// arguments must be copyable, which is why we delay construction with instance-construction instructions instead
+    /// arguments must be copyable, which is why we delay construction with instance-construction factories instead
    /// of the instances themselves.
    ///
    service(
-            utils::loading_cache_config,
            cql3::query_processor&,
            ::service::raft_group0_client&,
-            ::service::migration_notifier&,
-            ::service::migration_manager&,
-            const service_config&,
+            authorizer_factory,
+            authenticator_factory,
+            role_manager_factory,
            maintenance_socket_enabled,
            cache&);

@@ -138,8 +118,6 @@ public:

    future<> ensure_superuser_is_created();

-    void update_cache_config();
-
    void reset_authorization_cache();

    ///
@@ -152,6 +130,11 @@ public:
    ///
    future<permission_set> get_uncached_permissions(const role_or_anonymous&, const resource&) const;

+    ///
+    /// Notify the service that the node is entering maintenance mode.
+    ///
+    void set_maintenance_mode();
+
    ///
    /// Query whether the named role has been granted a role that is a superuser.
    ///
@@ -161,6 +144,11 @@ public:
    ///
    future<bool> has_superuser(std::string_view role_name) const;

+    ///
+    /// Ensure that the role operations are enabled. Some role managers defer initialization.
+    ///
+    future<> ensure_role_operations_are_enabled();
+    
    ///
    /// Create a role with optional authentication information.
    ///
@@ -181,6 +169,13 @@ public:

    future<bool> exists(const resource&) const;

+    ///
+    /// Revoke all permissions granted to any role for a particular resource.
+    ///
+    /// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
+    ///
+    future<> revoke_all(const resource&, ::service::group0_batch&) const;
+
    ///
    /// Produces descriptions that can be used to restore the state of auth. That encompasses
    /// roles, role grants, and permission grants.
@@ -199,12 +194,9 @@ public:
        return *_role_manager;
    }

-    cql3::query_processor& query_processor() const noexcept {
-        return _qp;
-    }
-
    future<> commit_mutations(::service::group0_batch&& mc) {
-        return std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
+        co_await std::move(mc).commit(_group0_client, _as, ::service::raft_timeout{});
+        co_await _group0_client.send_group0_read_barrier_to_live_members();
    }

 private:
@@ -215,8 +207,12 @@ private:
    future<std::vector<cql3::description>> describe_permissions() const;
 };

+void set_maintenance_mode(service&);
+
 future<bool> has_superuser(const service&, const authenticated_user&);

+future<> ensure_role_operations_are_enabled(service&);
+
 future<role_set> get_roles(const service&, const authenticated_user&);

 future<permission_set> get_permissions(const service&, const authenticated_user&, const resource&);
@@ -400,7 +396,55 @@ future<std::vector<permission_details>> list_filtered_permissions(
 // Finalizes write operations performed in auth by committing mutations via raft group0.
 future<> commit_mutations(service& ser, ::service::group0_batch&& mc);

-// Migrates data from old keyspace to new one which supports linearizable writes via raft.
-future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_client& g0, start_operation_func_t start_operation_func, abort_source& as);
+///
+/// Factory helper functions for creating auth module instances.
+/// These are intended for use with sharded<service>::start() where copyable arguments are required.
+/// The returned factories capture the sharded references and call .local() when invoked on each shard.
+///
+
+/// Creates an authorizer factory for config-selectable authorizer types.
+/// @param name The authorizer class name (e.g., "CassandraAuthorizer", "AllowAllAuthorizer")
+authorizer_factory make_authorizer_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp);
+
+/// Creates an authenticator factory for config-selectable authenticator types.
+/// @param name The authenticator class name (e.g., "PasswordAuthenticator", "AllowAllAuthenticator")
+authenticator_factory make_authenticator_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a role_manager factory for config-selectable role manager types.
+/// @param name The role manager class name (e.g., "CassandraRoleManager")
+role_manager_factory make_role_manager_factory(
+        std::string_view name,
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a factory for the maintenance socket authenticator.
+/// This authenticator is not config-selectable and is only used for the maintenance socket.
+authenticator_factory make_maintenance_socket_authenticator_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);
+
+/// Creates a factory for the maintenance socket authorizer.
+/// This authorizer is not config-selectable and is only used for the maintenance socket.
+/// It grants all permissions unconditionally while delegating grant/revoke to the default authorizer.
+authorizer_factory make_maintenance_socket_authorizer_factory(sharded<cql3::query_processor>& qp);
+
+/// Creates a factory for the maintenance socket role manager.
+/// This role manager is not config-selectable and is only used for the maintenance socket.
+role_manager_factory make_maintenance_socket_role_manager_factory(
+        sharded<cql3::query_processor>& qp,
+        ::service::raft_group0_client& g0,
+        sharded<::service::migration_manager>& mm,
+        sharded<cache>& cache);

 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -28,15 +28,14 @@
 #include "cql3/untyped_result_set.hh"
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
+#include "db/system_keyspace.hh"
 #include "exceptions/exceptions.hh"
 #include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"
 #include "service/migration_manager.hh"
-#include "password_authenticator.hh"
 #include "utils/managed_string.hh"

 namespace auth {
@@ -44,57 +43,21 @@ namespace auth {

 static logging::logger log("standard_role_manager");

-static const class_registrator<
-        role_manager,
-        standard_role_manager,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");
-
-struct record final {
-    sstring name;
-    bool is_superuser;
-    bool can_login;
-    role_set member_of;
-};
-
-static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
-    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
-        return db::consistency_level::QUORUM;
+future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
+    auto role = _cache.get(role_name);
+    if (!role) {
+        return make_ready_future<std::optional<record>>(std::nullopt);
    }
-
-    return db::consistency_level::LOCAL_ONE;
+    return make_ready_future<std::optional<record>>(std::make_optional(record{
+        .name = sstring(role_name),
+        .is_superuser = role->is_superuser,
+        .can_login = role->can_login,
+        .member_of = role->member_of
+    }));
 }

-static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
-    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            get_auth_ks_name(qp),
-            meta::roles_table::name,
-            meta::roles_table::role_col_name);
-
-    const auto results = co_await qp.execute_internal(
-            query,
-            consistency_for_role(role_name),
-            internal_distributed_query_state(),
-            {sstring(role_name)},
-            cql3::query_processor::cache_internal::yes);
-    if (results->empty()) {
-        co_return std::optional<record>();
-    }
-
-    const cql3::untyped_result_set_row& row = results->one();
-    co_return std::make_optional(record{
-            row.get_as<sstring>(sstring(meta::roles_table::role_col_name)),
-            row.get_or<bool>("is_superuser", false),
-            row.get_or<bool>("can_login", false),
-            (row.has("member_of")
-                        ? row.get_set<sstring>("member_of")
-                        : role_set())});
-}
-
-static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
-    return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
+future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
+    return find_record(role_name).then([role_name](std::optional<record> mr) {
        if (!mr) {
            throw nonexistant_role(role_name);
        }
@@ -113,7 +76,6 @@ standard_role_manager::standard_role_manager(cql3::query_processor& qp, ::servic
    , _migration_manager(mm)
    , _cache(cache)
    , _stopped(make_ready_future<>())
-    , _superuser(password_authenticator::default_superuser(qp.db().get_config()))
 {}

 std::string_view standard_role_manager::qualified_java_name() const noexcept {
@@ -128,79 +90,12 @@ const resource_set& standard_role_manager::protected_resources() const {
    return resources;
 }

-future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const {
-    static const sstring create_roles_query = fmt::format(
-            "CREATE TABLE {}.{} ("
-            "  {} text PRIMARY KEY,"
-            "  can_login boolean,"
-            "  is_superuser boolean,"
-            "  member_of set<text>,"
-            "  salted_hash text"
-            ")",
-            meta::legacy::AUTH_KS,
-            meta::roles_table::name,
-            meta::roles_table::role_col_name);
-    static const sstring create_role_members_query = fmt::format(
-            "CREATE TABLE {}.{} ("
-            "  role text,"
-            "  member text,"
-            "  PRIMARY KEY (role, member)"
-            ")",
-            meta::legacy::AUTH_KS,
-            ROLE_MEMBERS_CF);
-    static const sstring create_role_attributes_query = seastar::format(
-            "CREATE TABLE {}.{} ("
-            "  role text,"
-            "  name text,"
-            "  value text,"
-            "  PRIMARY KEY(role, name)"
-            ")",
-            meta::legacy::AUTH_KS,
-            ROLE_ATTRIBUTES_CF);
-    return when_all_succeed(
-            create_legacy_metadata_table_if_missing(
-                    meta::roles_table::name,
-                    _qp,
-                    create_roles_query,
-                    _migration_manager),
-            create_legacy_metadata_table_if_missing(
-                    ROLE_MEMBERS_CF,
-                    _qp,
-                    create_role_members_query,
-                    _migration_manager),
-            create_legacy_metadata_table_if_missing(
-                    ROLE_ATTRIBUTES_CF,
-                    _qp,
-                    create_role_attributes_query,
-                    _migration_manager)).discard_result();
-}
-
-future<> standard_role_manager::legacy_create_default_role_if_missing() {
-    try {
-        const auto exists = co_await legacy::default_role_row_satisfies(_qp, &has_can_login, _superuser);
-        if (exists) {
-            co_return;
-        }
-        const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
-                meta::legacy::AUTH_KS,
-                meta::roles_table::name,
-                meta::roles_table::role_col_name);
-        co_await _qp.execute_internal(
-                query,
-                db::consistency_level::QUORUM,
-                internal_distributed_query_state(),
-                {_superuser},
-                cql3::query_processor::cache_internal::no).discard_result();
-        log.info("Created default superuser role '{}'.", _superuser);
-    } catch (const exceptions::unavailable_exception& e) {
-        log.warn("Skipped default role setup: some nodes were not ready; will retry");
-        throw e;
-    }
-}
-
 future<> standard_role_manager::maybe_create_default_role() {
+    if (default_superuser(_qp).empty()) {
+        co_return;
+    }
    auto has_superuser = [this] () -> future<bool> {
-        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", db::system_keyspace::NAME, meta::roles_table::name);
        auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
        for (const auto& result : *results) {
@@ -224,12 +119,12 @@ future<> standard_role_manager::maybe_create_default_role() {
    // There is no superuser which has can_login field - create default role.
    // Note that we don't check if can_login is set to true.
    const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            meta::roles_table::role_col_name);
-    co_await collect_mutations(_qp, batch, insert_query, {_superuser});
+    co_await collect_mutations(_qp, batch, insert_query, {default_superuser(_qp)});
    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
-    log.info("Created default superuser role '{}'.", _superuser);
+    log.info("Created default superuser role '{}'.", default_superuser(_qp));
 }

 future<> standard_role_manager::maybe_create_default_role_with_retries() {
@@ -252,78 +147,12 @@ future<> standard_role_manager::maybe_create_default_role_with_retries() {
    }
 }

-static const sstring legacy_table_name{"users"};
-
-bool standard_role_manager::legacy_metadata_exists() {
-    return _qp.db().has_schema(meta::legacy::AUTH_KS, legacy_table_name);
-}
-
-future<> standard_role_manager::migrate_legacy_metadata() {
-    log.info("Starting migration of legacy user metadata.");
-    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
-
-    return _qp.execute_internal(
-            query,
-            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
-            cql3::query_processor::cache_internal::no).then([this](::shared_ptr<cql3::untyped_result_set> results) {
-        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
-            role_config config;
-            config.is_superuser = row.get_or<bool>("super", false);
-            config.can_login = true;
-
-            return do_with(
-                    row.get_as<sstring>("name"),
-                    std::move(config),
-                    ::service::group0_batch::unused(),
-                    [this](const auto& name, const auto& config, auto& mc) {
-                return create_or_replace(meta::legacy::AUTH_KS, name, config, mc);
-            });
-        }).finally([results] {});
-    }).then([] {
-        log.info("Finished migrating legacy user metadata.");
-    }).handle_exception([](std::exception_ptr ep) {
-        log.error("Encountered an error during migration!");
-        std::rethrow_exception(ep);
-    });
-}
-
 future<> standard_role_manager::start() {
    return once_among_shards([this] () -> future<> {
-        if (legacy_mode(_qp)) {
-            co_await create_legacy_metadata_tables_if_missing();
-        }
-
        auto handler = [this] () -> future<> {
-            const bool legacy = legacy_mode(_qp);
-            if (legacy) {
-                if (!_superuser_created_promise.available()) {
-                    // Counterintuitively, we mark promise as ready before any startup work
-                    // because wait_for_schema_agreement() below will block indefinitely
-                    // without cluster majority. In that case, blocking node startup
-                    // would lead to a cluster deadlock.
-                    _superuser_created_promise.set_value();
-                }
-                co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);
-
-                if (co_await legacy::any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
-                    if (legacy_metadata_exists()) {
-                        log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
-                    }
-                    co_return;
-                }
-
-                if (legacy_metadata_exists()) {
-                    co_await migrate_legacy_metadata();
-                    co_return;
-                }
-                co_await legacy_create_default_role_if_missing();
-            }
-            if (!legacy) {
-                co_await maybe_create_default_role_with_retries();
-                if (!_superuser_created_promise.available()) {
-                    _superuser_created_promise.set_value();
-                }
+            co_await maybe_create_default_role_with_retries();
+            if (!_superuser_created_promise.available()) {
+                _superuser_created_promise.set_value();
            }
        };

@@ -342,21 +171,12 @@ future<> standard_role_manager::ensure_superuser_is_created() {
    return _superuser_created_promise.get_shared_future();
 }

-future<> standard_role_manager::create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
+future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
    const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
-            auth_ks_name,
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            meta::roles_table::role_col_name);
-    if (auth_ks_name == meta::legacy::AUTH_KS) {
-        co_await _qp.execute_internal(
-                query,
-                consistency_for_role(role_name),
-                internal_distributed_query_state(),
-                {sstring(role_name), c.is_superuser, c.can_login},
-                cql3::query_processor::cache_internal::yes).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc,  query, {sstring(role_name), c.is_superuser, c.can_login});
-    }
+    co_await collect_mutations(_qp, mc,  query, {sstring(role_name), c.is_superuser, c.can_login});
 }

 future<>
@@ -366,7 +186,7 @@ standard_role_manager::create(std::string_view role_name, const role_config& c,
            throw role_already_exists(role_name);
        }

-        return create_or_replace(get_auth_ks_name(_qp), role_name, c, mc);
+        return create_or_replace(role_name, c, mc);
    });
 }

@@ -386,25 +206,16 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        return fmt::to_string(fmt::join(assignments, ", "));
    };

-    return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
+    return require_record(role_name).then([this, role_name, &u, &mc](record) {
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
        const sstring query = seastar::format("UPDATE {}.{} SET {} WHERE {} = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            build_column_assignments(u),
            meta::roles_table::role_col_name);
-        if (legacy_mode(_qp)) {
-            return _qp.execute_internal(
-                    std::move(query),
-                    consistency_for_role(role_name),
-                    internal_distributed_query_state(),
-                    {sstring(role_name)},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
-        }
+        return collect_mutations(_qp, mc, std::move(query), {sstring(role_name)});
    });
 }

@@ -415,11 +226,11 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    // First, revoke this role from all roles that are members of it.
    const auto revoke_from_members = [this, role_name, &mc] () -> future<> {
        const sstring query = seastar::format("SELECT member FROM {}.{} WHERE role = ?",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                ROLE_MEMBERS_CF);
        const auto members = co_await _qp.execute_internal(
                query,
-                consistency_for_role(role_name),
+                db::consistency_level::LOCAL_ONE,
                internal_distributed_query_state(),
                {sstring(role_name)},
                cql3::query_processor::cache_internal::no);
@@ -447,102 +258,33 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    // Delete all attributes for that role
    const auto remove_attributes_of = [this, role_name, &mc] () -> future<> {
        const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ?",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                ROLE_ATTRIBUTES_CF);
-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(query, {sstring(role_name)},
-                cql3::query_processor::cache_internal::yes).discard_result();
-        } else {
-            co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
-        }
+        co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
    };
    // Finally, delete the role itself.
    const auto delete_role = [this, role_name, &mc] () -> future<> {
        const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                meta::roles_table::name,
                meta::roles_table::role_col_name);

-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(
-                    query,
-                    consistency_for_role(role_name),
-                    internal_distributed_query_state(),
-                    {sstring(role_name)},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
-        }
+        co_await collect_mutations(_qp, mc, query, {sstring(role_name)});
    };

    co_await when_all_succeed(revoke_from_members, revoke_members_of, remove_attributes_of);
    co_await delete_role();
 }

-future<>
-standard_role_manager::legacy_modify_membership(
-        std::string_view grantee_name,
-        std::string_view role_name,
-        membership_change ch) {
-    const auto modify_roles = [this, role_name, grantee_name, ch] () -> future<> {
-        const auto query = seastar::format(
-                "UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
-                get_auth_ks_name(_qp),
-                meta::roles_table::name,
-                (ch == membership_change::add ? '+' : '-'),
-                meta::roles_table::role_col_name);
-        co_await _qp.execute_internal(
-                query,
-                consistency_for_role(grantee_name),
-                internal_distributed_query_state(),
-                {role_set{sstring(role_name)}, sstring(grantee_name)},
-                cql3::query_processor::cache_internal::no).discard_result();
-    };
-
-    const auto modify_role_members = [this, role_name, grantee_name, ch] () -> future<> {
-        switch (ch) {
-            case membership_change::add: {
-                const sstring insert_query = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
-                        get_auth_ks_name(_qp),
-                        ROLE_MEMBERS_CF);
-                co_return co_await _qp.execute_internal(
-                        insert_query,
-                        consistency_for_role(role_name),
-                        internal_distributed_query_state(),
-                        {sstring(role_name), sstring(grantee_name)},
-                        cql3::query_processor::cache_internal::no).discard_result();
-            }
-
-            case membership_change::remove: {
-                const sstring delete_query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
-                        get_auth_ks_name(_qp),
-                        ROLE_MEMBERS_CF);
-                co_return co_await _qp.execute_internal(
-                        delete_query,
-                        consistency_for_role(role_name),
-                        internal_distributed_query_state(),
-                        {sstring(role_name), sstring(grantee_name)},
-                        cql3::query_processor::cache_internal::no).discard_result();
-            }
-        }
-    };
-
-    co_await when_all_succeed(modify_roles, modify_role_members).discard_result();
-}
-
 future<>
 standard_role_manager::modify_membership(
        std::string_view grantee_name,
        std::string_view role_name,
        membership_change ch,
        ::service::group0_batch& mc) {
-    if (legacy_mode(_qp)) {
-        co_return co_await legacy_modify_membership(grantee_name, role_name, ch);
-    }
-
    const auto modify_roles = seastar::format(
            "UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            meta::roles_table::name,
            (ch == membership_change::add ? '+' : '-'),
            meta::roles_table::role_col_name);
@@ -553,12 +295,12 @@ standard_role_manager::modify_membership(
    switch (ch) {
    case membership_change::add:
        modify_role_members = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                ROLE_MEMBERS_CF);
        break;
    case membership_change::remove:
        modify_role_members = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
-                get_auth_ks_name(_qp),
+                db::system_keyspace::NAME,
                ROLE_MEMBERS_CF);
        break;
    default:
@@ -620,18 +362,17 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
    });
 }

-static future<> collect_roles(
-        cql3::query_processor& qp,
+future<> standard_role_manager::collect_roles(
        std::string_view grantee_name,
        bool recurse,
        role_set& roles) {
-    return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
-        return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
-            return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
+    return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
+        return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
+            return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
                roles.insert(role_name);

                if (recurse) {
-                    return collect_roles(qp, role_name, true, roles);
+                    return collect_roles(role_name, true, roles);
                }

                return make_ready_future<>();
@@ -646,115 +387,68 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    return do_with(
            role_set{sstring(grantee_name)},
            [this, grantee_name, recurse](role_set& roles) {
-        return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
+        return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
    });
 }

 future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT * FROM {}.{}",
-            get_auth_ks_name(_qp),
-            ROLE_MEMBERS_CF);
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::ONE,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_to_directly_granted_map roles_map;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles_map, roles_map.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
-    );
-
+    _cache.for_each_role([&roles_map] (const cache::role_name_t& name, const cache::role_record& record) {
+        for (const auto& granted_role : record.member_of) {
+            roles_map.emplace(name, granted_role);
+        }
+    });
    co_return roles_map;
 }

 future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT {} FROM {}.{}",
-            meta::roles_table::role_col_name,
-            get_auth_ks_name(_qp),
-            meta::roles_table::name);
-
-    // To avoid many copies of a view.
-    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
-
-    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
-        if (legacy_mode(_qp)) {
-            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
-        }
-    }
-
-    const auto results = co_await _qp.execute_internal(
-            query,
-            db::consistency_level::QUORUM,
-            qs,
-            cql3::query_processor::cache_internal::yes);
-
    role_set roles;
-    std::transform(
-            results->begin(),
-            results->end(),
-            std::inserter(roles, roles.begin()),
-            [] (const cql3::untyped_result_set_row& row) {
-                return row.get_as<sstring>(role_col_name_string);}
-    );
+    roles.reserve(_cache.roles_count());
+    _cache.for_each_role([&roles] (const cache::role_name_t& name, const cache::role_record&) {
+        roles.insert(name);
+    });
    co_return roles;
 }

 future<bool> standard_role_manager::exists(std::string_view role_name) {
-    return find_record(_qp, role_name).then([](std::optional<record> mr) {
+    return find_record(role_name).then([](std::optional<record> mr) {
        return static_cast<bool>(mr);
    });
 }

 future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
-    return require_record(_qp, role_name).then([](record r) {
+    return require_record(role_name).then([](record r) {
        return r.is_superuser;
    });
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    if (legacy_mode(_qp)) {
-       const auto r = co_await require_record(_qp, role_name);
-       co_return r.can_login;
-    }
-    auto role = _cache.get(sstring(role_name));
-    if (!role) {
-        throw nonexistant_role(role_name);
-    }
-    co_return role->can_login;
+    return require_record(role_name).then([](record r) {
+        return r.can_login;
+    });
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
-    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
-            get_auth_ks_name(_qp),
-            ROLE_ATTRIBUTES_CF);
-    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
-    if (!result_set->empty()) {
-        const cql3::untyped_result_set_row &row = result_set->one();
-        co_return std::optional<sstring>(row.get_as<sstring>("value"));
+    auto role = _cache.get(role_name);
+    if (!role) {
+        co_return std::nullopt;
    }
-    co_return std::optional<sstring>{};
+    auto it = role->attributes.find(attribute_name);
+    if (it != role->attributes.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
-    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
-                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
-                    if (att_val) {
-                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
-                    }
-                });
-            }).then([&role_to_att_val] () {
-                return make_ready_future<attribute_vals>(std::move(role_to_att_val));
-            });
-        });
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    attribute_vals result;
+    _cache.for_each_role([&result, attribute_name] (const cache::role_name_t& name, const cache::role_record& record) {
+        auto it = record.attributes.find(attribute_name);
+        if (it != record.attributes.end()) {
+            result.emplace(name, it->second);
+        }
    });
+    co_return result;
 }

 future<> standard_role_manager::set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) {
@@ -762,14 +456,10 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
        throw auth::nonexistant_role(role_name);
    }
    const sstring query = seastar::format("INSERT INTO {}.{} (role, name, value)  VALUES (?, ?, ?)",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            ROLE_ATTRIBUTES_CF);
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name), sstring(attribute_value)}, cql3::query_processor::cache_internal::yes).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc, query,
-                {sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
-    }
+    co_await collect_mutations(_qp, mc, query,
+            {sstring(role_name), sstring(attribute_name), sstring(attribute_value)});
 }

 future<> standard_role_manager::remove_attribute(std::string_view role_name, std::string_view attribute_name, ::service::group0_batch& mc) {
@@ -777,14 +467,10 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
        throw auth::nonexistant_role(role_name);
    }
    const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
-            get_auth_ks_name(_qp),
+            db::system_keyspace::NAME,
            ROLE_ATTRIBUTES_CF);
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes).discard_result();
-    } else {
-        co_await collect_mutations(_qp, mc, query,
-                {sstring(role_name), sstring(attribute_name)});
-    }
+    co_await collect_mutations(_qp, mc, query,
+            {sstring(role_name), sstring(attribute_name)});
 }

 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -40,7 +40,6 @@ class standard_role_manager final : public role_manager {
    cache& _cache;
    future<> _stopped;
    abort_source _as;
-    std::string _superuser;
    shared_promise<> _superuser_created_promise;

 public:
@@ -90,23 +89,26 @@ public:

 private:
    enum class membership_change { add, remove };
-
-    future<> create_legacy_metadata_tables_if_missing() const;
-
-    bool legacy_metadata_exists();
-
-    future<> migrate_legacy_metadata();
-
-    future<> legacy_create_default_role_if_missing();
+    struct record final {
+        sstring name;
+        bool is_superuser;
+        bool can_login;
+        role_set member_of;
+    };

    future<> maybe_create_default_role();
    future<> maybe_create_default_role_with_retries();

-    future<> create_or_replace(std::string_view auth_ks_name, std::string_view role_name, const role_config&, ::service::group0_batch&);
-
-    future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);
+    future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);

    future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
+
+    future<std::optional<record>> find_record(std::string_view role_name);
+    future<record> require_record(std::string_view role_name);
+    future<> collect_roles(
+            std::string_view grantee_name,
+            bool recurse,
+            role_set& roles);
 };

 } // namespace auth
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -8,244 +8,200 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "auth/transitional.hh"
 #include "auth/authenticated_user.hh"
-#include "auth/authenticator.hh"
-#include "auth/authorizer.hh"
 #include "auth/default_authorizer.hh"
 #include "auth/password_authenticator.hh"
-#include "auth/cache.hh"
 #include "auth/permission.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/class_registrator.hh"

 namespace auth {

-static const sstring PACKAGE_NAME("com.scylladb.auth.");
-
-static const sstring& transitional_authenticator_name() {
-    static const sstring name = PACKAGE_NAME + "TransitionalAuthenticator";
-    return name;
+transitional_authenticator::transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
+        : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
 }

-static const sstring& transitional_authorizer_name() {
-    static const sstring name = PACKAGE_NAME + "TransitionalAuthorizer";
-    return name;
+transitional_authenticator::transitional_authenticator(std::unique_ptr<authenticator> a)
+        : _authenticator(std::move(a)) {
 }

-class transitional_authenticator : public authenticator {
-    std::unique_ptr<authenticator> _authenticator;
+future<> transitional_authenticator::start() {
+    return _authenticator->start();
+}

-public:
-    static const sstring PASSWORD_AUTHENTICATOR_NAME;
+future<> transitional_authenticator::stop() {
+    return _authenticator->stop();
+}

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
+std::string_view transitional_authenticator::qualified_java_name() const {
+    return "com.scylladb.auth.TransitionalAuthenticator";
+}
+
+bool transitional_authenticator::require_authentication() const {
+    return true;
+}
+
+authentication_option_set transitional_authenticator::supported_options() const {
+    return _authenticator->supported_options();
+}
+
+authentication_option_set transitional_authenticator::alterable_options() const {
+    return _authenticator->alterable_options();
+}
+
+future<authenticated_user> transitional_authenticator::authenticate(const credentials_map& credentials) const {
+    auto i = credentials.find(authenticator::USERNAME_KEY);
+    if ((i == credentials.end() || i->second.empty())
+            && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+        // return anon user
+        return make_ready_future<authenticated_user>(anonymous_user());
    }
-    transitional_authenticator(std::unique_ptr<authenticator> a)
-            : _authenticator(std::move(a)) {
-    }
-
-    virtual future<> start() override {
-        return _authenticator->start();
-    }
-
-    virtual future<> stop() override {
-        return _authenticator->stop();
-    }
-
-    virtual std::string_view qualified_java_name() const override {
-        return transitional_authenticator_name();
-    }
-
-    virtual bool require_authentication() const override {
-        return true;
-    }
-
-    virtual authentication_option_set supported_options() const override {
-        return _authenticator->supported_options();
-    }
-
-    virtual authentication_option_set alterable_options() const override {
-        return _authenticator->alterable_options();
-    }
-
-    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override {
-        auto i = credentials.find(authenticator::USERNAME_KEY);
-        if ((i == credentials.end() || i->second.empty())
-                && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+    return make_ready_future().then([this, &credentials] {
+        return _authenticator->authenticate(credentials);
+    }).handle_exception([](auto ep) {
+        try {
+            std::rethrow_exception(ep);
+        } catch (const exceptions::authentication_exception&) {
            // return anon user
            return make_ready_future<authenticated_user>(anonymous_user());
        }
-        return make_ready_future().then([this, &credentials] {
-            return _authenticator->authenticate(credentials);
-        }).handle_exception([](auto ep) {
-            try {
-                std::rethrow_exception(ep);
-            } catch (const exceptions::authentication_exception&) {
-                // return anon user
-                return make_ready_future<authenticated_user>(anonymous_user());
-            }
-        });
-    }
-
-    virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
-        return _authenticator->create(role_name, options, mc);
-    }
-
-    virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override {
-        return _authenticator->alter(role_name, options, mc);
-    }
-
-    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override {
-        return _authenticator->drop(role_name, mc);
-    }
-
-    virtual future<custom_options> query_custom_options(std::string_view role_name) const override {
-        return _authenticator->query_custom_options(role_name);
-    }
-
-    virtual bool uses_password_hashes() const override {
-        return _authenticator->uses_password_hashes();
-    }
-
-    virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override {
-        return _authenticator->get_password_hash(role_name);
-    }
-
-    virtual const resource_set& protected_resources() const override {
-        return _authenticator->protected_resources();
-    }
-
-    virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
-        class sasl_wrapper : public sasl_challenge {
-        public:
-            sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
-                    : _sasl(std::move(sasl)) {
-            }
-
-            virtual bytes evaluate_response(bytes_view client_response) override {
-                try {
-                    return _sasl->evaluate_response(client_response);
-                } catch (const exceptions::authentication_exception&) {
-                    _complete = true;
-                    return {};
-                }
-            }
-
-            virtual bool is_complete() const override {
-                return _complete || _sasl->is_complete();
-            }
-
-            virtual future<authenticated_user> get_authenticated_user() const override {
-                return futurize_invoke([this] {
-                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
-                        try {
-                            std::rethrow_exception(ep);
-                        } catch (const exceptions::authentication_exception&) {
-                            // return anon user
-                            return make_ready_future<authenticated_user>(anonymous_user());
-                        }
-                    });
-                });
-	    }
-
-            const sstring& get_username() const override {
-                return _sasl->get_username();
-            }
-
-        private:
-            ::shared_ptr<sasl_challenge> _sasl;
-
-            bool _complete = false;
-        };
-        return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
-    }
-
-    virtual future<> ensure_superuser_is_created() const override {
-        return _authenticator->ensure_superuser_is_created();
-    }
-};
-
-class transitional_authorizer : public authorizer {
-    std::unique_ptr<authorizer> _authorizer;
-
-public:
-    transitional_authorizer(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-            : transitional_authorizer(std::make_unique<default_authorizer>(qp, g0, mm)) {
-    }
-    transitional_authorizer(std::unique_ptr<authorizer> a)
-            : _authorizer(std::move(a)) {
-    }
-
-    ~transitional_authorizer() {
-    }
-
-    virtual future<> start() override {
-        return _authorizer->start();
-    }
-
-    virtual future<> stop() override {
-        return _authorizer->stop();
-    }
-
-    virtual std::string_view qualified_java_name() const override {
-        return transitional_authorizer_name();
-    }
-
-    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
-        static const permission_set transitional_permissions =
-                permission_set::of<
-                        permission::CREATE,
-                        permission::ALTER,
-                        permission::DROP,
-                        permission::SELECT,
-                        permission::MODIFY>();
-
-        return make_ready_future<permission_set>(transitional_permissions);
-    }
-
-    virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc)  override {
-        return _authorizer->grant(s, std::move(ps), r, mc);
-    }
-
-    virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override {
-        return _authorizer->revoke(s, std::move(ps), r, mc);
-    }
-
-    virtual future<std::vector<permission_details>> list_all() const override {
-        return _authorizer->list_all();
-    }
-
-    virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override {
-        return _authorizer->revoke_all(s, mc);
-    }
-
-    virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override {
-        return _authorizer->revoke_all(r, mc);
-    }
-
-    virtual const resource_set& protected_resources() const override {
-        return _authorizer->protected_resources();
-    }
-};
-
+    });
 }

-//
-// To ensure correct initialization order, we unfortunately need to use string literals.
-//
+future<> transitional_authenticator::create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
+    return _authenticator->create(role_name, options, mc);
+}

-static const class_registrator<
-        auth::authenticator,
-        auth::transitional_authenticator,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&,
-        auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+future<> transitional_authenticator::alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) {
+    return _authenticator->alter(role_name, options, mc);
+}

-static const class_registrator<
-        auth::authorizer,
-        auth::transitional_authorizer,
-        cql3::query_processor&,
-        ::service::raft_group0_client&,
-        ::service::migration_manager&> transitional_authorizer_reg(auth::PACKAGE_NAME + "TransitionalAuthorizer");
+future<> transitional_authenticator::drop(std::string_view role_name, ::service::group0_batch& mc) {
+    return _authenticator->drop(role_name, mc);
+}
+
+future<custom_options> transitional_authenticator::query_custom_options(std::string_view role_name) const {
+    return _authenticator->query_custom_options(role_name);
+}
+
+bool transitional_authenticator::uses_password_hashes() const {
+    return _authenticator->uses_password_hashes();
+}
+
+future<std::optional<sstring>> transitional_authenticator::get_password_hash(std::string_view role_name) const {
+    return _authenticator->get_password_hash(role_name);
+}
+
+const resource_set& transitional_authenticator::protected_resources() const {
+    return _authenticator->protected_resources();
+}
+
+::shared_ptr<sasl_challenge> transitional_authenticator::new_sasl_challenge() const {
+    class sasl_wrapper : public sasl_challenge {
+    public:
+        sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
+                : _sasl(std::move(sasl)) {
+        }
+
+        virtual bytes evaluate_response(bytes_view client_response) override {
+            try {
+                return _sasl->evaluate_response(client_response);
+            } catch (const exceptions::authentication_exception&) {
+                _complete = true;
+                return {};
+            }
+        }
+
+        virtual bool is_complete() const override {
+            return _complete || _sasl->is_complete();
+        }
+
+        virtual future<authenticated_user> get_authenticated_user() const override {
+            return futurize_invoke([this] {
+                return _sasl->get_authenticated_user().handle_exception([](auto ep) {
+                    try {
+                        std::rethrow_exception(ep);
+                    } catch (const exceptions::authentication_exception&) {
+                        // return anon user
+                        return make_ready_future<authenticated_user>(anonymous_user());
+                    }
+                });
+            });
+        }
+
+        const sstring& get_username() const override {
+            return _sasl->get_username();
+        }
+
+    private:
+        ::shared_ptr<sasl_challenge> _sasl;
+
+        bool _complete = false;
+    };
+    return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
+}
+
+future<> transitional_authenticator::ensure_superuser_is_created() const {
+    return _authenticator->ensure_superuser_is_created();
+}
+
+transitional_authorizer::transitional_authorizer(cql3::query_processor& qp)
+        : transitional_authorizer(std::make_unique<default_authorizer>(qp)) {
+}
+
+transitional_authorizer::transitional_authorizer(std::unique_ptr<authorizer> a)
+        : _authorizer(std::move(a)) {
+}
+
+transitional_authorizer::~transitional_authorizer() {
+}
+
+future<> transitional_authorizer::start() {
+    return _authorizer->start();
+}
+
+future<> transitional_authorizer::stop() {
+    return _authorizer->stop();
+}
+
+std::string_view transitional_authorizer::qualified_java_name() const {
+    return "com.scylladb.auth.TransitionalAuthorizer";
+}
+
+future<permission_set> transitional_authorizer::authorize(const role_or_anonymous&, const resource&) const {
+    static const permission_set transitional_permissions =
+            permission_set::of<
+                    permission::CREATE,
+                    permission::ALTER,
+                    permission::DROP,
+                    permission::SELECT,
+                    permission::MODIFY>();
+
+    return make_ready_future<permission_set>(transitional_permissions);
+}
+
+future<> transitional_authorizer::grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->grant(s, std::move(ps), r, mc);
+}
+
+future<> transitional_authorizer::revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->revoke(s, std::move(ps), r, mc);
+}
+
+future<std::vector<permission_details>> transitional_authorizer::list_all() const {
+    return _authorizer->list_all();
+}
+
+future<> transitional_authorizer::revoke_all(std::string_view s, ::service::group0_batch& mc) {
+    return _authorizer->revoke_all(s, mc);
+}
+
+future<> transitional_authorizer::revoke_all(const resource& r, ::service::group0_batch& mc) {
+    return _authorizer->revoke_all(r, mc);
+}
+
+const resource_set& transitional_authorizer::protected_resources() const {
+    return _authorizer->protected_resources();
+}
+
+}
--- a/auth/transitional.hh
+++ b/auth/transitional.hh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "auth/authenticator.hh"
+#include "auth/authorizer.hh"
+#include "auth/cache.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace service {
+class raft_group0_client;
+class migration_manager;
+}
+
+namespace auth {
+
+///
+/// Transitional authenticator that allows anonymous access when credentials are not provided
+/// or authentication fails. Used for migration scenarios.
+///
+class transitional_authenticator : public authenticator {
+    std::unique_ptr<authenticator> _authenticator;
+
+public:
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache);
+    transitional_authenticator(std::unique_ptr<authenticator> a);
+
+    virtual future<> start() override;
+    virtual future<> stop() override;
+    virtual std::string_view qualified_java_name() const override;
+    virtual bool require_authentication() const override;
+    virtual authentication_option_set supported_options() const override;
+    virtual authentication_option_set alterable_options() const override;
+    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override;
+    virtual future<> create(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
+    virtual future<> alter(std::string_view role_name, const authentication_options& options, ::service::group0_batch& mc) override;
+    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
+    virtual future<custom_options> query_custom_options(std::string_view role_name) const override;
+    virtual bool uses_password_hashes() const override;
+    virtual future<std::optional<sstring>> get_password_hash(std::string_view role_name) const override;
+    virtual const resource_set& protected_resources() const override;
+    virtual ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;
+    virtual future<> ensure_superuser_is_created() const override;
+};
+
+///
+/// Transitional authorizer that grants a fixed set of permissions to all users.
+/// Used for migration scenarios.
+///
+class transitional_authorizer : public authorizer {
+    std::unique_ptr<authorizer> _authorizer;
+
+public:
+    transitional_authorizer(cql3::query_processor& qp);
+    transitional_authorizer(std::unique_ptr<authorizer> a);
+    ~transitional_authorizer();
+
+    virtual future<> start() override;
+    virtual future<> stop() override;
+    virtual std::string_view qualified_java_name() const override;
+    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override;
+    virtual future<> grant(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
+    virtual future<> revoke(std::string_view s, permission_set ps, const resource& r, ::service::group0_batch& mc) override;
+    virtual future<std::vector<permission_details>> list_all() const override;
+    virtual future<> revoke_all(std::string_view s, ::service::group0_batch& mc) override;
+    virtual future<> revoke_all(const resource& r, ::service::group0_batch& mc) override;
+    virtual const resource_set& protected_resources() const override;
+};
+
+} // namespace auth
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -10,24 +10,15 @@
 #include <random>
 #include <unordered_set>
 #include <algorithm>
-#include <seastar/core/sleep.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/maybe_yield.hh>
-#include <seastar/util/later.hh>

-#include "gms/endpoint_state.hh"
-#include "gms/versioned_value.hh"
 #include "keys/keys.hh"
 #include "replica/database.hh"
 #include "db/system_keyspace.hh"
-#include "db/system_distributed_keyspace.hh"
 #include "dht/token-sharding.hh"
 #include "locator/token_metadata.hh"
 #include "types/set.hh"
-#include "gms/application_state.hh"
-#include "gms/inet_address.hh"
-#include "gms/gossiper.hh"
-#include "gms/feature_service.hh"
 #include "utils/assert.hh"
 #include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"
@@ -41,16 +32,6 @@

 extern logging::logger cdc_log;

-static int get_shard_count(const locator::host_id& endpoint, const gms::gossiper& g) {
-    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
-    return ep_state ? std::stoi(ep_state->value()) : -1;
-}
-
-static unsigned get_sharding_ignore_msb(const locator::host_id& endpoint, const gms::gossiper& g) {
-    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
-    return ep_state ? std::stoi(ep_state->value()) : 0;
-}
-
 namespace db {
    extern thread_local data_type cdc_streams_set_type;
 }
@@ -225,12 +206,6 @@ static std::vector<stream_id> create_stream_ids(
    return result;
 }

-bool should_propose_first_generation(const locator::host_id& my_host_id, const gms::gossiper& g) {
-    return g.for_each_endpoint_state_until([&] (const gms::endpoint_state& eps) {
-        return stop_iteration(my_host_id < eps.get_host_id());
-    }) == stop_iteration::no;
-}
-
 bool is_cdc_generation_optimal(const cdc::topology_description& gen, const locator::token_metadata& tm) {
    if (tm.sorted_tokens().size() != gen.entries().size()) {
        // We probably have garbage streams from old generations
@@ -330,38 +305,6 @@ future<utils::chunked_vector<mutation>> get_cdc_generation_mutations_v3(
    co_return co_await get_common_cdc_generation_mutations(s, pkey, std::move(get_ckey), desc, mutation_size_threshold, ts);
 }

-// non-static for testing
-size_t limit_of_streams_in_topology_description() {
-    // Each stream takes 16B and we don't want to exceed 4MB so we can have
-    // at most 262144 streams but not less than 1 per vnode.
-    return 4 * 1024 * 1024 / 16;
-}
-
-// non-static for testing
-topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
-    uint64_t streams_count = 0;
-    for (auto& tr_desc : desc.entries()) {
-        streams_count += tr_desc.streams.size();
-    }
-
-    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
-    if (limit >= streams_count) {
-        return std::move(desc);
-    }
-    size_t streams_per_vnode_limit = limit / desc.entries().size();
-    auto entries = std::move(desc).entries();
-    auto start = entries.back().token_range_end;
-    for (size_t idx = 0; idx < entries.size(); ++idx) {
-        auto end = entries[idx].token_range_end;
-        if (entries[idx].streams.size() > streams_per_vnode_limit) {
-            entries[idx].streams =
-                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
-        }
-        start = end;
-    }
-    return topology_description(std::move(entries));
-}
-
 // Compute a set of tokens that split the token ring into vnodes.
 static auto get_tokens(const std::unordered_set<dht::token>& bootstrap_tokens, const locator::token_metadata_ptr tmptr) {
    auto tokens = tmptr->sorted_tokens();
@@ -419,364 +362,6 @@ db_clock::time_point new_generation_timestamp(bool add_delay, std::chrono::milli
    return ts;
 }

-future<cdc::generation_id> generation_service::legacy_make_new_generation(const std::unordered_set<dht::token>& bootstrap_tokens, bool add_delay) {
-    const locator::token_metadata_ptr tmptr = _token_metadata.get();
-
-    // Fetch sharding parameters for a node that owns vnode ending with this token
-    // using gossiped application states.
-    auto get_sharding_info = [&] (dht::token end) -> std::pair<size_t, uint8_t> {
-        if (bootstrap_tokens.contains(end)) {
-            return {smp::count, _cfg.ignore_msb_bits};
-        } else {
-            auto endpoint = tmptr->get_endpoint(end);
-            if (!endpoint) {
-                throw std::runtime_error(
-                        format("Can't find endpoint for token {}", end));
-            }
-            auto sc = get_shard_count(*endpoint, _gossiper);
-            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
-        }
-    };
-
-    auto uuid = utils::make_random_uuid();
-    auto gen = make_new_generation_description(bootstrap_tokens, get_sharding_info, tmptr);
-
-    // Our caller should ensure that there are normal tokens in the token ring.
-    auto normal_token_owners = tmptr->count_normal_token_owners();
-    SCYLLA_ASSERT(normal_token_owners);
-
-    if (_feature_service.cdc_generations_v2) {
-        cdc_log.info("Inserting new generation data at UUID {}", uuid);
-        // This may take a while.
-        co_await _sys_dist_ks.local().insert_cdc_generation(uuid, gen, { normal_token_owners });
-
-        // Begin the race.
-        cdc::generation_id_v2 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay), uuid};
-
-        cdc_log.info("New CDC generation: {}", gen_id);
-        co_return gen_id;
-    }
-
-    // The CDC_GENERATIONS_V2 feature is not enabled: some nodes may still not understand the V2 format.
-    // We must create a generation in the old format.
-
-    // If the cluster is large we may end up with a generation that contains
-    // large number of streams. This is problematic because we store the
-    // generation in a single row (V1 format). For a generation with large number of rows
-    // this will lead to a row that can be as big as 32MB. This is much more
-    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
-    // the row that describes a new generation grows above
-    // commitlog_segment_size_in_mb, the write will fail and the new node won't
-    // be able to join. To avoid such problem we make sure that such row is
-    // always smaller than 4MB. We do that by removing some CDC streams from
-    // each vnode if the total number of streams is too large.
-    gen = limit_number_of_streams_if_needed(std::move(gen));
-
-    cdc_log.warn(
-        "Creating a new CDC generation in the old storage format due to a partially upgraded cluster:"
-        " the CDC_GENERATIONS_V2 feature is known by this node, but not enabled in the cluster."
-        " The old storage format forces us to create a suboptimal generation."
-        " It is recommended to finish the upgrade and then create a new generation either by bootstrapping"
-        " a new node or running the checkAndRepairCdcStreams nodetool command.");
-
-    // Begin the race.
-    cdc::generation_id_v1 gen_id{new_generation_timestamp(add_delay, _cfg.ring_delay)};
-
-    co_await _sys_dist_ks.local().insert_cdc_topology_description(gen_id, std::move(gen), { normal_token_owners });
-
-    cdc_log.info("New CDC generation: {}", gen_id);
-    co_return gen_id;
-}
-
-/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
- * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
- * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
- * which means it will gossip the generation's timestamp.
- */
-static std::optional<cdc::generation_id> get_generation_id_for(const locator::host_id& endpoint, const gms::endpoint_state& eps) {
-    const auto* gen_id_ptr = eps.get_application_state_ptr(gms::application_state::CDC_GENERATION_ID);
-    if (!gen_id_ptr) {
-        return std::nullopt;
-    }
-    auto gen_id_string = gen_id_ptr->value();
-    cdc_log.trace("endpoint={}, gen_id_string={}", endpoint, gen_id_string);
-    return gms::versioned_value::cdc_generation_id_from_string(gen_id_string);
-}
-
-static future<std::optional<cdc::topology_description>> retrieve_generation_data_v2(
-        cdc::generation_id_v2 id,
-        db::system_keyspace& sys_ks,
-        db::system_distributed_keyspace& sys_dist_ks) {
-    auto cdc_gen = co_await sys_dist_ks.read_cdc_generation(id.id);
-
-    if (!cdc_gen && id.id.is_timestamp()) {
-        // If we entered legacy mode due to recovery, we (or some other node)
-        // might gossip about a generation that was previously propagated
-        // through raft. If that's the case, it will sit in
-        // the system.cdc_generations_v3 table.
-        //
-        // If the provided id is not a timeuuid, we don't want to query
-        // the system.cdc_generations_v3 table. This table stores generation
-        // ids as timeuuids. If the provided id is not a timeuuid, the
-        // generation cannot be in system.cdc_generations_v3. Also, the query
-        // would fail with a marshaling error.
-        cdc_gen = co_await sys_ks.read_cdc_generation_opt(id.id);
-    }
-
-    co_return cdc_gen;
-}
-
-static future<std::optional<cdc::topology_description>> retrieve_generation_data(
-        cdc::generation_id gen_id,
-        db::system_keyspace& sys_ks,
-        db::system_distributed_keyspace& sys_dist_ks,
-        db::system_distributed_keyspace::context ctx) {
-    return std::visit(make_visitor(
-    [&] (const cdc::generation_id_v1& id) {
-        return sys_dist_ks.read_cdc_topology_description(id, ctx);
-    },
-    [&] (const cdc::generation_id_v2& id) {
-        return retrieve_generation_data_v2(id, sys_ks, sys_dist_ks);
-    }
-    ), gen_id);
-}
-
-static future<> do_update_streams_description(
-        cdc::generation_id gen_id,
-        db::system_keyspace& sys_ks,
-        db::system_distributed_keyspace& sys_dist_ks,
-        db::system_distributed_keyspace::context ctx) {
-    if (co_await sys_dist_ks.cdc_desc_exists(get_ts(gen_id), ctx)) {
-        cdc_log.info("Generation {}: streams description table already updated.", gen_id);
-        co_return;
-    }
-
-    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
-
-    auto topo = co_await retrieve_generation_data(gen_id, sys_ks, sys_dist_ks, ctx);
-    if (!topo) {
-        throw no_generation_data_exception(gen_id);
-    }
-
-    co_await sys_dist_ks.create_cdc_desc(get_ts(gen_id), *topo, ctx);
-    cdc_log.info("CDC description table successfully updated with generation {}.", gen_id);
-}
-
-/* Inform CDC users about a generation of streams (identified by the given timestamp)
- * by inserting it into the cdc_streams table.
- *
- * Assumes that the cdc_generation_descriptions table contains this generation.
- *
- * Returning from this function does not mean that the table update was successful: the function
- * might run an asynchronous task in the background.
- */
-static future<> update_streams_description(
-        cdc::generation_id gen_id,
-        db::system_keyspace& sys_ks,
-        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
-        noncopyable_function<unsigned()> get_num_token_owners,
-        abort_source& abort_src) {
-    try {
-        co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
-    } catch (...) {
-        cdc_log.warn(
-            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
-            gen_id, std::current_exception());
-
-        // It is safe to discard this future: we keep system distributed keyspace alive.
-        (void)(([] (cdc::generation_id gen_id,
-                    db::system_keyspace& sys_ks,
-                    shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
-                    noncopyable_function<unsigned()> get_num_token_owners,
-                    abort_source& abort_src) -> future<> {
-            while (true) {
-                try {
-                    co_await sleep_abortable(std::chrono::seconds(60), abort_src);
-                } catch (seastar::sleep_aborted&) {
-                    cdc_log.warn( "Aborted update CDC description table with generation {}", gen_id);
-                    co_return;
-                }
-                try {
-                    co_await do_update_streams_description(gen_id, sys_ks, *sys_dist_ks, { get_num_token_owners() });
-                    co_return;
-                } catch (...) {
-                    cdc_log.warn(
-                        "Could not update CDC description table with generation {}: {}. Will try again.",
-                        gen_id, std::current_exception());
-                }
-            }
-        })(gen_id, sys_ks, std::move(sys_dist_ks), std::move(get_num_token_owners), abort_src));
-    }
-}
-
-static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
-    return db_clock::time_point(utils::UUID_gen::unix_timestamp(uuid));
-}
-
-static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
-        db::system_distributed_keyspace& sys_dist_ks,
-        abort_source& abort_src,
-        const noncopyable_function<unsigned()>& get_num_token_owners) {
-    while (true) {
-        try {
-            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
-        } catch (...) {
-            cdc_log.warn(
-                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
-                    std::current_exception());
-        }
-        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
-    }
-}
-
-// Contains a CDC log table's creation time (extracted from its schema's id)
-// and its CDC TTL setting.
-struct time_and_ttl {
-    db_clock::time_point creation_time;
-    int ttl;
-};
-
-/*
- * See `maybe_rewrite_streams_descriptions`.
- * This is the long-running-in-the-background part of that function.
- * It returns the timestamp of the last rewritten generation (if any).
- */
-static future<std::optional<cdc::generation_id_v1>> rewrite_streams_descriptions(
-        std::vector<time_and_ttl> times_and_ttls,
-        db::system_keyspace& sys_ks,
-        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
-        noncopyable_function<unsigned()> get_num_token_owners,
-        abort_source& abort_src) {
-    cdc_log.info("Retrieving generation timestamps for rewriting...");
-    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
-    cdc_log.info("Generation timestamps retrieved.");
-
-    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
-    // This predicate is monotonic w.r.t the timestamps.
-    auto now = db_clock::now();
-    std::sort(tss.begin(), tss.end());
-    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
-        // partition_point finds first element that does *not* satisfy the predicate.
-        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
-                [&] (const time_and_ttl& tat) {
-            // In this CDC log table there are no entries older than the table's creation time
-            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
-            // If ttl is set to 0 then entries in this table never expire. In that case we look
-            // only at the table's creation time.
-            auto no_entries_older_than =
-                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
-                    - std::chrono::seconds(10);
-            return no_entries_older_than < ts;
-        });
-    });
-
-    // Find first generation timestamp such that some CDC log table may contain data in this generation.
-    // This and all later generations need to be written to the new streams table.
-    if (first != tss.begin()) {
-        --first;
-    }
-
-    if (first == tss.end()) {
-        cdc_log.info("No generations to rewrite.");
-        co_return std::nullopt;
-    }
-
-    cdc_log.info("First generation to rewrite: {}", *first);
-
-    bool each_success = true;
-    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
-        while (true) {
-            try {
-                co_return co_await do_update_streams_description(cdc::generation_id_v1{ts}, sys_ks, *sys_dist_ks, { get_num_token_owners() });
-            } catch (const no_generation_data_exception& e) {
-                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
-                each_success = false;
-                co_return;
-            } catch (...) {
-                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
-            }
-            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
-        }
-    });
-
-    if (each_success) {
-        cdc_log.info("Rewriting stream tables finished successfully.");
-    } else {
-        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
-    }
-
-    if (first != tss.end()) {
-        co_return cdc::generation_id_v1{*std::prev(tss.end())};
-    }
-
-    co_return std::nullopt;
-}
-
-future<> generation_service::maybe_rewrite_streams_descriptions() {
-    if (!_db.has_schema(_sys_dist_ks.local().NAME, _sys_dist_ks.local().CDC_DESC_V1)) {
-        // This cluster never went through a Scylla version which used this table
-        // or the user deleted the table. Nothing to do.
-        co_return;
-    }
-
-    if (co_await _sys_ks.local().cdc_is_rewritten()) {
-        co_return;
-    }
-
-    if (_cfg.dont_rewrite_streams) {
-        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
-        co_return;
-    }
-
-    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
-    std::vector<time_and_ttl> times_and_ttls;
-    _db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
-        auto& s = *t->schema();
-        auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
-        if (!base) {
-            // Not a CDC log table.
-            return;
-        }
-        auto& cdc_opts = base->cdc_options();
-        if (!cdc_opts.enabled()) {
-            // This table is named like a CDC log table but it's not one.
-            return;
-        }
-
-        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
-    });
-
-    if (times_and_ttls.empty()) {
-        // There's no point in rewriting old generations' streams (they don't contain any data).
-        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
-        co_return co_await _sys_ks.local().cdc_set_rewritten(std::nullopt);
-    }
-
-    auto get_num_token_owners = [tm = _token_metadata.get()] { return tm->count_normal_token_owners(); };
-
-    // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
-    // and some nodes that are UP may still be marked as DOWN by us.
-    // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
-    // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
-    co_await sleep_abortable(std::chrono::seconds(10), _abort_src);
-
-    cdc_log.info("Rewriting stream tables in the background...");
-    auto last_rewritten = co_await rewrite_streams_descriptions(
-            std::move(times_and_ttls),
-            _sys_ks.local(),
-            _sys_dist_ks.local_shared(),
-            std::move(get_num_token_owners),
-            _abort_src);
-
-    co_await _sys_ks.local().cdc_set_rewritten(last_rewritten);
-}
-
-static void assert_shard_zero(const sstring& where) {
-    if (this_shard_id() != 0) {
-        on_internal_error(cdc_log, format("`{}`: must be run on shard 0", where));
-    }
-}
-
 class and_reducer {
 private:
    bool _result = true;
@@ -803,206 +388,26 @@ public:
    }
 };

-class generation_handling_nonfatal_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-constexpr char could_not_retrieve_msg_template[]
-        = "Could not retrieve CDC streams with timestamp {} upon gossip event. Reason: \"{}\". Action: {}.";
-
 generation_service::generation_service(
-            config cfg, gms::gossiper& g, sharded<db::system_distributed_keyspace>& sys_dist_ks,
+            config cfg,
            sharded<db::system_keyspace>& sys_ks,
-            abort_source& abort_src, const locator::shared_token_metadata& stm, gms::feature_service& f,
-            replica::database& db,
-            std::function<bool()> raft_topology_change_enabled)
+            replica::database& db)
        : _cfg(std::move(cfg))
-        , _gossiper(g)
-        , _sys_dist_ks(sys_dist_ks)
        , _sys_ks(sys_ks)
-        , _abort_src(abort_src)
-        , _token_metadata(stm)
-        , _feature_service(f)
        , _db(db)
-        , _raft_topology_change_enabled(std::move(raft_topology_change_enabled))
 {
 }

 future<> generation_service::stop() {
-    try {
-        co_await std::move(_cdc_streams_rewrite_complete);
-    } catch (...) {
-        cdc_log.error("CDC stream rewrite failed: ", std::current_exception());
-    }
-
-    if (_joined && (this_shard_id() == 0)) {
-        co_await leave_ring();
-    }
-
    _stopped = true;
+    return make_ready_future<>();
 }

 generation_service::~generation_service() {
    SCYLLA_ASSERT(_stopped);
 }

-future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    _gen_id = std::move(startup_gen_id);
-    _gossiper.register_(shared_from_this());
-
-    _joined = true;
-
-    // Retrieve the latest CDC generation seen in gossip (if any).
-    co_await legacy_scan_cdc_generations();
-
-    // Ensure that the new CDC stream description table has all required streams.
-    // See the function's comment for details.
-    //
-    // Since this depends on the entire cluster (and therefore we cannot guarantee
-    // timely completion), run it in the background and wait for it in stop().
-    _cdc_streams_rewrite_complete = maybe_rewrite_streams_descriptions();
-}
-
-future<> generation_service::leave_ring() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-    _joined = false;
-    co_await _gossiper.unregister_(shared_from_this());
-}
-
-future<> generation_service::on_join(gms::inet_address ep, locator::host_id id, gms::endpoint_state_ptr ep_state, gms::permit_id pid) {
-    return on_change(ep, id, ep_state->get_application_state_map(), pid);
-}
-
-future<> generation_service::on_change(gms::inet_address ep, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    if (_raft_topology_change_enabled()) {
-        return make_ready_future<>();
-    }
-
-    return on_application_state_change(ep, id, states, gms::application_state::CDC_GENERATION_ID, pid, [this] (gms::inet_address ep, locator::host_id id, const gms::versioned_value& v, gms::permit_id) {
-        auto gen_id = gms::versioned_value::cdc_generation_id_from_string(v.value());
-        cdc_log.debug("Endpoint: {}, CDC generation ID change: {}", ep, gen_id);
-
-        return legacy_handle_cdc_generation(gen_id);
-    });
-}
-
-future<> generation_service::check_and_repair_cdc_streams() {
-    // FIXME: support Raft group 0-based topology changes
-    if (!_joined) {
-        throw std::runtime_error("check_and_repair_cdc_streams: node not initialized yet");
-    }
-
-    std::optional<cdc::generation_id> latest = _gen_id;
-    _gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& state) {
-        auto addr = state.get_host_id();
-        if (_gossiper.is_left(addr)) {
-            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
-            return;
-        }
-        if (!_gossiper.is_normal(addr)) {
-            throw std::runtime_error(fmt::format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
-                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
-        }
-
-        const auto gen_id = get_generation_id_for(addr, state);
-        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
-            latest = gen_id;
-        }
-    });
-
-    auto tmptr = _token_metadata.get();
-    auto sys_dist_ks = get_sys_dist_ks();
-
-    bool should_regenerate = false;
-
-    if (!latest) {
-        cdc_log.warn("check_and_repair_cdc_streams: no generation observed in gossip");
-        should_regenerate = true;
-    } else if (std::holds_alternative<cdc::generation_id_v1>(*latest)
-            && _feature_service.cdc_generations_v2) {
-        cdc_log.info(
-            "Cluster still using CDC generation storage format V1 (id: {}), even though it already understands the V2 format."
-            " Creating a new generation using V2.", *latest);
-        should_regenerate = true;
-    } else {
-        cdc_log.info("check_and_repair_cdc_streams: last generation observed in gossip: {}", *latest);
-
-        static const auto timeout_msg = "Timeout while fetching CDC topology description";
-        static const auto topology_read_error_note = "Note: this is likely caused by"
-                " node(s) being down or unreachable. It is recommended to check the network and"
-                " restart/remove the failed node(s), then retry checkAndRepairCdcStreams command";
-        static const auto exception_translating_msg = "Translating the exception to `request_execution_exception`";
-
-        std::optional<topology_description> gen;
-        try {
-            gen = co_await retrieve_generation_data(*latest, _sys_ks.local(), *sys_dist_ks, { tmptr->count_normal_token_owners() });
-        } catch (exceptions::request_timeout_exception& e) {
-            cdc_log.error("{}: \"{}\". {}.", timeout_msg, e.what(), exception_translating_msg);
-            throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
-                    format("{}. {}.", timeout_msg, topology_read_error_note));
-        } catch (exceptions::unavailable_exception& e) {
-            static const auto unavailable_msg = "Node(s) unavailable while fetching CDC topology description";
-            cdc_log.error("{}: \"{}\". {}.", unavailable_msg, e.what(), exception_translating_msg);
-            throw exceptions::request_execution_exception(exceptions::exception_code::UNAVAILABLE,
-                    format("{}. {}.", unavailable_msg, topology_read_error_note));
-        } catch (...) {
-            const auto ep = std::current_exception();
-            if (is_timeout_exception(ep)) {
-                cdc_log.error("{}: \"{}\". {}.", timeout_msg, ep, exception_translating_msg);
-                throw exceptions::request_execution_exception(exceptions::exception_code::READ_TIMEOUT,
-                        format("{}. {}.", timeout_msg, topology_read_error_note));
-            }
-            // On exotic errors proceed with regeneration
-            cdc_log.error("Exception while reading CDC topology description: \"{}\". Regenerating streams anyway.", ep);
-            should_regenerate = true;
-        }
-
-        if (!gen) {
-            cdc_log.error(
-                "Could not find CDC generation with timestamp {} in distributed system tables (current time: {}),"
-                " even though some node gossiped about it.",
-                latest, db_clock::now());
-            should_regenerate = true;
-        } else if (!is_cdc_generation_optimal(*gen, *tmptr)) {
-            should_regenerate = true;
-            cdc_log.info("CDC generation {} needs repair, regenerating", latest);
-        }
-    }
-
-    if (!should_regenerate) {
-        if (latest != _gen_id) {
-            co_await legacy_do_handle_cdc_generation(*latest);
-        }
-        cdc_log.info("CDC generation {} does not need repair", latest);
-        co_return;
-    }
-
-    const auto new_gen_id = co_await legacy_make_new_generation({}, true);
-
-    // Need to artificially update our STATUS so other nodes handle the generation ID change
-    // FIXME: after 0e0282cd nodes do not require a STATUS update to react to CDC generation changes.
-    // The artificial STATUS update here should eventually be removed (in a few releases).
-    auto status = _gossiper.get_this_endpoint_state_ptr()->get_application_state_ptr(gms::application_state::STATUS);
-    if (!status) {
-        cdc_log.error("Our STATUS is missing");
-        cdc_log.error("Aborting CDC generation repair due to missing STATUS");
-        co_return;
-    }
-    // Update _gen_id first, so that legacy_do_handle_cdc_generation (which will get called due to the status update)
-    // won't try to update the gossiper, which would result in a deadlock inside add_local_application_state
-    _gen_id = new_gen_id;
-    co_await _gossiper.add_local_application_state(
-            std::pair(gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(new_gen_id)),
-            std::pair(gms::application_state::STATUS, *status)
-    );
-    co_await _sys_ks.local().update_cdc_generation_id(new_gen_id);
-}
-
-future<> generation_service::handle_cdc_generation(cdc::generation_id_v2 gen_id) {
+future<> generation_service::handle_cdc_generation(cdc::generation_id gen_id) {
    auto ts = get_ts(gen_id);
    if (co_await container().map_reduce(and_reducer(), [ts] (generation_service& svc) {
        return !svc._cdc_metadata.prepare(ts);
@@ -1024,171 +429,8 @@ future<> generation_service::handle_cdc_generation(cdc::generation_id_v2 gen_id)
    }
 }

-future<> generation_service::legacy_handle_cdc_generation(std::optional<cdc::generation_id> gen_id) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    if (!gen_id) {
-        co_return;
-    }
-
-    if (!_sys_dist_ks.local_is_initialized() || !_sys_dist_ks.local().started()) {
-        on_internal_error(cdc_log, "Legacy handle CDC generation with sys.dist.ks. down");
-    }
-
-    // The service should not be listening for generation changes until after the node
-    // is bootstrapped and since the node leaves the ring on decommission
-
-    if (co_await container().map_reduce(and_reducer(), [ts = get_ts(*gen_id)] (generation_service& svc) {
-        return !svc._cdc_metadata.prepare(ts);
-    })) {
-        co_return;
-    }
-
-    bool using_this_gen = false;
-    try {
-        using_this_gen = co_await legacy_do_handle_cdc_generation_intercept_nonfatal_errors(*gen_id);
-    } catch (generation_handling_nonfatal_exception& e) {
-        cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "retrying in the background");
-        legacy_async_handle_cdc_generation(*gen_id);
-        co_return;
-    } catch (...) {
-        cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying");
-        co_return; // Exotic ("fatal") exception => do not retry
-    }
-
-    if (using_this_gen) {
-        cdc_log.info("Starting to use generation {}", *gen_id);
-        co_await update_streams_description(*gen_id, _sys_ks.local(), get_sys_dist_ks(),
-                [&tm = _token_metadata] { return tm.get()->count_normal_token_owners(); },
-                _abort_src);
-    }
-}
-
-void generation_service::legacy_async_handle_cdc_generation(cdc::generation_id gen_id) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    (void)(([] (cdc::generation_id gen_id, shared_ptr<generation_service> svc) -> future<> {
-        while (true) {
-            co_await sleep_abortable(std::chrono::seconds(5), svc->_abort_src);
-
-            try {
-                bool using_this_gen = co_await svc->legacy_do_handle_cdc_generation_intercept_nonfatal_errors(gen_id);
-                if (using_this_gen) {
-                    cdc_log.info("Starting to use generation {}", gen_id);
-                    co_await update_streams_description(gen_id, svc->_sys_ks.local(), svc->get_sys_dist_ks(),
-                            [&tm = svc->_token_metadata] { return tm.get()->count_normal_token_owners(); },
-                            svc->_abort_src);
-                }
-                co_return;
-            } catch (generation_handling_nonfatal_exception& e) {
-                cdc_log.warn(could_not_retrieve_msg_template, gen_id, e.what(), "continuing to retry in the background");
-            } catch (...) {
-                cdc_log.error(could_not_retrieve_msg_template, gen_id, std::current_exception(), "not retrying anymore");
-                co_return; // Exotic ("fatal") exception => do not retry
-            }
-
-            if (co_await svc->container().map_reduce(and_reducer(), [ts = get_ts(gen_id)] (generation_service& svc) {
-                return svc._cdc_metadata.known_or_obsolete(ts);
-            })) {
-                co_return;
-            }
-        }
-    })(gen_id, shared_from_this()));
-}
-
-future<> generation_service::legacy_scan_cdc_generations() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    std::optional<cdc::generation_id> latest;
-    _gossiper.for_each_endpoint_state([&] (const gms::endpoint_state& eps) {
-        auto gen_id = get_generation_id_for(eps.get_host_id(), eps);
-        if (!latest || (gen_id && get_ts(*gen_id) > get_ts(*latest))) {
-            latest = gen_id;
-        }
-    });
-
-    if (latest) {
-        cdc_log.info("Latest generation seen during startup: {}", *latest);
-        co_await legacy_handle_cdc_generation(latest);
-    } else {
-        cdc_log.info("No generation seen during startup.");
-    }
-}
-
-future<bool> generation_service::legacy_do_handle_cdc_generation_intercept_nonfatal_errors(cdc::generation_id gen_id) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    // Use futurize_invoke to catch all exceptions from legacy_do_handle_cdc_generation.
-    return futurize_invoke([this, gen_id] {
-        return legacy_do_handle_cdc_generation(gen_id);
-    }).handle_exception([] (std::exception_ptr ep) -> future<bool> {
-        try {
-            std::rethrow_exception(ep);
-        } catch (exceptions::request_timeout_exception& e) {
-            throw generation_handling_nonfatal_exception(e.what());
-        } catch (exceptions::unavailable_exception& e) {
-            throw generation_handling_nonfatal_exception(e.what());
-        } catch (exceptions::read_failure_exception& e) {
-            throw generation_handling_nonfatal_exception(e.what());
-        } catch (...) {
-            const auto ep = std::current_exception();
-            if (is_timeout_exception(ep)) {
-                throw generation_handling_nonfatal_exception(format("{}", ep));
-            }
-            throw;
-        }
-    });
-}
-
-future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation_id gen_id) {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    auto sys_dist_ks = get_sys_dist_ks();
-    auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
-    if (!gen) {
-        // This may happen during raft upgrade when a node gossips about a generation that
-        // was propagated through raft and we didn't apply it yet.
-        throw generation_handling_nonfatal_exception(fmt::format(
-            "Could not find CDC generation {} in distributed system tables (current time: {}),"
-            " even though some node gossiped about it.",
-            gen_id, db_clock::now()));
-    }
-
-    // We always gossip about the generation with the greatest timestamp. Specific nodes may remember older generations,
-    // but eventually they forget when their clocks move past the latest generation's timestamp.
-    // The cluster as a whole is only interested in the last generation so restarting nodes may learn what it is.
-    // We assume that generation changes don't happen ``too often'' so every node can learn about a generation
-    // before it is superseded by a newer one which causes nodes to start gossiping the about the newer one.
-    // The assumption follows from the requirement of bootstrapping nodes sequentially.
-    if (!_gen_id || get_ts(*_gen_id) < get_ts(gen_id)) {
-        _gen_id = gen_id;
-        co_await _sys_ks.local().update_cdc_generation_id(gen_id);
-        co_await _gossiper.add_local_application_state(
-                gms::application_state::CDC_GENERATION_ID, gms::versioned_value::cdc_generation_id(gen_id));
-    }
-
-    // Return `true` iff the generation was inserted on any of our shards.
-    co_return co_await container().map_reduce(or_reducer(),
-            [ts = get_ts(gen_id), &gen] (generation_service& svc) -> future<bool> {
-        // We need to copy it here before awaiting anything to avoid destruction of the captures.
-        const auto timestamp = ts;
-        topology_description gen_copy = co_await gen->clone_async();
-        co_return svc._cdc_metadata.insert(timestamp, std::move(gen_copy));
-    });
-}
-
-shared_ptr<db::system_distributed_keyspace> generation_service::get_sys_dist_ks() {
-    assert_shard_zero(__PRETTY_FUNCTION__);
-
-    if (!_sys_dist_ks.local_is_initialized()) {
-        throw std::runtime_error("system distributed keyspace not initialized");
-    }
-
-    return _sys_dist_ks.local_shared();
-}
-
 db_clock::time_point get_ts(const generation_id& gen_id) {
-    return std::visit([] (auto& id) { return id.ts; }, gen_id);
+    return gen_id.ts;
 }

 future<mutation> create_table_streams_mutation(table_id table, db_clock::time_point stream_ts, const locator::tablet_map& map, api::timestamp_type ts) {
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -34,16 +34,6 @@ namespace seastar {
    class abort_source;
 } // namespace seastar

-namespace db {
-    class config;
-    class system_distributed_keyspace;
-} // namespace db
-
-namespace gms {
-    class inet_address;
-    class gossiper;
-} // namespace gms
-
 namespace locator {
    class tablet_map;
 } // namespace locator
@@ -153,23 +143,6 @@ struct cdc_stream_diff {

 using table_streams = std::map<api::timestamp_type, committed_stream_set>;

-class no_generation_data_exception : public std::runtime_error {
-public:
-    no_generation_data_exception(cdc::generation_id generation_ts)
-        : std::runtime_error(fmt::format("could not find generation data for timestamp {}", generation_ts))
-    {}
-};
-
-/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
- * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
- * that there's a bug, or the user messed with our local tables).
- *
- * It checks whether we should be the node to propose the first generation of CDC streams.
- * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
- * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
- */
-bool should_propose_first_generation(const locator::host_id& me, const gms::gossiper&);
-
 /*
 * Checks if the CDC generation is optimal, which is true if its `topology_description` is consistent
 * with `token_metadata`.
--- a/cdc/generation_id.hh
+++ b/cdc/generation_id.hh
@@ -15,48 +15,22 @@

 namespace cdc {

-struct generation_id_v1 {
-    db_clock::time_point ts;
-    bool operator==(const generation_id_v1&) const = default;
-};

-struct generation_id_v2 {
+struct generation_id {
    db_clock::time_point ts;
    utils::UUID id;
-    bool operator==(const generation_id_v2&) const = default;
+    bool operator==(const generation_id&) const = default;
 };

-using generation_id = std::variant<generation_id_v1, generation_id_v2>;
-
 db_clock::time_point get_ts(const generation_id&);

 } // namespace cdc

-template <>
-struct fmt::formatter<cdc::generation_id_v1> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    template <typename FormatContext>
-    auto format(const cdc::generation_id_v1& gen_id, FormatContext& ctx) const {
-        return fmt::format_to(ctx.out(), "{}", gen_id.ts);
-    }
-};
-
-template <>
-struct fmt::formatter<cdc::generation_id_v2> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    template <typename FormatContext>
-    auto format(const cdc::generation_id_v2& gen_id, FormatContext& ctx) const {
-        return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
-    }
-};
-
 template <>
 struct fmt::formatter<cdc::generation_id> {
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
    template <typename FormatContext>
    auto format(const cdc::generation_id& gen_id, FormatContext& ctx) const {
-        return std::visit([&ctx] (auto& id) {
-            return fmt::format_to(ctx.out(), "{}", id);
-        }, gen_id);
+        return fmt::format_to(ctx.out(), "({}, {})", gen_id.ts, gen_id.id);
    }
 };
--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -11,140 +11,51 @@
 #include <seastar/core/sharded.hh>
 #include "cdc/metadata.hh"
 #include "cdc/generation_id.hh"
-#include "gms/i_endpoint_state_change_subscriber.hh"

 namespace db {
-class system_distributed_keyspace;
 class system_keyspace;
 }

-namespace gms {
-class gossiper;
-class feature_service;
-}
-
-namespace seastar {
-class abort_source;
-}
-
 namespace locator {
-class shared_token_metadata;
 class tablet_map;
 }

 namespace cdc {

 class generation_service : public peering_sharded_service<generation_service>
-                         , public async_sharded_service<generation_service>
-                         , public gms::i_endpoint_state_change_subscriber {
+                         , public async_sharded_service<generation_service> {
 public:
    struct config {
-        unsigned ignore_msb_bits;
        std::chrono::milliseconds ring_delay;
-        bool dont_rewrite_streams = false;
    };

 private:
    bool _stopped = false;

-    // The node has joined the token ring. Set to `true` on `after_join` call.
-    bool _joined = false;
-
    config _cfg;
-    gms::gossiper& _gossiper;
-    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    sharded<db::system_keyspace>& _sys_ks;
-    abort_source& _abort_src;
-    const locator::shared_token_metadata& _token_metadata;
-    gms::feature_service& _feature_service;
    replica::database& _db;

-    /* Maintains the set of known CDC generations used to pick streams for log writes (i.e., the partition keys of these log writes).
-     * Updated in response to certain gossip events (see the handle_cdc_generation function).
-     */
+    /* Maintains the set of known CDC generations used to pick streams for log writes (i.e., the partition keys of these log writes). */
    cdc::metadata _cdc_metadata;

-    /* The latest known generation timestamp and the timestamp that we're currently gossiping
-     * (as CDC_GENERATION_ID application state).
-     *
-     * Only shard 0 manages this, hence it will be std::nullopt on all shards other than 0.
-     * This timestamp is also persisted in the system.cdc_local table.
-     *
-     * On shard 0 this may be nullopt only in one special case: rolling upgrade, when we upgrade
-     * from an old version of Scylla that didn't support CDC. In that case one node in the cluster
-     * will create the first generation and start gossiping it; it may be us, or it may be some
-     * different node. In any case, eventually - after one of the nodes gossips the first timestamp
-     * - we'll catch on and this variable will be updated with that generation.
-     */
-    std::optional<cdc::generation_id> _gen_id;
-    future<> _cdc_streams_rewrite_complete = make_ready_future<>();
-
-    /* Returns true if raft topology changes are enabled.
-     * Can only be called from shard 0.
-     */
-    std::function<bool()> _raft_topology_change_enabled;
 public:
-    generation_service(config cfg, gms::gossiper&,
-            sharded<db::system_distributed_keyspace>&,
+    generation_service(config cfg,
            sharded<db::system_keyspace>& sys_ks,
-            abort_source&, const locator::shared_token_metadata&,
-            gms::feature_service&, replica::database& db,
-            std::function<bool()> raft_topology_change_enabled);
+            replica::database& db);

    future<> stop();
    ~generation_service();

-    /* After the node bootstraps and creates a new CDC generation, or restarts and loads the last
-     * known generation timestamp from persistent storage, this function should be called with
-     * that generation timestamp moved in as the `startup_gen_id` parameter.
-     * This passes the responsibility of managing generations from the node startup code to this service;
-     * until then, the service remains dormant.
-     * The startup code is in `storage_service::join_topology`, hence
-     * `after_join` should be called at the end of that function.
-     * Precondition: the node has completed bootstrapping and system_distributed_keyspace is initialized.
-     * Must be called on shard 0 - that's where the generation management happens.
-     */
-    future<> after_join(std::optional<cdc::generation_id>&& startup_gen_id);
-    future<> leave_ring();
-
    cdc::metadata& get_cdc_metadata() {
        return _cdc_metadata;
    }

-    virtual future<> on_join(gms::inet_address, locator::host_id id, gms::endpoint_state_ptr, gms::permit_id) override;
-    virtual future<> on_change(gms::inet_address, locator::host_id id, const gms::application_state_map&, gms::permit_id) override;
-
-    future<> check_and_repair_cdc_streams();
-
-    /* Generate a new set of CDC streams and insert it into the internal distributed CDC generations table.
-     * Returns the ID of this new generation.
-     *
-     * Should be called when starting the node for the first time (i.e., joining the ring).
-     *
-     * Assumes that the system_distributed_keyspace service is initialized.
-     * `cluster_supports_generations_v2` must be `true` if and only if the `CDC_GENERATIONS_V2` feature is enabled.
-     *
-     * If `CDC_GENERATIONS_V2` is enabled, the new generation will be inserted into
-     * `system_distributed_everywhere.cdc_generation_descriptions_v2` and the returned ID will be in the v2 format.
-     * Otherwise the new generation will be limited in size, causing suboptimal stream distribution, it will be inserted
-     * into `system_distributed.cdc_generation_descriptions` and the returned ID will be in the v1 format.
-     * The second case should happen only when we create new generations in a mixed cluster.
-     *
-     * The caller of this function is expected to insert the ID into the gossiper as fast as possible,
-     * so that other nodes learn about the generation before their clocks cross the generation's timestamp
-     * (not guaranteed in the current implementation, but expected to be the common case;
-     *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    future<cdc::generation_id> legacy_make_new_generation(
-        const std::unordered_set<dht::token>& bootstrap_tokens, bool add_delay);
-
    /* Retrieve the CDC generation with the given ID from local tables
     * and start using it for CDC log writes if it's not obsolete.
     * Precondition: the generation was committed using group 0 and locally applied.
     */
-    future<> handle_cdc_generation(cdc::generation_id_v2);
+    future<> handle_cdc_generation(cdc::generation_id);

    future<> load_cdc_tablet_streams(std::optional<std::unordered_set<table_id>> changed_tables);

@@ -156,56 +67,6 @@ public:
    future<utils::chunked_vector<mutation>> garbage_collect_cdc_streams_for_table(table_id table, std::optional<std::chrono::seconds> ttl, api::timestamp_type ts);
    future<> garbage_collect_cdc_streams(utils::chunked_vector<canonical_mutation>& muts, api::timestamp_type ts);

-private:
-    /* Retrieve the CDC generation which starts at the given timestamp (from a distributed table created for this purpose)
-     * and start using it for CDC log writes if it's not obsolete.
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    future<> legacy_handle_cdc_generation(std::optional<cdc::generation_id>);
-
-    /* If `legacy_handle_cdc_generation` fails, it schedules an asynchronous retry in the background
-     * using `legacy_async_handle_cdc_generation`.
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    void legacy_async_handle_cdc_generation(cdc::generation_id);
-
-    /* Wrapper around `legacy_do_handle_cdc_generation` which intercepts timeout/unavailability exceptions.
-     * Returns: legacy_do_handle_cdc_generation(ts).
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    future<bool> legacy_do_handle_cdc_generation_intercept_nonfatal_errors(cdc::generation_id);
-
-    /* Returns `true` iff we started using the generation (it was not obsolete or already known),
-     * which means that this node might write some CDC log entries using streams from this generation.
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    future<bool> legacy_do_handle_cdc_generation(cdc::generation_id);
-
-    /* Scan CDC generation timestamps gossiped by other nodes and retrieve the latest one.
-     * This function should be called once at the end of the node startup procedure
-     * (after the node is started and running normally, it will retrieve generations on gossip events instead).
-     *
-     * Legacy: used for gossiper-based topology changes.
-     */
-    future<> legacy_scan_cdc_generations();
-
-    /* generation_service code might be racing with system_distributed_keyspace deinitialization
-     * (the deinitialization order is broken).
-     * Therefore, whenever we want to access sys_dist_ks in a background task,
-     * we need to check if the instance is still there. Storing the shared pointer will keep it alive.
-     */
-    shared_ptr<db::system_distributed_keyspace> get_sys_dist_ks();
-
-    /* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
-     * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
-     * table contains streams of all generations that were present in the old table and may still contain data
-     * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
-     * these generations). */
-    future<> maybe_rewrite_streams_descriptions();
 };

 } // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -618,7 +618,7 @@ static void set_default_properties_log_table(schema_builder& b, const schema& s,
    b.set_caching_options(caching_options::get_disabled_caching_options());

    auto rs = generate_replication_strategy(ksm, db.get_token_metadata().get_topology());
-    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, db.get_token_metadata(), false));
+    auto tombstone_gc_ext = seastar::make_shared<tombstone_gc_extension>(get_default_tombstone_gc_mode(*rs, false));
    b.add_extension(tombstone_gc_extension::NAME, std::move(tombstone_gc_ext));
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -48,6 +48,7 @@
 #include "mutation/mutation_fragment_stream_validator.hh"
 #include "utils/assert.hh"
 #include "utils/error_injection.hh"
+#include "utils/chunked_vector.hh"
 #include "utils/pretty_printers.hh"
 #include "readers/multi_range.hh"
 #include "readers/compacting.hh"
@@ -161,6 +162,7 @@ std::string_view to_string(compaction_type type) {
    case compaction_type::Reshape: return "Reshape";
    case compaction_type::Split: return "Split";
    case compaction_type::Major: return "Major";
+    case compaction_type::RewriteComponent: return "RewriteComponent";
    }
    on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
    return "(invalid)";
@@ -598,8 +600,7 @@ protected:
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
    std::vector<sstables::shared_sstable> _used_garbage_collected_sstables;
    utils::observable<> _stop_request_observable;
-    // optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
-    std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
+    tombstone_gc_state _tombstone_gc_state;
    int64_t _output_repaired_at = 0;
 private:
    // Keeps track of monitors for input sstable.
@@ -611,23 +612,23 @@ private:
    }

    // Called in a seastar thread
-    dht::partition_range_vector
+    utils::chunked_vector<dht::partition_range>
    get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
        // If owned ranges is disengaged, it means no cleanup work was done and
        // so nothing needs to be invalidated.
        if (!_owned_ranges) {
-            return dht::partition_range_vector{};
+            return {};
        }
-        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
+        auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();

        auto non_owned_ranges = sstables
                | std::views::transform([] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
-        })      | std::ranges::to<dht::partition_range_vector>();
+        })      | std::ranges::to<utils::chunked_vector<dht::partition_range>>();

-        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
+        return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
    }
 protected:
    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -649,9 +650,12 @@ protected:
        , _owned_ranges(std::move(descriptor.owned_ranges))
        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
-        , _tombstone_gc_state_with_commitlog_check_disabled(descriptor.gc_check_only_compacting_sstables ? std::make_optional(_table_s.get_tombstone_gc_state().with_commitlog_check_disabled()) : std::nullopt)
+        , _tombstone_gc_state(_table_s.get_tombstone_gc_state())
        , _progress_monitor(progress_monitor)
    {
+        if (descriptor.gc_check_only_compacting_sstables) {
+            _tombstone_gc_state = _tombstone_gc_state.with_commitlog_check_disabled();
+        }
        std::unordered_set<sstables::run_id> ssts_run_ids;
        _contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (sstables::shared_sstable& sst) {
            return !ssts_run_ids.insert(sst->run_identifier()).second;
@@ -718,8 +722,8 @@ protected:

    compaction_completion_desc
    get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
-        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
-        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
+        auto ranges = get_ranges_for_invalidation(input_sstables);
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
    }

    // Tombstone expiration is enabled based on the presence of sstable set.
@@ -849,8 +853,8 @@ private:
        return _table_s.get_compaction_strategy().make_sstable_set(_table_s);
    }

-    const tombstone_gc_state& get_tombstone_gc_state() const {
-        return _tombstone_gc_state_with_commitlog_check_disabled ? _tombstone_gc_state_with_commitlog_check_disabled.value() : _table_s.get_tombstone_gc_state();
+    tombstone_gc_state get_tombstone_gc_state() const {
+        return _tombstone_gc_state;
    }

    future<> setup() {
@@ -1050,7 +1054,7 @@ private:
            return can_never_purge;
        }
        return [this] (const dht::decorated_key& dk, is_shadowable is_shadowable) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp, _tombstone_gc_state_with_commitlog_check_disabled.has_value(), is_shadowable);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp, !_tombstone_gc_state.is_commitlog_check_enabled(), is_shadowable);
        };
    }

@@ -2048,6 +2052,7 @@ compaction_type compaction_type_options::type() const {
        compaction_type::Reshape,
        compaction_type::Split,
        compaction_type::Major,
+        compaction_type::RewriteComponent,
    };
    static_assert(std::variant_size_v<compaction_type_options::options_variant> == std::size(index_to_type));
    return index_to_type[_options.index()];
@@ -2084,6 +2089,9 @@ static std::unique_ptr<compaction> make_compaction(compaction_group_view& table_
        std::unique_ptr<compaction> operator()(compaction_type_options::split split_options) {
            return std::make_unique<split_compaction>(table_s, std::move(descriptor), cdata, std::move(split_options), progress_monitor);
        }
+        std::unique_ptr<compaction> operator()(compaction_type_options::component_rewrite) {
+            throw std::runtime_error("component_rewrite compaction should be handled separately");
+        }
    } visitor_factory{table_s, std::move(descriptor), cdata, progress_monitor};

    return descriptor.options.visit(visitor_factory);
@@ -2101,7 +2109,7 @@ static future<compaction_result> scrub_sstables_validate_mode(compaction_descrip

        validation_errors += co_await sst->validate(permit, cdata.abort, [&schema] (sstring what) {
            scrub_compaction::report_validation_error(compaction_type::Scrub, *schema, what);
-        }, monitor_generator(sst));
+        }, monitor_generator(sst), true);
        // Did validation actually finish because aborted?
        if (cdata.is_stop_requested()) {
            // Compaction manager will catch this exception and re-schedule the compaction.
@@ -2138,6 +2146,34 @@ future<compaction_result> scrub_sstables_validate_mode(compaction_descriptor des
    co_return res;
 }

+future<compaction_result> rewrite_sstables_component(compaction_descriptor descriptor, compaction_group_view& table_s) {
+    return seastar::async([descriptor = std::move(descriptor), &table_s] () mutable {
+        compaction_result result {
+            .stats = {
+                .started_at = db_clock::now(),
+            },
+        };
+
+        const auto& options = descriptor.options.as<compaction_type_options::component_rewrite>();
+        bool update_id = static_cast<bool>(options.update_id);
+        // When rewriting a component, we cannot use the standard descriptor creator
+        // because we must preserve the sstable version.
+        auto creator = [&table_s] (sstables::shared_sstable sst) {
+            return table_s.make_sstable(sst->state(), sst->get_version());
+        };
+        result.new_sstables.reserve(descriptor.sstables.size());
+        for (auto& sst : descriptor.sstables) {
+            auto rewritten = sst->link_with_rewritten_component(creator, options.component_to_rewrite, options.modifier, update_id).get();
+            result.new_sstables.push_back(rewritten);
+        }
+
+        descriptor.replacer({std::move(descriptor.sstables), result.new_sstables});
+
+        result.stats.ended_at = db_clock::now();
+        return result;
+    });
+}
+
 future<compaction_result>
 compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compaction_group_view& table_s, compaction_progress_monitor& progress_monitor) {
    if (descriptor.sstables.empty()) {
@@ -2149,6 +2185,9 @@ compact_sstables(compaction_descriptor descriptor, compaction_data& cdata, compa
        // Bypass the usual compaction machinery for dry-mode scrub
        return scrub_sstables_validate_mode(std::move(descriptor), cdata, table_s, progress_monitor);
    }
+    if (descriptor.options.type() == compaction_type::RewriteComponent) {
+        return rewrite_sstables_component(std::move(descriptor), table_s);
+    }
    return compaction::run(make_compaction(table_s, std::move(descriptor), cdata, progress_monitor));
 }

--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -12,10 +12,12 @@
 #include <functional>
 #include <optional>
 #include <variant>
+#include "sstables/component_type.hh"
 #include "sstables/types_fwd.hh"
 #include "sstables/sstable_set.hh"
 #include "compaction_fwd.hh"
 #include "mutation_writer/token_group_based_splitting_writer.hh"
+#include "utils/chunked_vector.hh"

 namespace compaction {

@@ -30,6 +32,7 @@ enum class compaction_type {
    Reshape = 7,
    Split = 8,
    Major = 9,
+    RewriteComponent = 10,
 };

 struct compaction_completion_desc {
@@ -38,7 +41,7 @@ struct compaction_completion_desc {
    // New, fresh SSTables that should be added to SSTable set, replacing the old ones.
    std::vector<sstables::shared_sstable> new_sstables;
    // Set of compacted partition ranges that should be invalidated in the cache.
-    dht::partition_range_vector ranges_for_cache_invalidation;
+    utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
 };

 // creates a new SSTable for a given shard
@@ -90,8 +93,15 @@ public:
    struct split {
        mutation_writer::classify_by_token_group classifier;
    };
+    struct component_rewrite {
+        sstables::component_type component_to_rewrite;
+        std::function<void(sstables::sstable&)> modifier;
+
+        using update_sstable_id = bool_class<class update_sstable_id_tag>;
+        update_sstable_id update_id = update_sstable_id::yes;
+    };
 private:
-    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major>;
+    using options_variant = std::variant<regular, cleanup, upgrade, scrub, reshard, reshape, split, major, component_rewrite>;

 private:
    options_variant _options;
@@ -129,6 +139,10 @@ public:
        return compaction_type_options(scrub{.operation_mode = mode, .quarantine_sstables = quarantine_sstables, .drop_unfixable = drop_unfixable_sstables});
    }

+    static compaction_type_options make_component_rewrite(component_type component, std::function<void(sstables::sstable&)> modifier, component_rewrite::update_sstable_id update_id = component_rewrite::update_sstable_id::yes) {
+        return compaction_type_options(component_rewrite{.component_to_rewrite = component, .modifier = std::move(modifier), .update_id = update_id});
+    }
+
    static compaction_type_options make_split(mutation_writer::classify_by_token_group classifier) {
        return compaction_type_options(split{std::move(classifier)});
    }
--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -46,6 +46,7 @@ public:
    virtual reader_permit make_compaction_reader_permit() const = 0;
    virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
    virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
+    virtual sstables::shared_sstable make_sstable(sstables::sstable_state, sstables::sstable_version_types) const = 0;
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
    virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
@@ -54,7 +55,7 @@ public:
    virtual future<> on_compaction_completion(compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
    virtual bool tombstone_gc_enabled() const noexcept = 0;
-    virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
+    virtual tombstone_gc_state get_tombstone_gc_state() const noexcept = 0;
    virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
    virtual const std::string get_group_id() const noexcept = 0;
    virtual seastar::condition_variable& get_staging_done_condition() noexcept = 0;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -778,6 +778,7 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
        cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
@@ -791,6 +792,7 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
        cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
+    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
@@ -1040,7 +1042,7 @@ compaction_manager::compaction_manager(config cfg, abort_source& as, tasks::task
        _compaction_controller.set_max_shares(max_shares);
    }))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(_shared_tombstone_gc_state) {
+{
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    register_metrics();
    // Bandwidth throttling is node-wide, updater is needed on single shard
@@ -1064,7 +1066,7 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
    , _compaction_static_shares_observer(_cfg.static_shares.observe(_update_compaction_static_shares_action.make_observer()))
    , _compaction_max_shares_observer(_cfg.max_shares.observe([] (const float& max_shares) {}))
    , _strategy_control(std::make_unique<strategy_control>(*this))
-    , _tombstone_gc_state(_shared_tombstone_gc_state) {
+{
    tm.register_module(_task_manager_module->get_name(), _task_manager_module);
    // No metric registration because this constructor is supposed to be used only by the testing
    // infrastructure.
@@ -1266,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
+                return container().invoke_on_all([] (compaction_manager& cm) {
+                    cm._in_critical_disk_utilization_mode = true;
+                    return cm.drain();
+                });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
+            return container().invoke_on_all([] (compaction_manager& cm) {
+                cm._in_critical_disk_utilization_mode = false;
+                cm.enable();
+            });
        });
    }

@@ -1519,7 +1527,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
+    const auto injected_threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold");
+    const auto threshold = injected_threshold.value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
+
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1534,9 +1544,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.wait([this, &t] {
-                return !can_perform_regular_compaction(t);
-            });
+            co_await cstate.compaction_done.when();
        }
    } catch (const broken_condition_variable&) {
        co_return;
@@ -1786,6 +1794,41 @@ protected:
    }
 };

+class rewrite_sstables_component_compaction_task_executor final : public rewrite_sstables_compaction_task_executor {
+    std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& _rewritten_sstables;
+public:
+    rewrite_sstables_component_compaction_task_executor(compaction_manager& mgr,
+                                       throw_if_stopping do_throw_if_stopping,
+                                       compaction_group_view* t,
+                                       tasks::task_id parent_id,
+                                       compaction_type_options options,
+                                       std::vector<sstables::shared_sstable> sstables,
+                                       compacting_sstable_registration compacting,
+                                       std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables)
+            : rewrite_sstables_compaction_task_executor(mgr, do_throw_if_stopping, t, parent_id, options, {},
+                std::move(sstables), std::move(compacting), compaction_manager::can_purge_tombstones::no, "component_rewrite"),
+            _rewritten_sstables(rewritten_sstables)
+    {}
+protected:
+    virtual future<compaction_manager::compaction_stats_opt> do_run() override {
+        compaction_stats stats{};
+
+        switch_state(state::pending);
+        auto maintenance_permit = co_await acquire_semaphore(_cm._maintenance_ops_sem);
+
+        while (!_sstables.empty()) {
+            auto sst = consume_sstable();
+            auto it = _rewritten_sstables.emplace(sst, sstables::shared_sstable{}).first;
+            auto res = co_await rewrite_sstable(std::move(sst));
+            _cm._validation_errors += res.stats.validation_errors;
+            stats += res.stats;
+            it->second = std::move(res.new_sstables.front());
+        }
+
+        co_return stats;
+    }
+};
+
 class split_compaction_task_executor final : public rewrite_sstables_compaction_task_executor {
    compaction_type_options::split _opt;
 public:
@@ -1899,6 +1942,28 @@ compaction_manager::rewrite_sstables(compaction_group_view& t, compaction_type_o
    return perform_task_on_all_files<rewrite_sstables_compaction_task_executor>("rewrite", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_func), throw_if_stopping::no, can_purge, std::move(options_desc));
 }

+future<compaction_manager::compaction_stats_opt>
+compaction_manager::rewrite_sstables_component(compaction_group_view& t,
+                                     std::vector<sstables::shared_sstable>& sstables,
+                                     compaction_type_options options,
+                                     std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables,
+                                     tasks::task_info info) {
+    auto gh = start_compaction(t);
+    if (!gh) {
+        co_return std::nullopt;
+    }
+
+    if (sstables.empty()) {
+        co_return std::nullopt;
+    }
+
+    compacting_sstable_registration compacting(*this, get_compaction_state(&t));
+    compacting.register_compacting(sstables);
+
+    co_return co_await perform_compaction<rewrite_sstables_component_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id,
+        std::move(options), std::move(sstables), std::move(compacting), rewritten_sstables);
+}
+
 class validate_sstables_compaction_task_executor : public sstables_task_executor {
    compaction_manager::quarantine_invalid_sstables _quarantine_sstables;
 public:
@@ -2289,6 +2354,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

+std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
+    std::exception_ptr ex;
+    if (_in_critical_disk_utilization_mode) {
+        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
+    } else {
+        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
+    }
+    return ex;
+}
+
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2298,8 +2373,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
-                                                                                         "reason might be out of space prevention", sst->get_filename()))));
+        co_return coroutine::exception(make_disabled_exception(t));
    }
    std::vector<sstables::shared_sstable> ret;

@@ -2323,6 +2397,18 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    co_return ret;
 }

+future<std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>> compaction_manager::perform_component_rewrite(compaction::compaction_group_view& t,
+            tasks::task_info info,
+            std::vector<sstables::shared_sstable> sstables,
+            sstables::component_type component,
+            std::function<void(sstables::sstable&)> modifier,
+            compaction_type_options::component_rewrite::update_sstable_id update_id) {
+    std::unordered_map<sstables::shared_sstable, sstables::shared_sstable> rewritten_sstables;
+    rewritten_sstables.reserve(sstables.size());
+    co_await rewrite_sstables_component(t, sstables, compaction_type_options::make_component_rewrite(component, std::move(modifier), update_id), rewritten_sstables, info);
+    co_return rewritten_sstables;
+}
+
 // Submit a table to be scrubbed and wait for its termination.
 future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(compaction_group_view& t, compaction_type_options::scrub opts, tasks::task_info info) {
    auto scrub_mode = opts.operation_mode;
@@ -2387,6 +2473,8 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
    if (!c_state.gate.is_closed()) {
        auto close_gate = c_state.gate.close();
        co_await stop_ongoing_compactions(reason, &t);
+        // Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
+        co_await c_state.incremental_repair_lock.write_lock();
        co_await std::move(close_gate);
    }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -55,6 +55,7 @@ class custom_compaction_task_executor;
 class regular_compaction_task_executor;
 class offstrategy_compaction_task_executor;
 class rewrite_sstables_compaction_task_executor;
+class rewrite_sstables_component_compaction_task_executor;
 class split_compaction_task_executor;
 class cleanup_sstables_compaction_task_executor;
 class validate_sstables_compaction_task_executor;
@@ -114,6 +115,8 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
+    // precondition: is_disabled() is true.
+    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -167,12 +170,9 @@ private:
    std::unique_ptr<strategy_control> _strategy_control;

    shared_tombstone_gc_state _shared_tombstone_gc_state;
-    // TODO: tombstone_gc_state should now have value semantics, but the code
-    // still uses it with reference semantics (inconsistently though).
-    // Drop this member, once the code is converted into using value semantics.
-    tombstone_gc_state _tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
+    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
@@ -256,6 +256,12 @@ private:
    future<compaction_stats_opt> rewrite_sstables(compaction::compaction_group_view& t, compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
                                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");

+    future<compaction_stats_opt> rewrite_sstables_component(compaction_group_view& t,
+                                                            std::vector<sstables::shared_sstable>& sstables,
+                                                            compaction_type_options options,
+                                                            std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>& rewritten_sstables,
+                                                            tasks::task_info info);
+
    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
    future<> really_do_stop() noexcept;
@@ -364,6 +370,13 @@ public:
    // Submit a table to be scrubbed and wait for its termination.
    future<compaction_stats_opt> perform_sstable_scrub(compaction::compaction_group_view& t, compaction_type_options::scrub opts, tasks::task_info info);

+    future<std::unordered_map<sstables::shared_sstable, sstables::shared_sstable>> perform_component_rewrite(compaction::compaction_group_view& t,
+            tasks::task_info info,
+            std::vector<sstables::shared_sstable> sstables,
+            sstables::component_type component,
+            std::function<void(sstables::sstable&)> modifier,
+            compaction_type_options::component_rewrite::update_sstable_id update_id = compaction_type_options::component_rewrite::update_sstable_id::yes);
+
    // Submit a table for major compaction.
    future<> perform_major_compaction(compaction::compaction_group_view& t, tasks::task_info info, bool consider_only_existing_data = false);

@@ -456,10 +469,6 @@ public:

    compaction::strategy_control& get_strategy_control() const noexcept;

-    const tombstone_gc_state& get_tombstone_gc_state() const noexcept {
-        return _tombstone_gc_state;
-    };
-
    shared_tombstone_gc_state& get_shared_tombstone_gc_state() noexcept {
        return _shared_tombstone_gc_state;
    };
@@ -489,6 +498,7 @@ public:
    friend class compaction::regular_compaction_task_executor;
    friend class compaction::offstrategy_compaction_task_executor;
    friend class compaction::rewrite_sstables_compaction_task_executor;
+    friend class compaction::rewrite_sstables_component_compaction_task_executor;
    friend class compaction::cleanup_sstables_compaction_task_executor;
    friend class compaction::validate_sstables_compaction_task_executor;
    friend compaction_reenabler;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -299,13 +299,11 @@ batch_size_fail_threshold_in_kb: 1024
 # max_hint_window_in_ms: 10800000 # 3 hours


-# Validity period for permissions cache (fetching permissions can be an
-# expensive operation depending on the authorizer, CassandraAuthorizer is
-# one example). Defaults to 10000, set to 0 to disable.
+# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
 # Will be disabled automatically for AllowAllAuthorizer.
 # permissions_validity_in_ms: 10000

-# Refresh interval for permissions cache (if enabled).
+# Refresh interval for authorized statements cache.
 # After this interval, cache entries become eligible for refresh. Upon next
 # access, an async reload is scheduled and the old value returned until it
 # completes. If permissions_validity_in_ms is non-zero, then this also must have
@@ -399,6 +397,17 @@ commitlog_total_space_in_mb: -1
 #      you can cache more hot rows
 # column_index_size_in_kb: 64

+# sstable format version for newly written sstables.
+# Currently allowed values are `me` and `ms`.
+# If not specified in the config, this defaults to `me`.
+#
+# The difference between `me` and `ms` are the data structures used
+# in the primary index.
+# In short, `ms` needs more CPU during sstable writes,
+# but should behave better during reads,
+# although it might behave worse for very long clustering keys.
+sstable_format: ms
+
 # Auto-scaling of the promoted index prevents running out of memory
 # when the promoted index grows too large (due to partitions with many rows
 # vs. too small column_index_size_in_kb).  When the serialized representation
@@ -566,15 +575,16 @@ commitlog_total_space_in_mb: -1
 # prometheus_address: 1.2.3.4

 # audit settings
-# By default, Scylla does not audit anything.
+# Table audit is enabled by default.
 # 'audit' config option controls if and where to output audited events:
-#   - "none": auditing is disabled (default)
-#   - "table": save audited events in audit.audit_log column family
+#   - "none": auditing is disabled
+#   - "table": save audited events in audit.audit_log column family (default)
 #   - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
 audit: "table"
 #
 # List of statement categories that should be audited.
-audit_categories: "DCL,DDL,AUTH,ADMIN"
+# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
+audit_categories: "DCL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
@@ -640,7 +650,7 @@ strict_is_not_null_in_views: true
 # * workdir: the node will open the maintenance socket on the path <scylla's workdir>/cql.m,
 #            where <scylla's workdir> is a path defined by the workdir configuration option,
 # * <socket path>: the node will open the maintenance socket on the path <socket path>.
-maintenance_socket: ignore
+maintenance_socket: workdir

 # If set to true, configuration parameters defined with LiveUpdate option can be updated in runtime with CQL
 # by updating system.config virtual table. If we don't want any configuration parameter to be changed in runtime
@@ -649,10 +659,9 @@ maintenance_socket: ignore
 # e.g. for cloud users, for whom scylla's configuration should be changed only by support engineers.
 # live_updatable_config_params_changeable_via_cql: true

-# ****************
-# *  GUARDRAILS  *
-# ****************
-
+#
+# Guardrails options
+#
 # Guardrails to warn or fail when Replication Factor is smaller/greater than the threshold.
 # Please note that the value of 0 is always allowed,
 # which means that having no replication at all, i.e. RF = 0, is always valid.
@@ -662,6 +671,27 @@ maintenance_socket: ignore
 # minimum_replication_factor_warn_threshold:  3
 # maximum_replication_factor_warn_threshold: -1
 # maximum_replication_factor_fail_threshold: -1
+#
+# Guardrails to warn about or disallow creating a keyspace with specific replication strategy.
+# Each of these 2 settings is a list storing replication strategies considered harmful.
+# The replication strategies to choose from are:
+# 1) SimpleStrategy,
+# 2) NetworkTopologyStrategy,
+# 3) LocalStrategy,
+# 4) EverywhereStrategy
+#
+# replication_strategy_warn_list:
+#  - SimpleStrategy
+# replication_strategy_fail_list:
+#
+# Guardrail to enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.
+# enable_create_table_with_compact_storage: false
+#
+# Guardrails to limit usage of selected consistency levels for writes.
+# Adding a warning to a CQL query response can significantly increase network
+# traffic and decrease overall throughput.
+# write_consistency_levels_warned: []
+# write_consistency_levels_disallowed: []

 #
 # System information encryption settings
@@ -839,21 +869,6 @@ maintenance_socket: ignore
 #   key_namespace: <kmip key namespace> (optional)
 #

-# Guardrails to warn about or disallow creating a keyspace with specific replication strategy.
-# Each of these 2 settings is a list storing replication strategies considered harmful.
-# The replication strategies to choose from are:
-# 1) SimpleStrategy,
-# 2) NetworkTopologyStrategy,
-# 3) LocalStrategy,
-# 4) EverywhereStrategy
-#
-# replication_strategy_warn_list:
-#  - SimpleStrategy
-# replication_strategy_fail_list:
-
-# Guardrail to enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.
-# enable_create_table_with_compact_storage: false
-
 # Control tablets for new keyspaces.
 # Can be set to: disabled|enabled|enforced
 #
@@ -875,7 +890,16 @@ maintenance_socket: ignore
 # The `tablets` option cannot be changed using `ALTER KEYSPACE`.
 tablets_mode_for_new_keyspaces: enabled

-# Enforce RF-rack-valid keyspaces.
+# Require every tablet-enabled keyspace to be RF-rack-valid.
+#
+# A tablet-enabled keyspace is RF-rack-valid when, for each data center,
+# its replication factor (RF) is 0, 1, or exactly equal to the number of
+# racks in that data center. Setting the RF to the number of racks ensures
+# that a single rack failure never results in data unavailability.
+#
+# When set to true, CREATE KEYSPACE and ALTER KEYSPACE statements that
+# would produce an RF-rack-invalid keyspace are rejected.
+# When set to false, such statements are allowed but emit a warning.
 rf_rack_valid_keyspaces: false

 #
--- a/configure.py
+++ b/configure.py
@@ -544,7 +544,6 @@ scylla_tests = set([
    'test/boost/caching_options_test',
    'test/boost/canonical_mutation_test',
    'test/boost/cartesian_product_test',
-    'test/boost/cdc_generation_test',
    'test/boost/cell_locker_test',
    'test/boost/checksum_utils_test',
    'test/boost/chunked_managed_vector_test',
@@ -619,6 +618,7 @@ scylla_tests = set([
    'test/boost/reservoir_sampling_test',
    'test/boost/result_utils_test',
    'test/boost/rest_client_test',
+    'test/boost/rolling_max_tracker_test',
    'test/boost/reusable_buffer_test',
    'test/boost/rust_test',
    'test/boost/s3_test',
@@ -795,6 +795,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
+# Workaround for https://github.com/mozilla/sccache/issues/2575
+arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
+                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -893,6 +896,9 @@ scylla_core = (['message/messaging_service.cc',
                'replica/multishard_query.cc',
                'replica/mutation_dump.cc',
                'replica/querier.cc',
+                'replica/logstor/segment_manager.cc',
+                'replica/logstor/logstor.cc',
+                'replica/logstor/write_buffer.cc',
                'mutation/atomic_cell.cc',
                'mutation/canonical_mutation.cc',
                'mutation/frozen_mutation.cc',
@@ -925,8 +931,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'utils/buffer_input_stream.cc',
-                'utils/limiting_data_source.cc',
+                'test/lib/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1172,6 +1177,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
                'utils/http.cc',
+                'utils/http_client_error_processing.cc',
                'utils/rest/client.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
@@ -1189,6 +1195,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/azure/identity/default_credentials.cc',
                'utils/gcp/gcp_credentials.cc',
                'utils/gcp/object_storage.cc',
+                'utils/gcp/object_storage_retry_strategy.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -1200,6 +1207,7 @@ scylla_core = (['message/messaging_service.cc',
                'gms/application_state.cc',
                'gms/inet_address.cc',
                'dht/i_partitioner.cc',
+                'dht/fixed_shard.cc',
                'dht/token.cc',
                'dht/murmur3_partitioner.cc',
                'dht/boot_strapper.cc',
@@ -1235,7 +1243,6 @@ scylla_core = (['message/messaging_service.cc',
                'service/pager/query_pagers.cc',
                'service/qos/qos_common.cc',
                'service/qos/service_level_controller.cc',
-                'service/qos/standard_service_level_distributed_data_accessor.cc',
                'service/qos/raft_service_level_distributed_data_accessor.cc',
                'streaming/stream_task.cc',
                'streaming/stream_session.cc',
@@ -1269,11 +1276,10 @@ scylla_core = (['message/messaging_service.cc',
                'auth/common.cc',
                'auth/default_authorizer.cc',
                'auth/resource.cc',
-                'auth/roles-metadata.cc',
                'auth/passwords.cc',
+                'auth/maintenance_socket_authenticator.cc',
                'auth/password_authenticator.cc',
                'auth/permission.cc',
-                'auth/permissions_cache.cc',
                'auth/service.cc',
                'auth/standard_role_manager.cc',
                'auth/ldap_role_manager.cc',
@@ -1337,6 +1343,7 @@ scylla_core = (['message/messaging_service.cc',
                'service/strong_consistency/groups_manager.cc',
                'service/strong_consistency/coordinator.cc',
                'service/strong_consistency/state_machine.cc',
+                'service/strong_consistency/raft_groups_storage.cc',
                'service/raft/group0_state_id_handler.cc',
                'service/raft/group0_state_machine.cc',
                'service/raft/group0_state_machine_merger.cc',
@@ -1358,7 +1365,6 @@ scylla_core = (['message/messaging_service.cc',
                'service/topology_state_machine.cc',
                'service/topology_mutation.cc',
                'service/topology_coordinator.cc',
-                'node_ops/node_ops_ctl.cc',
                'node_ops/task_manager_module.cc',
                'reader_concurrency_semaphore_group.cc',
                'utils/disk_space_monitor.cc',
@@ -1464,6 +1470,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/query.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
+        'idl/logstor.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
@@ -1471,6 +1478,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/messaging_service.idl.hh',
        'idl/paxos.idl.hh',
        'idl/raft.idl.hh',
+        'idl/raft_util.idl.hh',
        'idl/raft_storage.idl.hh',
        'idl/group0.idl.hh',
        'idl/hinted_handoff.idl.hh',
@@ -1490,7 +1498,9 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/gossip.idl.hh',
        'idl/migration_manager.idl.hh',
        "idl/node_ops.idl.hh",
-        "idl/tasks.idl.hh"
+        "idl/tasks.idl.hh",
+        "idl/client_state.idl.hh",
+        "idl/forward_cql.idl.hh",
        ]

 scylla_tests_generic_dependencies = [
@@ -1535,6 +1545,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
+                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -1582,6 +1593,7 @@ pure_boost_tests = set([
    'test/boost/wrapping_interval_test',
    'test/boost/range_tombstone_list_test',
    'test/boost/reservoir_sampling_test',
+    'test/boost/rolling_max_tracker_test',
    'test/boost/serialization_test',
    'test/boost/small_vector_test',
    'test/boost/top_k_test',
@@ -1642,6 +1654,7 @@ for t in sorted(perf_tests):

 deps['test/boost/combined_tests'] += [
    'test/boost/aggregate_fcts_test.cc',
+    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
    'test/boost/cache_algorithm_test.cc',
@@ -1729,6 +1742,7 @@ deps['test/boost/url_parse_test'] = ['utils/http.cc', 'test/boost/url_parse_test
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
 deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'utils/labels.cc']
 deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
+deps['test/boost/rolling_max_tracker_test'] = ['test/boost/rolling_max_tracker_test.cc']
 deps['test/boost/estimated_histogram_test'] = ['test/boost/estimated_histogram_test.cc']
 deps['test/boost/summary_test'] = ['test/boost/summary_test.cc']
 deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']
@@ -2383,7 +2397,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -3112,7 +3126,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache:
+        if 'sccache' in compiler_cache and args.sccache_rust:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool is_ann_ordering = false;
    }
    : K_SELECT (
-                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
-                ( K_DISTINCT { is_distinct = true; } )?
+                ( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                | (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                )?
+                ( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
                sclause=selectClause
               )
      K_FROM (
@@ -425,13 +427,13 @@ selector returns [shared_ptr<raw_selector> s]

 unaliasedSelector returns [uexpression tmp]
    :  ( c=cident                                  { tmp = unresolved_identifier{std::move(c)}; }
+       | v=value                                   { tmp = std::move(v); }
       | K_COUNT '(' countArgument ')'             { tmp = make_count_rows_function_expression(); }
       | K_WRITETIME '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | K_TTL       '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
-       | f=similarityFunctionName args=vectorSimilarityArgs            { tmp = function_call{std::move(f), std::move(args)}; }
       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
       )
       ( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -446,23 +448,9 @@ selectionFunctionArgs returns [std::vector<expression> a]
      ')'
    ;

-vectorSimilarityArgs returns [std::vector<expression> a]
-    : '(' ')'
-    | '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
-          ( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
-      ')'
-    ;
-
-vectorSimilarityArg returns [uexpression a]
-    : s=unaliasedSelector { a = std::move(s); }
-    | v=value             { a = std::move(v); }
-    ;
-
 countArgument
    : '*'
-    | i=INTEGER { if (i->getText() != "1") {
-                    add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
-                } }
+    /* COUNT(1) is also allowed, it is recognized via the general function(args) path */
    ;

 whereClause returns [uexpression clause]
@@ -886,8 +874,8 @@ cfamDefinition[cql3::statements::create_table_statement::raw_statement& expr]
    ;

 cfamColumns[cql3::statements::create_table_statement::raw_statement& expr]
-    @init { bool is_static=false; }
-    : k=ident v=comparatorType (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static); }
+    @init { bool is_static=false, is_ttl=false; }
+    : k=ident v=comparatorType (K_TTL {is_ttl = true;})? (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static, is_ttl); }
        (K_PRIMARY K_KEY { $expr.add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); })?
    | K_PRIMARY K_KEY '(' pkDef[expr] (',' c=ident { $expr.add_column_alias(c); } )* ')'
    ;
@@ -1054,6 +1042,7 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
        std::vector<alter_table_statement::column_change> column_changes;
        std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>, shared_ptr<cql3::column_identifier::raw>>> renames;
        auto attrs = std::make_unique<cql3::attributes::raw>();
+        shared_ptr<cql3::column_identifier::raw> ttl_change;
    }
    : K_ALTER K_COLUMNFAMILY cf=columnFamilyName
          ( K_ALTER id=cident K_TYPE v=comparatorType { type = alter_table_statement::type::alter; column_changes.emplace_back(alter_table_statement::column_change{id, v}); }
@@ -1072,9 +1061,11 @@ alterTableStatement returns [std::unique_ptr<alter_table_statement::raw_statemen
          | K_RENAME                                  { type = alter_table_statement::type::rename; }
               id1=cident K_TO toId1=cident { renames.emplace_back(id1, toId1); }
               ( K_AND idn=cident K_TO toIdn=cident { renames.emplace_back(idn, toIdn); } )*
+          | K_TTL                                     { type = alter_table_statement::type::ttl; }
+               ( id=cident { ttl_change = id; } | K_NULL )
          )
    {
-        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs));
+        $expr = std::make_unique<alter_table_statement::raw_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames), std::move(attrs), std::move(ttl_change));
    }
    ;

@@ -1706,10 +1697,6 @@ functionName returns [cql3::functions::function_name s]
    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s.keyspace = std::move(ks); $s.name = std::move(f); }
    ;

-similarityFunctionName returns [cql3::functions::function_name s]
-    : f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
-    ;
-
 allowedFunctionName returns [sstring s]
    : f=IDENT                       { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
    | f=QUOTED_NAME                 { $s = $f.text; }
@@ -1718,11 +1705,6 @@ allowedFunctionName returns [sstring s]
    | K_COUNT                       { $s = "count"; }
    ;

-allowedSimilarityFunctionName returns [sstring s]
-    : f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
-      { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
-    ;
-
 functionArgs returns [std::vector<expression> a]
    : '(' ')'
    | '(' t1=term { a.push_back(std::move(t1)); }
@@ -2092,7 +2074,21 @@ vector_type returns [shared_ptr<cql3::cql3_type::raw> pt]
        {
            if ($d.text[0] == '-')
                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
-            $pt = cql3::cql3_type::raw::vector(t, std::stoul($d.text));
+            unsigned long parsed_dimension;
+            try {
+                parsed_dimension = std::stoul($d.text);
+            } catch (const std::exception& e) {
+                throw exceptions::invalid_request_exception(format("Invalid vector dimension: {}", $d.text));
+            }
+            static_assert(sizeof(unsigned long) >= sizeof(vector_dimension_t));
+            if (parsed_dimension == 0) {
+                throw exceptions::invalid_request_exception("Vectors must have a dimension greater than 0");
+            }
+            if (parsed_dimension > cql3::cql3_type::MAX_VECTOR_DIMENSION) {
+                throw exceptions::invalid_request_exception(
+                        format("Vectors must have a dimension less than or equal to {}", cql3::cql3_type::MAX_VECTOR_DIMENSION));
+            }
+            $pt = cql3::cql3_type::raw::vector(t, static_cast<vector_dimension_t>(parsed_dimension));
        }
    ;

@@ -2419,10 +2415,6 @@ K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

 K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;

-K_SIMILARITY_EUCLIDEAN:     S I M I L A R I T Y '_' E U C L I D E A N;
-K_SIMILARITY_COSINE:        S I M I L A R I T Y '_' C O S I N E;
-K_SIMILARITY_DOT_PRODUCT:   S I M I L A R I T Y '_' D O T '_' P R O D U C T;
-
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/assignment_testable.hh
+++ b/cql3/assignment_testable.hh
@@ -27,7 +27,7 @@ public:

    struct vector_test_result {
        test_result result;
-        std::optional<size_t> dimension_opt;
+        std::optional<vector_dimension_t> dimension_opt;
    };

    static bool is_assignable(test_result tr) {
--- a/cql3/column_specification.cc
+++ b/cql3/column_specification.cc
@@ -23,7 +23,7 @@ column_specification::column_specification(std::string_view ks_name_, std::strin

 bool column_specification::all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names)
 {
-    SCYLLA_ASSERT(!names.empty());
+    throwing_assert(!names.empty());

    auto first = names.front();
    return std::all_of(std::next(names.begin()), names.end(), [first] (auto&& spec) {
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -49,9 +49,9 @@ static cql3_type::kind get_cql3_kind(const abstract_type& t) {
        cql3_type::kind operator()(const uuid_type_impl&) { return cql3_type::kind::UUID; }
        cql3_type::kind operator()(const varint_type_impl&) { return cql3_type::kind::VARINT; }
        cql3_type::kind operator()(const reversed_type_impl& r) { return get_cql3_kind(*r.underlying_type()); }
-        cql3_type::kind operator()(const tuple_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
-        cql3_type::kind operator()(const vector_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
-        cql3_type::kind operator()(const collection_type_impl&) { SCYLLA_ASSERT(0 && "no kind for this type"); }
+        cql3_type::kind operator()(const tuple_type_impl&) { throwing_assert(0 && "no kind for this type"); }
+        cql3_type::kind operator()(const vector_type_impl&) { throwing_assert(0 && "no kind for this type"); }
+        cql3_type::kind operator()(const collection_type_impl&) { throwing_assert(0 && "no kind for this type"); }
    };
    return visit(t, visitor{});
 }
@@ -124,7 +124,7 @@ class cql3_type::raw_collection : public raw {
        } else if (_kind == abstract_type::kind::map) {
            return format("{}map<{}, {}>{}", start, _keys, _values, end);
        }
-        abort();
+        throwing_assert(0 && "invalid raw_collection kind");
    }
 public:
    raw_collection(const abstract_type::kind kind, shared_ptr<raw> keys, shared_ptr<raw> values)
@@ -150,7 +150,7 @@ public:
    }

    virtual cql3_type prepare_internal(const sstring& keyspace, const data_dictionary::user_types_metadata& user_types) override {
-        SCYLLA_ASSERT(_values); // "Got null values type for a collection";
+        throwing_assert(_values); // "Got null values type for a collection";

        if (_values->is_counter()) {
            throw exceptions::invalid_request_exception(format("Counters are not allowed inside collections: {}", *this));
@@ -190,7 +190,7 @@ private:
            }
            return cql3_type(set_type_impl::get_instance(_values->prepare_internal(keyspace, user_types).get_type(), !is_frozen()));
        } else if (_kind == abstract_type::kind::map) {
-            SCYLLA_ASSERT(_keys); // "Got null keys type for a collection";
+            throwing_assert(_keys); // "Got null keys type for a collection";
            if (_keys->is_duration()) {
                throw exceptions::invalid_request_exception(format("Durations are not allowed as map keys: {}", *this));
            }
@@ -198,7 +198,7 @@ private:
                                                         _values->prepare_internal(keyspace, user_types).get_type(),
                                                         !is_frozen()));
        }
-        abort();
+        throwing_assert(0 && "do_prepare invalid kind");
    }
 };

@@ -307,17 +307,14 @@ public:

 class cql3_type::raw_vector : public raw {
    shared_ptr<raw> _type;
-    size_t _dimension;
-
-    // This limitation is acquired from the maximum number of dimensions in OpenSearch. 
-    static constexpr size_t MAX_VECTOR_DIMENSION = 16000;
+    vector_dimension_t _dimension;

    virtual sstring to_string() const override {
        return seastar::format("vector<{}, {}>", _type, _dimension);
    }

 public:
-    raw_vector(shared_ptr<raw> type, size_t dimension)
+    raw_vector(shared_ptr<raw> type, vector_dimension_t dimension)
            : _type(std::move(type)), _dimension(dimension) {
    }

@@ -417,7 +414,7 @@ cql3_type::raw::tuple(std::vector<shared_ptr<raw>> ts) {
 }

 shared_ptr<cql3_type::raw>
-cql3_type::raw::vector(shared_ptr<raw> t, size_t dimension) {
+cql3_type::raw::vector(shared_ptr<raw> t, vector_dimension_t dimension) {
    return ::make_shared<raw_vector>(std::move(t), dimension);
 }

--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -39,6 +39,9 @@ public:
    data_type get_type() const { return _type; }
    const sstring& to_string() const { return _type->cql3_type_name(); }

+    // This limitation is acquired from the maximum number of dimensions in OpenSearch.
+    static constexpr vector_dimension_t MAX_VECTOR_DIMENSION = 16000;
+
    // For UserTypes, we need to know the current keyspace to resolve the
    // actual type used, so Raw is a "not yet prepared" CQL3Type.
    class raw {
@@ -64,7 +67,7 @@ public:
        static shared_ptr<raw> list(shared_ptr<raw> t);
        static shared_ptr<raw> set(shared_ptr<raw> t);
        static shared_ptr<raw> tuple(std::vector<shared_ptr<raw>> ts);
-        static shared_ptr<raw> vector(shared_ptr<raw> t, size_t dimension);
+        static shared_ptr<raw> vector(shared_ptr<raw> t, vector_dimension_t dimension);
        static shared_ptr<raw> frozen(shared_ptr<raw> t);
        friend sstring format_as(const raw& r) {
            return r.to_string();
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1603,7 +1603,7 @@ static cql3::raw_value do_evaluate(const collection_constructor& collection, con
        case collection_constructor::style_type::vector:
            return evaluate_vector(collection, inputs);
    }
-    std::abort();
+    throwing_assert(0 && "do_evaluate invalid style");
 }

 static cql3::raw_value do_evaluate(const usertype_constructor& user_val, const evaluation_inputs& inputs) {
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -10,6 +10,7 @@
 #include "expr-utils.hh"
 #include "evaluate.hh"
 #include "cql3/functions/functions.hh"
+#include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/castas_fcts.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/column_identifier.hh"
@@ -501,8 +502,8 @@ vector_validate_assignable_to(const collection_constructor& c, data_dictionary::
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }

-    size_t expected_size = vt->get_dimension();
-    if (!expected_size) {
+    vector_dimension_t expected_size = vt->get_dimension();
+    if (expected_size == 0) {
        throw exceptions::invalid_request_exception(format("Invalid vector type literal for {}: type {} expects at least one element",
                                                            *receiver.name, receiver.type->as_cql3_type()));
    }
@@ -875,7 +876,7 @@ cast_test_assignment(const cast& c, data_dictionary::database db, const sstring&
            return assignment_testable::test_result::NOT_ASSIGNABLE;
        }
    } catch (exceptions::invalid_request_exception& e) {
-        abort();
+        throwing_assert(0 && "cast_test_assignment exception");
    }
 }

@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
    return partially_prepared_args;
 }

+// Special case for count(1) - recognize it as the countRows() function. Note it is quite
+// artificial and we might relax it to the more general count(expression) later.
+static
+std::optional<expression>
+try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    return std::visit(overloaded_functor{
+        [&] (const functions::function_name& name) -> std::optional<expression> {
+            auto native_name = name;
+            if (!native_name.has_keyspace()) {
+                native_name = name.as_native_function();
+            }
+            // Collapse count(1) into countRows()
+            if (native_name == functions::function_name::native_function("count")) {
+                if (fc.args.size() == 1) {
+                    if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
+                        if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
+                                && uc_arg->raw_text == "1") {
+                            return expr::function_call{
+                                .func = functions::aggregate_fcts::make_count_rows_function(),
+                                .args = {},
+                            };
+                        } else {
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                        }
+                    }
+                }
+            }
+            return std::nullopt;
+        },
+        [] (const shared_ptr<functions::function>&) -> std::optional<expression> {
+            // Already prepared, nothing to do
+            return std::nullopt;
+        },
+    }, fc.func);
+}
+
 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
+        return prepared;
+    }
    // Try to extract a column family name from the available information.
    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -544,7 +544,7 @@ functions::get_user_aggregates(const sstring& keyspace) const {

 std::ranges::subrange<functions::declared_t::const_iterator>
 functions::find(const function_name& name) const {
-    SCYLLA_ASSERT(name.has_keyspace()); // : "function name not fully qualified";
+    throwing_assert(name.has_keyspace()); // : "function name not fully qualified";
    auto pair = _declared.equal_range(name);
    return std::ranges::subrange(pair.first, pair.second);
 }
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,9 +10,38 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
+#include <bit>
+#include <span>
+#include <seastar/core/byteorder.hh>

 namespace cql3 {
 namespace functions {
+
+namespace detail {
+
+std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension) {
+    if (!param) {
+        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
+    }
+
+    const size_t expected_size = dimension * sizeof(float);
+    if (param->size() != expected_size) {
+        throw exceptions::invalid_request_exception(
+            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
+                       expected_size, dimension, param->size()));
+    }
+
+    std::vector<float> result(dimension);
+    const char* p = reinterpret_cast<const char*>(param->data());
+    for (size_t i = 0; i < dimension; ++i) {
+        result[i] = std::bit_cast<float>(consume_be<uint32_t>(p));
+    }
+
+    return result;
+}
+
+} // namespace detail
+
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -22,14 +51,15 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
-    double dot_product = 0.0;
-    double squared_norm_a = 0.0;
-    double squared_norm_b = 0.0;
+float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float dot_product = 0.0;
+    float squared_norm_a = 0.0;
+    float squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        float a = v1[i];
+        float b = v2[i];

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -37,7 +67,7 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
+        return std::numeric_limits<float>::quiet_NaN();
    }

    // The cosine similarity is in the range [-1, 1].
@@ -46,14 +76,15 @@ float compute_cosine_similarity(const std::vector<data_value>& v1, const std::ve
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
-    double sum = 0.0;
+float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        float a = v1[i];
+        float b = v2[i];

-        double diff = a - b;
+        float diff = a - b;
        sum += diff * diff;
    }

@@ -65,12 +96,13 @@ float compute_euclidean_similarity(const std::vector<data_value>& v1, const std:

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
-    double dot_product = 0.0;
+float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
+    #pragma clang fp contract(fast) reassociate(on) // Allow the compiler to optimize the loop.
+    float dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = value_cast<float>(v1[i]);
-        double b = value_cast<float>(v2[i]);
+        float a = v1[i];
+        float b = v2[i];
        dot_product += a * b;
    }

@@ -124,7 +156,7 @@ std::vector<data_type> retrieve_vector_arg_types(const function_name& name, cons
        }
    }

-    size_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
+    vector_dimension_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
    auto type = vector_type_impl::get_instance(float_type, dimension);
    return {type, type};
 }
@@ -136,13 +168,15 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    const auto& type = arg_types()[0];
-    data_value v1 = type->deserialize(*parameters[0]);
-    data_value v2 = type->deserialize(*parameters[1]);
-    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
-    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
+    // Extract dimension from the vector type
+    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
+    vector_dimension_t dimension = type.get_dimension();

-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
+    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
+    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
+    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
+
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,6 +11,7 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
+#include <span>

 namespace cql3 {
 namespace functions {
@@ -19,7 +20,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
+using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -33,5 +34,14 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

+namespace detail {
+
+// Extract float vector directly from serialized bytes, bypassing data_value overhead.
+// This is an internal API exposed for testing purposes.
+// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
+std::vector<float> extract_float_vector(const bytes_opt& param, vector_dimension_t dimension);
+
+} // namespace detail
+
 } // namespace functions
 } // namespace cql3
--- a/cql3/keyspace_element_name.cc
+++ b/cql3/keyspace_element_name.cc
@@ -25,7 +25,7 @@ bool keyspace_element_name::has_keyspace() const

 const sstring& keyspace_element_name::get_keyspace() const
 {
-    SCYLLA_ASSERT(_ks_name);
+    throwing_assert(_ks_name);
    return *_ks_name;
 }

--- a/Show More
+++ b/Show More