Fix critical bugs and issues found in alternator code review

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Initial plan
2026-05-13 11:22:01 +00:00 · 2026-01-29 22:54:57 +00:00 · 2026-01-29 22:49:31 +00:00 · 2026-01-29 17:25:42 +01:00 · 2026-01-29 18:12:35 +02:00 · 2026-01-29 16:18:26 +02:00
472 changed files with 5526 additions and 10750 deletions
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -10,9 +10,6 @@ on:
    types: [labeled, unlabeled]
    branches: [master, next, enterprise]

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  check-commit:
    runs-on: ubuntu-latest
@@ -33,7 +30,7 @@ jobs:
            echo "DEFAULT_BRANCH=master" >> $GITHUB_ENV
          fi
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          repository: ${{ github.repository }}
          ref: ${{ env.DEFAULT_BRANCH }}
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -5,15 +5,12 @@ on:
    types: [opened, reopened, edited]
    branches: [branch-*]

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  check-fixes-prefix:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR body for "Fixes" prefix patterns
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        uses: actions/github-script@v7
        with:
          script: |
            const body = context.payload.pull_request.body;
--- a/.github/workflows/build-scylla.yaml
+++ b/.github/workflows/build-scylla.yaml
@@ -12,9 +12,6 @@ on:
        description: 'the md5sum for scylla executable'
        value: ${{ jobs.build.outputs.md5sum }}

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  read-toolchain:
    uses: ./.github/workflows/read-toolchain.yaml
@@ -27,7 +24,7 @@ jobs:
    outputs:
      md5sum: ${{ steps.checksum.outputs.md5sum }}
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Generate the building system
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -1,53 +0,0 @@
-name: Backport with Jira Integration
-
-on:
-  push:
-    branches:
-      - master
-      - next-*.*
-      - branch-*.*
-  pull_request_target:
-    types: [labeled, closed]
-    branches: 
-      - master
-      - next
-      - next-*.*
-      - branch-*.*
-
-jobs:
-  backport-on-push:
-    if: github.event_name == 'push'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'push'
-      base_branch: ${{ github.ref }}
-      commits: ${{ github.event.before }}..${{ github.sha }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-on-label:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'labeled'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      head_commit: ${{ github.event.pull_request.base.sha }}
-      label_name: ${{ github.event.label.name }}
-      pr_state: ${{ github.event.pull_request.state }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-chain:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'chain'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      pr_body: ${{ github.event.pull_request.body }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/check-license-header.yaml
+++ b/.github/workflows/check-license-header.yaml
@@ -9,7 +9,6 @@ env:
  HEADER_CHECK_LINES: 10
  LICENSE: "LicenseRef-ScyllaDB-Source-Available-1.0"
  CHECKED_EXTENSIONS: ".cc .hh .py"
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 jobs:
  check-license-headers:
@@ -20,7 +19,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -41,7 +40,7 @@ jobs:

      - name: Comment on PR if check fails
        if: failure()
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        uses: actions/github-script@v7
        with:
          script: |
            const license = '${{ env.LICENSE }}';
--- a/.github/workflows/clang-nightly.yaml
+++ b/.github/workflows/clang-nightly.yaml
@@ -9,7 +9,6 @@ env:
  # use the development branch explicitly
  CLANG_VERSION: 21
  BUILD_DIR: build
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 permissions: {}

@@ -33,7 +32,7 @@ jobs:
    steps:
      - run: |
          sudo dnf -y install git
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          submodules: true
      - name: Install build dependencies
--- a/.github/workflows/clang-tidy.yaml
+++ b/.github/workflows/clang-tidy.yaml
@@ -18,7 +18,6 @@ env:
  BUILD_TYPE: RelWithDebInfo
  BUILD_DIR: build
  CLANG_TIDY_CHECKS: '-*,bugprone-use-after-move'
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 permissions: {}

@@ -43,7 +42,7 @@ jobs:
          IMAGE: ${{ needs.read-toolchain.image }}
        run: |
          echo ${{ needs.read-toolchain.image }}
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          submodules: true
      - run: |
--- a/.github/workflows/codespell.yaml
+++ b/.github/workflows/codespell.yaml
@@ -4,15 +4,13 @@ on:
    branches:
      - master
 permissions: {}
-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 jobs:
  codespell:
    name: Check for spelling errors
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      - uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # v2.2
+      - uses: actions/checkout@v4
+      - uses: codespell-project/actions-codespell@master
        with:
          only_warn: 1
          ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison,iif,tread"
--- a/.github/workflows/conflict_reminder.yaml
+++ b/.github/workflows/conflict_reminder.yaml
@@ -12,16 +12,13 @@ on:
  schedule:
    - cron: '0 10 * * 1'  # Runs every Monday at 10:00am

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  notify_conflict_prs:
    runs-on: ubuntu-latest

    steps:
      - name: Notify PR Authors of Conflicts
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        uses: actions/github-script@v7
        with:
          script: |
            console.log("Starting conflict reminder script...");
--- a/.github/workflows/differential-shellcheck.yaml
+++ b/.github/workflows/differential-shellcheck.yaml
@@ -13,9 +13,6 @@ on:
 permissions:
  contents: read

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  lint:
    runs-on: ubuntu-latest
@@ -24,12 +21,12 @@ jobs:
      security-events: write

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Differential ShellCheck
-        uses: redhat-plumbers-in-action/differential-shellcheck@d965e66ec0b3b2f821f75c8eff9b12442d9a7d1e # v5.5.6
+        uses: redhat-plumbers-in-action/differential-shellcheck@v5
        with:
          severity: warning
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/docs-pages.yaml
+++ b/.github/workflows/docs-pages.yaml
@@ -5,7 +5,6 @@ name: "Docs / Publish"
 env:
  FLAG: ${{ github.repository == 'scylladb/scylla-enterprise' && 'enterprise' || 'opensource' }}
  DEFAULT_BRANCH: ${{ github.repository == 'scylladb/scylla-enterprise' && 'enterprise' || 'master' }}
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 on:
  push:
@@ -24,13 +23,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          ref: ${{ env.DEFAULT_BRANCH }}
          persist-credentials: false
          fetch-depth: 0
      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
      - name: Set up env
--- a/.github/workflows/docs-pr.yaml
+++ b/.github/workflows/docs-pr.yaml
@@ -7,7 +7,6 @@ permissions:

 env:
  FLAG: ${{ github.repository == 'scylladb/scylla-enterprise' && 'enterprise' || 'opensource' }}
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 on:
  pull_request:
@@ -23,12 +22,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          persist-credentials: false
          fetch-depth: 0
      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
      - name: Set up env
--- a/.github/workflows/docs-validate-metrics.yml
+++ b/.github/workflows/docs-validate-metrics.yml
@@ -3,9 +3,6 @@ name: Docs / Validate metrics
 permissions:
  contents: read

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 on:
  pull_request:
    branches:
@@ -24,12 +21,12 @@ jobs:

    steps:
    - name: Checkout code
-      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      uses: actions/checkout@v4
      with:
        submodules: true

    - name: Set up Python
-      uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      uses: actions/setup-python@v6
      with:
        python-version: '3.10'

--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -13,9 +13,9 @@ env:
  # supposed to be processed by idl-compiler.py, so we don't check them using the cleaner
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

-permissions: {}
+permissions:
+  contents: read

 # cancel the in-progress run upon a repush
 concurrency:
@@ -32,11 +32,9 @@ jobs:
    runs-on: ubuntu-latest
    container: ${{ needs.read-toolchain.outputs.image }}
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          submodules: true
-      - run: |
-          sudo dnf -y install clang-tools-extra
      - name: Generate compilation database
        run: |
          cmake                                         \
@@ -91,7 +89,7 @@ jobs:
            | tee "$SEASTAR_BAD_INCLUDE_OUTPUT_PATH"
      - run: |
          echo "::remove-matcher owner=seastar-bad-include::"
-      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+      - uses: actions/upload-artifact@v4
        with:
          name: Logs
          path: |
--- a/.github/workflows/make-pr-ready-for-review.yaml
+++ b/.github/workflows/make-pr-ready-for-review.yaml
@@ -7,7 +7,6 @@ on:

 env:
  DEFAULT_BRANCH: 'master'
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 jobs:
  mark-ready:
@@ -18,7 +17,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          repository: ${{ github.repository }}
          ref: ${{ env.DEFAULT_BRANCH }}
--- a/.github/workflows/pr-require-backport-label.yaml
+++ b/.github/workflows/pr-require-backport-label.yaml
@@ -5,8 +5,6 @@ on:
    branches:
      - master
      - next
-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 jobs:
  label:
    if: github.event.pull_request.draft == false
@@ -17,7 +15,7 @@ jobs:
    steps:
      - name: Wait for label to be added
        run: sleep 1m
-      - uses: mheap/github-action-required-labels@0ac283b4e65c1fb28ce6079dea5546ceca98ccbe # v5.5.2
+      - uses: mheap/github-action-required-labels@v5
        with:
          mode: minimum
          count: 1
--- a/.github/workflows/read-toolchain.yaml
+++ b/.github/workflows/read-toolchain.yaml
@@ -7,9 +7,6 @@ on:
        description: "the toolchain docker image"
        value: ${{ jobs.read-toolchain.outputs.image }}

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  read-toolchain:
    runs-on: ubuntu-latest
@@ -18,7 +15,7 @@ jobs:
    outputs:
      image: ${{ steps.read.outputs.image }}
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          sparse-checkout: tools/toolchain/image
          sparse-checkout-cone-mode: false
--- a/.github/workflows/seastar.yaml
+++ b/.github/workflows/seastar.yaml
@@ -13,7 +13,6 @@ concurrency:

 env:
  BUILD_DIR: build
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

 jobs:
  read-toolchain:
@@ -30,12 +29,12 @@ jobs:
          - RelWithDebInfo
          - Dev
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          submodules: true
      - run: |
          rm -rf seastar
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v4
        with:
          repository: scylladb/seastar
          submodules: true
--- a/.github/workflows/sync-labels.yaml
+++ b/.github/workflows/sync-labels.yaml
@@ -7,9 +7,6 @@ on:
  issues:
    types: [labeled, unlabeled]

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  label-sync:
    if: ${{ github.repository == 'scylladb/scylladb' }}
@@ -24,7 +21,7 @@ jobs:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v4
        with:
          sparse-checkout: |
            .github/scripts/sync_labels.py
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,57 +9,16 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
+    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
-      - name: Verify Org Membership
-        id: verify_author
-        env:
-          EVENT_NAME: ${{ github.event_name }}
-          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
-          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
-          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
-        shell: bash
-        run: |
-          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
-            AUTHOR="$PR_AUTHOR"
-            ASSOCIATION="$PR_ASSOCIATION"
-          else
-            AUTHOR="$COMMENT_AUTHOR"
-            ASSOCIATION="$COMMENT_ASSOCIATION"
-          fi
-          ORG="scylladb"
-          if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
-            echo "member=true" >> $GITHUB_OUTPUT
-          else
-            echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
-            echo "member=false" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Validate Comment Trigger
-        if: github.event_name == 'issue_comment'
-        id: verify_comment
-        env:
-          COMMENT_BODY: ${{ github.event.comment.body }}
-        shell: bash
-        run: |
-          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')
-
-          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
-            echo "trigger=true" >> $GITHUB_OUTPUT
-          else
-            echo "trigger=false" >> $GITHUB_OUTPUT
-          fi
-
      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
-          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
-          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/.github/workflows/trigger_ci.yaml
+++ b/.github/workflows/trigger_ci.yaml
@@ -5,10 +5,7 @@ on:
    types: [opened, reopened, synchronize]
  issue_comment:
    types: [created]
-
-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
+    
 jobs:
  trigger-ci:
    runs-on: ubuntu-latest
@@ -18,7 +15,7 @@ jobs:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
      - name: Checkout PR code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0  # Needed to access full history
          ref: ${{ github.event.pull_request.head.ref }}
--- a/.github/workflows/urgent_issue_reminder.yml
+++ b/.github/workflows/urgent_issue_reminder.yml
@@ -4,16 +4,13 @@ on:
  schedule:
    - cron: '10 8 * * *' # Runs daily at 8 AM

-env:
-  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-
 jobs:
  reminder:
    runs-on: ubuntu-latest

    steps:
    - name: Send reminders
-      uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+      uses: actions/github-script@v7
      with:
        script: |
          const labelFilters = ['P0', 'P1', 'Field-Tier1','status/release blocker', 'status/regression']; 
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.3
+VERSION=2026.2.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -244,7 +244,10 @@ static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {

 // Check if two JSON-encoded values match with the CONTAINS relation
 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query) {
-    if (!v1) {
+    if (!v1 || !v1->IsObject() || v1->MemberCount() == 0) {
+        return false;
+    }
+    if (!v2.IsObject() || v2.MemberCount() == 0) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_reponse) {
+    if (_should_add_to_response) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
@@ -53,7 +53,9 @@ void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjso
 }

 static uint64_t calculate_half_units(uint64_t unit_block_size, uint64_t total_bytes, bool is_quorum) {
-    uint64_t half_units = (total_bytes + unit_block_size -1) / unit_block_size; //divide by unit_block_size and round up
+    // Avoid potential integer overflow when total_bytes is close to UINT64_MAX
+    // by using division with modulo instead of addition before division
+    uint64_t half_units = total_bytes / unit_block_size + (total_bytes % unit_block_size != 0 ? 1 : 0);

    if (is_quorum) {
        half_units *= 2;
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
+    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
    bool operator()() const noexcept {
-        return _should_add_to_reponse;
+        return _should_add_to_response;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_reponse = false;
+    bool _should_add_to_response = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -834,11 +834,13 @@ future<> executor::fill_table_size(rjson::value &table_description, schema_ptr s
            total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
            const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
            // Note: we don't care when the notification of other shards will finish, as long as it will be done
-            // it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
-            // the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
-            // with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
-            // In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
-            // which is also fine, as the specification doesn't give precision guarantees of any kind.
+            // A race condition is possible: if a DescribeTable request arrives on a different shard before
+            // that shard receives the cached size, it will recalculate independently. This is acceptable because:
+            // 1. Both calculations will cache their results with an expiry time
+            // 2. Expiry times are unlikely to be identical, so eventually all shards converge to the most recent value
+            // 3. Even if expiry times match, different shards may briefly return different table sizes
+            // 4. This temporary inconsistency is acceptable per DynamoDB specification, which doesn't guarantee
+            //    exact precision for DescribeTable size information
            co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
        }
    }
@@ -3464,11 +3466,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
    if (should_add_wcu) {
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
-    auto duration = std::chrono::steady_clock::now() - start_time;
-    _stats.api_operations.batch_write_item_latency.mark(duration);
-    for (const auto& w : per_table_wcu) {
-        w.first->api_operations.batch_write_item_latency.mark(duration);
-    }
+    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    co_return rjson::print(std::move(ret));
 }

@@ -4979,12 +4977,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (!some_succeeded && eptr) {
        co_await coroutine::return_exception_ptr(std::move(eptr));
    }
-    auto duration = std::chrono::steady_clock::now() - start_time;
-    _stats.api_operations.batch_get_item_latency.mark(duration);
-    for (const table_requests& rs : requests) {
-        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
-        per_table_stats->api_operations.batch_get_item_latency.mark(duration);
-    }
+    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        co_return rjson::print(std::move(ret));
    }

    // TODO: label
@@ -502,123 +502,121 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
+    auto e = topologies.end();
+    auto prev = e;
+    auto shards = rjson::empty_array();

-        auto e = topologies.end();
-        auto prev = e;
-        auto shards = rjson::empty_array();
+    std::optional<shard_id> last;

-        std::optional<shard_id> last;
+    auto i = topologies.begin();
+    // if we're a paged query, skip to the generation where we left of.
+    if (shard_start) {
+        i = topologies.find(shard_start->time);
+    }

-        auto i = topologies.begin();
-        // if we're a paged query, skip to the generation where we left of.
-        if (shard_start) {
-            i = topologies.find(shard_start->time);
-        }
+    // for parent-child stuff we need id:s to be sorted by token
+    // (see explanation above) since we want to find closest
+    // token boundary when determining parent.
+    // #7346 - we processed and searched children/parents in
+    // stored order, which is not necessarily token order,
+    // so the finding of "closest" token boundary (using upper bound)
+    // could give somewhat weird results.
+    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return id1.token() < id2.token();
+    };

-        // for parent-child stuff we need id:s to be sorted by token
-        // (see explanation above) since we want to find closest
-        // token boundary when determining parent.
-        // #7346 - we processed and searched children/parents in
-        // stored order, which is not necessarily token order,
-        // so the finding of "closest" token boundary (using upper bound)
-        // could give somewhat weird results.
-        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return id1.token() < id2.token();
-        };
+    // #7409 - shards must be returned in lexicographical order,
+    // normal bytes compare is string_traits<int8_t>::compare.
+    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+    // compare instead we inadvertently will sort in string lexical.
+    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+    };
+
+    // need a prev even if we are skipping stuff
+    if (i != topologies.begin()) {
+        prev = std::prev(i);
+    }
+
+    for (; limit > 0 && i != e; prev = i, ++i) {
+        auto& [ts, sv] = *i;
+
+        last = std::nullopt;
+
+        auto lo = sv.streams.begin();
+        auto end = sv.streams.end();

        // #7409 - shards must be returned in lexicographical order,
-        // normal bytes compare is string_traits<int8_t>::compare.
-        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-        // compare instead we inadvertently will sort in string lexical.
-        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-        };
+        std::sort(lo, end, id_cmp);

-        // need a prev even if we are skipping stuff
-        if (i != topologies.begin()) {
-            prev = std::prev(i);
+        if (shard_start) {
+            // find next shard position
+            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+            shard_start = std::nullopt;
        }

-        for (; limit > 0 && i != e; prev = i, ++i) {
-            auto& [ts, sv] = *i;
+        if (lo != end && prev != e) {
+            // We want older stuff sorted in token order so we can find matching
+            // token range when determining parent shard.
+            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+        }
+
+        auto expired = [&]() -> std::optional<db_clock::time_point> {
+            auto j = std::next(i);
+            if (j == e) {
+                return std::nullopt;
+            }
+            // add this so we sort of match potential 
+            // sequence numbers in get_records result.
+            return j->first + confidence_interval(db);
+        }();
+
+        while (lo != end) {
+            auto& id = *lo++;
+
+            auto shard = rjson::empty_object();
+
+            if (prev != e) {
+                auto& pids = prev->second.streams;
+                auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
+                    return t < id.token();
+                });
+                if (pid != pids.begin()) {
+                    pid = std::prev(pid);
+                }
+                if (pid != pids.end()) {
+                    rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
+                }
+            }
+
+            last.emplace(ts, id);
+            rjson::add(shard, "ShardId", *last);
+            auto range = rjson::empty_object();
+            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+            if (expired) {
+                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+            }
+
+            rjson::add(shard, "SequenceNumberRange", std::move(range));
+            rjson::push_back(shards, std::move(shard));
+            
+            if (--limit == 0) {
+                break;
+            }

            last = std::nullopt;
-
-            auto lo = sv.streams.begin();
-            auto end = sv.streams.end();
-
-            // #7409 - shards must be returned in lexicographical order,
-            std::sort(lo, end, id_cmp);
-
-            if (shard_start) {
-                // find next shard position
-                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-                shard_start = std::nullopt;
-            }
-
-            if (lo != end && prev != e) {
-                // We want older stuff sorted in token order so we can find matching
-                // token range when determining parent shard.
-                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-            }
-
-            auto expired = [&]() -> std::optional<db_clock::time_point> {
-                auto j = std::next(i);
-                if (j == e) {
-                    return std::nullopt;
-                }
-                // add this so we sort of match potential 
-                // sequence numbers in get_records result.
-                return j->first + confidence_interval(db);
-            }();
-
-            while (lo != end) {
-                auto& id = *lo++;
-
-                auto shard = rjson::empty_object();
-
-                if (prev != e) {
-                    auto& pids = prev->second.streams;
-                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                        return t < id.token();
-                    });
-                    if (pid != pids.begin()) {
-                        pid = std::prev(pid);
-                    }
-                    if (pid != pids.end()) {
-                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                    }
-                }
-
-                last.emplace(ts, id);
-                rjson::add(shard, "ShardId", *last);
-                auto range = rjson::empty_object();
-                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-                if (expired) {
-                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-                }
-
-                rjson::add(shard, "SequenceNumberRange", std::move(range));
-                rjson::push_back(shards, std::move(shard));
-                
-                if (--limit == 0) {
-                    break;
-                }
-
-                last = std::nullopt;
-            }
        }
+    }

-        if (last) {
-            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-        }
+    if (last) {
+        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+    }

-        rjson::add(stream_desc, "Shards", std::move(shards));
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-            
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-    });
+    rjson::add(stream_desc, "Shards", std::move(shards));
+    rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        
+    co_return rjson::print(std::move(ret));
 }

 enum class shard_iterator_type {
@@ -898,172 +896,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
-            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-        auto result_set = builder.build();
-        auto records = rjson::empty_array();
+    auto result_set = builder.build();
+    auto records = rjson::empty_array();

-        auto& metadata = result_set->get_metadata();
+    auto& metadata = result_set->get_metadata();

-        auto op_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == op_column_name;
-            })
-        );
-        auto ts_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == timestamp_column_name;
-            })
-        );
-        auto eor_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == eor_column_name;
-            })
-        );
+    auto op_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == op_column_name;
+        })
+    );
+    auto ts_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == timestamp_column_name;
+        })
+    );
+    auto eor_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == eor_column_name;
+        })
+    );

-        std::optional<utils::UUID> timestamp;
-        auto dynamodb = rjson::empty_object();
-        auto record = rjson::empty_object();
-        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+    std::optional<utils::UUID> timestamp;
+    auto dynamodb = rjson::empty_object();
+    auto record = rjson::empty_object();
+    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-        using op_utype = std::underlying_type_t<cdc::operation>;
+    using op_utype = std::underlying_type_t<cdc::operation>;

-        auto maybe_add_record = [&] {
-            if (!dynamodb.ObjectEmpty()) {
-                rjson::add(record, "dynamodb", std::move(dynamodb));
-                dynamodb = rjson::empty_object();
-            }
-            if (!record.ObjectEmpty()) {
-                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.1");
-                rjson::push_back(records, std::move(record));
-                record = rjson::empty_object();
-                --limit;
-            }
-        };
+    auto maybe_add_record = [&] {
+        if (!dynamodb.ObjectEmpty()) {
+            rjson::add(record, "dynamodb", std::move(dynamodb));
+            dynamodb = rjson::empty_object();
+        }
+        if (!record.ObjectEmpty()) {
+            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+            rjson::add(record, "eventSource", "scylladb:alternator");
+            rjson::add(record, "eventVersion", "1.1");
+            rjson::push_back(records, std::move(record));
+            record = rjson::empty_object();
+            --limit;
+        }
+    };

-        for (auto& row : result_set->rows()) {
-            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+    for (auto& row : result_set->rows()) {
+        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-            if (!dynamodb.HasMember("Keys")) {
-                auto keys = rjson::empty_object();
-                describe_single_item(*selection, row, key_names, keys);
-                rjson::add(dynamodb, "Keys", std::move(keys));
-                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-                rjson::add(dynamodb, "StreamViewType", type);
-                // TODO: SizeBytes
-            }
-
-            /**
-             * We merge rows with same timestamp into a single event.
-             * This is pretty much needed, because a CDC row typically
-             * encodes ~half the info of an alternator write. 
-             * 
-             * A big, big downside to how alternator records are written
-             * (i.e. CQL), is that the distinction between INSERT and UPDATE
-             * is somewhat lost/unmappable to actual eventName. 
-             * A write (currently) always looks like an insert+modify
-             * regardless whether we wrote existing record or not. 
-             * 
-             * Maybe RMW ops could be done slightly differently so 
-             * we can distinguish them here...
-             * 
-             * For now, all writes will become MODIFY.
-             * 
-             * Note: we do not check the current pre/post
-             * flags on CDC log, instead we use data to 
-             * drive what is returned. This is (afaict)
-             * consistent with dynamo streams
-             */
-            switch (op) {
-            case cdc::operation::pre_image:
-            case cdc::operation::post_image:
-            {
-                auto item = rjson::empty_object();
-                describe_single_item(*selection, row, attr_names, item, nullptr, true);
-                describe_single_item(*selection, row, key_names, item);
-                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-                break;
-            }
-            case cdc::operation::update:
-                rjson::add(record, "eventName", "MODIFY");
-                break;
-            case cdc::operation::insert:
-                rjson::add(record, "eventName", "INSERT");
-                break;
-            case cdc::operation::service_row_delete:
-            case cdc::operation::service_partition_delete:
-            {
-                auto user_identity = rjson::empty_object();
-                rjson::add(user_identity, "Type", "Service");
-                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-                rjson::add(record, "userIdentity", std::move(user_identity));
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            default:
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            if (eor) {
-                maybe_add_record();
-                timestamp = ts;
-                if (limit == 0) {
-                    break;
-                }
-            }
+        if (!dynamodb.HasMember("Keys")) {
+            auto keys = rjson::empty_object();
+            describe_single_item(*selection, row, key_names, keys);
+            rjson::add(dynamodb, "Keys", std::move(keys));
+            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(dynamodb, "StreamViewType", type);
+            // TODO: SizeBytes
        }

-        auto ret = rjson::empty_object();
-        auto nrecords = records.Size();
-        rjson::add(ret, "Records", std::move(records));
-
-        if (nrecords != 0) {
-            // #9642. Set next iterators threshold to > last
-            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-            // Note that here we unconditionally return NextShardIterator,
-            // without checking if maybe we reached the end-of-shard. If the
-            // shard did end, then the next read will have nrecords == 0 and
-            // will notice end end of shard and not return NextShardIterator.
-            rjson::add(ret, "NextShardIterator", next_iter);
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        /**
+         * We merge rows with same timestamp into a single event.
+         * This is pretty much needed, because a CDC row typically
+         * encodes ~half the info of an alternator write. 
+         * 
+         * A big, big downside to how alternator records are written
+         * (i.e. CQL), is that the distinction between INSERT and UPDATE
+         * is somewhat lost/unmappable to actual eventName. 
+         * A write (currently) always looks like an insert+modify
+         * regardless whether we wrote existing record or not. 
+         * 
+         * Maybe RMW ops could be done slightly differently so 
+         * we can distinguish them here...
+         * 
+         * For now, all writes will become MODIFY.
+         * 
+         * Note: we do not check the current pre/post
+         * flags on CDC log, instead we use data to 
+         * drive what is returned. This is (afaict)
+         * consistent with dynamo streams
+         */
+        switch (op) {
+        case cdc::operation::pre_image:
+        case cdc::operation::post_image:
+        {
+            auto item = rjson::empty_object();
+            describe_single_item(*selection, row, attr_names, item, nullptr, true);
+            describe_single_item(*selection, row, key_names, item);
+            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            break;
        }
-
-        // ugh. figure out if we are and end-of-shard
-        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-
-        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
-            auto& shard = iter.shard;            
-
-            if (shard.time < ts && ts < high_ts) {
-                // The DynamoDB documentation states that when a shard is
-                // closed, reading it until the end has NextShardIterator
-                // "set to null". Our test test_streams_closed_read
-                // confirms that by "null" they meant not set at all.
-            } else {
-                // We could have return the same iterator again, but we did
-                // a search from it until high_ts and found nothing, so we
-                // can also start the next search from high_ts.
-                // TODO: but why? It's simpler just to leave the iterator be.
-                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-                rjson::add(ret, "NextShardIterator", iter);
+        case cdc::operation::update:
+            rjson::add(record, "eventName", "MODIFY");
+            break;
+        case cdc::operation::insert:
+            rjson::add(record, "eventName", "INSERT");
+            break;
+        case cdc::operation::service_row_delete:
+        case cdc::operation::service_partition_delete:
+        {
+            auto user_identity = rjson::empty_object();
+            rjson::add(user_identity, "Type", "Service");
+            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+            rjson::add(record, "userIdentity", std::move(user_identity));
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        default:
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        if (eor) {
+            maybe_add_record();
+            timestamp = ts;
+            if (limit == 0) {
+                break;
            }
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            if (is_big(ret)) {
-                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
-            }
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-        });
-    });
+        }
+    }
+
+    auto ret = rjson::empty_object();
+    auto nrecords = records.Size();
+    rjson::add(ret, "Records", std::move(records));
+
+    if (nrecords != 0) {
+        // #9642. Set next iterators threshold to > last
+        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+        // Note that here we unconditionally return NextShardIterator,
+        // without checking if maybe we reached the end-of-shard. If the
+        // shard did end, then the next read will have nrecords == 0 and
+        // will notice end end of shard and not return NextShardIterator.
+        rjson::add(ret, "NextShardIterator", next_iter);
+        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+        co_return rjson::print(std::move(ret));
+    }
+
+    // ugh. figure out if we are and end-of-shard
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+
+    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
+    auto& shard = iter.shard;
+
+    if (shard.time < ts && ts < high_ts) {
+        // The DynamoDB documentation states that when a shard is
+        // closed, reading it until the end has NextShardIterator
+        // "set to null". Our test test_streams_closed_read
+        // confirms that by "null" they meant not set at all.
+    } else {
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        rjson::add(ret, "NextShardIterator", iter);
+    }
+    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+    if (is_big(ret)) {
+        co_return make_streamed(std::move(ret));
+    }
+    co_return rjson::print(std::move(ret));
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -767,7 +767,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
-                    keyspace,
-                    table,
-                    endpoint,
-                    bucket,
-                    prefix,
-                    sstables.size(),
-                    scope,
-                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
@@ -893,9 +884,7 @@ rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::r
    }

    apilog.info("exclude_node: hosts={}", hosts);
-    co_await ss.local().run_with_no_api_lock([hosts = std::move(hosts)] (service::storage_service& ss) {
-        return ss.mark_excluded(hosts);
-    });
+    co_await ss.local().mark_excluded(hosts);
    co_return json_void();
 }

@@ -1805,100 +1794,84 @@ rest_bind(FuncType func, BindArgs&... args) {
    return std::bind_front(func, std::ref(args)...);
 }

-// Hold the storage_service async gate for the duration of async REST
-// handlers so stop() drains in-flight requests before teardown.
-// Synchronous handlers don't yield and need no gate.
-static seastar::httpd::future_json_function
-gated(sharded<service::storage_service>& ss, seastar::httpd::future_json_function fn) {
-    return [fn = std::move(fn), &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto holder = ss.local().hold_async_gate();
-        co_return co_await fn(std::move(req));
-    };
-}
-
-static seastar::httpd::json_request_function
-gated(sharded<service::storage_service>&, seastar::httpd::json_request_function fn) {
-    return fn;
-}
-
 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    ss::get_token_endpoint.set(r, gated(ss, rest_bind(rest_get_token_endpoint, ctx, ss)));
-    ss::toppartitions_generic.set(r, gated(ss, rest_bind(rest_toppartitions_generic, ctx)));
-    ss::get_release_version.set(r, gated(ss, rest_bind(rest_get_release_version, ss)));
-    ss::get_scylla_release_version.set(r, gated(ss, rest_bind(rest_get_scylla_release_version, ss)));
-    ss::get_schema_version.set(r, gated(ss, rest_bind(rest_get_schema_version, ss)));
-    ss::get_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_range_to_endpoint_map, ctx, ss)));
-    ss::get_pending_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_pending_range_to_endpoint_map, ctx)));
-    ss::describe_ring.set(r, gated(ss, rest_bind(rest_describe_ring, ctx, ss)));
-    ss::get_current_generation_number.set(r, gated(ss, rest_bind(rest_get_current_generation_number, ss)));
-    ss::get_natural_endpoints.set(r, gated(ss, rest_bind(rest_get_natural_endpoints, ctx, ss)));
-    ss::get_natural_endpoints_v2.set(r, gated(ss, rest_bind(rest_get_natural_endpoints_v2, ctx, ss)));
-    ss::cdc_streams_check_and_repair.set(r, gated(ss, rest_bind(rest_cdc_streams_check_and_repair, ss)));
-    ss::cleanup_all.set(r, gated(ss, rest_bind(rest_cleanup_all, ctx, ss)));
-    ss::reset_cleanup_needed.set(r, gated(ss, rest_bind(rest_reset_cleanup_needed, ctx, ss)));
-    ss::force_flush.set(r, gated(ss, rest_bind(rest_force_flush, ctx)));
-    ss::force_keyspace_flush.set(r, gated(ss, rest_bind(rest_force_keyspace_flush, ctx)));
-    ss::decommission.set(r, gated(ss, rest_bind(rest_decommission, ss)));
-    ss::move.set(r, gated(ss, rest_bind(rest_move, ss)));
-    ss::remove_node.set(r, gated(ss, rest_bind(rest_remove_node, ss)));
-    ss::exclude_node.set(r, gated(ss, rest_bind(rest_exclude_node, ss)));
-    ss::get_removal_status.set(r, gated(ss, rest_bind(rest_get_removal_status, ss)));
-    ss::force_remove_completion.set(r, gated(ss, rest_bind(rest_force_remove_completion, ss)));
-    ss::set_logging_level.set(r, gated(ss, rest_bind(rest_set_logging_level)));
-    ss::get_logging_levels.set(r, gated(ss, rest_bind(rest_get_logging_levels)));
-    ss::get_operation_mode.set(r, gated(ss, rest_bind(rest_get_operation_mode, ss)));
-    ss::is_starting.set(r, gated(ss, rest_bind(rest_is_starting, ss)));
-    ss::get_drain_progress.set(r, gated(ss, rest_bind(rest_get_drain_progress, ss)));
-    ss::drain.set(r, gated(ss, rest_bind(rest_drain, ss)));
-    ss::stop_gossiping.set(r, gated(ss, rest_bind(rest_stop_gossiping, ss)));
-    ss::start_gossiping.set(r, gated(ss, rest_bind(rest_start_gossiping, ss)));
-    ss::is_gossip_running.set(r, gated(ss, rest_bind(rest_is_gossip_running, ss)));
-    ss::stop_daemon.set(r, gated(ss, rest_bind(rest_stop_daemon)));
-    ss::is_initialized.set(r, gated(ss, rest_bind(rest_is_initialized, ss)));
-    ss::join_ring.set(r, gated(ss, rest_bind(rest_join_ring)));
-    ss::is_joined.set(r, gated(ss, rest_bind(rest_is_joined, ss)));
-    ss::is_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_is_incremental_backups_enabled, ctx)));
-    ss::set_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_set_incremental_backups_enabled, ctx)));
-    ss::rebuild.set(r, gated(ss, rest_bind(rest_rebuild, ss)));
-    ss::bulk_load.set(r, gated(ss, rest_bind(rest_bulk_load)));
-    ss::bulk_load_async.set(r, gated(ss, rest_bind(rest_bulk_load_async)));
-    ss::reschedule_failed_deletions.set(r, gated(ss, rest_bind(rest_reschedule_failed_deletions)));
-    ss::sample_key_range.set(r, gated(ss, rest_bind(rest_sample_key_range)));
-    ss::reset_local_schema.set(r, gated(ss, rest_bind(rest_reset_local_schema, ss)));
-    ss::set_trace_probability.set(r, gated(ss, rest_bind(rest_set_trace_probability)));
-    ss::get_trace_probability.set(r, gated(ss, rest_bind(rest_get_trace_probability)));
-    ss::get_slow_query_info.set(r, gated(ss, rest_bind(rest_get_slow_query_info)));
-    ss::set_slow_query.set(r, gated(ss, rest_bind(rest_set_slow_query)));
-    ss::deliver_hints.set(r, gated(ss, rest_bind(rest_deliver_hints)));
-    ss::get_cluster_name.set(r, gated(ss, rest_bind(rest_get_cluster_name, ss)));
-    ss::get_partitioner_name.set(r, gated(ss, rest_bind(rest_get_partitioner_name, ss)));
-    ss::get_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_warn_threshold)));
-    ss::set_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_warn_threshold)));
-    ss::get_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_failure_threshold)));
-    ss::set_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_failure_threshold)));
-    ss::get_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_get_batch_size_failure_threshold)));
-    ss::set_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_set_batch_size_failure_threshold)));
-    ss::set_hinted_handoff_throttle_in_kb.set(r, gated(ss, rest_bind(rest_set_hinted_handoff_throttle_in_kb)));
-    ss::get_exceptions.set(r, gated(ss, rest_bind(rest_get_exceptions, ss)));
-    ss::get_total_hints_in_progress.set(r, gated(ss, rest_bind(rest_get_total_hints_in_progress)));
-    ss::get_total_hints.set(r, gated(ss, rest_bind(rest_get_total_hints)));
-    ss::get_ownership.set(r, gated(ss, rest_bind(rest_get_ownership, ctx, ss)));
-    ss::get_effective_ownership.set(r, gated(ss, rest_bind(rest_get_effective_ownership, ctx, ss)));
-    ss::retrain_dict.set(r, gated(ss, rest_bind(rest_retrain_dict, ctx, ss, group0_client)));
-    ss::estimate_compression_ratios.set(r, gated(ss, rest_bind(rest_estimate_compression_ratios, ctx, ss)));
-    ss::sstable_info.set(r, gated(ss, rest_bind(rest_sstable_info, ctx)));
-    ss::reload_raft_topology_state.set(r, gated(ss, rest_bind(rest_reload_raft_topology_state, ss, group0_client)));
-    ss::upgrade_to_raft_topology.set(r, gated(ss, rest_bind(rest_upgrade_to_raft_topology, ss)));
-    ss::raft_topology_upgrade_status.set(r, gated(ss, rest_bind(rest_raft_topology_upgrade_status, ss)));
-    ss::raft_topology_get_cmd_status.set(r, gated(ss, rest_bind(rest_raft_topology_get_cmd_status, ss)));
-    ss::move_tablet.set(r, gated(ss, rest_bind(rest_move_tablet, ctx, ss)));
-    ss::add_tablet_replica.set(r, gated(ss, rest_bind(rest_add_tablet_replica, ctx, ss)));
-    ss::del_tablet_replica.set(r, gated(ss, rest_bind(rest_del_tablet_replica, ctx, ss)));
-    ss::repair_tablet.set(r, gated(ss, rest_bind(rest_repair_tablet, ctx, ss)));
-    ss::tablet_balancing_enable.set(r, gated(ss, rest_bind(rest_tablet_balancing_enable, ss)));
-    ss::quiesce_topology.set(r, gated(ss, rest_bind(rest_quiesce_topology, ss)));
-    sp::get_schema_versions.set(r, gated(ss, rest_bind(rest_get_schema_versions, ss)));
-    ss::drop_quarantined_sstables.set(r, gated(ss, rest_bind(rest_drop_quarantined_sstables, ctx, ss)));
+    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
+    ss::toppartitions_generic.set(r, rest_bind(rest_toppartitions_generic, ctx));
+    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
+    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
+    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
+    ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
+    ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
+    ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
+    ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
+    ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
+    ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
+    ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
+    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
+    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
+    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
+    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
+    ss::decommission.set(r, rest_bind(rest_decommission, ss));
+    ss::move.set(r, rest_bind(rest_move, ss));
+    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
+    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
+    ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
+    ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
+    ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
+    ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
+    ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
+    ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
+    ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
+    ss::drain.set(r, rest_bind(rest_drain, ss));
+    ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
+    ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
+    ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
+    ss::stop_daemon.set(r, rest_bind(rest_stop_daemon));
+    ss::is_initialized.set(r, rest_bind(rest_is_initialized, ss));
+    ss::join_ring.set(r, rest_bind(rest_join_ring));
+    ss::is_joined.set(r, rest_bind(rest_is_joined, ss));
+    ss::is_incremental_backups_enabled.set(r, rest_bind(rest_is_incremental_backups_enabled, ctx));
+    ss::set_incremental_backups_enabled.set(r, rest_bind(rest_set_incremental_backups_enabled, ctx));
+    ss::rebuild.set(r, rest_bind(rest_rebuild, ss));
+    ss::bulk_load.set(r, rest_bind(rest_bulk_load));
+    ss::bulk_load_async.set(r, rest_bind(rest_bulk_load_async));
+    ss::reschedule_failed_deletions.set(r, rest_bind(rest_reschedule_failed_deletions));
+    ss::sample_key_range.set(r, rest_bind(rest_sample_key_range));
+    ss::reset_local_schema.set(r, rest_bind(rest_reset_local_schema, ss));
+    ss::set_trace_probability.set(r, rest_bind(rest_set_trace_probability));
+    ss::get_trace_probability.set(r, rest_bind(rest_get_trace_probability));
+    ss::get_slow_query_info.set(r, rest_bind(rest_get_slow_query_info));
+    ss::set_slow_query.set(r, rest_bind(rest_set_slow_query));
+    ss::deliver_hints.set(r, rest_bind(rest_deliver_hints));
+    ss::get_cluster_name.set(r, rest_bind(rest_get_cluster_name, ss));
+    ss::get_partitioner_name.set(r, rest_bind(rest_get_partitioner_name, ss));
+    ss::get_tombstone_warn_threshold.set(r, rest_bind(rest_get_tombstone_warn_threshold));
+    ss::set_tombstone_warn_threshold.set(r, rest_bind(rest_set_tombstone_warn_threshold));
+    ss::get_tombstone_failure_threshold.set(r, rest_bind(rest_get_tombstone_failure_threshold));
+    ss::set_tombstone_failure_threshold.set(r, rest_bind(rest_set_tombstone_failure_threshold));
+    ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
+    ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
+    ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
+    ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
+    ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
+    ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
+    ss::get_ownership.set(r, rest_bind(rest_get_ownership, ctx, ss));
+    ss::get_effective_ownership.set(r, rest_bind(rest_get_effective_ownership, ctx, ss));
+    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
+    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
+    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
+    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
+    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
+    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
+    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
+    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
+    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
+    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
+    ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
+    ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
+    ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
+    sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
+    ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
 }

 void unset_storage_service(http_context& ctx, routes& r) {
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table);
-}
-
-audit_info_ptr audit::create_no_audit_info() {
-    return audit_info_ptr();
+    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

 future<> audit::start(const db::config& cfg) {
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
-    if (batch != nullptr) {
+    auto audit_info = statement->get_audit_info();
+    if (!audit_info) {
+        return make_ready_future<>();
+    }
+    if (audit_info->batch()) {
+        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        auto audit_info = statement->get_audit_info();
-        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
+        if (audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
+        return make_ready_future<>();
    }
-    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,11 +75,13 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
+    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table)
+    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
+        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -89,6 +91,7 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
+    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -126,8 +129,7 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
-    static audit_info_ptr create_no_audit_info();
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -81,14 +81,24 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
        auto rs = co_await fetch(q);
        for (const auto& r : *rs) {
-            if (!r.has("value")) {
-                continue;
-            }
            rec->attributes[r.get_as<sstring>("name")] =
                    r.get_as<sstring>("value");
            co_await coroutine::maybe_yield();
        }
    }
+    // permissions
+    {
+        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
+        auto rs = co_await fetch(q);
+        for (const auto& r : *rs) {
+            auto resource = r.get_as<sstring>("resource");
+            auto perms_strings = r.get_set<sstring>("permissions");
+            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
+            auto pset = permissions::from_strings(perms_set);
+            rec->permissions[std::move(resource)] = std::move(pset);
+            co_await coroutine::maybe_yield();
+        }
+    }
    co_return rec;
 }

--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -39,6 +39,7 @@ public:
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
        std::unordered_map<sstring, sstring> attributes;
+        std::unordered_map<sstring, permission_set> permissions;
        version_tag_t version; // used for seamless cache reloads
    };

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -174,11 +174,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
    if (results->empty()) {
        co_return permissions::NONE;
    }
-    const auto& row = results->one();
-    if (!row.has(PERMISSIONS_NAME)) {
-        co_return permissions::NONE;
-    }
-    co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
+    co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
 }

 future<>
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -32,8 +32,6 @@ namespace {

 logger mylog{"ldap_role_manager"}; // `log` is taken by math.

-constexpr std::string_view user_placeholder = "{USER}";
-
 struct url_desc_deleter {
    void operator()(LDAPURLDesc *p) {
        ldap_free_urldesc(p);
@@ -42,141 +40,9 @@ struct url_desc_deleter {

 using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;

-/// Escapes LDAP filter assertion value per RFC 4515 Section 3.
-/// The characters *, (, ), \, and NUL must be backslash-hex-escaped
-/// to prevent filter injection when interpolating untrusted input.
-sstring escape_filter_value(std::string_view value) {
-    size_t escapable_chars = 0;
-    for (unsigned char ch : value) {
-        switch (ch) {
-        case '*':
-        case '(':
-        case ')':
-        case '\\':
-        case '\0':
-            ++escapable_chars;
-            break;
-        default:
-            break;
-        }
-    }
-
-    if (escapable_chars == 0) {
-        return sstring(value);
-    }
-
-    sstring escaped(value.size() + escapable_chars * 2, 0);
-    size_t pos = 0;
-    for (unsigned char ch : value) {
-        switch (ch) {
-        case '*':
-            escaped[pos++] = '\\';
-            escaped[pos++] = '2';
-            escaped[pos++] = 'a';
-            break;
-        case '(':
-            escaped[pos++] = '\\';
-            escaped[pos++] = '2';
-            escaped[pos++] = '8';
-            break;
-        case ')':
-            escaped[pos++] = '\\';
-            escaped[pos++] = '2';
-            escaped[pos++] = '9';
-            break;
-        case '\\':
-            escaped[pos++] = '\\';
-            escaped[pos++] = '5';
-            escaped[pos++] = 'c';
-            break;
-        case '\0':
-            escaped[pos++] = '\\';
-            escaped[pos++] = '0';
-            escaped[pos++] = '0';
-            break;
-        default:
-            escaped[pos++] = static_cast<char>(ch);
-            break;
-        }
-    }
-
-    return escaped;
-}
-
-/// Percent-encodes characters that are not RFC 3986 "unreserved"
-/// (ALPHA / DIGIT / '-' / '.' / '_' / '~').
-///
-/// Uses explicit ASCII range checks instead of std::isalnum() because
-/// the latter is locale-dependent and could pass non-ASCII characters
-/// through unencoded under certain locale settings.
-///
-/// This is applied AFTER RFC 4515 filter escaping when the value is
-/// substituted into an LDAP URL.  It serves two purposes:
-///  1. Prevents URL-level metacharacters ('?', '#') from breaking
-///     the URL structure parsed by ldap_url_parse.
-///  2. Prevents percent-decoding (which ldap_url_parse performs on
-///     each component) from undoing the filter escaping, e.g. a
-///     literal "%2a" in the username would otherwise decode to '*'.
-sstring percent_encode_for_url(std::string_view value) {
-    static constexpr char hex[] = "0123456789ABCDEF";
-
-    size_t chars_to_encode = 0;
-    for (unsigned char ch : value) {
-        if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
-                || ch == '-' || ch == '.' || ch == '_' || ch == '~')) {
-            ++chars_to_encode;
-        }
-    }
-
-    if (chars_to_encode == 0) {
-        return sstring(value);
-    }
-
-    sstring encoded(value.size() + chars_to_encode * 2, 0);
-    size_t pos = 0;
-    for (unsigned char ch : value) {
-        if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
-                || ch == '-' || ch == '.' || ch == '_' || ch == '~') {
-            encoded[pos++] = static_cast<char>(ch);
-        } else {
-            encoded[pos++] = '%';
-            encoded[pos++] = hex[ch >> 4];
-            encoded[pos++] = hex[ch & 0x0F];
-        }
-    }
-
-    return encoded;
-}
-
-/// Checks whether \p sentinel appears in any parsed URL component
-/// other than the filter (host, DN, attributes, extensions).
-bool sentinel_outside_filter(const LDAPURLDesc& desc, std::string_view sentinel) {
-    auto contains = [&](const char* field) {
-        return field && std::string_view(field).find(sentinel) != std::string_view::npos;
-    };
-    if (contains(desc.lud_host) || contains(desc.lud_dn)) {
-        return true;
-    }
-    if (desc.lud_attrs) {
-        for (int i = 0; desc.lud_attrs[i]; ++i) {
-            if (contains(desc.lud_attrs[i])) {
-                return true;
-            }
-        }
-    }
-    if (desc.lud_exts) {
-        for (int i = 0; desc.lud_exts[i]; ++i) {
-            if (contains(desc.lud_exts[i])) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-url_desc_ptr parse_url(const sstring& url) {
+url_desc_ptr parse_url(std::string_view url) {
    LDAPURLDesc *desc = nullptr;
-    if (ldap_url_parse(url.c_str(), &desc)) {
+    if (ldap_url_parse(url.data(), &desc)) {
        mylog.error("error in ldap_url_parse({})", url);
    }
    return url_desc_ptr(desc);
@@ -249,7 +115,6 @@ const resource_set& ldap_role_manager::protected_resources() const {
 }

 future<> ldap_role_manager::start() {
-    validate_query_template();
    if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
@@ -334,7 +199,7 @@ future<> ldap_role_manager::revoke(std::string_view, std::string_view, ::service
 }

 future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
-    const auto url = get_url(grantee_name);
+    const auto url = get_url(grantee_name.data());
    auto desc = parse_url(url);
    if (!desc) {
        return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
@@ -466,46 +331,7 @@ future<> ldap_role_manager::remove_attribute(std::string_view role_name, std::st
 }

 sstring ldap_role_manager::get_url(std::string_view user) const {
-    // Two-layer encoding protects against injection:
-    // 1. RFC 4515 filter escaping neutralizes filter metacharacters (*, (, ), \, NUL)
-    // 2. URL percent-encoding prevents URL structure injection (?, #) and blocks
-    //    ldap_url_parse's percent-decoding from undoing the filter escaping (%2a -> *)
-    return boost::replace_all_copy(_query_template, user_placeholder,
-            percent_encode_for_url(escape_filter_value(user)));
-}
-
-void ldap_role_manager::validate_query_template() const {
-    if (_query_template.find(user_placeholder) == sstring::npos) {
-        return;
-    }
-
-    // Substitute {USER} with a sentinel and let ldap_url_parse tell us
-    // which URL component it landed in.  The sentinel is purely
-    // alphanumeric so it cannot affect URL parsing.
-    static constexpr std::string_view sentinel = "XLDAPSENTINELX";
-    sstring test_url = boost::replace_all_copy(_query_template, user_placeholder, sentinel);
-    auto desc = parse_url(test_url);
-    if (!desc) {
-        throw url_error(format("LDAP URL template is not a valid URL when {{USER}} is substituted: {}", _query_template));
-    }
-
-    // The sentinel must appear in the filter ...
-    if (!desc->lud_filter
-            || std::string_view(desc->lud_filter).find(sentinel) == std::string_view::npos) {
-        throw url_error(format(
-                "LDAP URL template places {{USER}} outside the filter component. "
-                "RFC 4515 filter escaping only protects the filter; other components "
-                "(e.g. the base DN) require different escaping and are not supported. "
-                "Template: {}", _query_template));
-    }
-    // ... and nowhere else (host, DN, attributes, extensions).
-    if (sentinel_outside_filter(*desc, sentinel)) {
-        throw url_error(format(
-                "LDAP URL template places {{USER}} outside the filter component. "
-                "RFC 4515 filter escaping only protects the filter; other components "
-                "(e.g. the host) require different escaping and are not supported. "
-                "Template: {}", _query_template));
-    }
+    return boost::replace_all_copy(_query_template, "{USER}", user);
 }

 future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants() {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -107,9 +107,6 @@ class ldap_role_manager : public role_manager {
    /// Macro-expands _query_template, returning the result.
    sstring get_url(std::string_view user) const;

-    /// Validates that {USER}, if present, is used only in the LDAP filter component.
-    void validate_query_template() const;
-
    /// Used to auto-create roles returned by ldap.
    future<> create_role(std::string_view role_name);

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -52,13 +52,6 @@ static const class_registrator<
        ::service::migration_manager&,
        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");

-struct record final {
-    sstring name;
-    bool is_superuser;
-    bool can_login;
-    role_set member_of;
-};
-
 static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -67,13 +60,13 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
    return db::consistency_level::LOCAL_ONE;
 }

-static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            get_auth_ks_name(qp),
+            get_auth_ks_name(_qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);

-    const auto results = co_await qp.execute_internal(
+    const auto results = co_await _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_query_state(),
@@ -93,8 +86,25 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
                        : role_set())});
 }

-static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
-    return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
+    if (legacy_mode(_qp)) {
+        return legacy_find_record(role_name);
+    }
+    auto name = sstring(role_name);
+    auto role = _cache.get(name);
+    if (!role) {
+        return make_ready_future<std::optional<record>>(std::nullopt);
+    }
+    return make_ready_future<std::optional<record>>(std::make_optional(record{
+        .name = std::move(name),
+        .is_superuser = role->is_superuser,
+        .can_login = role->can_login,
+        .member_of = role->member_of
+    }));
+}
+
+future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
+    return find_record(role_name).then([role_name](std::optional<record> mr) {
        if (!mr) {
            throw nonexistant_role(role_name);
        }
@@ -386,7 +396,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        return fmt::to_string(fmt::join(assignments, ", "));
    };

-    return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
+    return require_record(role_name).then([this, role_name, &u, &mc](record) {
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
@@ -620,18 +630,17 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
    });
 }

-static future<> collect_roles(
-        cql3::query_processor& qp,
+future<> standard_role_manager::collect_roles(
        std::string_view grantee_name,
        bool recurse,
        role_set& roles) {
-    return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
-        return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
-            return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
+    return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
+        return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
+            return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
                roles.insert(role_name);

                if (recurse) {
-                    return collect_roles(qp, role_name, true, roles);
+                    return collect_roles(role_name, true, roles);
                }

                return make_ready_future<>();
@@ -646,7 +655,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    return do_with(
            role_set{sstring(grantee_name)},
            [this, grantee_name, recurse](role_set& roles) {
-        return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
+        return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
    });
 }

@@ -706,27 +715,21 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
 }

 future<bool> standard_role_manager::exists(std::string_view role_name) {
-    return find_record(_qp, role_name).then([](std::optional<record> mr) {
+    return find_record(role_name).then([](std::optional<record> mr) {
        return static_cast<bool>(mr);
    });
 }

 future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
-    return require_record(_qp, role_name).then([](record r) {
+    return require_record(role_name).then([](record r) {
        return r.is_superuser;
    });
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    if (legacy_mode(_qp)) {
-       const auto r = co_await require_record(_qp, role_name);
-       co_return r.can_login;
-    }
-    auto role = _cache.get(sstring(role_name));
-    if (!role) {
-        throw nonexistant_role(role_name);
-    }
-    co_return role->can_login;
+    return require_record(role_name).then([](record r) {
+        return r.can_login;
+    });
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -90,6 +90,12 @@ public:

 private:
    enum class membership_change { add, remove };
+    struct record final {
+        sstring name;
+        bool is_superuser;
+        bool can_login;
+        role_set member_of;
+    };

    future<> create_legacy_metadata_tables_if_missing() const;

@@ -107,6 +113,14 @@ private:
    future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);

    future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
+
+    future<std::optional<record>> legacy_find_record(std::string_view role_name);
+    future<std::optional<record>> find_record(std::string_view role_name);
+    future<record> require_record(std::string_view role_name);
+    future<> collect_roles(
+            std::string_view grantee_name,
+            bool recurse,
+            role_set& roles);
 };

 } // namespace auth
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -48,7 +48,6 @@
 #include "mutation/mutation_fragment_stream_validator.hh"
 #include "utils/assert.hh"
 #include "utils/error_injection.hh"
-#include "utils/chunked_vector.hh"
 #include "utils/pretty_printers.hh"
 #include "readers/multi_range.hh"
 #include "readers/compacting.hh"
@@ -612,23 +611,23 @@ private:
    }

    // Called in a seastar thread
-    utils::chunked_vector<dht::partition_range>
+    dht::partition_range_vector
    get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
        // If owned ranges is disengaged, it means no cleanup work was done and
        // so nothing needs to be invalidated.
        if (!_owned_ranges) {
-            return {};
+            return dht::partition_range_vector{};
        }
-        auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();
+        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);

        auto non_owned_ranges = sstables
                | std::views::transform([] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
-        })      | std::ranges::to<utils::chunked_vector<dht::partition_range>>();
+        })      | std::ranges::to<dht::partition_range_vector>();

-        return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
+        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
    }
 protected:
    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -719,8 +718,8 @@ protected:

    compaction_completion_desc
    get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
-        auto ranges = get_ranges_for_invalidation(input_sstables);
-        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
+        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
    }

    // Tombstone expiration is enabled based on the presence of sstable set.
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -16,7 +16,6 @@
 #include "sstables/sstable_set.hh"
 #include "compaction_fwd.hh"
 #include "mutation_writer/token_group_based_splitting_writer.hh"
-#include "utils/chunked_vector.hh"

 namespace compaction {

@@ -39,7 +38,7 @@ struct compaction_completion_desc {
    // New, fresh SSTables that should be added to SSTable set, replacing the old ones.
    std::vector<sstables::shared_sstable> new_sstables;
    // Set of compacted partition ranges that should be invalidated in the cache.
-    utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
+    dht::partition_range_vector ranges_for_cache_invalidation;
 };

 // creates a new SSTable for a given shard
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -778,7 +778,6 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
        cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
-    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
@@ -792,7 +791,6 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
        cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
-    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
@@ -1128,10 +1126,7 @@ void compaction_manager::enable() {

    _compaction_submission_timer.cancel();
    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
-    if (_waiting_reevaluation) {
-        on_internal_error(cmlog, "postponed compactions reevaluation is already running when enabling compaction manager");
-    }
-    _waiting_reevaluation.emplace(postponed_compactions_reevaluation());
+    _waiting_reevalution = postponed_compactions_reevaluation();
    cmlog.info("Enabled");
 }

@@ -1179,16 +1174,6 @@ void compaction_manager::reevaluate_postponed_compactions() noexcept {
    _postponed_reevaluation.signal();
 }

-future<> compaction_manager::stop_postponed_compactions() noexcept {
-    auto waiting_reevaluation = std::exchange(_waiting_reevaluation, std::nullopt);
-    if (!waiting_reevaluation) {
-        return make_ready_future();
-    }
-    // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
-    reevaluate_postponed_compactions();
-    return std::move(*waiting_reevaluation);
-}
-
 void compaction_manager::postpone_compaction_for_table(compaction_group_view* t) {
    _postponed.insert(t);
 }
@@ -1272,7 +1257,8 @@ future<> compaction_manager::drain() {
    _compaction_submission_timer.cancel();
    // Stop ongoing compactions, if the request has not been sent already and wait for them to stop.
    co_await stop_ongoing_compactions("drain");
-    co_await stop_postponed_compactions();
+    // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
+    reevaluate_postponed_compactions();
    cmlog.info("Drained");
 }

@@ -1280,15 +1266,9 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) {
-                    cm._in_critical_disk_utilization_mode = true;
-                    return cm.drain();
-                });
+                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) {
-                cm._in_critical_disk_utilization_mode = false;
-                cm.enable();
-            });
+            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
        });
    }

@@ -1316,7 +1296,8 @@ future<> compaction_manager::really_do_stop() noexcept {
    if (!_tasks.empty()) {
        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
    }
-    co_await stop_postponed_compactions();
+    reevaluate_postponed_compactions();
+    co_await std::move(_waiting_reevalution);
    co_await _sys_ks.close();
    _weight_tracker.clear();
    _compaction_submission_timer.cancel();
@@ -1538,9 +1519,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
-        .value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
-
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1555,7 +1534,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.wait();
+            co_await cstate.compaction_done.wait([this, &t] {
+                return !can_perform_regular_compaction(t);
+            });
        }
    } catch (const broken_condition_variable&) {
        co_return;
@@ -2308,16 +2289,6 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

-std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
-    std::exception_ptr ex;
-    if (_in_critical_disk_utilization_mode) {
-        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
-    } else {
-        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
-    }
-    return ex;
-}
-
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2327,7 +2298,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(make_disabled_exception(t));
+        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
+                                                                                         "reason might be out of space prevention", sst->get_filename()))));
    }
    std::vector<sstables::shared_sstable> ret;

@@ -2415,8 +2387,6 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
    if (!c_state.gate.is_closed()) {
        auto close_gate = c_state.gate.close();
        co_await stop_ongoing_compactions(reason, &t);
-        // Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
-        co_await c_state.incremental_repair_lock.write_lock();
        co_await std::move(close_gate);
    }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -114,8 +114,6 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
-    // precondition: is_disabled() is true.
-    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -127,7 +125,7 @@ private:
    // a sstable from being compacted twice.
    std::unordered_set<sstables::shared_sstable> _compacting_sstables;

-    std::optional<future<>> _waiting_reevaluation;
+    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // tables that wait for compaction but had its submission postponed due to ongoing compaction.
    std::unordered_set<compaction::compaction_group_view*> _postponed;
@@ -175,7 +173,6 @@ private:
    tombstone_gc_state _tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
-    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
@@ -237,7 +234,6 @@ private:

    future<> postponed_compactions_reevaluation();
    void reevaluate_postponed_compactions() noexcept;
-    future<> stop_postponed_compactions() noexcept;
    // Postpone compaction for a table that couldn't be executed due to ongoing
    // similar-sized compaction.
    void postpone_compaction_for_table(compaction::compaction_group_view* t);
--- a/configure.py
+++ b/configure.py
@@ -730,6 +730,28 @@ vector_search_tests = set([
    'test/vector_search/rescoring_test'
 ])

+vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
+vector_search_validator_deps = set([
+    'test/vector_search_validator/build-validator',
+    'test/vector_search_validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/src/main.rs',
+    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
+    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
+    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
+])
+
+vector_store_bin = 'vector-search-validator/bin/vector-store'
+vector_store_deps = set([
+    'test/vector_search_validator/build-env',
+    'test/vector_search_validator/build-vector-store',
+])
+
+vector_search_validator_bins = set([
+    vector_search_validator_bin,
+    vector_store_bin,
+])
+
 wasms = set([
    'wasm/return_input.wat',
    'wasm/test_complex_null_values.wat',
@@ -763,7 +785,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms
+all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -795,6 +817,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
+# Workaround for https://github.com/mozilla/sccache/issues/2575
+arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
+                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -925,8 +950,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'utils/buffer_input_stream.cc',
-                'utils/limiting_data_source.cc',
+                'test/lib/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1535,6 +1559,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
+                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -1701,7 +1726,6 @@ deps['test/boost/combined_tests'] += [
    'test/boost/tracing_test.cc',
    'test/boost/user_function_test.cc',
    'test/boost/user_types_test.cc',
-    'test/boost/vector_index_test.cc',
    'test/boost/view_build_test.cc',
    'test/boost/view_complex_test.cc',
    'test/boost/view_schema_ckey_test.cc',
@@ -2384,7 +2408,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2561,10 +2585,11 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
+                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2594,7 +2619,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms:
+            if binary in other or binary in wasms or binary in vector_search_validator_bins:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2705,10 +2730,11 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
+                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2876,6 +2902,19 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

+    f.write(textwrap.dedent(f'''\
+        rule build-vector-search-validator
+            command = test/vector_search_validator/build-validator $builddir
+        rule build-vector-store
+            command = test/vector_search_validator/build-vector-store $builddir
+        '''))
+    f.write(
+            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
+    )
+    f.write(
+            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
+    )
+
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -3113,7 +3152,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache:
+        if 'sccache' in compiler_cache and args.sccache_rust:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -136,9 +136,9 @@ public:
    {}

    future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
-        return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
+        return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
            return make_ready_future<value_type>(std::move(v));
-        });
+        }).discard_result();
    }

    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
--- a/cql3/column_specification.cc
+++ b/cql3/column_specification.cc
@@ -10,7 +10,6 @@

 #include "utils/assert.hh"
 #include "cql3/column_specification.hh"
-#include "cql3/column_identifier.hh"

 namespace cql3 {

@@ -32,12 +31,4 @@ bool column_specification::all_in_same_table(const std::vector<lw_shared_ptr<col
    });
 }

-lw_shared_ptr<column_specification> make_column_spec(std::string_view ks_name, std::string_view cf_name, sstring name, data_type type) {
-    return make_lw_shared<column_specification>(
-            ks_name,
-            cf_name,
-            ::make_shared<column_identifier>(std::move(name), true),
-            std::move(type));
-}
-
 }
--- a/cql3/column_specification.hh
+++ b/cql3/column_specification.hh
@@ -42,6 +42,4 @@ public:
    static bool all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names);
 };

-lw_shared_ptr<column_specification> make_column_spec(std::string_view ks_name, std::string_view cf_name, sstring name, data_type type);
-
 }
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,41 +10,9 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
-#include <span>
-#include <bit>

 namespace cql3 {
 namespace functions {
-
-namespace detail {
-
-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
-    if (!param) {
-        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
-    }
-
-    const size_t expected_size = dimension * sizeof(float);
-    if (param->size() != expected_size) {
-        throw exceptions::invalid_request_exception(
-            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
-                       expected_size, dimension, param->size()));
-    }
-
-    std::vector<float> result;
-    result.reserve(dimension);
-
-    bytes_view view(*param);
-    for (size_t i = 0; i < dimension; ++i) {
-        // read_simple handles network byte order (big-endian) conversion
-        uint32_t raw = read_simple<uint32_t>(view);
-        result.push_back(std::bit_cast<float>(raw));
-    }
-
-    return result;
-}
-
-} // namespace detail
-
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;
    double squared_norm_a = 0.0;
    double squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -69,7 +37,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        return std::numeric_limits<float>::quiet_NaN();
+        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
    }

    // The cosine similarity is in the range [-1, 1].
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        double diff = a - b;
        sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
        dot_product += a * b;
    }

@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    // Extract dimension from the vector type
-    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
-    size_t dimension = type.get_dimension();
+    const auto& type = arg_types()[0];
+    data_value v1 = type->deserialize(*parameters[0]);
+    data_value v2 = type->deserialize(*parameters[1]);
+    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
+    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);

-    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
-    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
-    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
-
-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,7 +11,6 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
-#include <span>

 namespace cql3 {
 namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
+using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

-namespace detail {
-
-// Extract float vector directly from serialized bytes, bypassing data_value overhead.
-// This is an internal API exposed for testing purposes.
-// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
-
-} // namespace detail
-
 } // namespace functions
 } // namespace cql3
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -105,7 +105,6 @@ public:
    static const std::chrono::minutes entry_expiry;

    using key_type = prepared_cache_key_type;
-    using pinned_value_type = cache_value_ptr;
    using value_type = checked_weak_ptr;
    using statement_is_too_big = typename cache_type::entry_is_too_big;

@@ -117,14 +116,9 @@ public:
        : _cache(size, entry_expiry, logger)
    {}

-    template <typename LoadFunc>
-    future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
-        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
-    }
-
    template <typename LoadFunc>
    future<value_type> get(const key_type& key, LoadFunc&& load) {
-        return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
        });
    }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
 query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
    try {
        auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
-        auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
+        auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
                auto prepared = get_statement(query_string, client_state, d);
                prepared->calculate_metadata_id();
                auto bound_terms = prepared->statement->get_bound_terms();
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
            });

-        co_await utils::get_local_injector().inject(
-                "query_processor_prepare_wait_after_cache_get",
-                utils::wait_for_message(std::chrono::seconds(60)));
-  
-        auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
+        const auto& warnings = prep_ptr->warnings;
+        const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
                    client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
-        co_return std::move(msg);
+        for (const auto& w : warnings) {
+            msg->add_warning(w);
+        }
+        co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
    } catch(typename prepared_statements_cache::statement_is_too_big&) {
        throw prepared_statement_is_too_big(query_string);
    }
@@ -1029,11 +1029,6 @@ query_processor::execute_batch_without_checking_exception_message(
        query_options& options,
        std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
    auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
-    bool failed = access_future.failed();
-    co_await audit::inspect(batch, query_state, options, failed);
-    if (failed) {
-        std::rethrow_exception(access_future.get_exception());
-    }
    co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
            try {
                co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
@@ -1041,6 +1036,11 @@ query_processor::execute_batch_without_checking_exception_message(
                log.error("failed to cache the entry: {}", std::current_exception());
            }
        });
+    bool failed = access_future.failed();
+    co_await audit::inspect(batch, query_state, options, failed);
+    if (access_future.failed()) {
+        std::rethrow_exception(access_future.get_exception());
+    }
    batch->validate();
    batch->validate(*this, query_state.get_client_state());
    _stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
--- a/cql3/query_result_printer.hh
+++ b/cql3/query_result_printer.hh
@@ -1,20 +0,0 @@
-/*
- * Copyright 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <ostream>
-
-namespace cql3 {
-
-class result;
-
-void print_query_results_text(std::ostream& os, const result& result);
-void print_query_results_json(std::ostream& os, const result& result);
-
-} // namespace cql3
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -201,10 +201,6 @@ public:
        return _clustering_columns_restrictions;
    }

-    const expr::expression& get_nonprimary_key_restrictions() const {
-        return _nonprimary_key_restrictions;
-    }
-
    // Get a set of columns restricted by the IS NOT NULL restriction.
    // IS NOT NULL is a special case that is handled separately from other restrictions.
    const std::unordered_set<const column_definition*> get_not_null_columns() const;
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -9,10 +9,8 @@
 */

 #include <cstdint>
-#include "types/json_utils.hh"
 #include "utils/assert.hh"
 #include "utils/hashers.hh"
-#include "utils/rjson.hh"
 #include "cql3/result_set.hh"

 namespace cql3 {
@@ -197,85 +195,4 @@ make_empty_metadata() {
    return empty_metadata_cache;
 }

-void print_query_results_text(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    struct column_values {
-        size_t max_size{0};
-        sstring header_format;
-        sstring row_format;
-        std::vector<sstring> values;
-
-        void add(sstring value) {
-            max_size = std::max(max_size, value.size());
-            values.push_back(std::move(value));
-        }
-    };
-
-    std::vector<column_values> columns;
-    columns.resize(column_metadata.size());
-
-    for (size_t i = 0; i < column_metadata.size(); ++i) {
-        columns[i].add(column_metadata[i]->name->text());
-    }
-
-    for (const auto& row : result.result_set().rows()) {
-        for (size_t i = 0; i < row.size(); ++i) {
-            if (row[i]) {
-                columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
-            } else {
-                columns[i].add("");
-            }
-        }
-    }
-
-    std::vector<sstring> separators(columns.size(), sstring());
-    for (size_t i = 0; i < columns.size(); ++i) {
-        auto& col_values = columns[i];
-        col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
-        col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
-        for (size_t c = 0; c < col_values.max_size; ++c) {
-            separators[i] += "-";
-        }
-    }
-
-    for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
-        std::vector<sstring> row;
-        row.reserve(columns.size());
-        for (size_t i = 0; i < columns.size(); ++i) {
-            const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
-            row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
-        }
-        fmt::print(os, "{}\n", fmt::join(row, "|"));
-        if (!r) {
-            fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
-        }
-    }
-}
-
-void print_query_results_json(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    rjson::streaming_writer writer(os);
-
-    writer.StartArray();
-    for (const auto& row : result.result_set().rows()) {
-        writer.StartObject();
-        for (size_t i = 0; i < row.size(); ++i) {
-            writer.Key(column_metadata[i]->name->text());
-            if (!row[i] || row[i]->empty()) {
-                writer.Null();
-                continue;
-            }
-            const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
-            const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
-            writer.RawValue(value, type);
-        }
-        writer.EndObject();
-    }
-    writer.EndArray();
-}
-
 }
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -8,7 +8,6 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

-#include <boost/algorithm/string.hpp>
 #include <seastar/core/coroutine.hh>
 #include "create_index_statement.hh"
 #include "db/config.hh"
@@ -38,7 +37,6 @@
 #include "types/concrete_types.hh"
 #include "db/tags/extension.hh"
 #include "tombstone_gc_extension.hh"
-#include "index/secondary_index.hh"

 #include <stdexcept>

@@ -118,15 +116,6 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
    }
 }

-static bool is_vector_capable_class(const sstring& class_name) {
-    return boost::iequals(class_name, "vector_index");
-}
-
-static bool is_vector_index(const index_options_map& options) {
-    auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
-    return class_it != options.end() && is_vector_capable_class(class_it->second);
-}
-
 view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
        const data_dictionary::database& db) const
 {
@@ -277,7 +266,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
    _idx_properties->validate();

    // FIXME: This is ugly and can be improved.
-    const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
+    const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
    const bool uses_view_properties = _view_properties.properties()->count() > 0
            || _view_properties.use_compact_storage()
            || _view_properties.defined_ordering().size() > 0;
@@ -459,15 +448,11 @@ void create_index_statement::validate_for_local_index(const schema& schema) cons
                auto base_pk_identifiers = *index_pk | std::views::transform([&schema] (const ::shared_ptr<column_identifier::raw>& raw_ident) {
                    return raw_ident->prepare_column_identifier(schema);
                });
-                auto const is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
                auto remaining_base_pk_columns = schema.partition_key_columns();
                auto next_expected_base_column = remaining_base_pk_columns.begin();
                for (const auto& ident : base_pk_identifiers) {
                    auto it = schema.columns_by_name().find(ident->name());
                    if (it == schema.columns_by_name().end() || !it->second->is_partition_key()) {
-                        if (is_vector_index) {
-                            throw exceptions::invalid_request_exception(format("Local vector index definition must contain partition key's columns only. Redundant column: {}", ident->to_string()));
-                        }
                        throw exceptions::invalid_request_exception(format("Local index definition must contain full partition key only. Redundant column: {}", ident->to_string()));
                    }
                    if (next_expected_base_column == remaining_base_pk_columns.end()) {
@@ -478,7 +463,7 @@ void create_index_statement::validate_for_local_index(const schema& schema) cons
                    }
                    ++next_expected_base_column;
                }
-                if (!is_vector_index && next_expected_base_column != remaining_base_pk_columns.end()) {
+                if (next_expected_base_column != remaining_base_pk_columns.end()) {
                    throw exceptions::invalid_request_exception(format("Local index definition must contain full partition key only. Missing column: {}", next_expected_base_column->name_as_text()));
                }
                if (_raw_targets.size() == 1) {
@@ -712,9 +697,7 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
                                                           const index_options_map& options)
 {
    index_options_map new_options = options;
-    auto target_option = is_vector_index(options)
-        ? secondary_index::vector_index::serialize_targets(targets)
-        : secondary_index::target_parser::serialize_targets(targets);
+    auto target_option = secondary_index::target_parser::serialize_targets(targets);
    new_options.emplace(index_target::target_option_name, target_option);

    const auto& first_target = targets.front()->value;
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,7 +23,6 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
-#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

-        table_desc.create_statement = std::move(os).to_managed_string();
-    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
-        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
-        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
-        fragmented_ostringstream os{};
-
-        fmt::format_to(os.to_iter(),
-                "/* Do NOT execute this statement! It's only for informational purposes.\n"
-                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
-                "\n{}\n"
-                "*/",
-                *table_desc.create_statement);
-
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/list_effective_service_level_statement.cc
+++ b/cql3/statements/list_effective_service_level_statement.cc
@@ -30,14 +30,13 @@ list_effective_service_level_statement::prepare(data_dictionary::database db, cq
    return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_effective_service_level_statement>(*this));
 }

-shared_ptr<const cql3::metadata> list_effective_service_level_statement::get_result_metadata() const {
-    return ::make_shared<cql3::metadata>(
-            std::vector<lw_shared_ptr<column_specification>>{
-                    make_column_spec("QOS", "effective_service_level", "service_level_option", utf8_type),
-                    make_column_spec("QOS", "effective_service_level", "effective_service_level", utf8_type),
-                    make_column_spec("QOS", "effective_service_level", "value", utf8_type)
-            });
-}
+static auto make_column(sstring name, const shared_ptr<const abstract_type> type) {
+    return make_lw_shared<column_specification>(
+        "QOS",
+        "effective_service_level",
+        ::make_shared<column_identifier>(std::move(name), true),
+        type);
+};

 static bytes_opt decompose_timeout (const qos::service_level_options::timeout_type& duration) {
    return std::visit(overloaded_functor{
@@ -70,6 +69,11 @@ static bytes_opt decompose_shares(const qos::service_level_options::shares_type&

 future<::shared_ptr<cql_transport::messages::result_message>>
 list_effective_service_level_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional<service::group0_guard>) const {
+    static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
+        make_column("service_level_option", utf8_type),
+        make_column("effective_service_level", utf8_type),
+        make_column("value", utf8_type)
+    });
    auto& role_manager = state.get_client_state().get_auth_service()->underlying_role_manager();

    if (!co_await role_manager.exists(_role_name)) {
@@ -83,7 +87,7 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu
        throw exceptions::invalid_request_exception(format("Role {} doesn't have assigned any service level", _role_name));
    }

-    auto rs = std::make_unique<result_set>(::make_shared<cql3::metadata>(*get_result_metadata()));
+    auto rs = std::make_unique<result_set>(metadata);
    rs->add_row({
        utf8_type->decompose("workload_type"),
        utf8_type->decompose(slo->effective_names->workload),
@@ -106,4 +110,4 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu

 }

-}
+}
--- a/cql3/statements/list_effective_service_level_statement.hh
+++ b/cql3/statements/list_effective_service_level_statement.hh
@@ -21,11 +21,9 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard>) const override;
 };

 }
-}
+}
--- a/cql3/statements/list_permissions_statement.cc
+++ b/cql3/statements/list_permissions_statement.cc
@@ -15,18 +15,9 @@
 #include "auth/authorizer.hh"
 #include "auth/common.hh"
 #include "cql3/result_set.hh"
-#include "db/system_keyspace.hh"
+#include "cql3/column_identifier.hh"
 #include "transport/messages/result_message.hh"

-shared_ptr<const cql3::metadata> cql3::statements::list_permissions_statement::get_result_metadata() const {
-    return ::make_shared<cql3::metadata>(
-            std::vector<lw_shared_ptr<cql3::column_specification>>{
-                    make_column_spec(db::system_keyspace::NAME, "permissions", "role", utf8_type),
-                    make_column_spec(db::system_keyspace::NAME, "permissions", "username", utf8_type),
-                    make_column_spec(db::system_keyspace::NAME, "permissions", "resource", utf8_type),
-                    make_column_spec(db::system_keyspace::NAME, "permissions", "permission", utf8_type)});
-}
-
 cql3::statements::list_permissions_statement::list_permissions_statement(
        auth::permission_set permissions,
        std::optional<auth::resource> resource,
@@ -89,6 +80,18 @@ cql3::statements::list_permissions_statement::execute(
        service::query_state& state,
        const query_options& options,
        std::optional<service::group0_guard> guard) const {
+    auto make_column = [auth_ks = auth::get_auth_ks_name(qp)](sstring name) {
+        return make_lw_shared<column_specification>(
+                auth_ks,
+                "permissions",
+                ::make_shared<column_identifier>(std::move(name), true),
+                utf8_type);
+    };
+
+    std::vector<lw_shared_ptr<column_specification>> metadata({
+        make_column("role"), make_column("username"), make_column("resource"), make_column("permission")
+    });
+
    const auto make_resource_filter = [this]()
            -> std::optional<std::pair<auth::resource, auth::recursive_permissions>> {
        if (!_resource) {
@@ -101,7 +104,6 @@ cql3::statements::list_permissions_statement::execute(
    };

    const auto& as = *state.get_client_state().get_auth_service();
-    auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());

    return do_with(make_resource_filter(), [this, &as, metadata = std::move(metadata)](const auto& resource_filter) mutable {
        return auth::list_filtered_permissions(
--- a/cql3/statements/list_permissions_statement.hh
+++ b/cql3/statements/list_permissions_statement.hh
@@ -34,8 +34,6 @@ public:

    std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
-
    void validate(query_processor&, const service::client_state&) const override;

    future<> check_access(query_processor& qp, const service::client_state&) const override;
--- a/cql3/statements/list_roles_statement.hh
+++ b/cql3/statements/list_roles_statement.hh
@@ -35,8 +35,6 @@ public:

    std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
-
    virtual future<> check_access(query_processor& qp, const service::client_state&) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>>
--- a/cql3/statements/list_service_level_attachments_statement.cc
+++ b/cql3/statements/list_service_level_attachments_statement.cc
@@ -8,6 +8,7 @@

 #include "seastarx.hh"
 #include "cql3/statements/list_service_level_attachments_statement.hh"
+#include "cql3/column_identifier.hh"
 #include "transport/messages/result_message.hh"
 #include "service/client_state.hh"
 #include "service/query_state.hh"
@@ -16,15 +17,6 @@ namespace cql3 {

 namespace statements {

-shared_ptr<const cql3::metadata> list_service_level_attachments_statement::get_result_metadata() const {
-    static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
-        make_column_spec("QOS", "service_levels_attachments", "role", utf8_type),
-        make_column_spec("QOS", "service_levels_attachments", "service_level", utf8_type)
-    });
-
-    return ::make_shared<cql3::metadata>(metadata);
-}
-
 list_service_level_attachments_statement::list_service_level_attachments_statement(sstring role_name) :
    _role_name(role_name), _describe_all(false) {
 }
@@ -48,7 +40,19 @@ list_service_level_attachments_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &,
        std::optional<service::group0_guard> guard) const {
-    auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
+
+    static auto make_column = [] (sstring name, const shared_ptr<const abstract_type> type) {
+        return make_lw_shared<column_specification>(
+                "QOS",
+                "service_levels_attachments",
+                ::make_shared<column_identifier>(std::move(name), true),
+                type);
+    };
+
+    static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
+        make_column("role", utf8_type), make_column("service_level", utf8_type)
+    });
+

    return make_ready_future().then([this, &state] () {
        if (_describe_all) {
@@ -63,7 +67,7 @@ list_service_level_attachments_statement::execute(query_processor& qp,
            });

        }
-    }).then([metadata = std::move(metadata)] (std::unordered_map<sstring, sstring> roles_to_att_val) {
+    }).then([] (std::unordered_map<sstring, sstring> roles_to_att_val) {

        auto rs = std::make_unique<result_set>(metadata);
        for (auto&& role_to_sl : roles_to_att_val) {
--- a/cql3/statements/list_service_level_attachments_statement.hh
+++ b/cql3/statements/list_service_level_attachments_statement.hh
@@ -22,7 +22,6 @@ public:
    list_service_level_attachments_statement(sstring role_name);
    list_service_level_attachments_statement();
    std::unique_ptr<cql3::statements::prepared_statement> prepare(data_dictionary::database db, cql_stats &stats) override;
-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state&) const override;
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard> guard) const override;
--- a/cql3/statements/list_service_level_statement.cc
+++ b/cql3/statements/list_service_level_statement.cc
@@ -8,6 +8,7 @@

 #include "seastarx.hh"
 #include "cql3/statements/list_service_level_statement.hh"
+#include "cql3/column_identifier.hh"
 #include "service/qos/service_level_controller.hh"
 #include "transport/messages/result_message.hh"
 #include "utils/overloaded_functor.hh"
@@ -18,20 +19,6 @@ namespace cql3 {

 namespace statements {

-shared_ptr<const cql3::metadata> list_service_level_statement::get_result_metadata() const {
-    std::vector<lw_shared_ptr<column_specification>> metadata{
-            make_column_spec("QOS", "service_levels", "service_level", utf8_type),
-            make_column_spec("QOS", "service_levels", "timeout", duration_type),
-            make_column_spec("QOS", "service_levels", "workload_type", utf8_type),
-            make_column_spec("QOS", "service_levels", "shares", int32_type),
-    };
-    if (_describe_all) {
-        metadata.push_back(make_column_spec("QOS", "service_levels", "percentage of all service level shares", utf8_type));
-    }
-
-    return ::make_shared<cql3::metadata>(std::move(metadata));
-}
-
 list_service_level_statement::list_service_level_statement(sstring service_level, bool describe_all) :
    _service_level(service_level), _describe_all(describe_all) {
 }
@@ -51,7 +38,23 @@ list_service_level_statement::execute(query_processor& qp,
        service::query_state &state,
        const query_options &,
        std::optional<service::group0_guard> guard) const {
-    auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
+
+    static auto make_column = [] (sstring name, const shared_ptr<const abstract_type> type) {
+        return make_lw_shared<column_specification>(
+                "QOS",
+                "service_levels",
+                ::make_shared<column_identifier>(std::move(name), true),
+                type);
+    };
+
+    std::vector<lw_shared_ptr<column_specification>> metadata({make_column("service_level", utf8_type),
+        make_column("timeout", duration_type),
+        make_column("workload_type", utf8_type),
+        make_column("shares", int32_type),
+    });
+    if (_describe_all) {
+        metadata.push_back(make_column("percentage of all service level shares", utf8_type));
+    }

    return make_ready_future().then([this, &state] () {
                                  if (_describe_all) {
--- a/cql3/statements/list_service_level_statement.hh
+++ b/cql3/statements/list_service_level_statement.hh
@@ -21,7 +21,6 @@ class list_service_level_statement final : public service_level_statement {
 public:
    list_service_level_statement(sstring service_level, bool describe_all);
    std::unique_ptr<cql3::statements::prepared_statement> prepare(data_dictionary::database db, cql_stats &stats) override;
-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state&) const override;
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor&, service::query_state&, const query_options&, std::optional<service::group0_guard> guard) const override;
--- a/cql3/statements/list_users_statement.cc
+++ b/cql3/statements/list_users_statement.cc
@@ -12,17 +12,10 @@
 #include "list_users_statement.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/query_options.hh"
+#include "cql3/column_identifier.hh"
 #include "auth/common.hh"
-#include "db/system_keyspace.hh"
 #include "transport/messages/result_message.hh"

-shared_ptr<const cql3::metadata> cql3::statements::list_users_statement::get_result_metadata() const {
-    return ::make_shared<cql3::metadata>(
-        std::vector<lw_shared_ptr<cql3::column_specification>>{
-                cql3::make_column_spec(db::system_keyspace::NAME, "users", "name", utf8_type),
-                cql3::make_column_spec(db::system_keyspace::NAME, "users", "super", boolean_type)});
-}
-
 std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::list_users_statement::prepare(
                data_dictionary::database db, cql_stats& stats) {
    return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_users_statement>(*this));
@@ -35,7 +28,20 @@ future<> cql3::statements::list_users_statement::check_access(query_processor& q

 future<::shared_ptr<cql_transport::messages::result_message>>
 cql3::statements::list_users_statement::execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const {
-    auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
+    static const sstring virtual_table_name("users");
+
+    const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
+        return make_lw_shared<column_specification>(
+            auth_ks,
+            virtual_table_name,
+            ::make_shared<column_identifier>(name, true),
+            ty);
+    };
+
+    auto metadata = ::make_shared<cql3::metadata>(
+        std::vector<lw_shared_ptr<column_specification>>{
+                make_column_spec("name", utf8_type),
+                make_column_spec("super", boolean_type)});

    auto make_results = [metadata = std::move(metadata)](const auth::service& as, std::unordered_set<sstring>&& roles) mutable {
        using cql_transport::messages::result_message;
--- a/cql3/statements/list_users_statement.hh
+++ b/cql3/statements/list_users_statement.hh
@@ -23,8 +23,6 @@ public:

    std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual seastar::shared_ptr<const metadata> get_result_metadata() const override;
-
    future<> check_access(query_processor& qp, const service::client_state&) const override;
    future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor&
                    , service::query_state&
--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -50,8 +50,8 @@ public:
 protected:
    virtual audit::statement_category category() const override;
    virtual audit::audit_info_ptr audit_info() const override {
-        // We don't audit batch statements. Instead we audit statements that are inside the batch.
-        return audit::audit::create_no_audit_info();
+        constexpr bool batch = true;
+        return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
    }
 };

--- a/cql3/statements/role-management-statements.cc
+++ b/cql3/statements/role-management-statements.cc
@@ -27,7 +27,6 @@
 #include "cql3/statements/list_roles_statement.hh"
 #include "cql3/statements/revoke_role_statement.hh"
 #include "cql3/statements/request_validations.hh"
-#include "db/system_keyspace.hh"
 #include "exceptions/exceptions.hh"
 #include "service/storage_proxy.hh"
 #include "transport/messages/result_message.hh"
@@ -348,17 +347,6 @@ std::unique_ptr<prepared_statement> list_roles_statement::prepare(
    return std::make_unique<prepared_statement>(audit_info(), ::make_shared<list_roles_statement>(*this));
 }

-shared_ptr<const cql3::metadata> list_roles_statement::get_result_metadata() const {
-    static const thread_local auto custom_options_type = map_type_impl::get_instance(utf8_type, utf8_type, true);
-
-    return ::make_shared<cql3::metadata>(
-            std::vector<lw_shared_ptr<column_specification>>{
-                    make_column_spec(db::system_keyspace::NAME, "roles", "role", utf8_type),
-                    make_column_spec(db::system_keyspace::NAME, "roles", "super", boolean_type),
-                    make_column_spec(db::system_keyspace::NAME, "roles", "login", boolean_type),
-                    make_column_spec(db::system_keyspace::NAME, "roles", "options", custom_options_type)});
-}
-
 future<> list_roles_statement::check_access(query_processor& qp, const service::client_state& state) const {
    state.ensure_not_anonymous();

@@ -388,8 +376,24 @@ future<> list_roles_statement::check_access(query_processor& qp, const service::

 future<result_message_ptr>
 list_roles_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional<service::group0_guard> guard) const {
+    static const sstring virtual_table_name("roles");
+
+    const auto make_column_spec = [auth_ks = auth::get_auth_ks_name(qp)](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
+        return make_lw_shared<column_specification>(
+                auth_ks,
+                virtual_table_name,
+                ::make_shared<column_identifier>(name, true),
+                ty);
+    };
+
    static const thread_local auto custom_options_type = map_type_impl::get_instance(utf8_type, utf8_type, true);
-    auto metadata = ::make_shared<cql3::metadata>(*get_result_metadata());
+
+    auto metadata = ::make_shared<cql3::metadata>(
+            std::vector<lw_shared_ptr<column_specification>>{
+                    make_column_spec("role", utf8_type),
+                    make_column_spec("super", boolean_type),
+                    make_column_spec("login", boolean_type),
+                    make_column_spec("options", custom_options_type)});

    auto make_results = [metadata = std::move(metadata)](
            auth::role_manager& rm,
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -2006,7 +2006,9 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(

    auto indexes = sim.list_indexes();
    auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
-        return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
+        return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
+                       ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
+               (ind.target_column() == prepared_ann_ordering.first->name_as_text());
    });

    if (it == indexes.end()) {
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -55,21 +55,8 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
    return hash & ((1ULL << batchlog_shard_bits) - 1);
 }

-bool is_batchlog_v1(const schema& schema) {
-    return schema.cf_name() == system_keyspace::BATCHLOG;
-}
-
 std::pair<partition_key, clustering_key>
 get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
-    if (is_batchlog_v1(schema)) {
-        if (!id) {
-            on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
-        }
-        auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
-        auto ckey = clustering_key::make_empty();
-        return std::pair(std::move(pkey), std::move(ckey));
-    }
-
    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});

    std::vector<bytes> ckey_components;
@@ -98,14 +85,6 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
    auto cdef_data = schema->get_column_definition(to_bytes("data"));
    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));

-    if (is_batchlog_v1(*schema)) {
-        auto cdef_version = schema->get_column_definition(to_bytes("version"));
-        m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
-
-        auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
-        m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
-    }
-
    return m;
 }

@@ -143,10 +122,9 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

-db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
+db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
-        , _fs(fs)
        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
@@ -322,206 +300,149 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
    });
 }

-namespace {
-
-using clock_type = db_clock::rep;
-
-struct replay_stats {
-    std::optional<db_clock::time_point> min_too_fresh;
-    bool need_cleanup = false;
-};
-
-} // anonymous namespace
-
-static future<db::all_batches_replayed> process_batch(
-        cql3::query_processor& qp,
-        db::batchlog_manager::stats& stats,
-        db::batchlog_manager::post_replay_cleanup cleanup,
-        utils::rate_limiter& limiter,
-        schema_ptr schema,
-        std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
-        const db_clock::time_point now,
-        db_clock::duration replay_timeout,
-        std::chrono::seconds write_timeout,
-        const cql3::untyped_result_set::row& row) {
-    const bool is_v1 = db::is_batchlog_v1(*schema);
-    const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
-    const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
-    auto written_at = row.get_as<db_clock::time_point>("written_at");
-    auto id = row.get_as<utils::UUID>("id");
-    // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-    auto timeout = replay_timeout;
-
-    if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
-        blogger.debug("Skipping batch replay due to skip_batch_replay injection");
-        co_return db::all_batches_replayed::no;
-    }
-
-    auto data = row.get_blob_unfragmented("data");
-
-    blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
-
-    utils::chunked_vector<mutation> mutations;
-    bool send_failed = false;
-
-    auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
-
-    try {
-        utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
-        auto in = ser::as_input_stream(data);
-        while (in.size()) {
-            auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
-            const auto tbl = qp.db().try_find_table(fm.column_family_id());
-            if (!tbl) {
-                continue;
-            }
-            if (written_at <= tbl->get_truncation_time()) {
-                continue;
-            }
-            schema_ptr s = tbl->schema();
-            if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
-                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
-            }
-            fms.emplace_back(std::move(fm), std::move(s));
-        }
-
-        if (now < written_at + timeout) {
-            blogger.debug("Skipping replay of {}, too fresh", id);
-
-            shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
-
-            co_return db::all_batches_replayed::no;
-        }
-
-        auto size = data.size();
-
-        for (const auto& [fm, s] : fms) {
-            mutations.emplace_back(fm.to_mutation(s));
-            co_await coroutine::maybe_yield();
-        }
-
-        if (!mutations.empty()) {
-            const auto ttl = [written_at]() -> clock_type {
-                /*
-                * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                * This ensures that deletes aren't "undone" by an old batch replay.
-                */
-                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                warn(unimplemented::cause::HINT);
-#if 0
-                for (auto& m : *mutations) {
-                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-                }
-#endif
-                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-            }();
-
-            if (ttl > 0) {
-                // Origin does the send manually, however I can't see a super great reason to do so.
-                // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-                // in both cases.
-                // FIXME: verify that the above is reasonably true.
-                co_await limiter.reserve(size);
-                stats.write_attempts += mutations.size();
-                auto timeout = db::timeout_clock::now() + write_timeout;
-                if (cleanup) {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
-                } else {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
-                }
-            }
-        }
-    } catch (data_dictionary::no_such_keyspace& ex) {
-        // should probably ignore and drop the batch
-    } catch (const data_dictionary::no_such_column_family&) {
-        // As above -- we should drop the batch if the table doesn't exist anymore.
-    } catch (...) {
-        blogger.warn("Replay failed (will retry): {}", std::current_exception());
-        // timeout, overload etc.
-        // Do _not_ remove the batch, assuning we got a node write error.
-        // Since we don't have hints (which origin is satisfied with),
-        // we have to resort to keeping this batch to next lap.
-        if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
-            co_return db::all_batches_replayed::no;
-        }
-        send_failed = true;
-    }
-
-    auto& sp = qp.proxy();
-
-    if (send_failed) {
-        blogger.debug("Moving batch {} to stage failed_replay", id);
-        auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
-        co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-    }
-
-    // delete batch
-    auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
-    co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-
-    shard_written_at.need_cleanup = true;
-
-    co_return db::all_batches_replayed(!send_failed);
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
-    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
-    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
-    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
-    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
-
-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
-
-    // Use a stable `now` across all batches, so skip/replay decisions are the
-    // same across a while prefix of written_at (across all ids).
-    const auto now = db_clock::now();
-
-    auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
-        co_return stop_iteration::no;
-    };
-
-    co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
-        blogger.debug("Started replayAllFailedBatches");
-        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
-
-        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-        co_await _qp.query_internal(
-                format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
-                db::consistency_level::ONE,
-                {},
-                page_size,
-                batch);
-
-        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
-    });
-
-    co_return all_replayed;
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
    co_await maybe_migrate_v1_to_v2();

+    typedef db_clock::rep clock_type;
+
    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
+    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);

+    struct replay_stats {
+        std::optional<db_clock::time_point> min_too_fresh;
+        bool need_cleanup = false;
+    };
+
    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

    // Use a stable `now` across all batches, so skip/replay decisions are the
    // same across a while prefix of written_at (across all ids).
    const auto now = db_clock::now();

-    auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
+    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto batch_shard = row.get_as<int32_t>("shard");
+        auto written_at = row.get_as<db_clock::time_point>("written_at");
+        auto id = row.get_as<utils::UUID>("id");
+        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+        auto timeout = _replay_timeout;
+
+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
+            co_return stop_iteration::no;
+        }
+
+        auto data = row.get_blob_unfragmented("data");
+
+        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+        utils::chunked_vector<mutation> mutations;
+        bool send_failed = false;
+
+        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
+
+        try {
+            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
+            auto in = ser::as_input_stream(data);
+            while (in.size()) {
+                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
+                if (!tbl) {
+                    continue;
+                }
+                if (written_at <= tbl->get_truncation_time()) {
+                    continue;
+                }
+                schema_ptr s = tbl->schema();
+                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                }
+                fms.emplace_back(std::move(fm), std::move(s));
+            }
+
+            if (now < written_at + timeout) {
+                blogger.debug("Skipping replay of {}, too fresh", id);
+
+                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
+                co_return stop_iteration::no;
+            }
+
+            auto size = data.size();
+
+            for (const auto& [fm, s] : fms) {
+                mutations.emplace_back(fm.to_mutation(s));
+                co_await coroutine::maybe_yield();
+            }
+
+            if (!mutations.empty()) {
+                const auto ttl = [written_at]() -> clock_type {
+                    /*
+                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                    * This ensures that deletes aren't "undone" by an old batch replay.
+                    */
+                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                    warn(unimplemented::cause::HINT);
+#if 0
+                    for (auto& m : *mutations) {
+                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                    }
+#endif
+                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+                }();
+
+                if (ttl > 0) {
+                    // Origin does the send manually, however I can't see a super great reason to do so.
+                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                    // in both cases.
+                    // FIXME: verify that the above is reasonably true.
+                    co_await limiter->reserve(size);
+                    _stats.write_attempts += mutations.size();
+                    auto timeout = db::timeout_clock::now() + write_timeout;
+                    if (cleanup) {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                    } else {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    }
+                }
+            }
+        } catch (data_dictionary::no_such_keyspace& ex) {
+            // should probably ignore and drop the batch
+        } catch (const data_dictionary::no_such_column_family&) {
+            // As above -- we should drop the batch if the table doesn't exist anymore.
+        } catch (...) {
+            blogger.warn("Replay failed (will retry): {}", std::current_exception());
+            all_replayed = all_batches_replayed::no;
+            // timeout, overload etc.
+            // Do _not_ remove the batch, assuning we got a node write error.
+            // Since we don't have hints (which origin is satisfied with),
+            // we have to resort to keeping this batch to next lap.
+            if (!cleanup || stage == batchlog_stage::failed_replay) {
+                co_return stop_iteration::no;
+            }
+            send_failed = true;
+        }
+
+        auto& sp = _qp.proxy();
+
+        if (send_failed) {
+            blogger.debug("Moving batch {} to stage failed_replay", id);
+            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+        }
+
+        // delete batch
+        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+        shard_written_at.need_cleanup = true;
+
        co_return stop_iteration::no;
    };

@@ -580,10 +501,3 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

    co_return all_replayed;
 }
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
-    if (_fs.batchlog_v2) {
-        return replay_all_failed_batches_v2(cleanup);
-    }
-    return replay_all_failed_batches_v1(cleanup);
-}
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -27,12 +27,6 @@ class query_processor;

 } // namespace cql3

-namespace gms {
-
-class feature_service;
-
-} // namespace gms
-
 namespace db {

 class system_keyspace;
@@ -55,11 +49,6 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

-    struct stats {
-        uint64_t write_attempts = 0;
-    };
-
-
 private:
    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
@@ -67,13 +56,14 @@ private:

    using clock_type = lowres_clock;

-    stats _stats;
+    struct stats {
+        uint64_t write_attempts = 0;
+    } _stats;

    seastar::metrics::metric_groups _metrics;

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
-    gms::feature_service& _fs;
    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
@@ -94,14 +84,12 @@ private:

    future<> maybe_migrate_v1_to_v2();

-    future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
-    future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
    // shard qp (which is what you feed here).
-    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
+    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);

    // abort the replay loop and return its future.
    future<> drain();
@@ -114,7 +102,7 @@ public:
        return _last_replay;
    }

-    const stats& get_stats() const {
+    const stats& stats() const {
        return _stats;
    }
 private:
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
+            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
-    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -621,25 +621,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    * @GroupDescription: Provides an overview of the group.
    */
    /**
-    * @Group Ungrouped properties
-    */
-    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
-        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
-    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
-        "true: auto-adjust memtable shares for flush processes")
-    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
-        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
-        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
-    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
-        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
-        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
-        "Set to 0 to disable automatic flushing all tables before major compaction.")
-    /**
    * @Group Initialization properties
    * @GroupDescription The minimal properties needed for configuring a cluster.
    */
@@ -1291,7 +1272,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
    , override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
    , enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
    , enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
            "If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
@@ -1394,6 +1375,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
            "Admit new reads while there are less than this number of requests that need CPU.")
+    , reader_concurrency_semaphore_preemptive_abort_factor(this, "reader_concurrency_semaphore_preemptive_abort_factor", liveness::LiveUpdate, value_status::Used, 0.3,
+            "Admit new reads while their remaining time is more than this factor times their timeout times when arrived to a semaphore. Its vale means\n"
+            "* <= 0.0 means new reads will never get rejected during admission\n"
+            "* >= 1.0 means new reads will always get rejected during admission\n")
    , view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
            "Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , view_update_reader_concurrency_semaphore_kill_limit_multiplier(this, "view_update_reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
@@ -1602,6 +1587,25 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
    , minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
        "Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
+    /**
+    * @Group Ungrouped properties
+    */
+    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
+        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
+    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
+        "true: auto-adjust memtable shares for flush processes")
+    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
+        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
+        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
+    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
+        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
+        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
+        "Set to 0 to disable automatic flushing all tables before major compaction.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
--- a/db/config.hh
+++ b/db/config.hh
@@ -185,13 +185,6 @@ public:
     * All values and documentation taken from
     * http://docs.datastax.com/en/cassandra/2.1/cassandra/configuration/configCassandra_yaml_r.html
     */
-    named_value<double> background_writer_scheduling_quota;
-    named_value<bool> auto_adjust_flush_quota;
-    named_value<float> memtable_flush_static_shares;
-    named_value<float> compaction_static_shares;
-    named_value<float> compaction_max_shares;
-    named_value<bool> compaction_enforce_min_threshold;
-    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
    named_value<sstring> cluster_name;
    named_value<sstring> listen_address;
    named_value<sstring> listen_interface;
@@ -446,6 +439,7 @@ public:
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
+    named_value<float> reader_concurrency_semaphore_preemptive_abort_factor;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_cpu_concurrency;
@@ -612,6 +606,14 @@ public:
    named_value<float> size_based_balance_threshold_percentage;
    named_value<uint64_t> minimal_tablet_size_for_balancing;

+    named_value<double> background_writer_scheduling_quota;
+    named_value<bool> auto_adjust_flush_quota;
+    named_value<float> memtable_flush_static_shares;
+    named_value<float> compaction_static_shares;
+    named_value<float> compaction_max_shares;
+    named_value<bool> compaction_enforce_min_threshold;
+    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
+
    static const sstring default_tls_priority;
 private:
    template<typename T>
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
    _sender.cancel_draining();
 }

-hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
+hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
    : _key(key)
    , _shard_manager(shard_manager)
    , _store_gate("hint_endpoint_manager")
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
    // Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
    // TODO: Should this logic be deduplicated with what is in the commitlog?
    , _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
-    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
+    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
 {}

 hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -63,7 +63,7 @@ private:
    hint_sender _sender;

 public:
-    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
+    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
    hint_endpoint_manager(hint_endpoint_manager&&);
    ~hint_endpoint_manager();

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
    return cm_it->second;
 }

-hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
+hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
    : _stopped(make_ready_future<>())
    , _ep_key(parent.end_point_key())
    , _ep_manager(parent)
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
-    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
+    , _hints_cpu_sched_group(sg)
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -120,7 +120,7 @@ private:
    std::multimap<db::replay_position, lw_shared_ptr<std::optional<promise<>>>> _replay_waiters;

 public:
-    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper) noexcept;
+    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept;
    ~hint_sender();

    /// \brief A constructor that should be called from the copy/move-constructor of hint_endpoint_manager.
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -142,7 +142,7 @@ future<> directory_initializer::ensure_rebalanced() {
 }

 manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter, int64_t max_hint_window_ms,
-        resource_manager& res_manager, sharded<replica::database>& db)
+        resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg)
    : _hints_dir(fs::path(hints_directory) / fmt::to_string(this_shard_id()))
    , _host_filter(std::move(filter))
    , _proxy(proxy)
@@ -150,6 +150,7 @@ manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_fi
    , _local_db(db.local())
    , _draining_eps_gate(seastar::format("hints::manager::{}", _hints_dir.native()))
    , _resource_manager(res_manager)
+    , _hints_sending_sched_group(sg)
 {
    if (utils::get_local_injector().enter("decrease_hints_flush_period")) {
        hints_flush_period = std::chrono::seconds{1};
@@ -415,7 +416,7 @@ hint_endpoint_manager& manager::get_ep_manager(const endpoint_id& host_id, const

    try {
        std::filesystem::path hint_directory = hints_dir() / (_uses_host_id ? fmt::to_string(host_id) : fmt::to_string(ip));
-        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this});
+        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this, _hints_sending_sched_group});
        hint_endpoint_manager& ep_man = it->second;

        manager_logger.trace("Created an endpoint manager for {}", host_id);
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -133,6 +133,7 @@ private:

    hint_stats _stats;
    seastar::metrics::metric_groups _metrics;
+    scheduling_group _hints_sending_sched_group;

    // We need to keep a variant here. Before migrating hinted handoff to using host ID, hint directories will
    // still represent IP addresses. But after the migration, they will start representing host IDs.
@@ -155,7 +156,7 @@ private:

 public:
    manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter,
-            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db);
+            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg);

    manager(const manager&) = delete;
    manager& operator=(const manager&) = delete;
--- a/db/partition_snapshot_row_cursor.hh
+++ b/db/partition_snapshot_row_cursor.hh
@@ -461,17 +461,7 @@ public:
                    }
                }
            } else {
-                if (_reversed) [[unlikely]] {
-                    if (!rows.empty()) {
-                        it = std::prev(rows.end());
-                        cont = is_continuous::yes;
-                        rt = {};
-                    } else {
-                        _background_continuity = true;
-                    }
-                } else {
-                    _background_continuity = true;
-                }
+                _background_continuity = true; // Default continuity
            }

            if (!it) {
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -24,12 +24,11 @@
 #include "readers/forwardable.hh"
 #include "readers/nonforwardable.hh"
 #include "cache_mutation_reader.hh"
-#include "partition_snapshot_reader.hh"
+#include "replica/partition_snapshot_reader.hh"
 #include "keys/clustering_key_filter.hh"
 #include "utils/assert.hh"
 #include "utils/updateable_value.hh"
 #include "utils/labels.hh"
-#include "utils/chunked_vector.hh"

 namespace cache {

@@ -846,7 +845,7 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
            cache_entry& e = *i;
            upgrade_entry(e);
            tracing::trace(ts, "Reading partition {} from cache", pos);
-            return make_partition_snapshot_flat_reader<false, dummy_accounter>(
+            return replica::make_partition_snapshot_reader<false, dummy_accounter>(
                    schema,
                    std::move(permit),
                    e.key(),
@@ -1216,10 +1215,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
 }

 future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
-    return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
+    return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
 }

-future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
+future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
    return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
        return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
            auto on_failure = defer([this] () noexcept {
--- a/db/row_cache.hh
+++ b/db/row_cache.hh
@@ -17,7 +17,6 @@
 #include "utils/histogram.hh"
 #include "mutation/partition_version.hh"
 #include "utils/double-decker.hh"
-#include "utils/chunked_vector.hh"
 #include "db/cache_tracker.hh"
 #include "readers/empty.hh"
 #include "readers/mutation_source.hh"
@@ -458,7 +457,7 @@ public:
    // mutation source made prior to the call to invalidate().
    future<> invalidate(external_updater, const dht::decorated_key&);
    future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
-    future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
+    future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });

    // Evicts entries from cache.
    //
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -1139,17 +1139,14 @@ future<> schema_applier::finalize_tables_and_views() {
    // was already dropped (see https://github.com/scylladb/scylla/issues/5614)
    for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
        auto s = dropped_view.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
        auto s = dropped_table.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
        auto s = dropped_cdc.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }

--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -105,7 +105,7 @@ namespace {
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.ks_name() == schema_tables::NAME) {
                // all schema tables are group0 tables
-                builder.set_is_group0_table();
+                builder.set_is_group0_table(true);
            }
        });
 }
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -144,7 +144,7 @@ static std::vector<sstring> get_keyspaces(const schema& s, const replica::databa
 /**
 * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
 */
-static dht::partition_range as_ring_position_range(const dht::token_range& r) {
+static dht::partition_range as_ring_position_range(dht::token_range& r) {
    std::optional<wrapping_interval<dht::ring_position>::bound> start_bound, end_bound;
    if (r.start()) {
        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
@@ -156,14 +156,11 @@ static dht::partition_range as_ring_position_range(const dht::token_range& r) {
 }

 /**
- * Add a new range_estimates for the specified range, considering the sstables associated
- * with the table identified by `cf_id` across all shards.
+ * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
 */
-static future<system_keyspace::range_estimates> estimate(replica::database& db, table_id cf_id, schema_ptr schema, const token_range& r) {
-    struct shard_estimate {
-        int64_t count = 0;
-        utils::estimated_histogram hist{0};
-    };
+static future<system_keyspace::range_estimates> estimate(const replica::column_family& cf, const token_range& r) {
+    int64_t count{0};
+    utils::estimated_histogram hist{0};
    auto from_bytes = [] (auto& b) {
        return dht::token::from_sstring(utf8_type->to_string(b));
    };
@@ -172,35 +169,14 @@ static future<system_keyspace::range_estimates> estimate(replica::database& db,
        wrapping_interval<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
        dht::token_comparator(),
        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-
-    // Estimate partition count and size distribution from sstables on a single shard.
-    auto estimate_on_shard = [cf_id, ranges] (replica::database& local_db) -> future<shard_estimate> {
-        auto table_ptr = local_db.get_tables_metadata().get_table_if_exists(cf_id);
-        if (!table_ptr) {
-            co_return shard_estimate{};
+    for (auto&& r : ranges) {
+        auto rp_range = as_ring_position_range(r);
+        for (auto&& sstable : cf.select_sstables(rp_range)) {
+            count += co_await sstable->estimated_keys_for_range(r);
+            hist.merge(sstable->get_stats_metadata().estimated_partition_size);
        }
-        auto& cf = *table_ptr;
-        shard_estimate result;
-        for (auto&& r : ranges) {
-            auto rp_range = as_ring_position_range(r);
-            for (auto&& sstable : cf.select_sstables(rp_range)) {
-                result.count += co_await sstable->estimated_keys_for_range(r);
-                result.hist.merge(sstable->get_stats_metadata().estimated_partition_size);
-            }
-        }
-        co_return result;
-    };
-
-    // Combine partial results from two shards.
-    auto reduce = [] (shard_estimate a, const shard_estimate& b) {
-        a.count += b.count;
-        a.hist.merge(b.hist);
-        return a;
-    };
-
-    auto aggregate = co_await db.container().map_reduce0(std::move(estimate_on_shard), shard_estimate{}, std::move(reduce));
-    int64_t mean_size = aggregate.count > 0 ? aggregate.hist.mean() : 0;
-    co_return system_keyspace::range_estimates{std::move(schema), r.start, r.end, aggregate.count, mean_size};
+    }
+    co_return system_keyspace::range_estimates{cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
 }

 /**
@@ -345,7 +321,7 @@ size_estimates_mutation_reader::estimates_for_current_keyspace(std::vector<token
        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
        for (auto&& r : rows_to_estimate) {
            auto& cf = _db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-            estimates.push_back(co_await estimate(_db, cf.schema()->id(), cf.schema(), r.tokens));
+            estimates.push_back(co_await estimate(cf, r.tokens));
            if (estimates.size() >= _slice.partition_row_limit()) {
                co_return estimates;
            }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -87,15 +87,31 @@ namespace {
        static const std::unordered_set<sstring> tables = {
            schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
            system_keyspace::BROADCAST_KV_STORE,
+            system_keyspace::CDC_GENERATIONS_V3,
            system_keyspace::RAFT,
            system_keyspace::RAFT_SNAPSHOTS,
            system_keyspace::RAFT_SNAPSHOT_CONFIG,
            system_keyspace::GROUP0_HISTORY,
            system_keyspace::DISCOVERY,
+            system_keyspace::TABLETS,
+            system_keyspace::TOPOLOGY,
+            system_keyspace::TOPOLOGY_REQUESTS,
            system_keyspace::LOCAL,
            system_keyspace::PEERS,
+            system_keyspace::SCYLLA_LOCAL,
            system_keyspace::COMMITLOG_CLEANUPS,
+            system_keyspace::SERVICE_LEVELS_V2,
+            system_keyspace::VIEW_BUILD_STATUS_V2,
+            system_keyspace::CDC_STREAMS_STATE,
+            system_keyspace::CDC_STREAMS_HISTORY,
+            system_keyspace::ROLES,
+            system_keyspace::ROLE_MEMBERS,
+            system_keyspace::ROLE_ATTRIBUTES,
+            system_keyspace::ROLE_PERMISSIONS,
            system_keyspace::CDC_LOCAL,
+            system_keyspace::DICTS,
+            system_keyspace::VIEW_BUILDING_TASKS,
+            system_keyspace::CLIENT_ROUTES,
        };
        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
            builder.enable_schema_commitlog();
@@ -127,7 +143,7 @@ namespace {
                system_keyspace::REPAIR_TASKS,
            };
            if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
-                builder.set_is_group0_table();
+                builder.set_is_group0_table(true);
            }
        });
 }
@@ -1698,9 +1714,7 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
    std::unordered_set<dht::token> tset;
    for (auto& t: tokens) {
        auto str = value_cast<sstring>(t);
-        if (str != dht::token::from_sstring(str).to_sstring()) {
-            on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
-        }
+        SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
        tset.insert(dht::token::from_sstring(str));
    }
    return tset;
@@ -3177,7 +3191,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                    };
                }
            } else if (must_have_tokens(nstate)) {
-                on_internal_error(slogger, format(
+                on_fatal_internal_error(slogger, format(
                        "load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
            }
        }
@@ -3259,7 +3273,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            // Currently, at most one node at a time can be in transitioning state.
            if (!map->empty()) {
                const auto& [other_id, other_rs] = *map->begin();
-                on_internal_error(slogger, format(
+                on_fatal_internal_error(slogger, format(
                    "load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
                    other_id, other_rs.state, host_id, nstate));
            }
@@ -3317,7 +3331,8 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
                        NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
                gen_id.id);
-            if (!gen_rows || gen_rows->empty()) {
+            SCYLLA_ASSERT(gen_rows);
+            if (gen_rows->empty()) {
                on_internal_error(slogger, format(
                    "load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
            }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -215,8 +215,6 @@ public:
    static constexpr auto BUILT_VIEWS = "built_views";
    static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
    static constexpr auto CDC_LOCAL = "cdc_local";
-    static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
-    static constexpr auto CDC_STREAMS = "cdc_streams";

    // auth
    static constexpr auto ROLES = "roles";
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -23,6 +23,7 @@

 #include <seastar/core/future-util.hh>
 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/all.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <flat_map>

@@ -65,6 +66,7 @@
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
 #include "utils/small_vector.hh"
+#include "view_builder.hh"
 #include "view_info.hh"
 #include "view_update_checks.hh"
 #include "types/list.hh"
@@ -930,7 +932,8 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
    const row& existing_row = existing.cells();
    const row& updated_row = update.cells();

-    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
+    const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
+    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
        const auto view_it = _view->columns_by_name().find(cdef.name());
        const bool column_is_selected = view_it != _view->columns_by_name().end();

@@ -938,29 +941,49 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
        // as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
        // Because of that, we don't generate view updates when the value in an unselected column is created
        // or changes.
-        if (!column_is_selected) {
+        if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
            return true;
        }

-        // We cannot skip if the value was created or deleted
+        //TODO(sarna): Optimize collections case - currently they do not go under optimization
+        if (!cdef.is_atomic()) {
+            return false;
+        }
+
+        // We cannot skip if the value was created or deleted, unless we have a non-expiring marker
        const auto* existing_cell = existing_row.find_cell(cdef.id);
        const auto* updated_cell = updated_row.find_cell(cdef.id);
        if (existing_cell == nullptr || updated_cell == nullptr) {
-            return existing_cell == updated_cell;
+            return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
        }
-
-        if (!cdef.is_atomic()) {
-            return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
-        }
-
        atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
        atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);

        // We cannot skip when a selected column is changed
-        if (view_it->second->is_view_virtual()) {
-            return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
+        if (column_is_selected) {
+            if (view_it->second->is_view_virtual()) {
+                return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
+            }
+            return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
        }
-        return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
+
+        // With non-expiring row marker, liveness checks below are not relevant
+        if (base_has_nonexpiring_marker) {
+            return true;
+        }
+
+        if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
+            return false;
+        }
+
+        // We cannot skip if the change updates TTL
+        const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
+        const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
+        if (existing_has_ttl || updated_has_ttl) {
+            return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
+        }
+
+        return true;
    });
 }

@@ -1581,11 +1604,9 @@ future<stop_iteration> view_update_builder::on_results() {

    auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
    if (tombstone && _existing && !_existing->is_end_of_partition()) {
-        if (_existing->is_range_tombstone_change()) {
-            _existing_current_tombstone = _existing->as_range_tombstone_change().tombstone();
-        } else if (_existing->is_clustering_row()) {
+        // We don't care if it's a range tombstone, as we're only looking for existing entries that get deleted
+        if (_existing->is_clustering_row()) {
            auto existing = clustering_row(*_schema, _existing->as_clustering_row());
-            existing.apply(std::max(_existing_partition_tombstone, _existing_current_tombstone));
            auto update = clustering_row(existing.key(), row_tombstone(std::move(tombstone)), row_marker(), ::row());
            generate_update(std::move(update), { std::move(existing) });
        } else if (_existing->is_static_row()) {
@@ -1596,10 +1617,9 @@ future<stop_iteration> view_update_builder::on_results() {
        return should_stop_updates() ? stop() : advance_existings();
    }

+    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
    if (_update && !_update->is_end_of_partition()) {
-        if (_update->is_range_tombstone_change()) {
-            _update_current_tombstone = _update->as_range_tombstone_change().tombstone();
-        } else if (_update->is_clustering_row()) {
+        if (_update->is_clustering_row()) {
            _update->mutate_as_clustering_row(*_schema, [&] (clustering_row& cr) mutable {
                cr.apply(std::max(_update_partition_tombstone, _update_current_tombstone));
            });
@@ -1731,7 +1751,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
        std::vector<std::reference_wrapper<const locator::node>> base_nodes,
        std::vector<std::reference_wrapper<const locator::node>> view_nodes,
        locator::endpoint_dc_rack my_location,
-        const bool network_topology,
+        const locator::network_topology_strategy* network_topology,
        replica::cf_stats& cf_stats) {
    using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
    node_vector base_endpoints, view_endpoints;
@@ -1884,7 +1904,7 @@ endpoints_to_update get_view_natural_endpoint(
        locator::host_id me,
        const locator::effective_replication_map_ptr& base_erm,
        const locator::effective_replication_map_ptr& view_erm,
-        const bool network_topology,
+        const locator::abstract_replication_strategy& replication_strategy,
        const dht::token& base_token,
        const dht::token& view_token,
        bool use_tablets,
@@ -1892,6 +1912,7 @@ endpoints_to_update get_view_natural_endpoint(
    auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
    auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
    auto& my_location = topology.get_location(me);
+    auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);

    auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
        if (auto* np = topology.find_node(ep)) {
@@ -1925,7 +1946,7 @@ endpoints_to_update get_view_natural_endpoint(
                // view pairing as the leaving base replica.
                // note that the recursive call will not recurse again because leaving_base is in base_nodes.
                auto leaving_base = it->get().host_id();
-                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
+                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
                        view_token, use_tablets, cf_stats);
            }
        }
@@ -2021,9 +2042,7 @@ future<> view_update_generator::mutate_MV(
        wait_for_all_updates wait_for_all)
 {
    auto& ks = _db.find_keyspace(base->ks_name());
-    const bool uses_tablets = ks.uses_tablets();
-    const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
-    // The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
+    auto& replication = ks.get_replication_strategy();
    std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
    auto get_erm = [&] (table_id id) {
        auto it = erms.find(id);
@@ -2042,8 +2061,8 @@ future<> view_update_generator::mutate_MV(
    co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
        auto view_token = dht::get_token(*mut.s, mut.fm.key());
        auto view_ermp = erms.at(mut.s->id());
-        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
-                uses_tablets, cf_stats);
+        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
+                ks.uses_tablets(), cf_stats);
        auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
        auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
        if (no_pairing_endpoint) {
@@ -2221,12 +2240,20 @@ void view_builder::setup_metrics() {
 }

 future<> view_builder::start_in_background(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
+    auto step_fiber = make_ready_future<>();
    try {
        view_builder_init_state vbi;
        auto fail = defer([&barrier] mutable { barrier.abort(); });
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
+        // Semaphore usage invariants:
+        // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+        //   (_base_to_build_step, _built_views, build_status, reader resets).
+        // - The unit is held for the whole operation, including the async chain, until the state
+        //   is stable for the next operation on that shard.
+        // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+        //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+        //   the local acquire because it already holds the unit from the dispatcher.
+        // Guard the whole startup routine with a semaphore so that it's not intercepted by
+        // `on_drop_view`, `on_create_view`, or `on_update_view` events.
        auto units = co_await get_units(_sem, view_builder_semaphore_units);
        // Wait for schema agreement even if we're a seed node.
        co_await mm.wait_for_schema_agreement(_db, db::timeout_clock::time_point::max(), &_as);
@@ -2247,8 +2274,10 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        _mnotifier.register_listener(this);
        co_await calculate_shard_build_step(vbi);
        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+
+        // If preparation above fails, run_in_background() is not invoked, just
+        // the start_in_background() emits a warning into logs and resolves
+        step_fiber = run_in_background();
    } catch (...) {
        auto ex = std::current_exception();
        auto ll = log_level::error;
@@ -2263,10 +2292,12 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        }
        vlogger.log(ll, "start aborted: {}", ex);
    }
+
+    co_await std::move(step_fiber);
 }

 future<> view_builder::start(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
-    _started = start_in_background(mm, std::move(barrier));
+    _step_fiber = start_in_background(mm, std::move(barrier));
    return make_ready_future<>();
 }

@@ -2276,12 +2307,12 @@ future<> view_builder::drain() {
    }
    vlogger.info("Draining view builder");
    _as.request_abort();
-    co_await std::move(_started);
    co_await _mnotifier.unregister_listener(this);
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
-    co_await _build_step.join();
+    _build_step.broken();
+    co_await std::move(_step_fiber);
    co_await coroutine::parallel_for_each(_base_to_build_step, [] (std::pair<const table_id, build_step>& p) {
        return p.second.reader.close();
    });
@@ -2650,63 +2681,59 @@ static bool should_ignore_tablet_keyspace(const replica::database& db, const sst
    return db.features().view_building_coordinator && db.has_keyspace(ks_name) && db.find_keyspace(ks_name).uses_tablets();
 }

-future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
-    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
-    }
-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; seed the global rows before broadcasting.
-        return handle_seed_view_build_progress(ks_name, view_name).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return container().invoke_on_all([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable {
-                return vb.handle_create_view_local(std::move(ks_name), std::move(view_name));
-            });
-        });
-    });
+future<view_builder::view_builder_units> view_builder::get_or_adopt_view_builder_lock(view_builder_units_opt units) {
+    co_return units ? std::move(*units) : co_await get_units(_sem, view_builder_semaphore_units);
 }

-future<> view_builder::handle_seed_view_build_progress(sstring ks_name, sstring view_name) {
+future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
+    if (should_ignore_tablet_keyspace(_db, ks_name)) {
+        co_return;
+    }
+
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+    co_await handle_seed_view_build_progress(ks_name, view_name);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_create_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_create_view_local(ks_name, view_name, std::nullopt); }); });
+}
+
+future<> view_builder::handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name) {
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
    return _sys_ks.register_view_for_building_for_all_shards(view->ks_name(), view->cf_name(), step.current_token());
 }

-future<> view_builder::handle_create_view_local(sstring ks_name, sstring view_name){
-    if (this_shard_id() == 0) { 
-        return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_create_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
-    return when_all(step.base->await_pending_writes(), step.base->await_pending_streams()).discard_result().then([this, &step] {
-        return flush_base(step.base, _as);
-    }).then([this, view, &step] () {
+    try {
+        co_await coroutine::all(
+            [&step] -> future<> {
+                co_await step.base->await_pending_writes(); },
+            [&step] -> future<> {
+                co_await step.base->await_pending_streams(); });
+        co_await flush_base(step.base, _as);
+    
        // This resets the build step to the current token. It may result in views currently
        // being built to receive duplicate updates, but it simplifies things as we don't have
        // to keep around a list of new views to build the next time the reader crosses a token
        // threshold.
-        return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
-            return add_new_view(view, step);
-        }).then_wrapped([this, view] (future<>&& f) {
-            try {
-                f.get();
-            } catch (abort_requested_exception&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (raft::request_aborted&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (...) {
-                vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
-            }
+        co_await initialize_reader_at_current_token(step);
+        co_await add_new_view(view, step);
+    } catch (abort_requested_exception&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (raft::request_aborted&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (...) {
+        vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
+    }

-            // Waited on indirectly in stop().
-            static_cast<void>(_build_step.trigger());
-        });
-    });
+    _build_step.signal();
 }

 void view_builder::on_create_view(const sstring& ks_name, const sstring& view_name) {
@@ -2743,62 +2770,55 @@ void view_builder::on_update_view(const sstring& ks_name, const sstring& view_na

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
+        co_return;
    }

-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; broadcast local cleanup before global cleanup.
-        return container().invoke_on_all([ks_name, view_name] (view_builder& vb) mutable {
-            return vb.handle_drop_view_local(std::move(ks_name), std::move(view_name));
-        }).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_global_cleanup(std::move(ks_name), std::move(view_name));
-        });
-    });
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_drop_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_drop_view_local(ks_name, view_name, std::nullopt); });});
+    co_await handle_drop_view_global_cleanup(ks_name, view_name);
 }

-future<> view_builder::handle_drop_view_local(sstring ks_name, sstring view_name) {
-    if (this_shard_id() == 0) { 
-        return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_drop_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    vlogger.info0("Stopping to build view {}.{}", ks_name, view_name);
-    // The view is absent from the database at this point, so find it by brute force.
-    ([&, this] {
-        for (auto& [_, step] : _base_to_build_step) {
-            if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
-                continue;
-            }
-            for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
-                if (it->view->cf_name() == view_name) {
-                    _built_views.erase(it->view->id());
-                    step.build_status.erase(it);
-                    return;
-                }
+
+    for (auto& [_, step] : _base_to_build_step) {
+        if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
+            continue;
+        }
+        for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
+            if (it->view->cf_name() == view_name) {
+                _built_views.erase(it->view->id());
+                step.build_status.erase(it);
+                co_return;
            }
        }
-    })();
-    return make_ready_future<>();  
+    }
 }

-future<> view_builder::handle_drop_view_global_cleanup(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name) {
    if (this_shard_id() != 0) {
-        return make_ready_future<>();
+        co_return;
    }
    vlogger.info0("Starting view global cleanup {}.{}", ks_name, view_name);
-    return when_all_succeed(
-                _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name),
-                _sys_ks.remove_built_view(ks_name, view_name),
-                remove_view_build_status(ks_name, view_name))
-                    .discard_result()
-                    .handle_exception([ks_name, view_name] (std::exception_ptr ep) {
-        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, ep);
-    });
+    
+    try {
+        co_await coroutine::all(
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_built_view(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await remove_view_build_status(ks_name, view_name); });
+    } catch (...) {
+        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, std::current_exception());
+    }
 }

 void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name) {
@@ -2812,14 +2832,15 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }));
 }

-future<> view_builder::do_build_step() {
-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this] {
+future<> view_builder::run_in_background() {
+    return seastar::async([this] {
        exponential_backoff_retry r(1s, 1min);
-        while (!_base_to_build_step.empty() && !_as.abort_requested()) {
+        while (!_as.abort_requested()) {
+            try {
+                _build_step.wait([this] { return !_base_to_build_step.empty(); }).get();
+            } catch (const seastar::broken_condition_variable&) {
+                return;
+            }
            auto units = get_units(_sem, view_builder_semaphore_units).get();
            ++_stats.steps_performed;
            try {
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
    locator::host_id node,
    const locator::effective_replication_map_ptr& base_erm,
    const locator::effective_replication_map_ptr& view_erm,
-    const bool network_topology,
+    const locator::abstract_replication_strategy& replication_strategy,
    const dht::token& base_token,
    const dht::token& view_token,
    bool use_tablets,
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -11,13 +11,13 @@
 #include "query/query-request.hh"
 #include "service/migration_listener.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/serialized_action.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "replica/database.hh"

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_future.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -104,6 +104,12 @@ class view_update_generator;
 *            redo the missing step, for simplicity.
 */
 class view_builder final : public service::migration_listener::only_view_notifications, public seastar::peering_sharded_service<view_builder> {
+    //aliasing for semaphore units that will be used throughout the class
+    using view_builder_units = semaphore_units<named_semaphore_exception_factory>;
+
+    //aliasing for optional semaphore units that will be used throughout the class
+    using view_builder_units_opt = std::optional<view_builder_units>;
+
    /**
     * Keeps track of the build progress for a particular view.
     * When the view is built, next_token == first_token.
@@ -168,14 +174,24 @@ class view_builder final : public service::migration_listener::only_view_notific
    reader_permit _permit;
    base_to_build_step_type _base_to_build_step;
    base_to_build_step_type::iterator _current_step = _base_to_build_step.end();
-    serialized_action _build_step{std::bind(&view_builder::do_build_step, this)};
+    condition_variable _build_step;
    static constexpr size_t view_builder_semaphore_units = 1;
    // Ensures bookkeeping operations are serialized, meaning that while we execute
    // a build step we don't consider newly added or removed views. This simplifies
    // the algorithms. Also synchronizes an operation wrt. a call to stop().
+    // Semaphore usage invariants:
+    // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+    //   (_base_to_build_step, _built_views, build_status, reader resets).
+    // - The unit is held for the whole operation, including the async chain, until the state
+    //   is stable for the next operation on that shard.
+    // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+    //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+    //   the local acquire because it already holds the unit from the dispatcher.
+    // Guard the whole startup routine with a semaphore so that it's not intercepted by
+    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
    seastar::abort_source _as;
-    future<> _started = make_ready_future<>();
+    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<table_id> _built_views;
    // Used for testing.
@@ -262,19 +278,18 @@ private:
    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace_view_name>, std::vector<system_keyspace_view_build_progress>);
    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
-    future<> do_build_step();
+    future<> run_in_background();
    void execute(build_step&, exponential_backoff_retry);
    future<> maybe_mark_view_as_built(view_ptr, dht::token);
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
-    future<> handle_seed_view_build_progress(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_global_cleanup(sstring ks_name, sstring view_name);
+    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
+    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name);
+    future<view_builder_units> get_or_adopt_view_builder_lock(view_builder_units_opt units);

    template <typename Func1, typename Func2>
    future<> write_view_build_status(Func1&& fn_group0, Func2&& fn_sys_dist) {
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -200,7 +200,9 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
+            auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
            co_await create_staging_sstable_tasks();
+            lock.return_all();
            _as.check();
            co_await _sstables_to_register_event.when();
        } catch (semaphore_aborted&) {
@@ -225,45 +227,13 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    }
 }

-future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
-    // Collect `_staging_sstables_mutex` locks from multiple shards,
-    // so other shards won't interact with their `_staging_sstables` map
-    // until the caller releases them.
-    std::vector<foreign_ptr<semaphore_units<>>> locks;
-    locks.resize(smp::count);
-    // Locks are acquired from multiple shards in parallel.
-    // This is the only place where multiple-shard locks are acquired at once
-    // and the method is called only once at a time (from `create_staging_sstable_tasks()`
-    // on shard 0), so no deadlock may occur.
-    co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
-        auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
-            auto& vbw = sharded_vbw.local();
-            auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
-            co_return make_foreign(std::move(lock));
-        });
-        locks[shard_id] = std::move(lock_ptr);
-    });
-    co_return std::move(locks);
-}
-
 future<> view_building_worker::create_staging_sstable_tasks() {
-    // Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
-    auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
-
    if (_sstables_to_register.empty()) {
        co_return;
    }

-    auto shards = _sstables_to_register 
-        | std::views::values 
-        | std::views::join 
-        | std::views::transform([] (const auto& sst_info) { return sst_info.shard; }) 
-        | std::ranges::to<std::flat_set<shard_id>>();
-    shards.erase(0); // We're already holding shard0 lock
-    auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));
-
    utils::chunked_vector<canonical_mutation> cmuts;
+
    auto guard = co_await _group0.client().start_operation(_as);
    auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
@@ -272,7 +242,7 @@ future<> view_building_worker::create_staging_sstable_tasks() {
                utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
                table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
            };
-            auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
+            auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
            cmuts.emplace_back(std::move(mut));
        }
    }
@@ -416,7 +386,6 @@ future<> view_building_worker::update_built_views() {
        auto schema = _db.find_schema(table_id);
        return std::make_pair(schema->ks_name(), schema->cf_name());
    };
-    auto& sys_ks = _group0.client().sys_ks();

    std::set<std::pair<sstring, sstring>> built_views;
    for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
@@ -425,22 +394,22 @@ future<> view_building_worker::update_built_views() {
        }
    }

-    auto local_built = co_await sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
+    auto local_built = co_await _sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
        return !_db.has_keyspace(v.first) || _db.find_keyspace(v.first).uses_tablets();
    }) | std::ranges::to<std::set>();

    // Remove dead entries
    for (auto& view: local_built) {
        if (!built_views.contains(view)) {
-            co_await sys_ks.remove_built_view(view.first, view.second);
+            co_await _sys_ks.remove_built_view(view.first, view.second);
        }
    }

    // Add new entries
    for (auto& view: built_views) {
        if (!local_built.contains(view)) {
-            co_await sys_ks.mark_view_as_built(view.first, view.second);
-            co_await sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
+            co_await _sys_ks.mark_view_as_built(view.first, view.second);
+            co_await _sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
        }
    }
 }
@@ -702,34 +671,24 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
 }

 future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
+    if (_staging_sstables[table_id].empty()) {
+        co_return;
+    }
+
    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
+    auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
+    auto tid = tablet_map.get_tablet_id(last_token);
+    auto tablet_range = tablet_map.get_token_range(tid);
+
+    // Select sstables belonging to the tablet (identified by `last_token`)
    std::vector<sstables::shared_sstable> sstables_to_process;
-
-    try {
-        // Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
-        // concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
-        // while we read them.
-        auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
-        auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
-        auto tid = tablet_map.get_tablet_id(last_token);
-        auto tablet_range = tablet_map.get_token_range(tid);
-
-        // Select sstables belonging to the tablet (identified by `last_token`)
-        for (auto& sst: _staging_sstables[table_id]) {
-            auto sst_last_token = sst->get_last_decorated_key().token();
-            if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
-                sstables_to_process.push_back(sst);
-            }
+    for (auto& sst: _staging_sstables[table_id]) {
+        auto sst_last_token = sst->get_last_decorated_key().token();
+        if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
+            sstables_to_process.push_back(sst);
        }
-        lock.return_all();
-    } catch (semaphore_aborted&) {
-        vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
-        co_return;
    }

-    if (sstables_to_process.empty()) {
-        co_return;
-    }
    co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);

    try {
--- a/db/view/view_building_worker.hh
+++ b/db/view/view_building_worker.hh
@@ -14,7 +14,6 @@
 #include <seastar/core/shared_future.hh>
 #include <unordered_map>
 #include <unordered_set>
-#include <flat_set>
 #include "locator/abstract_replication_strategy.hh"
 #include "locator/tablets.hh"
 #include "raft/raft.hh"
@@ -170,15 +169,10 @@ private:
    future<> do_process_staging(table_id base_id, dht::token last_token);

    future<> run_staging_sstables_registrator();
-    // Acquires `_staging_sstables_mutex` on all shards internally,
-    // so callers must not hold `_staging_sstables_mutex` when invoking it.
+    // Caller must hold units from `_staging_sstables_mutex`
    future<> create_staging_sstable_tasks();
    future<> discover_existing_staging_sstables();
    std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
-    // Acquire `_staging_sstables_mutex` on multiple shards in parallel.
-    // Must be called only from shard 0.
-    // Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
-    future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);

    void init_messaging_service();
    future<> uninit_messaging_service();
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -1345,8 +1345,8 @@ public:

 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
+        return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1428,8 +1428,8 @@ public:
    }
 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
+        return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", timestamp_type, column_kind::clustering_key)
--- a/debug.cc
+++ b/debug.cc
@@ -11,5 +11,6 @@
 namespace debug {

 seastar::sharded<replica::database>* volatile the_database = nullptr;
+seastar::scheduling_group streaming_scheduling_group;

 }
--- a/debug.hh
+++ b/debug.hh
@@ -17,7 +17,7 @@ class database;
 namespace debug {

 extern seastar::sharded<replica::database>* volatile the_database;
-
+extern seastar::scheduling_group streaming_scheduling_group;

 }

--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -352,16 +352,6 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
    return prs;
 }

-future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
-    utils::chunked_vector<dht::partition_range> prs;
-    prs.reserve(ranges.size());
-    for (auto& range : ranges) {
-        prs.push_back(dht::to_partition_range(range));
-        co_await coroutine::maybe_yield();
-    }
-    co_return prs;
-}
-
 std::map<unsigned, dht::partition_range_vector>
 split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
    std::map<unsigned, dht::partition_range_vector> ret;
@@ -374,11 +364,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
    return ret;
 }

-future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
+future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
    auto cmp = dht::ring_position_comparator(schema);
    // optimize set of potentially overlapping ranges by deoverlapping them.
-    auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
-    utils::chunked_vector<dht::partition_range> res;
+    auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
+    dht::partition_range_vector res;
    res.reserve(ranges.size() * 2);

    auto range = ranges.begin();
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -91,7 +91,6 @@ inline token get_token(const schema& s, partition_key_view key) {

 dht::partition_range to_partition_range(dht::token_range);
 dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
-future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);

 // Each shard gets a sorted, disjoint vector of ranges
 std::map<unsigned, dht::partition_range_vector>
@@ -106,7 +105,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
 // Returns a sorted and deoverlapped list of ranges that are
 // the result of subtracting all ranges from ranges_to_subtract.
 // ranges_to_subtract must be sorted and deoverlapped.
-future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);
+future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);

 // Returns a token_range vector split based on the given number of most-significant bits
 dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
--- a/Show More
+++ b/Show More