compaction_manager: cancel submission timer on drain

The `drain` method, cancels all running compactions and moves the compaction manager into the disabled state. To move it back to the enabled state, the `enable` method shall be called. This, however, throws an assertion error as the submission time is not cancelled and re-enabling the manager tries to arm the armed timer. Thus, cancel the timer, when calling the drain method to disable the compaction manager. Fixes https://github.com/scylladb/scylladb/issues/24504 All versions are affected. So it's a good candidate for a backport. Closes scylladb/scylladb#24505 (cherry picked from commit a9a53d9178) Closes scylladb/scylladb#24585
Merge '[Backport 6.2] cql: create default superuser if it doesn't exist' from Marcin Maliszkiewicz
2025-06-29 14:40:43 +03:00 · 2025-06-29 14:34:55 +03:00 · 2025-06-28 09:40:37 +03:00 · 2025-06-27 17:50:15 +02:00 · 2025-06-27 17:50:08 +02:00 · 2025-06-27 17:50:01 +02:00
1024 changed files with 36780 additions and 11465 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,225 @@
+---
+Language: Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: true
+AlignConsecutiveBitFields:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveDeclarations:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveMacros:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveShortCaseStatements:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCaseColons: false
+AlignEscapedNewlines: Right
+AlignOperands: Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 0
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: InlineOnly
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+AttributeMacros:
+  - __capability
+BinPackArguments: false
+BinPackParameters: false
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterExternBlock: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAfterAttributes: Never
+BreakAfterJavaFieldAnnotations: false
+BreakArrays: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: Always
+BreakBeforeBraces: Attach
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit: 160
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex: '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority: 2
+    SortPriority: 0
+    CaseSensitive: false
+  - Regex: '^(<|"(gtest|gmock|isl|json)/)'
+    Priority: 3
+    SortPriority: 0
+    CaseSensitive: false
+  - Regex: '.*'
+    Priority: 1
+    SortPriority: 0
+    CaseSensitive: false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+InsertBraces: false
+InsertNewlineAtEOF: true
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary: 0
+  BinaryMinDigits: 0
+  Decimal: 0
+  DecimalMinDigits: 0
+  Hex: 0
+  HexMinDigits: 0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+KeepEmptyLinesAtEOF: false
+LambdaBodyIndentation: Signature
+LineEnding: DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+PackConstructorInitializers: NextLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+PPIndentWidth: -1
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
+ReflowComments: true
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes: CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros: true
+  AfterOverloadedOperator: false
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParens: Never
+SpacesInParensOptions:
+  InCStyleCasts: false
+  InConditionalStatements: false
+  InEmptyParentheses: false
+  Other: false
+SpacesInSquareBrackets: false
+Standard: Latest
+TabWidth: 8
+UseTab: Never
+VerilogBreakBetweenInstancePorts: true
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
+...
+
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,5 @@
 # AUTH
-auth/* @elcallio @vladzcloudius
+auth/* @nuivall @ptrsmrn @KrzaQ

 # CACHE
 row_cache* @tgrabiec
@@ -7,9 +7,9 @@ row_cache* @tgrabiec
 test/boost/mvcc* @tgrabiec

 # CDC
-cdc/* @kbr- @elcallio @piodul @jul-stas
-test/cql/cdc_* @kbr- @elcallio @piodul @jul-stas
-test/boost/cdc_* @kbr- @elcallio @piodul @jul-stas
+cdc/* @kbr-scylla @elcallio @piodul
+test/cql/cdc_* @kbr-scylla @elcallio @piodul
+test/boost/cdc_* @kbr-scylla @elcallio @piodul

 # COMMITLOG / BATCHLOG
 db/commitlog/* @elcallio @eliransin
@@ -25,18 +25,18 @@ compaction/* @raphaelsc
 transport/*

 # CQL QUERY LANGUAGE
-cql3/* @tgrabiec
+cql3/* @tgrabiec @nuivall @ptrsmrn @KrzaQ

 # COUNTERS
-counters* @jul-stas
-tests/counter_test* @jul-stas
+counters* @nuivall @ptrsmrn @KrzaQ
+tests/counter_test* @nuivall @ptrsmrn @KrzaQ

 # DOCS
 docs/* @annastuchlik @tzach
-docs/alternator @annastuchlik @tzach @nyh @havaker @nuivall
+docs/alternator @annastuchlik @tzach @nyh @nuivall @ptrsmrn @KrzaQ

 # GOSSIP
-gms/* @tgrabiec @asias
+gms/* @tgrabiec @asias @kbr-scylla

 # DOCKER
 dist/docker/*
@@ -74,8 +74,8 @@ streaming/* @tgrabiec @asias
 service/storage_service.* @tgrabiec @asias

 # ALTERNATOR
-alternator/* @havaker @nuivall
-test/alternator/* @havaker @nuivall
+alternator/* @nyh @nuivall @ptrsmrn @KrzaQ
+test/alternator/* @nyh @nuivall @ptrsmrn @KrzaQ

 # HINTED HANDOFF
 db/hints/* @piodul @vladzcloudius @eliransin
@@ -94,8 +94,8 @@ test/boost/querier_cache_test.cc @denesb
 test/cql-pytest/* @nyh

 # RAFT
-raft/* @kbr- @gleb-cloudius @kostja
-test/raft/* @kbr- @gleb-cloudius @kostja
+raft/* @kbr-scylla @gleb-cloudius @kostja
+test/raft/* @kbr-scylla @gleb-cloudius @kostja

 # HEAT-WEIGHTED LOAD BALANCING
 db/heat_load_balance.* @nyh @gleb-cloudius
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -15,7 +15,7 @@ pull_request_rules:
        - closed
    actions:
      delete_head_branch:
-  - name: Automate backport pull request 5.2
+  - name: Automate backport pull request 6.1
    conditions:
      - or:
        - closed
@@ -23,11 +23,11 @@ pull_request_rules:
      - or:
          - base=master
          - base=next
-      - label=backport/5.2 # The PR must have this label to trigger the backport
+      - label=backport/6.1 # The PR must have this label to trigger the backport
      - label=promoted-to-master
    actions:
      copy:
-        title: "[Backport 5.2] {{ title }}"
+        title: "[Backport 6.1] {{ title }}"
        body: |
          {{ body }}

@@ -37,7 +37,7 @@ pull_request_rules:

           Refs #{{number}}
        branches:
-          - branch-5.2
+          - branch-6.1
        assignees:
          - "{{ author }}"
  - name: Automate backport pull request 5.4
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import re
+import sys
+import tempfile
+import logging
+
+from github import Github, GithubException
+from git import Repo, GitCommandError
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+try:
+    github_token = os.environ["GITHUB_TOKEN"]
+except KeyError:
+    print("Please set the 'GITHUB_TOKEN' environment variable")
+    sys.exit(1)
+
+
+def is_pull_request():
+    return '--pull-request' in sys.argv[1:]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--repo', type=str, required=True, help='Github repository name')
+    parser.add_argument('--base-branch', type=str, default='refs/heads/master', help='Base branch')
+    parser.add_argument('--commits', default=None, type=str, help='Range of promoted commits.')
+    parser.add_argument('--pull-request', type=int, help='Pull request number to be backported')
+    parser.add_argument('--head-commit', type=str, required=is_pull_request(), help='The HEAD of target branch after the pull request specified by --pull-request is merged')
+    return parser.parse_args()
+
+
+def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr_title, commits, is_draft=False):
+    pr_body = f'{pr.body}\n\n'
+    for commit in commits:
+        pr_body += f'- (cherry picked from commit {commit})\n\n'
+    pr_body += f'Parent PR: #{pr.number}'
+    try:
+        backport_pr = repo.create_pull(
+            title=backport_pr_title,
+            body=pr_body,
+            head=f'scylladbbot:{new_branch_name}',
+            base=base_branch_name,
+            draft=is_draft
+        )
+        logging.info(f"Pull request created: {backport_pr.html_url}")
+        backport_pr.add_to_assignees(pr.user)
+        if is_draft:
+            backport_pr.add_to_labels("conflicts")
+            pr_comment = f"@{pr.user} - This PR was marked as draft because it has conflicts\n"
+            pr_comment += "Please resolve them and mark this PR as ready for review"
+            backport_pr.create_issue_comment(pr_comment)
+        logging.info(f"Assigned PR to original author: {pr.user}")
+        return backport_pr
+    except GithubException as e:
+        if 'A pull request already exists' in str(e):
+            logging.warning(f'A pull request already exists for {pr.user}:{new_branch_name}')
+        else:
+            logging.error(f'Failed to create PR: {e}')
+
+
+def get_pr_commits(repo, pr, stable_branch, start_commit=None):
+    commits = []
+    if pr.merged:
+        merge_commit = repo.get_commit(pr.merge_commit_sha)
+        if len(merge_commit.parents) > 1:  # Check if this merge commit includes multiple commits
+            commits.append(pr.merge_commit_sha)
+        else:
+            if start_commit:
+                promoted_commits = repo.compare(start_commit, stable_branch).commits
+            else:
+                promoted_commits = repo.get_commits(sha=stable_branch)
+            for commit in pr.get_commits():
+                for promoted_commit in promoted_commits:
+                    commit_title = commit.commit.message.splitlines()[0]
+                    # In Scylla-pkg and scylla-dtest, for example,
+                    # we don't create a merge commit for a PR with multiple commits,
+                    # according to the GitHub API, the last commit will be the merge commit,
+                    # which is not what we need when backporting (we need all the commits).
+                    # So here, we are validating the correct SHA for each commit so we can cherry-pick
+                    if promoted_commit.commit.message.startswith(commit_title):
+                        commits.append(promoted_commit.sha)
+
+    elif pr.state == 'closed':
+        events = pr.get_issue_events()
+        for event in events:
+            if event.event == 'closed':
+                commits.append(event.commit_id)
+    return commits
+
+
+def create_pr_comment_and_remove_label(pr, comment_body):
+    labels = pr.get_labels()
+    pattern = re.compile(r"backport/\d+\.\d+$")
+    for label in labels:
+        if pattern.match(label.name):
+            print(f"Removing label: {label.name}")
+            comment_body += f'- {label.name}\n'
+            pr.remove_from_labels(label)
+    pr.create_issue_comment(comment_body)
+
+
+def backport(repo, pr, version, commits, backport_base_branch):
+    new_branch_name = f'backport/{pr.number}/to-{version}'
+    backport_pr_title = f'[Backport {version}] {pr.title}'
+    repo_url = f'https://scylladbbot:{github_token}@github.com/{repo.full_name}.git'
+    fork_repo = f'https://scylladbbot:{github_token}@github.com/scylladbbot/{repo.name}.git'
+    with (tempfile.TemporaryDirectory() as local_repo_path):
+        try:
+            repo_local = Repo.clone_from(repo_url, local_repo_path, branch=backport_base_branch)
+            repo_local.git.checkout(b=new_branch_name)
+            is_draft = False
+            for commit in commits:
+                try:
+                    repo_local.git.cherry_pick(commit, '-m1', '-x')
+                except GitCommandError as e:
+                    logging.warning(f'Cherry-pick conflict on commit {commit}: {e}')
+                    is_draft = True
+                    repo_local.git.add(A=True)
+                    repo_local.git.cherry_pick('--continue')
+            if not repo.private and not repo.has_in_collaborators(pr.user.login):
+                repo.add_to_collaborators(pr.user.login, permission="push")
+                comment = f':warning:  @{pr.user.login} you have been added as collaborator to scylladbbot fork '
+                comment += f'Please check your inbox and approve the invitation, once it is done, please add the backport labels again'
+                create_pr_comment_and_remove_label(pr, comment)
+                return
+            repo_local.git.push(fork_repo, new_branch_name, force=True)
+            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                is_draft=is_draft)
+
+        except GitCommandError as e:
+            logging.warning(f"GitCommandError: {e}")
+
+
+def main():
+    args = parse_args()
+    base_branch = args.base_branch.split('/')[2]
+    promoted_label = 'promoted-to-master'
+    repo_name = args.repo
+    if 'scylla-enterprise' in args.repo:
+        promoted_label = 'promoted-to-enterprise'
+    stable_branch = base_branch
+    backport_branch = 'branch-'
+
+    backport_label_pattern = re.compile(r'backport/\d+\.\d+$')
+
+    g = Github(github_token)
+    repo = g.get_repo(repo_name)
+    closed_prs = []
+    start_commit = None
+
+    if args.commits:
+        start_commit, end_commit = args.commits.split('..')
+        commits = repo.compare(start_commit, end_commit).commits
+        for commit in commits:
+            match = re.search(rf"Closes .*#([0-9]+)", commit.commit.message, re.IGNORECASE)
+            if match:
+                pr_number = int(match.group(1))
+                pr = repo.get_pull(pr_number)
+                closed_prs.append(pr)
+    if args.pull_request:
+        start_commit = args.head_commit
+        pr = repo.get_pull(args.pull_request)
+        closed_prs = [pr]
+
+    for pr in closed_prs:
+        labels = [label.name for label in pr.labels]
+        backport_labels = [label for label in labels if backport_label_pattern.match(label)]
+        if promoted_label not in labels:
+            print(f'no {promoted_label} label: {pr.number}')
+            continue
+        if not backport_labels:
+            print(f'no backport label: {pr.number}')
+            continue
+        commits = get_pr_commits(repo, pr, stable_branch, start_commit)
+        logging.info(f"Found PR #{pr.number} with commit {commits} and the following labels: {backport_labels}")
+        for backport_label in backport_labels:
+            version = backport_label.replace('backport/', '')
+            backport_base_branch = backport_label.replace('backport/', backport_branch)
+            backport(repo, pr, version, commits, backport_base_branch)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/label_promoted_commits.py
+++ b/.github/scripts/label_promoted_commits.py
@@ -16,13 +16,8 @@ def parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--repository', type=str, required=True,
                        help='Github repository name (e.g., scylladb/scylladb)')
-    parser.add_argument('--commit_before_merge', type=str, required=True, help='Git commit ID to start labeling from ('
-                                                                               'newest commit).')
-    parser.add_argument('--commit_after_merge', type=str, required=True,
-                        help='Git commit ID to end labeling at (oldest '
-                             'commit, exclusive).')
-    parser.add_argument('--update_issue', type=bool, default=False, help='Set True to update issues when backport was '
-                                                                         'done')
+    parser.add_argument('--commits', type=str, required=True, help='Range of promoted commits.')
+    parser.add_argument('--label', type=str, default='promoted-to-master', help='Label to use')
    parser.add_argument('--ref', type=str, required=True, help='PR target branch')
    return parser.parse_args()

@@ -53,10 +48,11 @@ def main():
    target_branch = re.search(r'branch-(\d+\.\d+)', args.ref)
    g = Github(github_token)
    repo = g.get_repo(args.repository, lazy=False)
-    commits = repo.compare(head=args.commit_after_merge, base=args.commit_before_merge)
+    start_commit, end_commit = args.commits.split('..')
+    commits = repo.compare(start_commit, end_commit).commits
    processed_prs = set()
    # Print commit information
-    for commit in commits.commits:
+    for commit in commits:
        print(f'Commit sha is: {commit.sha}')
        match = pr_pattern.search(commit.commit.message)
        if match:
@@ -66,13 +62,13 @@ def main():
            if target_branch:
                pr = repo.get_pull(pr_number)
                branch_name = target_branch[1]
-                refs_pr = re.findall(r'Refs (?:#|https.*?)(\d+)', pr.body)
+                refs_pr = re.findall(r'Parent PR: (?:#|https.*?)(\d+)', pr.body)
                if refs_pr:
                    print(f'branch-{target_branch.group(1)}, pr number is: {pr_number}')
                    # 1. change the backport label of the parent PR to note that
-                    #    we've merge the corresponding backport PR
+                    #    we've merged the corresponding backport PR
                    # 2. close the backport PR and leave a comment on it to note
-                    #    that it has been merged with a certain git commit,
+                    #    that it has been merged with a certain git commit.
                    ref_pr_number = refs_pr[0]
                    mark_backport_done(repo, ref_pr_number, branch_name)
                    comment = f'Closed via {commit.sha}'
--- a/.github/workflows/add-label-when-promoted.yaml
+++ b/.github/workflows/add-label-when-promoted.yaml
@@ -5,9 +5,10 @@ on:
    branches:
      - master
      - branch-*.*
-
-env:
-  DEFAULT_BRANCH: 'master'
+      - enterprise
+    pull_request_target:
+      types: [labeled]
+      branches: [master, next, enterprise]

 jobs:
  check-commit:
@@ -20,17 +21,51 @@ jobs:
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
+      - name: Set Default Branch
+        id: set_branch
+        run: |
+          if [[ "${{ github.repository }}" == *enterprise* ]]; then
+            echo "DEFAULT_BRANCH=enterprise" >> $GITHUB_ENV
+          else
+            echo "DEFAULT_BRANCH=master" >> $GITHUB_ENV
+          fi
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          repository: ${{ github.repository }}
          ref: ${{ env.DEFAULT_BRANCH }}
+          token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
          fetch-depth: 0  # Fetch all history for all tags and branches
-
+      - name: Set up Git identity
+        run: |
+          git config --global user.name "GitHub Action"
+          git config --global user.email "action@github.com"
+          git config --global merge.conflictstyle diff3
      - name: Install dependencies
-        run: sudo apt-get install -y python3-github
-
+        run: sudo apt-get install -y python3-github python3-git
      - name: Run python script
+        if: github.event_name == 'push'
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/label_promoted_commits.py --commit_before_merge ${{ github.event.before }} --commit_after_merge ${{ github.event.after }} --repository ${{ github.repository }} --ref ${{ github.ref }}
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+        run: python .github/scripts/label_promoted_commits.py  --commits ${{ github.event.before }}..${{ github.sha }} --repository ${{ github.repository }} --ref ${{ github.ref }}
+      - name: Run auto-backport.py when promotion completed
+        if: ${{ github.event_name == 'push' && github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH) }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+        run: python .github/scripts/auto-backport.py --repo ${{ github.repository }} --base-branch ${{ github.ref }} --commits ${{ github.event.before }}..${{ github.sha }}
+      - name: Check if label starts with 'backport/' and contains digits
+        id: check_label
+        run: |
+          label_name="${{ github.event.label.name }}"
+          if [[ "$label_name" =~ ^backport/[0-9]+\.[0-9]+$ ]]; then
+            echo "Label matches backport/X.X pattern."
+            echo "backport_label=true" >> $GITHUB_OUTPUT
+          else
+            echo "Label does not match the required pattern."
+            echo "backport_label=false" >> $GITHUB_OUTPUT
+          fi
+      - name: Run auto-backport.py when label was added
+        if: ${{ github.event_name == 'pull_request_target' && steps.check_label.outputs.backport_label == 'true' && github.event.pull_request.state == 'closed' }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+        run: python .github/scripts/auto-backport.py --repo ${{ github.repository }} --base-branch ${{ github.ref }} --pull-request ${{ github.event.pull_request.number }} --head-commit ${{ github.event.pull_request.base.sha }}
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -22,5 +22,12 @@ jobs:
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
-              core.setFailed("PR body does not contain a valid 'Fixes' reference.");
+              const error = "PR body does not contain a valid 'Fixes' reference.";
+              core.setFailed(error);
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: `:warning: ${error}`
+              });
            }
--- a/.github/workflows/build-scylla.yaml
+++ b/.github/workflows/build-scylla.yaml
@@ -13,10 +13,14 @@ on:
        value: ${{ jobs.build.outputs.md5sum }}

 jobs:
+  read-toolchain:
+    uses: ./.github/workflows/read-toolchain.yaml
  build:
+    if: github.repository == 'scylladb/scylladb'
+    needs:
+      - read-toolchain
    runs-on: ubuntu-latest
-    # be consistent with tools/toolchain/image
-    container: scylladb/scylla-toolchain:fedora-40-20240621
+    container: ${{ needs.read-toolchain.outputs.image }}
    outputs:
      md5sum: ${{ steps.checksum.outputs.md5sum }}
    steps:
--- a/.github/workflows/clang-nightly.yaml
+++ b/.github/workflows/clang-nightly.yaml
@@ -7,7 +7,7 @@ on:

 env:
  # use the development branch explicitly
-  CLANG_VERSION: 19
+  CLANG_VERSION: 20
  BUILD_DIR: build

 permissions: {}
@@ -20,6 +20,7 @@ concurrency:
 jobs:
  clang-dev:
    name: Build with clang nightly
+    if: github.repository == 'scylladb/scylladb'
    runs-on: ubuntu-latest
    container: fedora:40
    strategy:
--- a/.github/workflows/clang-tidy.yaml
+++ b/.github/workflows/clang-tidy.yaml
@@ -10,9 +10,6 @@ on:
      - 'docs/**'
      - '.github/**'
  workflow_dispatch:
-  schedule:
-    # only at 5AM Saturday
-    - cron: '0 5 * * SAT'

 env:
  BUILD_TYPE: RelWithDebInfo
--- a/.github/workflows/docs-pr.yaml
+++ b/.github/workflows/docs-pr.yaml
@@ -12,7 +12,8 @@ on:
      - enterprise
    paths:
      - "docs/**"
-
+      - "db/config.hh"
+      - "db/config.cc"
 jobs:
  build:
    runs-on: ubuntu-latest
--- a/.github/workflows/make-pr-ready-for-review.yaml
+++ b/.github/workflows/make-pr-ready-for-review.yaml
@@ -0,0 +1,27 @@
+name: Mark PR as Ready When Conflicts Label is Removed
+
+on:
+  pull_request_target:
+    types:
+      - unlabeled
+
+env:
+  DEFAULT_BRANCH: 'master'
+
+jobs:
+  mark-ready:
+    if: github.event.label.name == 'conflicts'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ env.DEFAULT_BRANCH }}
+          token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+          fetch-depth: 1
+      - name: Mark pull request as ready for review
+        run:  gh pr ready "${{ github.event.pull_request.number }}"
+        env:
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
--- a/.github/workflows/reproducible-build.yaml
+++ b/.github/workflows/reproducible-build.yaml
@@ -19,6 +19,7 @@ jobs:
    with:
      build_mode: release
  compare-checksum:
+    if: github.repository == 'scylladb/scylladb'
    runs-on: ubuntu-latest
    needs:
      - build-a
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 .settings
 build
 build.ninja
+cmake-build-*
 build.ninja.new
 cscope.*
 /debian/
@@ -13,13 +14,14 @@ dist/ami/scylla_deploy.sh
 Cql.tokens
 .kdev4
 *.kdev4
+.idea
 CMakeLists.txt.user
 .cache
 .tox
 *.egg-info
 __pycache__CMakeLists.txt.user
 .gdbinit
-resources
+/resources
 .pytest_cache
 /expressions.tokens
 tags
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -9,9 +9,6 @@
 [submodule "abseil"]
 	path = abseil
 	url = ../abseil-cpp
-[submodule "scylla-jmx"]
-	path = tools/jmx
-	url = ../scylla-jmx
 [submodule "scylla-tools"]
 	path = tools/java
 	url = ../scylla-tools-java
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,6 @@ cmake_minimum_required(VERSION 3.27)

 project(scylla)

-include(CTest)
-
 list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
  ${CMAKE_CURRENT_SOURCE_DIR}/seastar/cmake)
@@ -55,20 +53,22 @@ set(Seastar_DEPRECATED_OSTREAM_FORMATTERS OFF CACHE BOOL "" FORCE)
 set(Seastar_APPS ON CACHE BOOL "" FORCE)
 set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE)
 set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE)
+set(Seastar_IO_URING OFF CACHE BOOL "" FORCE)
+set(Seastar_SCHEDULING_GROUPS_COUNT 16 CACHE STRING "" FORCE)
 set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE)
 add_subdirectory(seastar)
 set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)

 find_package(Sanitizers QUIET)
 set(sanitizer_cxx_flags
-    $<$<IN_LIST:$<CONFIG>,Debug;Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>>)
+    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_COMPILE_OPTIONS>>)
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    set(ABSL_GCC_FLAGS ${sanitizer_cxx_flags})
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    set(ABSL_LLVM_FLAGS ${sanitizer_cxx_flags})
 endif()
 set(ABSL_DEFAULT_LINKOPTS
-    $<$<IN_LIST:$<CONFIG>,Debug;Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_LINK_LIBRARIES>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_LINK_LIBRARIES>>)
+    $<$<CONFIG:Debug,Sanitize>:$<TARGET_PROPERTY:Sanitizers::address,INTERFACE_LINK_LIBRARIES>;$<TARGET_PROPERTY:Sanitizers::undefined_behavior,INTERFACE_LINK_LIBRARIES>>)
 add_subdirectory(abseil)
 add_library(absl-headers INTERFACE)
 target_include_directories(absl-headers SYSTEM INTERFACE
@@ -95,7 +95,7 @@ target_link_libraries(Boost::regex
 find_package(Lua REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(ICU COMPONENTS uc i18n REQUIRED)
-find_package(fmt 9.0.0 REQUIRED)
+find_package(fmt 10.0.0 REQUIRED)
 find_package(libdeflate REQUIRED)
 find_package(libxcrypt REQUIRED)
 find_package(Snappy REQUIRED)
@@ -138,6 +138,7 @@ target_sources(scylla-main
    keys.cc
    multishard_mutation_query.cc
    mutation_query.cc
+    node_ops/task_manager_module.cc
    partition_slice_builder.cc
    querier.cc
    query.cc
@@ -151,6 +152,7 @@ target_sources(scylla-main
    serializer.cc
    sstables_loader.cc
    table_helper.cc
+    tasks/task_handler.cc
    tasks/task_manager.cc
    timeout_config.cc
    unimplemented.cc
@@ -194,6 +196,8 @@ include(check_headers)
 check_headers(check-headers scylla-main
  GLOB ${CMAKE_CURRENT_SOURCE_DIR}/*.hh)

+add_custom_target(compiler-training)
+
 add_subdirectory(api)
 add_subdirectory(alternator)
 add_subdirectory(db)
@@ -278,4 +282,9 @@ target_include_directories(scylla PRIVATE
    "${CMAKE_CURRENT_SOURCE_DIR}"
    "${scylla_gen_build_dir}")

+add_custom_target(maybe-scylla
+  DEPENDS $<$<CONFIG:Dev>:$<TARGET_FILE:scylla>>)
+add_dependencies(compiler-training
+  maybe-scylla)
+
 add_subdirectory(dist)
--- a/HACKING.md
+++ b/HACKING.md
@@ -19,18 +19,18 @@ $ git submodule update --init --recursive
 ### Dependencies

 Scylla is fairly fussy about its build environment, requiring a very recent
-version of the C++20 compiler and numerous tools and libraries to build.
+version of the C++23 compiler and numerous tools and libraries to build.

 Run `./install-dependencies.sh` (as root) to use your Linux distributions's
 package manager to install the appropriate packages on your build machine.
 However, this will only work on very recent distributions. For example,
 currently Fedora users must upgrade to Fedora 32 otherwise the C++ compiler
-will be too old, and not support the new C++20 standard that Scylla uses.
+will be too old, and not support the new C++23 standard that Scylla uses.

 Alternatively, to avoid having to upgrade your build machine or install
 various packages on it, we provide another option - the **frozen toolchain**.
 This is a script, `./tools/toolchain/dbuild`, that can execute build or run
-commands inside a Docker image that contains exactly the right build tools and
+commands inside a container that contains exactly the right build tools and
 libraries. The `dbuild` technique is useful for beginners, but is also the way
 in which ScyllaDB produces official releases, so it is highly recommended.

@@ -43,6 +43,12 @@ $ ./tools/toolchain/dbuild ninja build/release/scylla
 $ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
 ```

+Note: do not mix environemtns - either perform all your work with dbuild, or natively on the host.
+Note2: you can get to an interactive shell within dbuild by running it without any parameters:
+```bash
+$ ./tools/toolchain/dbuild
+```
+
 ### Build system

 **Note**: Compiling Scylla requires, conservatively, 2 GB of memory per native
@@ -116,6 +122,13 @@ Run all tests through the test execution wrapper with
 $ ./test.py --mode={debug,release}
 ```

+or, if you are using `dbuild`, you need to build the code and the tests and then you can run them at will:
+
+```bash
+$ ./tools/toolchain/dbuild ninja {debug,release,dev}-build
+$ ./tools/toolchain/dbuild ./test.py --mode {debug,release,dev}
+```
+
 The `--name` argument can be specified to run a particular test.

 Alternatively, you can execute the test executable directly. For example,
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ For more information, please see the [ScyllaDB web site].
 ## Build Prerequisites

 Scylla is fairly fussy about its build environment, requiring very recent
-versions of the C++20 compiler and of many libraries to build. The document
+versions of the C++23 compiler and of many libraries to build. The document
 [HACKING.md](HACKING.md) includes detailed information on building and
 developing Scylla, but to get Scylla building quickly on (almost) any build
 machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
@@ -84,11 +84,11 @@ Documentation can be found [here](docs/dev/README.md).
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

-## Training 
+## Training

-Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
-The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
-administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
+Training material and online courses can be found at [Scylla University](https://university.scylladb.com/).
+The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling,
+administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions,
 multi-datacenters and how Scylla integrates with third-party applications.

 ## Contributing to Scylla
--- a/4
+++ b/4
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=6.1.0-dev
+VERSION=6.2.4

 if test -f version
 then
@@ -104,7 +104,7 @@ else
 fi

 if [ -f "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" ]; then
-	GIT_COMMIT_FILE=$(cat "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" |cut -d . -f 3)
+	GIT_COMMIT_FILE=$(cat "$OUTPUT_DIR/SCYLLA-RELEASE-FILE" | rev | cut -d . -f 1 | rev)
 	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
 		exit 0
 	fi
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -19,6 +19,7 @@
 #include "alternator/executor.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/result_set.hh"
+#include "types/types.hh"
 #include <seastar/core/coroutine.hh>

 namespace alternator {
@@ -31,11 +32,12 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::serv
    dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
    std::vector<query::clustering_range> bounds{query::clustering_range::make_open_ended_both_sides()};
    const column_definition* salted_hash_col = schema->get_column_definition(bytes("salted_hash"));
-    if (!salted_hash_col) {
-        co_await coroutine::return_exception(api_error::unrecognized_client(format("Credentials cannot be fetched for: {}", username)));
+    const column_definition* can_login_col = schema->get_column_definition(bytes("can_login"));
+    if (!salted_hash_col || !can_login_col) {
+        co_await coroutine::return_exception(api_error::unrecognized_client(fmt::format("Credentials cannot be fetched for: {}", username)));
    }
-    auto selection = cql3::selection::selection::for_columns(schema, {salted_hash_col});
-    auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id}, selection->get_query_options());
+    auto selection = cql3::selection::selection::for_columns(schema, {salted_hash_col, can_login_col});
+    auto partition_slice = query::partition_slice(std::move(bounds), {}, query::column_id_vector{salted_hash_col->id, can_login_col->id}, selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
            proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
    auto cl = auth::password_authenticator::consistency_for_user(username);
@@ -49,11 +51,18 @@ future<std::string> get_key_from_roles(service::storage_proxy& proxy, auth::serv

    auto result_set = builder.build();
    if (result_set->empty()) {
-        co_await coroutine::return_exception(api_error::unrecognized_client(format("User not found: {}", username)));
+        co_await coroutine::return_exception(api_error::unrecognized_client(fmt::format("User not found: {}", username)));
    }
-    const managed_bytes_opt& salted_hash = result_set->rows().front().front(); // We only asked for 1 row and 1 column
+    const auto& result = result_set->rows().front();
+    bool can_login = result[1] && value_cast<bool>(boolean_type->deserialize(*result[1]));
+    if (!can_login) {
+        // This is a valid role name, but has "login=False" so should not be
+        // usable for authentication (see #19735).
+        co_await coroutine::return_exception(api_error::unrecognized_client(fmt::format("Role {} has login=false so cannot be used for login", username)));
+    }
+    const managed_bytes_opt& salted_hash = result.front();
    if (!salted_hash) {
-        co_await coroutine::return_exception(api_error::unrecognized_client(format("No password found for user: {}", username)));
+        co_await coroutine::return_exception(api_error::unrecognized_client(fmt::format("No password found for user: {}", username)));
    }
    co_return value_cast<sstring>(utf8_type->deserialize(*salted_hash));
 }
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -42,12 +42,12 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
    };
    if (!comparison_operator.IsString()) {
-        throw api_error::validation(format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
+        throw api_error::validation(fmt::format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
    std::string op = comparison_operator.GetString();
    auto it = ops.find(op);
    if (it == ops.end()) {
-        throw api_error::validation(format("Unsupported comparison operator {}", op));
+        throw api_error::validation(fmt::format("Unsupported comparison operator {}", op));
    }
    return it->second;
 }
@@ -429,7 +429,7 @@ static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from
    if (cmp_lt()(ub, lb)) {
        if (bounds_from_query) {
            throw api_error::validation(
-                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+                fmt::format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
        } else {
            return false;
        }
@@ -613,7 +613,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
        return conditional_operator_type::OR;
    } else {
        throw api_error::validation(
-                format("'ConditionalOperator' parameter must be AND, OR or missing. Found {}.", s));
+                fmt::format("'ConditionalOperator' parameter must be AND, OR or missing. Found {}.", s));
    }
 }

--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -9,10 +9,14 @@
 #include <fmt/ranges.h>
 #include <seastar/core/sleep.hh>
 #include "alternator/executor.hh"
+#include "auth/permission.hh"
+#include "cdc/log.hh"
+#include "auth/service.hh"
 #include "db/config.hh"
 #include "log.hh"
 #include "schema/schema_builder.hh"
 #include "exceptions/exceptions.hh"
+#include "service/client_state.hh"
 #include "timestamp.hh"
 #include "types/map.hh"
 #include "schema/schema.hh"
@@ -29,6 +33,7 @@
 #include "conditions.hh"
 #include "cql3/util.hh"
 #include <optional>
+#include "utils/assert.hh"
 #include "utils/overloaded_functor.hh"
 #include <seastar/json/json_elements.hh>
 #include <boost/algorithm/cxx11/any_of.hpp>
@@ -84,7 +89,7 @@ static map_type attrs_type() {

 static const column_definition& attrs_column(const schema& schema) {
    const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
-    assert(cdef);
+    SCYLLA_ASSERT(cdef);
    return *cdef;
 }

@@ -115,6 +120,7 @@ json::json_return_type make_streamed(rjson::value&& value) {
            elogger.error("Exception during streaming HTTP response: {}", ex);
        }
        co_await los.close();
+        co_await rjson::destroy_gently(std::move(*lrs));
        if (ex) {
            co_await coroutine::return_exception_ptr(std::move(ex));
        }
@@ -189,12 +195,12 @@ static std::string view_name(const std::string& table_name, std::string_view ind
    }
    if (!valid_table_name_chars(index_name)) {
        throw api_error::validation(
-                format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
+                fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
    std::string ret = table_name + delim + std::string(index_name);
    if (ret.length() > max_table_name_length) {
        throw api_error::validation(
-                format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
+                fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
                        table_name, index_name, max_table_name_length - delim.size()));
    }
    return ret;
@@ -249,7 +255,7 @@ schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::valu
        validate_table_name(table_name.value());

        throw api_error::resource_not_found(
-                format("Requested resource not found: Table: {} not found", *table_name));
+                fmt::format("Requested resource not found: Table: {} not found", *table_name));
    }
 }

@@ -303,7 +309,7 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
            validate_table_name(table_name);

            throw api_error::resource_not_found(
-                format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
+                fmt::format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
        }
    }

@@ -317,7 +323,7 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
            type = table_or_view_type::gsi;
        } else {
            throw api_error::validation(
-                    format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
+                    fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
        }
        // If no tables for global indexes were found, the index may be local
        if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
@@ -335,14 +341,14 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
            // does exist but the index does not (ValidationException).
            if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) {
                throw api_error::validation(
-                    format("Requested resource not found: Index '{}' for table '{}'", index_name->GetString(), orig_table_name));
+                    fmt::format("Requested resource not found: Index '{}' for table '{}'", index_name->GetString(), orig_table_name));
            } else {
                throw api_error::resource_not_found(
-                    format("Requested resource not found: Table: {} not found", orig_table_name));
+                    fmt::format("Requested resource not found: Table: {} not found", orig_table_name));
            }
        } else {
            throw api_error::resource_not_found(
-                format("Requested resource not found: Table: {} not found", table_name));
+                fmt::format("Requested resource not found: Table: {} not found", table_name));
        }
    }
 }
@@ -355,7 +361,7 @@ static std::string get_string_attribute(const rjson::value& value, std::string_v
    if (!attribute_value)
        return default_return;
    if (!attribute_value->IsString()) {
-        throw api_error::validation(format("Expected string value for attribute {}, got: {}",
+        throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}",
                attribute_name, value));
    }
    return std::string(attribute_value->GetString(), attribute_value->GetStringLength());
@@ -370,7 +376,7 @@ static bool get_bool_attribute(const rjson::value& value, std::string_view attri
        return default_return;
    }
    if (!attribute_value->IsBool()) {
-        throw api_error::validation(format("Expected boolean value for attribute {}, got: {}",
+        throw api_error::validation(fmt::format("Expected boolean value for attribute {}, got: {}",
                attribute_name, value));
    }
    return attribute_value->GetBool();
@@ -384,7 +390,7 @@ static std::optional<int> get_int_attribute(const rjson::value& value, std::stri
    if (!attribute_value)
        return {};
    if (!attribute_value->IsInt()) {
-        throw api_error::validation(format("Expected integer value for attribute {}, got: {}",
+        throw api_error::validation(fmt::format("Expected integer value for attribute {}, got: {}",
                attribute_name, value));
    }
    return attribute_value->GetInt();
@@ -432,7 +438,7 @@ static rjson::value generate_arn_for_table(const schema& schema) {
 }

 static rjson::value generate_arn_for_index(const schema& schema, std::string_view index_name) {
-    return rjson::from_string(format(
+    return rjson::from_string(fmt::format(
        "arn:scylla:alternator:{}:scylla:table/{}/index/{}",
        schema.ks_name(), schema.cf_name(), index_name));
 }
@@ -545,6 +551,53 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
 }

+// Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
+// SELECT, DROP, etc.) on the given table. When permission is denied an
+// appropriate user-readable api_error::access_denied is thrown.
+future<> verify_permission(
+    const service::client_state& client_state,
+    const schema_ptr& schema,
+    auth::permission permission_to_check) {
+    // Using exceptions for errors makes this function faster in the success
+    // path (when the operation is allowed). Using a continuation instead of
+    // co_await makes it faster (no allocation) in the happy path where
+    // permissions are cached and check_has_permissions() doesn't yield.
+    return client_state.check_has_permission(auth::command_desc(
+            permission_to_check,
+            auth::make_data_resource(schema->ks_name(), schema->cf_name()))).then(
+        [permission_to_check, &schema, &client_state] (bool allowed) {
+            if (!allowed) {
+                sstring username = "anonymous";
+                if (client_state.user() && client_state.user()->name) {
+                    username = client_state.user()->name.value();
+                }
+                throw api_error::access_denied(format(
+                    "{} access on table {}.{} is denied to role {}",
+                    auth::permissions::to_string(permission_to_check),
+                    schema->ks_name(), schema->cf_name(), username));
+            }
+        });
+}
+
+// Similar to verify_permission() above, but just for CREATE operations.
+// Those do not operate on any specific table, so require permissions on
+// ALL KEYSPACES instead of any specific table.
+future<> verify_create_permission(const service::client_state& client_state) {
+    return client_state.check_has_permission(auth::command_desc(
+            auth::permission::CREATE,
+            auth::resource(auth::resource_kind::data))).then(
+        [&client_state] (bool allowed) {
+            if (!allowed) {
+                sstring username = "anonymous";
+                if (client_state.user() && client_state.user()->name) {
+                    username = client_state.user()->name.value();
+                }
+                throw api_error::access_denied(format(
+                    "CREATE access on ALL KEYSPACES is denied to role {}", username));
+            }
+        });
+}
+
 future<executor::request_return_type> executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
    _stats.api_operations.delete_table++;
    elogger.trace("Deleting table {}", request);
@@ -560,14 +613,15 @@ future<executor::request_return_type> executor::delete_table(client_state& clien

    schema_ptr schema = get_table(_proxy, request);
    rjson::value table_description = fill_table_description(schema, table_status::deleting, _proxy);
-
-    co_await _mm.container().invoke_on(0, [&] (service::migration_manager& mm) -> future<> {
+    co_await verify_permission(client_state, schema, auth::permission::DROP);
+    co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
        auto group0_guard = co_await mm.start_group0_operation();

-        if (!p.local().data_dictionary().has_schema(keyspace_name, table_name)) {
-            throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
+        std::optional<data_dictionary::table> tbl = p.local().data_dictionary().try_find_table(keyspace_name, table_name);
+        if (!tbl) {
+            throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
        }

        auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
@@ -575,7 +629,29 @@ future<executor::request_return_type> executor::delete_table(client_state& clien

        std::move(m2.begin(), m2.end(), std::back_inserter(m));

-        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: delete {} table", table_name));
+        // When deleting a table and its views, we need to remove this role's
+        // special permissions in those tables (undoing the "auto-grant" done
+        // by CreateTable). If we didn't do this, if a second role later
+        // recreates a table with the same name, the first role would still
+        // have permissions over the new table.
+        // To make things more robust we just remove *all* permissions for
+        // the deleted table (CQL's drop_table_statement also does this).
+        // Unfortunately, there is an API mismatch between this code (which
+        // uses separate group0_guard and vector<mutation>) and the function
+        // revoke_all() which uses a combined "group0_batch" structure - so
+        // we need to do some ugly back-and-forth conversions between the pair
+        // to the group0_batch and back to the pair :-(
+        service::group0_batch mc(std::move(group0_guard));
+        mc.add_mutations(std::move(m));
+        co_await auth::revoke_all(*cs.get().get_auth_service(),
+            auth::make_data_resource(schema->ks_name(), schema->cf_name()), mc);
+        for (const view_ptr& v : tbl->views()) {
+            co_await auth::revoke_all(*cs.get().get_auth_service(),
+                auth::make_data_resource(v->ks_name(), v->cf_name()), mc);
+        }
+        std::tie(m, group0_guard) = co_await std::move(mc).extract();
+
+        co_await mm.announce(std::move(m), std::move(group0_guard), fmt::format("alternator-executor: delete {} table", table_name));
    });

    rjson::value response = rjson::empty_object();
@@ -595,7 +671,7 @@ static data_type parse_key_type(const std::string& type) {
        }
    }
    throw api_error::validation(
-            format("Invalid key type '{}', can only be S, B or N.", type));
+            fmt::format("Invalid key type '{}', can only be S, B or N.", type));
 }


@@ -605,7 +681,7 @@ static void add_column(schema_builder& builder, const std::string& name, const r
    // second column with the same name. We should fix this, by renaming
    // some column names which we want to reserve.
    if (name == executor::ATTRS_COLUMN_NAME) {
-        throw api_error::validation(format("Column name '{}' is currently reserved. FIXME.", name));
+        throw api_error::validation(fmt::format("Column name '{}' is currently reserved. FIXME.", name));
    }
    for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
        const rjson::value& attribute_info = *it;
@@ -616,7 +692,7 @@ static void add_column(schema_builder& builder, const std::string& name, const r
        }
    }
    throw api_error::validation(
-            format("KeySchema key '{}' missing in AttributeDefinitions", name));
+            fmt::format("KeySchema key '{}' missing in AttributeDefinitions", name));
 }

 // Parse the KeySchema request attribute, which specifies the column names
@@ -684,7 +760,7 @@ static schema_ptr get_table_from_arn(service::storage_proxy& proxy, std::string_
            // A table name cannot contain a '/' - if it does, it's not a
            // table ARN, it may be an index. DynamoDB returns a
            // ValidationException in that case - see #10786.
-            throw api_error::validation(format("ResourceArn '{}' is not a valid table ARN", table_name));
+            throw api_error::validation(fmt::format("ResourceArn '{}' is not a valid table ARN", table_name));
        }
        // FIXME: remove sstring creation once find_schema gains a view-based interface
        return proxy.data_dictionary().find_schema(sstring(keyspace_name), sstring(table_name));
@@ -719,7 +795,7 @@ static void validate_tags(const std::map<sstring, sstring>& tags) {
        std::string_view value = it->second;
        if (!allowed_write_isolation_values.contains(value)) {
            throw api_error::validation(
-                    format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
+                    fmt::format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
        }
    }
 }
@@ -754,7 +830,7 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
                "See docs/alternator/alternator.md for instructions.");
    }
    if (!allowed_write_isolation_values.contains(value)) {
-        throw std::runtime_error(format("Invalid --alternator-write-isolation "
+        throw std::runtime_error(fmt::format("Invalid --alternator-write-isolation "
                "setting '{}'. Allowed values: {}.",
                value, allowed_write_isolation_values));
    }
@@ -831,6 +907,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    if (tags->Size() < 1) {
        co_return api_error::validation("The number of tags must be at least 1") ;
    }
+    co_await verify_permission(client_state, schema, auth::permission::ALTER);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
    });
@@ -850,7 +927,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
    }

    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-
+    co_await verify_permission(client_state, schema, auth::permission::ALTER);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
    });
@@ -901,7 +978,10 @@ static void verify_billing_mode(const rjson::value& request) {
 // throws user-facing api_error::validation if it's not.
 // In particular, verify that the same AttributeName doesn't appear more than
 // once (Issue #13870).
-static void validate_attribute_definitions(const rjson::value& attribute_definitions){
+// Return the set of attribute names defined in AttributeDefinitions - this
+// set is useful for later verifying that all of them are used by some
+// KeySchema (issue #19784)
+static std::unordered_set<std::string> validate_attribute_definitions(const rjson::value& attribute_definitions){
    if (!attribute_definitions.IsArray()) {
        throw api_error::validation("AttributeDefinitions must be an array");
    }
@@ -916,7 +996,7 @@ static void validate_attribute_definitions(const rjson::value& attribute_definit
        }
        auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
        if (!added) {
-            throw api_error::validation(format("Duplicate AttributeName={} in AttributeDefinitions",
+            throw api_error::validation(fmt::format("Duplicate AttributeName={} in AttributeDefinitions",
                rjson::to_string_view(*attribute_name)));
        }
        const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
@@ -927,10 +1007,11 @@ static void validate_attribute_definitions(const rjson::value& attribute_definit
            throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
        }
    }
+    return seen_attribute_names;
 }

-static future<executor::request_return_type> create_table_on_shard0(tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
-    assert(this_shard_id() == 0);
+static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
+    SCYLLA_ASSERT(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
    // command. We can't inspect the current database schema at this point
@@ -940,19 +1021,26 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    validate_table_name(table_name);

    if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
-        co_return api_error::validation(format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
+        co_return api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
    }
    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
    const rjson::value& attribute_definitions = request["AttributeDefinitions"];
-    validate_attribute_definitions(attribute_definitions);
+    // Save the list of AttributeDefinitions in unused_attribute_definitions,
+    // and below remove each one as we see it in a KeySchema of the table or
+    // any of its GSIs or LSIs. If anything remains in this set at the end of
+    // this function, it's an error.
+    std::unordered_set<std::string> unused_attribute_definitions =
+        validate_attribute_definitions(attribute_definitions);

    tracing::add_table_name(trace_state, keyspace_name, table_name);

    schema_builder builder(keyspace_name, table_name);
    auto [hash_key, range_key] = parse_key_schema(request);
    add_column(builder, hash_key, attribute_definitions, column_kind::partition_key);
+    unused_attribute_definitions.erase(hash_key);
    if (!range_key.empty()) {
        add_column(builder, range_key, attribute_definitions, column_kind::clustering_key);
+        unused_attribute_definitions.erase(range_key);
    }
    builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);

@@ -965,7 +1053,6 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    // any table.
    const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
    std::vector<schema_builder> view_builders;
-    std::vector<sstring> where_clauses;
    std::unordered_set<std::string> index_names;
    if (gsi) {
        if (!gsi->IsArray()) {
@@ -979,7 +1066,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            std::string_view index_name = rjson::to_string_view(*index_name_v);
            auto [it, added] = index_names.emplace(index_name);
            if (!added) {
-                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+                co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
            }
            std::string vname(view_name(table_name, index_name));
            elogger.trace("Adding GSI {}", index_name);
@@ -993,6 +1080,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
                add_column(builder, view_hash_key, attribute_definitions, column_kind::regular_column);
            }
            add_column(view_builder, view_hash_key, attribute_definitions, column_kind::partition_key);
+            unused_attribute_definitions.erase(view_hash_key);
            if (!view_range_key.empty()) {
                if (partial_schema->get_column_definition(to_bytes(view_range_key)) == nullptr) {
                    // A column that exists in a global secondary index is upgraded from being a map entry
@@ -1005,6 +1093,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
                    add_column(builder, view_range_key, attribute_definitions, column_kind::regular_column);
                }
                add_column(view_builder, view_range_key, attribute_definitions, column_kind::clustering_key);
+                unused_attribute_definitions.erase(view_range_key);
            }
            // Base key columns which aren't part of the index's key need to
            // be added to the view nonetheless, as (additional) clustering
@@ -1017,12 +1106,6 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            }
            // GSIs have no tags:
            view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>());
-            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
-            if (!view_range_key.empty()) {
-                where_clause = format("{} AND {} IS NOT NULL", where_clause,
-                    cql3::util::maybe_quote(view_range_key));
-            }
-            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
        }
    }
@@ -1040,7 +1123,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            std::string_view index_name = rjson::to_string_view(*index_name_v);
            auto [it, added] = index_names.emplace(index_name);
            if (!added) {
-                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+                co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
            }
            std::string vname(lsi_name(table_name, index_name));
            elogger.trace("Adding LSI {}", index_name);
@@ -1055,9 +1138,11 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
                co_return api_error::validation("LocalSecondaryIndex hash key must match the base table hash key");
            }
            add_column(view_builder, view_hash_key, attribute_definitions, column_kind::partition_key);
+            unused_attribute_definitions.erase(view_hash_key);
            if (view_range_key.empty()) {
                co_return api_error::validation("LocalSecondaryIndex must specify a sort key");
            }
+            unused_attribute_definitions.erase(view_range_key);
            if (view_range_key == hash_key) {
                co_return api_error::validation("LocalSecondaryIndex sort key cannot be the same as hash key");
              }
@@ -1075,12 +1160,6 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
-            if (!view_range_key.empty()) {
-                where_clause = format("{} AND {} IS NOT NULL", where_clause,
-                    cql3::util::maybe_quote(view_range_key));
-            }
-            where_clauses.push_back(std::move(where_clause));
            // LSIs have no tags, but Scylla's "synchronous_updates" feature
            // (which an LSIs need), is actually implemented as a tag so we
            // need to add it here:
@@ -1090,6 +1169,12 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        }
    }

+    if (!unused_attribute_definitions.empty()) {
+        co_return api_error::validation(format(
+            "AttributeDefinitions defines spurious attributes not used by any KeySchema: {}",
+            unused_attribute_definitions));
+    }
+
    // We don't yet support configuring server-side encryption (SSE) via the
    // SSESpecifiction attribute, but an SSESpecification with Enabled=false
    // is simply the default, and should be accepted:
@@ -1119,8 +1204,9 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    }
    builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));

+    co_await verify_create_permission(client_state);
+
    schema_ptr schema = builder.build();
-    auto where_clause_it = where_clauses.begin();
    for (auto& view_builder : view_builders) {
        // Note below we don't need to add virtual columns, as all
        // base columns were copied to view. TODO: reconsider the need
@@ -1131,8 +1217,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            }
        }
        const bool include_all_columns = true;
-        view_builder.with_view_info(*schema, include_all_columns, *where_clause_it);
-        ++where_clause_it;
+        view_builder.with_view_info(*schema, include_all_columns, ""/*where clause*/);
    }

    // FIXME: the following needs to be in a loop. If mm.announce() below
@@ -1157,7 +1242,7 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
    } catch (exceptions::already_exists_exception&) {
        if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
-            co_return api_error::resource_in_use(format("Table {} already exists", table_name));
+            co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
        }
    }
    if (sp.data_dictionary().try_find_table(schema->id())) {
@@ -1179,7 +1264,27 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
        });

    }
-    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), format("alternator-executor: create {} table", table_name));
+    // If a role is allowed to create a table, we must give it permissions to
+    // use (and eventually delete) the specific table it just created (and
+    // also the view tables). This is known as "auto-grant".
+    // Unfortunately, there is an API mismatch between this code (which uses
+    // separate group0_guard and vector<mutation>) and the function
+    // grant_applicable_permissions() which uses a combined "group0_batch"
+    // structure - so we need to do some ugly back-and-forth conversions
+    // between the pair to the group0_batch and back to the pair :-(
+    service::group0_batch mc(std::move(group0_guard));
+    mc.add_mutations(std::move(schema_mutations));
+    co_await auth::grant_applicable_permissions(
+        *client_state.get_auth_service(), *client_state.user(),
+        auth::make_data_resource(schema->ks_name(), schema->cf_name()), mc);
+    for (const schema_builder& view_builder : view_builders) {
+        co_await auth::grant_applicable_permissions(
+            *client_state.get_auth_service(), *client_state.user(),
+            auth::make_data_resource(view_builder.ks_name(), view_builder.cf_name()), mc);
+    }
+    std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
+
+    co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));

    co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
    rjson::value status = rjson::empty_object();
@@ -1192,9 +1297,9 @@ future<executor::request_return_type> executor::create_table(client_state& clien
    _stats.api_operations.create_table++;
    elogger.trace("Creating table {}", request);

-    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container()]
+    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), client_state_other_shard = client_state.move_to_other_shard()]
                                        (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
-        co_return co_await create_table_on_shard0(tr, std::move(request), sp.local(), mm, g.local());
+        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local());
    });
 }

@@ -1219,7 +1324,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        verify_billing_mode(request);
    }

-    co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state))]
+    co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state)), client_state_other_shard = client_state.move_to_other_shard()]
                                                (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
        // FIXME: the following needs to be in a loop. If mm.announce() below
        // fails, we need to retry the whole thing.
@@ -1232,7 +1337,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        // the ugly but harmless conversion to string_view here is because
        // Seastar's sstring is missing a find(std::string_view) :-()
        if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
-            co_await coroutine::return_exception(api_error::validation(format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
+            co_await coroutine::return_exception(api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
        }

        schema_builder builder(tab);
@@ -1250,7 +1355,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        }

        auto schema = builder.build();
-
+        co_await verify_permission(client_state_other_shard.get(), schema, auth::permission::ALTER);
        auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema,  std::vector<view_ptr>(), group0_guard.write_timestamp());

        co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
@@ -1339,7 +1444,7 @@ void validate_value(const rjson::value& v, const char* caller) {
        }
    } else if (type != "L" && type != "M" && type != "BOOL" && type != "NULL") {
        // TODO: can do more sanity checks on the content of the above types.
-        throw api_error::validation(format("{}: unknown type {} for value {}", caller, type, v));
+        throw api_error::validation(fmt::format("{}: unknown type {} for value {}", caller, type, v));
    }
 }

@@ -1533,7 +1638,7 @@ rmw_operation::returnvalues rmw_operation::parse_returnvalues(const rjson::value
    } else if (s == "UPDATED_NEW") {
        return rmw_operation::returnvalues::UPDATED_NEW;
    } else {
-        throw api_error::validation(format("Unrecognized value for ReturnValues: {}", s));
+        throw api_error::validation(fmt::format("Unrecognized value for ReturnValues: {}", s));
    }
 }

@@ -1552,7 +1657,7 @@ rmw_operation::parse_returnvalues_on_condition_check_failure(const rjson::value&
    } else if (s == "ALL_OLD") {
        return rmw_operation::returnvalues_on_condition_check_failure::ALL_OLD;
    } else {
-        throw api_error::validation(format("Unrecognized value for ReturnValuesOnConditionCheckFailure: {}", s));
+        throw api_error::validation(fmt::format("Unrecognized value for ReturnValuesOnConditionCheckFailure: {}", s));
    }
 }

@@ -1676,7 +1781,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
        }
    } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
        std::optional<mutation> m = apply(nullptr, api::new_timestamp());
-        assert(m); // !needs_read_before_write, so apply() did not check a condition
+        SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
            return rmw_operation_return(std::move(_return_attributes));
        });
@@ -1808,10 +1913,13 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
    auto op = make_shared<put_item_operation>(_proxy, std::move(request));
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = op->needs_read_before_write();
+
+    co_await verify_permission(client_state, op->schema(), auth::permission::MODIFY);
+
    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
        _stats.api_operations.put_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
-        return container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(*shard, _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
@@ -1824,7 +1932,7 @@ future<executor::request_return_type> executor::put_item(client_state& client_st
            });
        });
    }
-    return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
+    co_return co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
        _stats.api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    });
 }
@@ -1897,10 +2005,13 @@ future<executor::request_return_type> executor::delete_item(client_state& client
    auto op = make_shared<delete_item_operation>(_proxy, std::move(request));
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = op->needs_read_before_write();
+
+    co_await verify_permission(client_state, op->schema(), auth::permission::MODIFY);
+
    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
        _stats.api_operations.delete_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
-        return container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(*shard, _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
@@ -1913,7 +2024,7 @@ future<executor::request_return_type> executor::delete_item(client_state& client
            });
        });
    }
-    return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
+    co_return co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
        _stats.api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    });
 }
@@ -2078,10 +2189,11 @@ static future<> do_batch_write(service::storage_proxy& proxy,
 future<executor::request_return_type> executor::batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
    _stats.api_operations.batch_write_item++;
    rjson::value& request_items = request["RequestItems"];
+    auto start_time = std::chrono::steady_clock::now();

    std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders;
    mutation_builders.reserve(request_items.MemberCount());
-
+    uint batch_size = 0;
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        schema_ptr schema = get_table_from_batch_request(_proxy, it);
        tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
@@ -2089,7 +2201,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                1, primary_key_hash{schema}, primary_key_equal{schema});
        for (auto& request : it->value.GetArray()) {
            if (!request.IsObject() || request.MemberCount() != 1) {
-                return make_ready_future<request_return_type>(api_error::validation(format("Invalid BatchWriteItem request: {}", request)));
+                co_return api_error::validation(format("Invalid BatchWriteItem request: {}", request));
            }
            auto r = request.MemberBegin();
            const std::string r_name = r->name.GetString();
@@ -2100,9 +2212,10 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                        item, schema, put_or_delete_item::put_item{}));
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
-                    return make_ready_future<request_return_type>(api_error::validation("Provided list of item keys contains duplicates"));
+                    co_return api_error::validation("Provided list of item keys contains duplicates");
                }
                used_keys.insert(std::move(mut_key));
+                batch_size++;
            } else if (r_name == "DeleteRequest") {
                const rjson::value& key = (r->value)["Key"];
                mutation_builders.emplace_back(schema, put_or_delete_item(
@@ -2110,21 +2223,28 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
                auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
                        mutation_builders.back().second.ck());
                if (used_keys.contains(mut_key)) {
-                    return make_ready_future<request_return_type>(api_error::validation("Provided list of item keys contains duplicates"));
+                    co_return api_error::validation("Provided list of item keys contains duplicates");
                }
                used_keys.insert(std::move(mut_key));
+                batch_size++;
            } else {
-                return make_ready_future<request_return_type>(api_error::validation(format("Unknown BatchWriteItem request type: {}", r_name)));
+                co_return api_error::validation(fmt::format("Unknown BatchWriteItem request type: {}", r_name));
            }
        }
    }

-    return do_batch_write(_proxy, _ssg, std::move(mutation_builders), client_state, trace_state, std::move(permit), _stats).then([] () {
+    for (const auto& b : mutation_builders) {
+        co_await verify_permission(client_state, b.first, auth::permission::MODIFY);
+    }
+
+    _stats.api_operations.batch_write_item_batch_total += batch_size;
+    co_return co_await do_batch_write(_proxy, _ssg, std::move(mutation_builders), client_state, trace_state, std::move(permit), _stats).then([start_time, this] () {
        // FIXME: Issue #5650: If we failed writing some of the updates,
        // need to return a list of these failed updates in UnprocessedItems
        // rather than fail the whole write (issue #5650).
        rjson::value ret = rjson::empty_object();
        rjson::add(ret, "UnprocessedItems", rjson::empty_object());
+        _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
    });
 }
@@ -2231,13 +2351,13 @@ void attribute_path_map_add(const char* source, attribute_path_map<T>& map, cons
    } else if(!p.has_operators()) {
        // If p is top-level and we already have it or a part of it
        // in map, it's a forbidden overlapping path.
-        throw api_error::validation(format(
+        throw api_error::validation(fmt::format(
            "Invalid {}: two document paths overlap at {}", source, p.root()));
    } else if (it->second.has_value()) {
        // If we're here, it != map.end() && p.has_operators && it->second.has_value().
        // This means the top-level attribute already has a value, and we're
        // trying to add a non-top-level value. It's an overlap.
-        throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p.root()));
+        throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root()));
    }
    node* h = &it->second;
    // The second step is to walk h from the top-level node to the inner node
@@ -2297,7 +2417,7 @@ void attribute_path_map_add(const char* source, attribute_path_map<T>& map, cons
    if (it == map.end()) {
        map.emplace(attr, node {std::move(value)});
    } else {
-        throw api_error::validation(format(
+        throw api_error::validation(fmt::format(
            "Invalid {}: Duplicate attribute: {}", source, attr));
    }
 }
@@ -2350,7 +2470,7 @@ static select_type parse_select(const rjson::value& request, table_or_view_type
        }
        return select_type::projection;
    }
-    throw api_error::validation(format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT",
+    throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT",
        select));
 }

@@ -2719,12 +2839,12 @@ static std::optional<rjson::value> action_result(
                std::string v1_type = get_item_type_string(v1);
                if (v1_type == "N") {
                    if (get_item_type_string(v2) != "N") {
-                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                        throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
                    }
                    result = number_add(v1, v2);
                } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
                    if (get_item_type_string(v2) != v1_type) {
-                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                        throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
                    }
                    result = set_sum(v1, v2);
                } else {
@@ -2810,7 +2930,7 @@ static bool hierarchy_actions(
    } else if (h.has_members()) {
        if (type[0] != 'M' || !v.IsObject()) {
            // A .something on a non-map doesn't work.
-            throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+            throw api_error::validation(fmt::format("UpdateExpression: document paths not valid for this item:{}", h));
        }
        for (const auto& member : h.get_members()) {
            std::string attr = member.first;
@@ -2997,7 +3117,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
            std::string column_name = actions.first;
            const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
            if (cdef && cdef->is_primary_key()) {
-                throw api_error::validation(format("UpdateItem cannot update key column {}", column_name));
+                throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
            }
            if (actions.second.has_value()) {
                // An action on a top-level attribute column_name. The single
@@ -3021,7 +3141,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                }
                const rjson::value *toplevel = rjson::find(*previous_item, column_name);
                if (!toplevel) {
-                    throw api_error::validation(format("UpdateItem cannot update document path: missing attribute {}",
+                    throw api_error::validation(fmt::format("UpdateItem cannot update document path: missing attribute {}",
                        column_name));
                }
                rjson::value result = rjson::copy(*toplevel);
@@ -3058,7 +3178,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                    validate_value(v2, "AttributeUpdates");
                    std::string v2_type = get_item_type_string(v2);
                    if (v2_type != "SS" && v2_type != "NS" && v2_type != "BS") {
-                        throw api_error::validation(format("AttributeUpdates DELETE operation with Value only valid for sets, got type {}", v2_type));
+                        throw api_error::validation(fmt::format("AttributeUpdates DELETE operation with Value only valid for sets, got type {}", v2_type));
                    }
                    if (v1) {
                        std::optional<rjson::value> result = set_diff(*v1, v2);
@@ -3104,7 +3224,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                    std::string v1_type = get_item_type_string(*v1);
                    std::string v2_type = get_item_type_string(v2);
                    if (v2_type != v1_type) {
-                        throw api_error::validation(format("Operand type mismatch in AttributeUpdates ADD. Expected {}, got {}", v1_type, v2_type));
+                        throw api_error::validation(fmt::format("Operand type mismatch in AttributeUpdates ADD. Expected {}, got {}", v1_type, v2_type));
                    }
                    if (v1_type == "N") {
                        do_update(std::move(column_name), number_add(*v1, v2));
@@ -3123,7 +3243,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                }
            } else {
                throw api_error::validation(
-                        format("Unknown Action value '{}' in AttributeUpdates", action));
+                        fmt::format("Unknown Action value '{}' in AttributeUpdates", action));
            }
        }
    }
@@ -3161,10 +3281,13 @@ future<executor::request_return_type> executor::update_item(client_state& client
    auto op = make_shared<update_item_operation>(_proxy, std::move(request));
    tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
    const bool needs_read_before_write = op->needs_read_before_write();
+
+    co_await verify_permission(client_state, op->schema(), auth::permission::MODIFY);
+
    if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
        _stats.api_operations.update_item--; // uncount on this shard, will be counted in other shard
        _stats.shard_bounce_for_lwt++;
-        return container().invoke_on(*shard, _ssg,
+        co_return co_await container().invoke_on(*shard, _ssg,
                [request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
                (executor& e) mutable {
            return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
@@ -3177,7 +3300,7 @@ future<executor::request_return_type> executor::update_item(client_state& client
            });
        });
    }
-    return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
+    co_return co_await op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
        _stats.api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    });
 }
@@ -3258,8 +3381,8 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
    const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
-
-    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
+    co_await verify_permission(client_state, schema, auth::permission::SELECT);
+    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = std::move(attrs_to_get), start_time = std::move(start_time)] (service::storage_proxy::coordinator_query_result qr) mutable {
        _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -3328,7 +3451,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    // the response size, as DynamoDB does.
    _stats.api_operations.batch_get_item++;
    rjson::value& request_items = request["RequestItems"];
-
+    auto start_time = std::chrono::steady_clock::now();
    // We need to validate all the parameters before starting any asynchronous
    // query, and fail the entire request on any parse error. So we parse all
    // the input into our own vector "requests", each element a table_requests
@@ -3361,7 +3484,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        }
    };
    std::vector<table_requests> requests;
-
+    uint batch_size = 0;
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        table_requests rs(get_table_from_batch_request(_proxy, it));
        tracing::add_table_name(trace_state, sstring(executor::KEYSPACE_NAME_PREFIX) + rs.schema->cf_name(), rs.schema->cf_name());
@@ -3375,9 +3498,15 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
            rs.add(key);
            check_key(key, rs.schema);
        }
+        batch_size += rs.requests.size();
        requests.emplace_back(std::move(rs));
    }

+    for (const table_requests& tr : requests) {
+        co_await verify_permission(client_state, tr.schema, auth::permission::SELECT);
+    }
+
+    _stats.api_operations.batch_get_item_batch_total += batch_size;
    // If we got here, all "requests" are valid, so let's start the
    // requests for the different partitions all in parallel.
    std::vector<future<std::vector<rjson::value>>> response_futures;
@@ -3467,6 +3596,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (!some_succeeded && eptr) {
        co_await coroutine::return_exception_ptr(std::move(eptr));
    }
+    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
@@ -3776,7 +3906,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
 }

 static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
-        schema_ptr schema,
+        schema_ptr table_schema,
        const rjson::value* exclusive_start_key,
        dht::partition_range_vector partition_ranges,
        std::vector<query::clustering_range> ck_bounds,
@@ -3793,33 +3923,50 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr

    tracing::trace(trace_state, "Performing a database query");

+    // Reverse the schema and the clustering bounds as the underlying code expects
+    // reversed queries in the native reversed format.
+    auto query_schema = table_schema;
+    const bool reversed = custom_opts.contains<query::partition_slice::option::reversed>();
+    if (reversed) {
+        query_schema = table_schema->get_reversed();
+
+        std::reverse(ck_bounds.begin(), ck_bounds.end());
+        for (auto& bound : ck_bounds) {
+            bound = query::reverse(bound);
+        }
+    }
+
    if (exclusive_start_key) {
-        partition_key pk = pk_from_json(*exclusive_start_key, schema);
+        partition_key pk = pk_from_json(*exclusive_start_key, table_schema);
        auto pos = position_in_partition::for_partition_start();
-        if (schema->clustering_key_size() > 0) {
-            pos = pos_from_json(*exclusive_start_key, schema);
+        if (table_schema->clustering_key_size() > 0) {
+            pos = pos_from_json(*exclusive_start_key, table_schema);
        }
        old_paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
    }

+    co_await verify_permission(client_state, table_schema, auth::permission::SELECT);
+
    auto regular_columns = boost::copy_range<query::column_id_vector>(
-            schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
+            table_schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
    auto static_columns = boost::copy_range<query::column_id_vector>(
-            schema->static_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
-    auto selection = cql3::selection::selection::wildcard(schema);
+            table_schema->static_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
+    auto selection = cql3::selection::selection::wildcard(table_schema);
    query::partition_slice::option_set opts = selection->get_query_options();
    opts.add(custom_opts);
    auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
+    auto command = ::make_lw_shared<query::read_command>(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
        query::tombstone_limit(proxy.get_tombstone_limit()));

+    elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version());
+
    auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));

    // FIXME: should be moved above, set on opts, so get_max_result_size knows it?
    command->slice.options.set<query::partition_slice::option::allow_short_read>();
    auto query_options = std::make_unique<cql3::query_options>(cl, std::vector<cql3::raw_value>{});
    query_options = std::make_unique<cql3::query_options>(std::move(query_options), std::move(old_paging_state));
-    auto p = service::pager::query_pagers::pager(proxy, schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr);
+    auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr);

    std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
    if (!p->is_exhausted()) {
@@ -3829,7 +3976,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    bool has_filter = filter;
    auto [items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
    if (paging_state) {
-        rjson::add(items, "LastEvaluatedKey", encode_paging_state(*schema, *paging_state));
+        rjson::add(items, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
    }
    if (has_filter){
        cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
@@ -3843,7 +3990,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
 }

 static dht::token token_for_segment(int segment, int total_segments) {
-    assert(total_segments > 1 && segment >= 0 && segment < total_segments);
+    SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
    uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
    return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
 }
@@ -4001,7 +4148,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
        // NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single
        // character at the end. Throws for NUMBER instances.
        if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
-            throw api_error::validation(format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
+            throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
        }
        return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
    }
@@ -4082,13 +4229,13 @@ static std::string_view get_toplevel(const parsed::value& v,
        used_attribute_names.emplace(column_name);
        if (!expression_attribute_names) {
            throw api_error::validation(
-                    format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression",
+                    fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression",
                            column_name));
        }
        const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
        if (!value || !value->IsString()) {
            throw api_error::validation(
-                    format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression",
+                    fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression",
                            column_name));
        }
        column_name = rjson::to_string_view(*value);
@@ -4206,7 +4353,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
            }
            if (f->_function_name != "begins_with") {
                throw api_error::validation(
-                        format("KeyConditionExpression function '{}' not supported",f->_function_name));
+                        fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name));
            }
            if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() ||
                    !f->_parameters[1].is_constant()) {
@@ -4271,7 +4418,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
                ck_bounds.push_back(query::clustering_range(ck));
            } else {
                throw api_error::validation(
-                        format("KeyConditionExpression condition on non-key attribute {}", key));
+                        fmt::format("KeyConditionExpression condition on non-key attribute {}", key));
            }
            continue;
        }
@@ -4279,10 +4426,10 @@ calculate_bounds_condition_expression(schema_ptr schema,
        // are allowed *only* on the clustering key:
        if (sstring(key) == pk_cdef.name_as_text()) {
            throw api_error::validation(
-                    format("KeyConditionExpression only '=' condition is supported on partition key {}", key));
+                    fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key));
        } else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) {
            throw api_error::validation(
-                    format("KeyConditionExpression condition on non-key attribute {}", key));
+                    fmt::format("KeyConditionExpression condition on non-key attribute {}", key));
        }
        if (!ck_bounds.empty()) {
            throw api_error::validation(
@@ -4306,7 +4453,7 @@ calculate_bounds_condition_expression(schema_ptr schema,
                // begins_with() supported on bytes and strings (both stored
                // in the database as strings) but not on numbers.
                throw api_error::validation(
-                        format("KeyConditionExpression begins_with() not supported on type {}",
+                        fmt::format("KeyConditionExpression begins_with() not supported on type {}",
                                type_to_string(ck_cdef->type)));
            } else if (raw_value.empty()) {
                ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
@@ -4439,8 +4586,10 @@ future<executor::request_return_type> executor::list_tables(client_state& client

    auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
    auto table_names = tables
-            | boost::adaptors::filtered([] (data_dictionary::table t) {
-                        return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 && !t.schema()->is_view();
+            | boost::adaptors::filtered([this] (data_dictionary::table t) {
+                        return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 &&
+                            !t.schema()->is_view() &&
+                            !cdc::is_log_for_some_table(_proxy.local_db(), t.schema()->ks_name(), t.schema()->cf_name());
                    })
            | boost::adaptors::transformed([] (data_dictionary::table t) {
                        return t.schema()->cf_name();
@@ -4532,7 +4681,7 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
        validate_table_name(table_name);

        throw api_error::table_not_found(
-                format("Table {} not found", table_name));
+                fmt::format("Table {} not found", table_name));
    }
    rjson::value desc = rjson::empty_object();
    rjson::add(desc, "ContinuousBackupsStatus", "DISABLED");
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -262,4 +262,9 @@ public:
 // add more than a couple of levels in its own output construction.
 bool is_big(const rjson::value& val, int big_size = 100'000);

+// Check CQL's Role-Based Access Control (RBAC) permission (MODIFY,
+// SELECT, DROP, etc.) on the given table. When permission is denied an
+// appropriate user-readable api_error::access_denied is thrown.
+future<> verify_permission(const service::client_state&, const schema_ptr&, auth::permission);
+
 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -57,10 +57,10 @@ static Result parse(const char* input_name, std::string_view input, Func&& f) {
        // TODO: displayRecognitionError could set a position inside the
        // expressions_syntax_error in throws, and we could use it here to
        // mark the broken position in 'input'.
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
+        throw expressions_syntax_error(fmt::format("Failed parsing {} '{}': {}",
            input_name, input, e.what()));
    } catch (...) {
-        throw expressions_syntax_error(format("Failed parsing {} '{}': {}",
+        throw expressions_syntax_error(fmt::format("Failed parsing {} '{}': {}",
            input_name, input, std::current_exception()));
    }
 }
@@ -160,12 +160,12 @@ static std::optional<std::string> resolve_path_component(const std::string& colu
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
            throw api_error::validation(
-                    format("ExpressionAttributeNames missing, entry '{}' required by expression", column_name));
+                    fmt::format("ExpressionAttributeNames missing, entry '{}' required by expression", column_name));
        }
        const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
        if (!value || !value->IsString()) {
            throw api_error::validation(
-                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
+                    fmt::format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
        return std::string(rjson::to_string_view(*value));
@@ -202,16 +202,16 @@ static void resolve_constant(parsed::constant& c,
        [&] (const std::string& valref) {
            if (!expression_attribute_values) {
                throw api_error::validation(
-                        format("ExpressionAttributeValues missing, entry '{}' required by expression", valref));
+                        fmt::format("ExpressionAttributeValues missing, entry '{}' required by expression", valref));
            }
            const rjson::value* value = rjson::find(*expression_attribute_values, valref);
            if (!value) {
                throw api_error::validation(
-                        format("ExpressionAttributeValues missing entry '{}' required by expression", valref));
+                        fmt::format("ExpressionAttributeValues missing entry '{}' required by expression", valref));
            }
            if (value->IsNull()) {
                throw api_error::validation(
-                        format("ExpressionAttributeValues null value for entry '{}' required by expression", valref));
+                        fmt::format("ExpressionAttributeValues null value for entry '{}' required by expression", valref));
            }
            validate_value(*value, "ExpressionAttributeValues");
            used_attribute_values.emplace(valref);
@@ -708,7 +708,7 @@ rjson::value calculate_value(const parsed::value& v,
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
                throw api_error::validation(
-                        format("{}: unknown function '{}' called.", caller, f._function_name));
+                        fmt::format("{}: unknown function '{}' called.", caller, f._function_name));
            }
            return function_it->second(caller, previous_item, f);
        },
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -143,17 +143,17 @@ static big_decimal parse_and_validate_number(std::string_view s) {
        big_decimal ret(s);
        auto [magnitude, precision] = internal::get_magnitude_and_precision(s);
        if (magnitude > 125) {
-            throw api_error::validation(format("Number overflow: {}. Attempting to store a number with magnitude larger than supported range.", s));
+            throw api_error::validation(fmt::format("Number overflow: {}. Attempting to store a number with magnitude larger than supported range.", s));
        }
        if (magnitude < -130) {
-            throw api_error::validation(format("Number underflow: {}. Attempting to store a number with magnitude lower than supported range.", s));
+            throw api_error::validation(fmt::format("Number underflow: {}. Attempting to store a number with magnitude lower than supported range.", s));
        }
        if (precision > 38) {
-            throw api_error::validation(format("Number too precise: {}. Attempting to store a number with more significant digits than supported.", s));
+            throw api_error::validation(fmt::format("Number too precise: {}. Attempting to store a number with more significant digits than supported.", s));
        }
        return ret;
    } catch (const marshal_exception& e) {
-        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", s));
+        throw api_error::validation(fmt::format("The parameter cannot be converted to a numeric value: {}", s));
    }

 }
@@ -265,7 +265,7 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
    std::string column_name = column.name_as_text();
    const rjson::value* key_typed_value = rjson::find(item, column_name);
    if (!key_typed_value) {
-        throw api_error::validation(format("Key column {} not found", column_name));
+        throw api_error::validation(fmt::format("Key column {} not found", column_name));
    }
    return get_key_from_typed_value(*key_typed_value, column);
 }
@@ -277,19 +277,26 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
 // mentioned in the exception message).
 // If the type does match, a reference to the encoded value is returned.
 static const rjson::value& get_typed_value(const rjson::value& key_typed_value, std::string_view type_str, std::string_view name, std::string_view value_name) {
-    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
-            !key_typed_value.MemberBegin()->value.IsString()) {
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1) {
        throw api_error::validation(
-                format("Malformed value object for {} {}: {}",
+                fmt::format("Malformed value object for {} {}: {}",
                        value_name, name, key_typed_value));
    }

    auto it = key_typed_value.MemberBegin();
    if (rjson::to_string_view(it->name) != type_str) {
        throw api_error::validation(
-                format("Type mismatch: expected type {} for {} {}, got type {}",
+                fmt::format("Type mismatch: expected type {} for {} {}, got type {}",
                        type_str, value_name, name, it->name));
    }
+    // We assume this function is called just for key types (S, B, N), and
+    // all of those always have a string value in the JSON.
+    if (!it->value.IsString()) {
+        throw api_error::validation(
+            fmt::format("Malformed value object for {} {}: {}",
+                    value_name, name, key_typed_value));
+
+    }
    return it->value;
 }

@@ -395,16 +402,16 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)

 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    if (!v.IsObject() || v.MemberCount() != 1) {
-        throw api_error::validation(format("{}: invalid number object", diagnostic));
+        throw api_error::validation(fmt::format("{}: invalid number object", diagnostic));
    }
    auto it = v.MemberBegin();
    if (it->name != "N") {
-        throw api_error::validation(format("{}: expected number, found type '{}'", diagnostic, it->name));
+        throw api_error::validation(fmt::format("{}: expected number, found type '{}'", diagnostic, it->name));
    }
    if (!it->value.IsString()) {
        // We shouldn't reach here. Callers normally validate their input
        // earlier with validate_value().
-        throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
+        throw api_error::validation(fmt::format("{}: improperly formatted number constant", diagnostic));
    }
    big_decimal ret = parse_and_validate_number(rjson::to_string_view(it->value));
    return ret;
@@ -485,7 +492,7 @@ rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error::validation(format("Mismatched set types: {} and {}", set1_type, set2_type));
+        throw api_error::validation(fmt::format("Mismatched set types: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
        throw api_error::validation("UpdateExpression: ADD operation for sets must be given sets as arguments");
@@ -513,7 +520,7 @@ std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value&
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error::validation(format("Set DELETE type mismatch: {} and {}", set1_type, set2_type));
+        throw api_error::validation(fmt::format("Set DELETE type mismatch: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
        throw api_error::validation("UpdateExpression: DELETE operation can only be performed on a set");
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -17,7 +17,10 @@
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
 #include "error.hh"
+#include "service/client_state.hh"
 #include "service/qos/service_level_controller.hh"
+#include "utils/assert.hh"
+#include "timeout_config.hh"
 #include "utils/rjson.hh"
 #include "auth.hh"
 #include <cctype>
@@ -211,7 +214,10 @@ protected:
        sstring local_dc = topology.get_datacenter();
        std::unordered_set<gms::inet_address> local_dc_nodes = topology.get_datacenter_endpoints().at(local_dc);
        for (auto& ip : local_dc_nodes) {
-            if (_gossiper.is_alive(ip)) {
+            // Note that it's not enough for the node to be is_alive() - a
+            // node joining the cluster is also "alive" but not responsive to
+            // requests. We alive *and* normal. See #19694, #21538.
+            if (_gossiper.is_alive(ip) && _gossiper.is_normal(ip)) {
                // Use the gossiped broadcast_rpc_address if available instead
                // of the internal IP address "ip". See discussion in #18711.
                rjson::push_back(results, rjson::from_string(_gossiper.get_rpc_address(ip)));
@@ -257,7 +263,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c
    std::string_view authorization_header = authorization_it->second;
    auto pos = authorization_header.find_first_of(' ');
    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
-        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+        throw api_error::invalid_signature(fmt::format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
    }
    authorization_header.remove_prefix(pos+1);
    std::string credential;
@@ -292,7 +298,7 @@ future<std::string> server::verify_signature(const request& req, const chunked_c

    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        throw api_error::validation(format("Incorrect credential information format: {}", credential));
+        throw api_error::validation(fmt::format("Incorrect credential information format: {}", credential));
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -377,7 +383,7 @@ static tracing::trace_state_ptr maybe_trace_query(service::client_state& client_
        std::string buf;
        tracing::add_session_param(trace_state, "alternator_op", op);
        tracing::add_query(trace_state, truncated_content_view(query, buf));
-        tracing::begin(trace_state, format("Alternator {}", op), client_state.get_client_address());
+        tracing::begin(trace_state, seastar::format("Alternator {}", op), client_state.get_client_address());
        if (!username.empty()) {
            tracing::set_username(trace_state, auth::authenticated_user(username));
        }
@@ -402,7 +408,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
        ++_executor._stats.requests_blocked_memory;
    }
    auto units = co_await std::move(units_fut);
-    assert(req->content_stream);
+    SCYLLA_ASSERT(req->content_stream);
    chunked_content content = co_await util::read_entire_stream(*req->content_stream);
    auto username = co_await verify_signature(*req, content);

@@ -413,7 +419,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    auto callback_it = _callbacks.find(op);
    if (callback_it == _callbacks.end()) {
        _executor._stats.unsupported_operations++;
-        co_return api_error::unknown_operation(format("Unsupported operation {}", op));
+        co_return api_error::unknown_operation(fmt::format("Unsupported operation {}", op));
    }
    if (_pending_requests.get_count() >= _max_concurrent_requests) {
        _executor._stats.requests_shed++;
@@ -421,11 +427,11 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    }
    _pending_requests.enter();
    auto leave = defer([this] () noexcept { _pending_requests.leave(); });
-    //FIXME: Client state can provide more context, e.g. client's endpoint address
-    // We use unique_ptr because client_state cannot be moved or copied
-    executor::client_state client_state = username.empty()
-        ? service::client_state{service::client_state::internal_tag()}
-        : service::client_state{service::client_state::internal_tag(), _auth_service, _sl_controller, username};
+    executor::client_state client_state(service::client_state::external_tag(),
+        _auth_service, &_sl_controller, _timeout_config.current_values(), req->get_client_address());
+    if (!username.empty()) {
+        client_state.set_login(auth::authenticated_user(username));
+    }
    co_await client_state.maybe_update_per_service_level_params();

    tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content);
@@ -472,6 +478,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
        , _enforce_authorization(false)
        , _enabled_servers{}
        , _pending_requests{}
+        , _timeout_config(_proxy.data_dictionary().get_config())
      , _callbacks{
        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
@@ -636,7 +643,7 @@ future<> server::json_parser::stop() {

 const char* api_error::what() const noexcept {
    if (_what_string.empty()) {
-        _what_string = format("{} {}: {}", std::to_underlying(_http_code), _type, _msg);
+        _what_string = fmt::format("{} {}: {}", std::to_underlying(_http_code), _type, _msg);
    }
    return _what_string.c_str();
 }
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -42,6 +42,11 @@ class server {
    bool _enforce_authorization;
    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
    gate _pending_requests;
+    // In some places we will need a CQL updateable_timeout_config object even
+    // though it isn't really relevant for Alternator which defines its own
+    // timeouts separately. We can create this object only once.
+    updateable_timeout_config _timeout_config;
+
    alternator_callbacks_map _callbacks;

    semaphore* _memory_limiter;
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -67,6 +67,8 @@ stats::stats() : api_operations{} {
            OPERATION_LATENCY(get_item_latency, "GetItem")
            OPERATION_LATENCY(delete_item_latency, "DeleteItem")
            OPERATION_LATENCY(update_item_latency, "UpdateItem")
+            OPERATION_LATENCY(batch_write_item_latency, "BatchWriteItem")
+            OPERATION_LATENCY(batch_get_item_latency, "BatchGetItem")
            OPERATION(list_streams, "ListStreams")
            OPERATION(describe_stream, "DescribeStream")
            OPERATION(get_shard_iterator, "GetShardIterator")
@@ -94,6 +96,10 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of rows read and matched during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_dropped_total", [this] { return cql_stats.filtered_rows_read_total - cql_stats.filtered_rows_matched_total; },
                    seastar::metrics::description("number of rows read and dropped during filtering operations")),
+                    seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"),{op("BatchWriteItem")},
+                            api_operations.batch_write_item_batch_total).set_skip_when_empty(),
+                    seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"),{op("BatchGetItem")},
+                            api_operations.batch_get_item_batch_total).set_skip_when_empty(),
    });
 }

--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -26,6 +26,8 @@ public:
    struct {
        uint64_t batch_get_item = 0;
        uint64_t batch_write_item = 0;
+        uint64_t batch_get_item_batch_total = 0;
+        uint64_t batch_write_item_batch_total = 0;
        uint64_t create_backup = 0;
        uint64_t create_global_table = 0;
        uint64_t create_table = 0;
@@ -69,6 +71,8 @@ public:
        utils::timed_rate_moving_average_summary_and_histogram get_item_latency;
        utils::timed_rate_moving_average_summary_and_histogram delete_item_latency;
        utils::timed_rate_moving_average_summary_and_histogram update_item_latency;
+        utils::timed_rate_moving_average_summary_and_histogram batch_write_item_latency;
+        utils::timed_rate_moving_average_summary_and_histogram batch_get_item_latency;
        utils::timed_rate_moving_average_summary_and_histogram get_records_latency;
    } api_operations;
    // Miscellaneous event counters
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -13,6 +13,7 @@

 #include <seastar/json/formatter.hh>

+#include "auth/permission.hh"
 #include "db/config.hh"

 #include "cdc/log.hh"
@@ -818,11 +819,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
    }

    if (!schema || !base || !is_alternator_keyspace(schema->ks_name())) {
-        throw api_error::resource_not_found(fmt::to_string(iter.table));
+        co_return api_error::resource_not_found(fmt::to_string(iter.table));
    }

    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

+    co_await verify_permission(client_state, schema, auth::permission::SELECT);
+
    db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
    partition_key pk = iter.shard.id.to_partition_key(*schema);

@@ -887,7 +890,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
+    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -26,6 +26,7 @@
 #include "log.hh"
 #include "gc_clock.hh"
 #include "replica/database.hh"
+#include "service/client_state.hh"
 #include "service_permit.hh"
 #include "timestamp.hh"
 #include "service/storage_proxy.hh"
@@ -35,6 +36,7 @@
 #include "mutation/mutation.hh"
 #include "types/types.hh"
 #include "types/map.hh"
+#include "utils/assert.hh"
 #include "utils/rjson.hh"
 #include "utils/big_decimal.hh"
 #include "cql3/selection/selection.hh"
@@ -97,6 +99,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    }
    sstring attribute_name(v->GetString(), v->GetStringLength());

+    co_await verify_permission(client_state, schema, auth::permission::ALTER);
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
        if (enabled) {
            if (tags_map.contains(TTL_TAG_KEY)) {
@@ -312,7 +315,7 @@ static size_t random_offset(size_t min, size_t max) {
 // this range's primary node is down. For this we need to return not just
 // a list of this node's secondary ranges - but also the primary owner of
 // each of those ranges.
-static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary_ranges(
+static future<std::vector<std::pair<dht::token_range, gms::inet_address>>> get_secondary_ranges(
        const locator::effective_replication_map_ptr& erm,
        gms::inet_address ep) {
    const auto& tm = *erm->get_token_metadata_ptr();
@@ -323,6 +326,7 @@ static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary
    }
    auto prev_tok = sorted_tokens.back();
    for (const auto& tok : sorted_tokens) {
+        co_await coroutine::maybe_yield();
        inet_address_vector_replica_set eps = erm->get_natural_endpoints(tok);
        if (eps.size() <= 1 || eps[1] != ep) {
            prev_tok = tok;
@@ -350,7 +354,7 @@ static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary
        }
        prev_tok = tok;
    }
-    return ret;
+    co_return ret;
 }


@@ -386,63 +390,63 @@ static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary
 //
 // FIXME: Check if this algorithm is safe with tablet migration.
 // https://github.com/scylladb/scylladb/issues/16567
-enum primary_or_secondary_t {primary, secondary};
-template<primary_or_secondary_t primary_or_secondary>
-class token_ranges_owned_by_this_shard {
-    // ranges_holder_primary holds just the primary ranges themselves
-    class ranges_holder_primary {
-        const dht::token_range_vector _token_ranges;
-     public:
-        ranges_holder_primary(const locator::vnode_effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
-            : _token_ranges(erm->get_primary_ranges(ep)) {}
-        std::size_t size() const { return _token_ranges.size(); }
-        const dht::token_range& operator[](std::size_t i) const {
-            return _token_ranges[i];
-        }
-        bool should_skip(std::size_t i) const {
-            return false;
-        }
-    };
-    // ranges_holder<secondary> holds the secondary token ranges plus each
-    // range's primary owner, needed to implement should_skip().
-    class ranges_holder_secondary {
-        std::vector<std::pair<dht::token_range, gms::inet_address>> _token_ranges;
-        gms::gossiper& _gossiper;
-     public:
-        ranges_holder_secondary(const locator::effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
-            : _token_ranges(get_secondary_ranges(erm, ep))
-            , _gossiper(g) {}
-        std::size_t size() const { return _token_ranges.size(); }
-        const dht::token_range& operator[](std::size_t i) const {
-            return _token_ranges[i].first;
-        }
-        // range i should be skipped if its primary owner is alive.
-        bool should_skip(std::size_t i) const {
-            return _gossiper.is_alive(_token_ranges[i].second);
-        }
-    };

+// ranges_holder_primary holds just the primary ranges themselves
+class ranges_holder_primary {
+    dht::token_range_vector _token_ranges;
+public:
+    explicit ranges_holder_primary(dht::token_range_vector token_ranges) : _token_ranges(std::move(token_ranges)) {}
+    static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map_ptr& erm, gms::inet_address ep) {
+        co_return ranges_holder_primary(co_await erm->get_primary_ranges(ep));
+    }
+    std::size_t size() const { return _token_ranges.size(); }
+    const dht::token_range& operator[](std::size_t i) const {
+        return _token_ranges[i];
+    }
+    bool should_skip(std::size_t i) const {
+        return false;
+    }
+};
+// ranges_holder<secondary> holds the secondary token ranges plus each
+// range's primary owner, needed to implement should_skip().
+class ranges_holder_secondary {
+    std::vector<std::pair<dht::token_range, gms::inet_address>> _token_ranges;
+    const gms::gossiper& _gossiper;
+public:
+    explicit ranges_holder_secondary(std::vector<std::pair<dht::token_range, gms::inet_address>> token_ranges, const gms::gossiper& g)
+        : _token_ranges(std::move(token_ranges))
+        , _gossiper(g) {}
+    static future<ranges_holder_secondary> make(const locator::effective_replication_map_ptr& erm, gms::inet_address ep, const gms::gossiper& g) {
+        co_return ranges_holder_secondary(co_await get_secondary_ranges(erm, ep), g);
+    }
+    std::size_t size() const { return _token_ranges.size(); }
+    const dht::token_range& operator[](std::size_t i) const {
+        return _token_ranges[i].first;
+    }
+    // range i should be skipped if its primary owner is alive.
+    bool should_skip(std::size_t i) const {
+        return _gossiper.is_alive(_token_ranges[i].second);
+    }
+};
+
+template<class primary_or_secondary_t>
+class token_ranges_owned_by_this_shard {
    schema_ptr _s;
    locator::effective_replication_map_ptr _erm;
    // _token_ranges will contain a list of token ranges owned by this node.
    // We'll further need to split each such range to the pieces owned by
    // the current shard, using _intersecter.
-    using ranges_holder = std::conditional_t<
-            primary_or_secondary == primary_or_secondary_t::primary,
-            ranges_holder_primary,
-            ranges_holder_secondary>;
-    const ranges_holder _token_ranges;
+    const primary_or_secondary_t _token_ranges;
    // NOTICE: _range_idx is used modulo _token_ranges size when accessing
    // the data to ensure that it doesn't go out of bounds
    size_t _range_idx;
    size_t _end_idx;
    std::optional<dht::selective_token_range_sharder> _intersecter;
 public:
-    token_ranges_owned_by_this_shard(replica::database& db, gms::gossiper& g, schema_ptr s)
+    token_ranges_owned_by_this_shard(schema_ptr s, primary_or_secondary_t token_ranges)
        :  _s(s)
        , _erm(s->table().get_effective_replication_map())
-        , _token_ranges(db.find_keyspace(s->ks_name()).get_vnode_effective_replication_map(),
-                g, _erm->get_topology().my_address())
+        , _token_ranges(std::move(token_ranges))
        , _range_idx(random_offset(0, _token_ranges.size() - 1))
        , _end_idx(_range_idx + _token_ranges.size())
    {
@@ -498,6 +502,7 @@ struct scan_ranges_context {
    bytes column_name;
    std::optional<std::string> member;

+    service::client_state internal_client_state;
    ::shared_ptr<cql3::selection::selection> selection;
    std::unique_ptr<service::query_state> query_state_ptr;
    std::unique_ptr<cql3::query_options> query_options;
@@ -507,6 +512,7 @@ struct scan_ranges_context {
        : s(s)
        , column_name(column_name)
        , member(member)
+        , internal_client_state(service::client_state::internal_tag())
    {
        // FIXME: don't read the entire items - read only parts of it.
        // We must read the key columns (to be able to delete) and also
@@ -525,10 +531,9 @@ struct scan_ranges_context {
        std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
        auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
        command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
-        executor::client_state client_state{executor::client_state::internal_tag()};
        tracing::trace_state_ptr trace_state;
        // NOTICE: empty_service_permit is used because the TTL service has fixed parallelism
-        query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, empty_service_permit());
+        query_state_ptr = std::make_unique<service::query_state>(internal_client_state, trace_state, empty_service_permit());
        // FIXME: What should we do on multi-DC? Will we run the expiration on the same ranges on all
        // DCs or only once for each range? If the latter, we need to change the CLs in the
        // scanner and deleter.
@@ -551,7 +556,7 @@ static future<> scan_table_ranges(
        expiration_service::stats& expiration_stats)
 {
    const schema_ptr& s = scan_ctx.s;
-    assert (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
+    SCYLLA_ASSERT (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
    auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
            *scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
    while (!p->is_exhausted()) {
@@ -724,7 +729,9 @@ static future<bool> scan_table(
    expiration_stats.scan_table++;
    // FIXME: need to pace the scan, not do it all at once.
    scan_ranges_context scan_ctx{s, proxy, std::move(column_name), std::move(member)};
-    token_ranges_owned_by_this_shard<primary> my_ranges(db.real_database(), gossiper, s);
+    auto erm = db.real_database().find_keyspace(s->ks_name()).get_vnode_effective_replication_map();
+    auto my_address = erm->get_topology().my_address();
+    token_ranges_owned_by_this_shard my_ranges(s, co_await ranges_holder_primary::make(erm, my_address));
    while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
        // Note that because of issue #9167 we need to run a separate
        // query on each partition range, and can't pass several of
@@ -744,7 +751,7 @@ static future<bool> scan_table(
    // by tasking another node to take over scanning of the dead node's primary
    // ranges. What we do here is that this node will also check expiration
    // on its *secondary* ranges - but only those whose primary owner is down.
-    token_ranges_owned_by_this_shard<secondary> my_secondary_ranges(db.real_database(), gossiper, s);
+    token_ranges_owned_by_this_shard my_secondary_ranges(s, co_await ranges_holder_secondary::make(erm, my_address, gossiper));
    while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
        expiration_stats.secondary_ranges_scanned++;
        dht::partition_range_vector partition_ranges;
--- a/api/CMakeLists.txt
+++ b/api/CMakeLists.txt
@@ -7,6 +7,7 @@ set(swagger_files
  api-doc/commitlog.json
  api-doc/compaction_manager.json
  api-doc/config.json
+  api-doc/cql_server_test.json
  api-doc/endpoint_snitch_info.json
  api-doc/error_injection.json
  api-doc/failure_detector.json
@@ -46,6 +47,7 @@ target_sources(api
    commitlog.cc
    compaction_manager.cc
    config.cc
+    cql_server_test.cc
    endpoint_snitch.cc
    error_injection.cc
    authorization_cache.cc
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -92,6 +92,14 @@
                     "type":"boolean",
                     "paramType":"query"
                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  },
                  {
                     "name":"split_output",
                     "description":"true if the output of the major compaction should be split in several sstables",
--- a/api/api-doc/cql_server_test.json
+++ b/api/api-doc/cql_server_test.json
@@ -0,0 +1,26 @@
+{
+    "apiVersion":"0.0.1",
+    "swaggerVersion":"1.2",
+    "basePath":"{{Protocol}}://{{Host}}",
+    "resourcePath":"/cql_server_test",
+    "produces":[
+        "application/json"
+    ],
+    "apis":[
+        {
+            "path":"/cql_server_test/connections_params",
+            "operations":[
+                {
+                    "method":"GET",
+                    "summary":"Get service level params of each CQL connection",
+                    "type":"connections_service_level_params",
+                    "nickname":"connections_params",
+                    "produces":[
+                        "application/json"
+                    ],
+                    "parameters":[]
+                }
+            ]
+        }
+    ]
+}
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -741,11 +741,123 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
         ]
      },
+      {
+          "path":"/storage_service/backup",
+          "operations":[
+              {
+                  "method":"POST",
+                  "summary":"Starts copying SSTables from a specified keyspace to a designated bucket in object storage",
+                  "type":"string",
+                  "nickname":"start_backup",
+                  "produces":[
+                      "application/json"
+                  ],
+                  "parameters":[
+                      {
+                          "name":"endpoint",
+                          "description":"ID of the configured object storage endpoint to copy sstables to",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"bucket",
+                          "description":"Name of the bucket to backup sstables to",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"keyspace",
+                          "description":"Name of a keyspace to copy sstables from",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"snapshot",
+                          "description":"Name of a snapshot to copy sstables from",
+                          "required":false,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      }
+                  ]
+              }
+          ]
+      },
+      {
+          "path":"/storage_service/restore",
+          "operations":[
+              {
+                  "method":"POST",
+                  "summary":"Starts copying SSTables from a designated bucket in object storage to a specified keyspace",
+                  "type":"string",
+                  "nickname":"start_restore",
+                  "produces":[
+                      "application/json"
+                  ],
+                  "parameters":[
+                      {
+                          "name":"endpoint",
+                          "description":"ID of the configured object storage endpoint to copy SSTables from",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"bucket",
+                          "description":"Name of the bucket to read SSTables from",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"snapshot",
+                          "description":"Name of a snapshot to copy SSTables from",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"keyspace",
+                          "description":"Name of a keyspace to copy SSTables to",
+                          "required":true,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      },
+                      {
+                          "name":"table",
+                          "description":"Name of a table to copy SSTables to",
+                          "required":false,
+                          "allowMultiple":false,
+                          "type":"string",
+                          "paramType":"query"
+                      }
+                  ]
+              }
+          ]
+      },
      {
         "path":"/storage_service/keyspace_compaction/{keyspace}",
         "operations":[
@@ -781,6 +893,14 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
@@ -1891,6 +2011,14 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"force",
+                     "description":"Enforce the source_dc option, even if it unsafe to use for rebuild",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -194,6 +194,21 @@
               "parameters":[]
            }
         ]
+      },
+      {
+         "path":"/system/highest_supported_sstable_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get highest supported sstable version",
+               "type":"string",
+               "nickname":"get_highest_supported_sstable_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
      }
   ]
 }
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -115,7 +115,7 @@
               "parameters":[
                  {
                     "name":"task_id",
-                     "description":"The uuid of a task to abort",
+                     "description":"The uuid of a task to abort; if the task is not abortable, 403 status code is returned",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -144,6 +144,14 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"path"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Timeout for waiting; if times out, 408 status code is returned",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"long",
+                     "paramType":"query"
                  }
               ]
            }
@@ -197,11 +205,60 @@
                     "paramType":"query"
                  }
               ]
+            },
+            {
+               "method":"GET",
+               "summary":"Get current ttl value",
+               "type":"long",
+               "nickname":"get_ttl",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/task_manager/drain/{module}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Drain finished local tasks",
+               "type":"void",
+               "nickname":"drain_tasks",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"module",
+                     "description":"The module to drain",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
            }
         ]
      }
   ],
   "models":{
+      "task_identity":{
+         "id": "task_identity",
+         "description":"Id and node of a task",
+         "properties":{
+            "task_id":{
+               "type":"string",
+               "description":"The uuid of a task"
+            },
+            "node":{
+               "type":"string",
+               "description":"Address of a server on which a task is created"
+            }
+         }
+      },
      "task_stats" :{
         "id": "task_stats",
         "description":"A task statistics object",
@@ -224,6 +281,14 @@
               "type":"string",
               "description":"The description of the task"
            },
+            "kind":{
+               "type":"string",
+               "enum":[
+                  "node",
+                  "cluster"
+               ],
+               "description":"The kind of a task"
+            },
            "scope":{
               "type":"string",
               "description":"The scope of the task"
@@ -258,6 +323,14 @@
               "type":"string",
               "description":"The description of the task"
            },
+            "kind":{
+               "type":"string",
+               "enum":[
+                  "node",
+                  "cluster"
+               ],
+               "description":"The kind of a task"
+            },
            "scope":{
               "type":"string",
               "description":"The scope of the task"
@@ -327,9 +400,9 @@
            "children_ids":{
               "type":"array",
               "items":{
-                  "type":"string"
+                  "type":"task_identity"
               },
-               "description":"Task IDs of children of this task"
+               "description":"Task identities of children of this task"
            }
         }
      }
--- a/api/api.cc
+++ b/api/api.cc
@@ -10,6 +10,7 @@
 #include <seastar/http/file_handler.hh>
 #include <seastar/http/transformers.hh>
 #include <seastar/http/api_docs.hh>
+#include "cql_server_test.hh"
 #include "storage_service.hh"
 #include "token_metadata.hh"
 #include "commitlog.hh"
@@ -73,6 +74,8 @@ future<> set_server_init(http_context& ctx) {
        set_error_injection(ctx, r);
        rb->register_function(r, "storage_proxy",
                "The storage proxy API");
+        rb->register_function(r, "storage_service",
+                "The storage service API");
    });
 }

@@ -115,7 +118,7 @@ future<> unset_thrift_controller(http_context& ctx) {
 }

 future<> set_server_storage_service(http_context& ctx, sharded<service::storage_service>& ss, service::raft_group0_client& group0_client) {
-    return register_api(ctx, "storage_service", "The storage service API", [&ss, &group0_client] (http_context& ctx, routes& r) {
+    return ctx.http_server.set_routes([&ctx, &ss, &group0_client] (routes& r) {
            set_storage_service(ctx, r, ss, group0_client);
        });
 }
@@ -256,6 +259,10 @@ future<> set_server_cache(http_context& ctx) {
            "The cache service API", set_cache_service);
 }

+future<> unset_server_cache(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_cache_service(ctx, r); });
+}
+
 future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& proxy) {
    return register_api(ctx, "hinted_handoff",
                "The hinted handoff API", [&proxy] (http_context& ctx, routes& r) {
@@ -323,6 +330,16 @@ future<> unset_server_task_manager_test(http_context& ctx) {
    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_task_manager_test(ctx, r); });
 }

+future<> set_server_cql_server_test(http_context& ctx, cql_transport::controller& ctl) {
+    return register_api(ctx, "cql_server_test", "The CQL server test API", [&ctl] (http_context& ctx, routes& r) {
+        set_cql_server_test(ctx, r, ctl);
+    });
+}
+
+future<> unset_server_cql_server_test(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_cql_server_test(ctx, r); });
+}
+
 #endif

 future<> set_server_tasks_compaction_module(http_context& ctx, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
--- a/api/api.hh
+++ b/api/api.hh
@@ -246,7 +246,7 @@ public:
                value = T{boost::lexical_cast<Base>(param)};
            }
        } catch (boost::bad_lexical_cast&) {
-            throw httpd::bad_param_exception(format("{} ({}): type error - should be {}", name, param, boost::units::detail::demangle(typeid(Base).name())));
+            throw httpd::bad_param_exception(fmt::format("{} ({}): type error - should be {}", name, param, boost::units::detail::demangle(typeid(Base).name())));
        }
    }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -119,6 +119,7 @@ future<> unset_server_stream_manager(http_context& ctx);
 future<> set_hinted_handoff(http_context& ctx, sharded<service::storage_proxy>& p);
 future<> unset_hinted_handoff(http_context& ctx);
 future<> set_server_cache(http_context& ctx);
+future<> unset_server_cache(http_context& ctx);
 future<> set_server_compaction_manager(http_context& ctx);
 future<> set_server_done(http_context& ctx);
 future<> set_server_task_manager(http_context& ctx, sharded<tasks::task_manager>& tm, lw_shared_ptr<db::config> cfg);
@@ -131,5 +132,7 @@ future<> set_server_raft(http_context&, sharded<service::raft_group_registry>&);
 future<> unset_server_raft(http_context&);
 future<> set_load_meter(http_context& ctx, service::load_meter& lm);
 future<> unset_load_meter(http_context& ctx);
+future<> set_server_cql_server_test(http_context& ctx, cql_transport::controller& ctl);
+future<> unset_server_cql_server_test(http_context& ctx);

 }
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -320,5 +320,50 @@ void set_cache_service(http_context& ctx, routes& r) {
    });
 }

+void unset_cache_service(http_context& ctx, routes& r) {
+    cs::get_row_cache_save_period_in_seconds.unset(r);
+    cs::set_row_cache_save_period_in_seconds.unset(r);
+    cs::get_key_cache_save_period_in_seconds.unset(r);
+    cs::set_key_cache_save_period_in_seconds.unset(r);
+    cs::get_counter_cache_save_period_in_seconds.unset(r);
+    cs::set_counter_cache_save_period_in_seconds.unset(r);
+    cs::get_row_cache_keys_to_save.unset(r);
+    cs::set_row_cache_keys_to_save.unset(r);
+    cs::get_key_cache_keys_to_save.unset(r);
+    cs::set_key_cache_keys_to_save.unset(r);
+    cs::get_counter_cache_keys_to_save.unset(r);
+    cs::set_counter_cache_keys_to_save.unset(r);
+    cs::invalidate_key_cache.unset(r);
+    cs::invalidate_counter_cache.unset(r);
+    cs::set_row_cache_capacity_in_mb.unset(r);
+    cs::set_key_cache_capacity_in_mb.unset(r);
+    cs::set_counter_cache_capacity_in_mb.unset(r);
+    cs::save_caches.unset(r);
+    cs::get_key_capacity.unset(r);
+    cs::get_key_hits.unset(r);
+    cs::get_key_requests.unset(r);
+    cs::get_key_hit_rate.unset(r);
+    cs::get_key_hits_moving_avrage.unset(r);
+    cs::get_key_requests_moving_avrage.unset(r);
+    cs::get_key_size.unset(r);
+    cs::get_key_entries.unset(r);
+    cs::get_row_capacity.unset(r);
+    cs::get_row_hits.unset(r);
+    cs::get_row_requests.unset(r);
+    cs::get_row_hit_rate.unset(r);
+    cs::get_row_hits_moving_avrage.unset(r);
+    cs::get_row_requests_moving_avrage.unset(r);
+    cs::get_row_size.unset(r);
+    cs::get_row_entries.unset(r);
+    cs::get_counter_capacity.unset(r);
+    cs::get_counter_hits.unset(r);
+    cs::get_counter_requests.unset(r);
+    cs::get_counter_hit_rate.unset(r);
+    cs::get_counter_hits_moving_avrage.unset(r);
+    cs::get_counter_requests_moving_avrage.unset(r);
+    cs::get_counter_size.unset(r);
+    cs::get_counter_entries.unset(r);
+}
+
 }

--- a/api/cache_service.hh
+++ b/api/cache_service.hh
@@ -16,5 +16,6 @@ namespace api {

 struct http_context;
 void set_cache_service(http_context& ctx, seastar::httpd::routes& r);
+void unset_cache_service(http_context& ctx, seastar::httpd::routes& r);

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -15,6 +15,7 @@
 #include <seastar/http/exception.hh>
 #include "sstables/sstables.hh"
 #include "sstables/metadata_collector.hh"
+#include "utils/assert.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
 #include "db/system_keyspace.hh"
@@ -103,7 +104,7 @@ class autocompaction_toggle_guard {
    replica::database& _db;
 public:
    autocompaction_toggle_guard(replica::database& db) : _db(db) {
-        assert(this_shard_id() == 0);
+        SCYLLA_ASSERT(this_shard_id() == 0);
        if (!_db._enable_autocompaction_toggle) {
            throw std::runtime_error("Autocompaction toggle is busy");
        }
@@ -112,7 +113,7 @@ public:
    autocompaction_toggle_guard(const autocompaction_toggle_guard&) = delete;
    autocompaction_toggle_guard(autocompaction_toggle_guard&&) = default;
    ~autocompaction_toggle_guard() {
-        assert(this_shard_id() == 0);
+        SCYLLA_ASSERT(this_shard_id() == 0);
        _db._enable_autocompaction_toggle = true;
    }
 };
@@ -1125,6 +1126,7 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        auto params = req_params({
            std::pair("name", mandatory::yes),
            std::pair("flush_memtables", mandatory::no),
+            std::pair("consider_only_existing_data", mandatory::no),
            std::pair("split_output", mandatory::no),
        });
        params.process(*req);
@@ -1133,7 +1135,8 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace
        }
        auto [ks, cf] = parse_fully_qualified_cf_name(*params.get("name"));
        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("column_family/force_major_compaction: name={} flush={}", req->get_path_param("name"), flush);
+        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        apilog.info("column_family/force_major_compaction: name={} flush={} consider_only_existing_data={}", req->get_path_param("name"), flush, consider_only_existing_data);

        auto keyspace = validate_keyspace(ctx, ks);
        std::vector<table_info> table_infos = {table_info{
@@ -1143,10 +1146,10 @@ void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace

        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        std::optional<flush_mode> fmopt;
-        if (!flush) {
+        if (!flush && !consider_only_existing_data) {
            fmopt = flush_mode::skip;
        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), ctx.db, std::move(table_infos), fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), ctx.db, std::move(table_infos), fmopt, consider_only_existing_data);
        co_await task->done();
        co_return json_void();
    });
--- a/api/config.cc
+++ b/api/config.cc
@@ -10,6 +10,7 @@
 #include "api/config.hh"
 #include "api/api-doc/config.json.hh"
 #include "api/api-doc/storage_proxy.json.hh"
+#include "api/api-doc/storage_service.json.hh"
 #include "replica/database.hh"
 #include "db/config.hh"
 #include <sstream>
@@ -19,6 +20,7 @@
 namespace api {
 using namespace seastar::httpd;
 namespace sp = httpd::storage_proxy_json;
+namespace ss = httpd::storage_service_json;

 template<class T>
 json::json_return_type get_json_return_type(const T& val) {
@@ -183,6 +185,14 @@ void set_config(std::shared_ptr < api_registry_builder20 > rb, http_context& ctx
        return make_ready_future<json::json_return_type>(seastar::json::json_void());
    });

+    ss::get_all_data_file_locations.set(r, [&cfg](const_req req) {
+        return container_to_vec(cfg.data_file_directories());
+    });
+
+    ss::get_saved_caches_location.set(r, [&cfg](const_req req) {
+        return cfg.saved_caches_directory();
+    });
+
 }

 void unset_config(http_context& ctx, routes& r) {
@@ -201,6 +211,8 @@ void unset_config(http_context& ctx, routes& r) {
    sp::set_range_rpc_timeout.unset(r);
    sp::get_truncate_rpc_timeout.unset(r);
    sp::set_truncate_rpc_timeout.unset(r);
+    ss::get_all_data_file_locations.unset(r);
+    ss::get_saved_caches_location.unset(r);
 }

 }
--- a/api/cql_server_test.cc
+++ b/api/cql_server_test.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#ifndef SCYLLA_BUILD_MODE_RELEASE
+
+#include <seastar/core/coroutine.hh>
+#include <boost/range/algorithm/transform.hpp>
+
+#include "api/api-doc/cql_server_test.json.hh"
+#include "cql_server_test.hh"
+#include "transport/controller.hh"
+#include "transport/server.hh"
+#include "service/qos/qos_common.hh"
+
+namespace api {
+
+namespace cst = httpd::cql_server_test_json;
+using namespace json;
+using namespace seastar::httpd;
+
+struct connection_sl_params : public json::json_base {
+    json::json_element<sstring> _role_name;
+    json::json_element<sstring> _workload_type;
+    json::json_element<sstring> _timeout;
+
+    connection_sl_params(const sstring& role_name, const sstring& workload_type, const sstring& timeout) {
+        _role_name = role_name;
+        _workload_type = workload_type;
+        _timeout = timeout;
+        register_params();
+    }
+
+    connection_sl_params(const connection_sl_params& params)
+        : connection_sl_params(params._role_name(), params._workload_type(), params._timeout()) {}
+
+    void register_params() {
+        add(&_role_name, "role_name");
+        add(&_workload_type, "workload_type");
+        add(&_timeout, "timeout");
+    }    
+};
+
+void set_cql_server_test(http_context& ctx, seastar::httpd::routes& r, cql_transport::controller& ctl) {
+    cst::connections_params.set(r, [&ctl] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto sl_params = co_await ctl.get_connections_service_level_params();
+
+        std::vector<connection_sl_params> result;
+        boost::transform(std::move(sl_params), std::back_inserter(result), [] (const cql_transport::connection_service_level_params& params) {
+            auto nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(params.timeout_config.read_timeout).count();
+            return connection_sl_params(
+                    std::move(params.role_name), 
+                    sstring(qos::service_level_options::to_string(params.workload_type)), 
+                    to_string(cql_duration(months_counter{0}, days_counter{0}, nanoseconds_counter{nanos})));
+        });
+        co_return result;
+    });
+}
+
+void unset_cql_server_test(http_context& ctx, seastar::httpd::routes& r) {
+    cst::connections_params.unset(r);
+}
+
+}
+
+#endif
--- a/api/cql_server_test.hh
+++ b/api/cql_server_test.hh
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#ifndef SCYLLA_BUILD_MODE_RELEASE
+
+#pragma once
+
+namespace cql_transport {
+class controller;
+}
+
+namespace seastar::httpd {
+class routes;
+}
+
+namespace api {
+struct http_context;
+
+void set_cql_server_test(http_context& ctx, seastar::httpd::routes& r, cql_transport::controller& ctl);
+void unset_cql_server_test(http_context& ctx, seastar::httpd::routes& r);
+
+}
+
+#endif
--- a/api/raft.cc
+++ b/api/raft.cc
@@ -102,8 +102,8 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis

        if (!req->query_parameters.contains("group_id")) {
            // Read barrier on group 0 by default
-            co_await raft_gr.invoke_on(0, [timeout] (service::raft_group_registry& raft_gr) {
-                return raft_gr.group0_with_timeouts().read_barrier(nullptr, timeout);
+            co_await raft_gr.invoke_on(0, [timeout] (service::raft_group_registry& raft_gr) -> future<> {
+                co_await raft_gr.group0_with_timeouts().read_barrier(nullptr, timeout);
            });
            co_return json_void{};
        }
@@ -111,12 +111,12 @@ void set_raft(http_context&, httpd::routes& r, sharded<service::raft_group_regis
        raft::group_id gid{utils::UUID{req->get_query_param("group_id")}};

        std::atomic<bool> found_srv{false};
-        co_await raft_gr.invoke_on_all([gid, timeout, &found_srv] (service::raft_group_registry& raft_gr) {
+        co_await raft_gr.invoke_on_all([gid, timeout, &found_srv] (service::raft_group_registry& raft_gr) -> future<> {
            if (!raft_gr.find_server(gid)) {
-                return make_ready_future<>();
+                co_return;
            }
            found_srv = true;
-            return raft_gr.get_server_with_timeouts(gid).read_barrier(nullptr, timeout);
+            co_await raft_gr.get_server_with_timeouts(gid).read_barrier(nullptr, timeout);
        });

        if (!found_srv) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -54,6 +54,7 @@
 #include "locator/abstract_replication_strategy.hh"
 #include "sstables_loader.hh"
 #include "db/view/view_builder.hh"
+#include "utils/user_provided_param.hh"

 using namespace seastar::httpd;
 using namespace std::chrono_literals;
@@ -489,10 +490,27 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
+
+    ss::start_restore.set(r, [&sst_loader] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto endpoint = req->get_query_param("endpoint");
+        auto keyspace = req->get_query_param("keyspace");
+        auto table = req->get_query_param("table");
+        auto bucket = req->get_query_param("bucket");
+        auto snapshot_name = req->get_query_param("snapshot");
+        if (table.empty()) {
+            // TODO: If missing, should restore all tables
+            throw httpd::bad_param_exception("The table name must be specified");
+        }
+
+        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, endpoint, bucket, snapshot_name);
+        co_return json::json_return_type(fmt::to_string(task_id));
+    });
+
 }

 void unset_sstables_loader(http_context& ctx, routes& r) {
    ss::load_new_ss_tables.unset(r);
+    ss::start_restore.unset(r);
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb) {
@@ -610,14 +628,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        return ss.local().get_schema_version();
    });

-    ss::get_all_data_file_locations.set(r, [&ctx](const_req req) {
-        return container_to_vec(ctx.db.local().get_config().data_file_directories());
-    });
-
-    ss::get_saved_caches_location.set(r, [&ctx](const_req req) {
-        return ctx.db.local().get_config().saved_caches_directory();
-    });
-
    ss::get_range_to_endpoint_map.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req);
        auto table = req->get_query_param("cf");
@@ -706,17 +716,19 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto& db = ctx.db;
        auto params = req_params({
            std::pair("flush_memtables", mandatory::no),
+            std::pair("consider_only_existing_data", mandatory::no),
        });
        params.process(*req);
        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.info("force_compaction: flush={}", flush);
+        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        apilog.info("force_compaction: flush={} consider_only_existing_data={}", flush, consider_only_existing_data);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        std::optional<flush_mode> fmopt;
-        if (!flush) {
+        if (!flush && !consider_only_existing_data) {
            fmopt = flush_mode::skip;
        }
-        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt);
+        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
        try {
            co_await task->done();
        } catch (...) {
@@ -733,19 +745,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            std::pair("keyspace", mandatory::yes),
            std::pair("cf", mandatory::no),
            std::pair("flush_memtables", mandatory::no),
+            std::pair("consider_only_existing_data", mandatory::no),
        });
        params.process(*req);
        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.debug("force_keyspace_compaction: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
+        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        std::optional<flush_mode> fmopt;
-        if (!flush) {
+        if (!flush && !consider_only_existing_data) {
            fmopt = flush_mode::skip;
        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
+        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
        try {
            co_await task->done();
        } catch (...) {
@@ -884,7 +898,8 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto host_id = validate_host_id(req->get_query_param("host_id"));
        std::vector<sstring> ignore_nodes_strs = utils::split_comma_separated_list(req->get_query_param("ignore_nodes"));
        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
-        auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
+        locator::host_id_or_endpoint_list ignore_nodes;
+        ignore_nodes.reserve(ignore_nodes_strs.size());
        for (const sstring& n : ignore_nodes_strs) {
            try {
                auto hoep = locator::host_id_or_endpoint(n);
@@ -893,7 +908,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                }
                ignore_nodes.push_back(std::move(hoep));
            } catch (...) {
-                throw std::runtime_error(format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
+                throw std::runtime_error(fmt::format("Failed to parse ignore_nodes parameter: ignore_nodes={}, node={}: {}", ignore_nodes_strs, n, std::current_exception()));
            }
        }
        return ss.local().removenode(host_id, std::move(ignore_nodes)).then([] {
@@ -1048,7 +1063,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::get_compaction_throughput_mb_per_sec.set(r, [&ctx](std::unique_ptr<http::request> req) {
-        int value = ctx.db.local().get_config().compaction_throughput_mb_per_sec();
+        int value = ctx.db.local().get_compaction_manager().throughput_mbs();
        return make_ready_future<json::json_return_type>(value);
    });

@@ -1096,7 +1111,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::rebuild.set(r, [&ss](std::unique_ptr<http::request> req) {
-        auto source_dc = req->get_query_param("source_dc");
+        utils::optional_param source_dc;
+        if (auto source_dc_str = req->get_query_param("source_dc"); !source_dc_str.empty()) {
+            source_dc.emplace(std::move(source_dc_str)).set_user_provided();
+        }
+        if (auto force_str = req->get_query_param("force"); !force_str.empty() && service::loosen_constraints(validate_bool(force_str))) {
+            if (!source_dc) {
+                throw bad_param_exception("The `source_dc` option must be provided for using the `force` option");
+            }
+            source_dc.set_force();
+        }
        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -1439,12 +1463,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reload_raft_topology_state.set(r,
            [&ss, &group0_client] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        co_await ss.invoke_on(0, [&group0_client] (service::storage_service& ss) -> future<> {
-            apilog.info("Waiting for group 0 read/apply mutex before reloading Raft topology state...");
-            auto holder = co_await group0_client.hold_read_apply_mutex();
-            apilog.info("Reloading Raft topology state");
-            // Using topology_transition() instead of topology_state_load(), because the former notifies listeners
-            co_await ss.topology_transition();
-            apilog.info("Reloaded Raft topology state");
+            return ss.reload_raft_topology_state(group0_client);
        });
        co_return json_void();
    });
@@ -1559,8 +1578,6 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::get_release_version.unset(r);
    ss::get_scylla_release_version.unset(r);
    ss::get_schema_version.unset(r);
-    ss::get_all_data_file_locations.unset(r);
-    ss::get_saved_caches_location.unset(r);
    ss::get_range_to_endpoint_map.unset(r);
    ss::get_pending_range_to_endpoint_map.unset(r);
    ss::describe_ring.unset(r);
@@ -1776,6 +1793,21 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        co_return json::json_return_type(static_cast<int>(scrub_status::successful));
    });

+    ss::start_backup.set(r, [&snap_ctl] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto endpoint = req->get_query_param("endpoint");
+        auto keyspace = req->get_query_param("keyspace");
+        auto bucket = req->get_query_param("bucket");
+        auto snapshot_name = req->get_query_param("snapshot");
+        if (snapshot_name.empty()) {
+            // TODO: If missing, snapshot should be taken by scylla, then removed
+            throw httpd::bad_param_exception("The snapshot name must be specified");
+        }
+
+        auto& ctl = snap_ctl.local();
+        auto task_id = co_await ctl.start_backup(std::move(endpoint), std::move(bucket), std::move(keyspace), std::move(snapshot_name));
+        co_return json::json_return_type(fmt::to_string(task_id));
+    });
+
    cf::get_true_snapshots_size.set(r, [&snap_ctl] (std::unique_ptr<http::request> req) {
        auto [ks, cf] = parse_fully_qualified_cf_name(req->get_path_param("name"));
        return snap_ctl.local().true_snapshots_size(std::move(ks), std::move(cf)).then([] (int64_t res) {
@@ -1797,6 +1829,7 @@ void unset_snapshot(http_context& ctx, routes& r) {
    ss::del_snapshot.unset(r);
    ss::true_snapshots_size.unset(r);
    ss::scrub.unset(r);
+    ss::start_backup.unset(r);
    cf::get_true_snapshots_size.unset(r);
    cf::get_all_true_snapshots_size.unset(r);
 }
--- a/api/system.cc
+++ b/api/system.cc
@@ -10,6 +10,7 @@
 #include "api/api-doc/system.json.hh"
 #include "api/api-doc/metrics.json.hh"
 #include "replica/database.hh"
+#include "sstables/sstables_manager.hh"

 #include <rapidjson/document.h>
 #include <seastar/core/reactor.hh>
@@ -182,6 +183,11 @@ void set_system(http_context& ctx, routes& r) {
        apilog.info("Profile dumped to {}", profile_dest);
        return make_ready_future<json::json_return_type>(json::json_return_type(json::json_void()));
    }) ;
+
+    hs::get_highest_supported_sstable_version.set(r, [&ctx] (const_req req) {
+        auto& table = ctx.db.local().find_column_family("system", "local");
+        return seastar::to_sstring(table.get_sstables_manager().get_highest_supported_format());
+    });
 }

 }
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -14,6 +14,8 @@
 #include "api/api.hh"
 #include "api/api-doc/task_manager.json.hh"
 #include "db/system_keyspace.hh"
+#include "tasks/task_handler.hh"
+#include "utils/overloaded_functor.hh"

 #include <utility>
 #include <boost/range/adaptors.hpp>
@@ -24,93 +26,57 @@ namespace tm = httpd::task_manager_json;
 using namespace json;
 using namespace seastar::httpd;

-using task_variant = std::variant<tasks::task_manager::foreign_task_ptr, tasks::task_manager::task::task_essentials>;
-
-inline bool filter_tasks(tasks::task_manager::task_ptr task, std::unordered_map<sstring, sstring>& query_params) {
-    return (!query_params.contains("keyspace") || query_params["keyspace"] == task->get_status().keyspace) &&
-        (!query_params.contains("table") || query_params["table"] == task->get_status().table);
-}
-
-struct full_task_status {
-    tasks::task_manager::task::status task_status;
-    std::string type;
-    tasks::task_manager::task::progress progress;
-    tasks::task_id parent_id;
-    tasks::is_abortable abortable;
-    std::vector<std::string> children_ids;
-};
-
-struct task_stats {
-    task_stats(tasks::task_manager::task_ptr task)
-        : task_id(task->id().to_sstring())
-        , state(task->get_status().state)
-        , type(task->type())
-        , scope(task->get_status().scope)
-        , keyspace(task->get_status().keyspace)
-        , table(task->get_status().table)
-        , entity(task->get_status().entity)
-        , sequence_number(task->get_status().sequence_number)
-    { }
-
-    sstring task_id;
-    tasks::task_manager::task_state state;
-    std::string type;
-    std::string scope;
-    std::string keyspace;
-    std::string table;
-    std::string entity;
-    uint64_t sequence_number;
-};
-
-tm::task_status make_status(full_task_status status) {
-    auto start_time = db_clock::to_time_t(status.task_status.start_time);
-    auto end_time = db_clock::to_time_t(status.task_status.end_time);
+tm::task_status make_status(tasks::task_status status) {
+    auto start_time = db_clock::to_time_t(status.start_time);
+    auto end_time = db_clock::to_time_t(status.end_time);
    ::tm st, et;
    ::gmtime_r(&end_time, &et);
    ::gmtime_r(&start_time, &st);

+    std::vector<tm::task_identity> tis{status.children.size()};
+    boost::transform(status.children, tis.begin(), [] (const auto& child) {
+        tm::task_identity ident;
+        ident.task_id = child.task_id.to_sstring();
+        ident.node = fmt::format("{}", child.node);
+        return ident;
+    });
+
    tm::task_status res{};
-    res.id = status.task_status.id.to_sstring();
+    res.id = status.task_id.to_sstring();
    res.type = status.type;
-    res.scope = status.task_status.scope;
-    res.state = status.task_status.state;
-    res.is_abortable = bool(status.abortable);
+    res.kind = status.kind;
+    res.scope = status.scope;
+    res.state = status.state;
+    res.is_abortable = bool(status.is_abortable);
    res.start_time = st;
    res.end_time = et;
-    res.error = status.task_status.error;
-    res.parent_id = status.parent_id.to_sstring();
-    res.sequence_number = status.task_status.sequence_number;
-    res.shard = status.task_status.shard;
-    res.keyspace = status.task_status.keyspace;
-    res.table = status.task_status.table;
-    res.entity = status.task_status.entity;
-    res.progress_units = status.task_status.progress_units;
+    res.error = status.error;
+    res.parent_id = status.parent_id ? status.parent_id.to_sstring() : "none";
+    res.sequence_number = status.sequence_number;
+    res.shard = status.shard;
+    res.keyspace = status.keyspace;
+    res.table = status.table;
+    res.entity = status.entity;
+    res.progress_units = status.progress_units;
    res.progress_total = status.progress.total;
    res.progress_completed = status.progress.completed;
-    res.children_ids = std::move(status.children_ids);
+    res.children_ids = std::move(tis);
    return res;
 }

-future<full_task_status> retrieve_status(const tasks::task_manager::foreign_task_ptr& task) {
-    if (task.get() == nullptr) {
-        co_return coroutine::return_exception(httpd::bad_param_exception("Task not found"));
-    }
-    auto progress = co_await task->get_progress();
-    full_task_status s;
-    s.task_status = task->get_status();
-    s.type = task->type();
-    s.parent_id = task->get_parent_id();
-    s.abortable = task->is_abortable();
-    s.progress.completed = progress.completed;
-    s.progress.total = progress.total;
-    std::vector<std::string> ct = co_await task->get_children().map_each_task<std::string>([] (const tasks::task_manager::foreign_task_ptr& child) {
-        return child->id().to_sstring();
-    }, [] (const tasks::task_manager::task::task_essentials& child) {
-        return child.task_status.id.to_sstring();
-    });
-    s.children_ids = std::move(ct);
-    co_return s;
-};
+tm::task_stats make_stats(tasks::task_stats stats) {
+    tm::task_stats res{};
+    res.task_id = stats.task_id.to_sstring();
+    res.type = stats.type;
+    res.kind = stats.kind;
+    res.scope = stats.scope;
+    res.state = stats.state;
+    res.sequence_number = stats.sequence_number;
+    res.keyspace = stats.keyspace;
+    res.table = stats.table;
+    res.entity = stats.entity;
+    return res;
+}

 void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>& tm, db::config& cfg) {
    tm::get_modules.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -119,23 +85,28 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
    });

    tm::get_tasks.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        using chunked_stats = utils::chunked_vector<task_stats>;
+        using chunked_stats = utils::chunked_vector<tasks::task_stats>;
        auto internal = tasks::is_internal{req_param<bool>(*req, "internal", false)};
        std::vector<chunked_stats> res = co_await tm.map([&req, internal] (tasks::task_manager& tm) {
-            chunked_stats local_res;
            tasks::task_manager::module_ptr module;
+            std::optional<std::string> keyspace = std::nullopt;
+            std::optional<std::string> table = std::nullopt;
            try {
                module = tm.find_module(req->get_path_param("module"));
            } catch (...) {
                throw bad_param_exception(fmt::format("{}", std::current_exception()));
            }
-            const auto& filtered_tasks = module->get_tasks() | boost::adaptors::filtered([&params = req->query_parameters, internal] (const auto& task) {
-                return (internal || !task.second->is_internal()) && filter_tasks(task.second, params);
-            });
-            for (auto& [task_id, task] : filtered_tasks) {
-                local_res.push_back(task_stats{task});
+
+            if (auto it = req->query_parameters.find("keyspace"); it != req->query_parameters.end()) {
+                keyspace = it->second;
            }
-            return local_res;
+            if (auto it = req->query_parameters.find("table"); it != req->query_parameters.end()) {
+                table = it->second;
+            }
+
+            return module->get_stats(internal, [keyspace = std::move(keyspace), table = std::move(table)] (std::string& ks, std::string& t) {
+                return (!keyspace || keyspace == ks) && (!table || table == t);
+            });
        });

        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
@@ -148,8 +119,7 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
                for (auto& v: res) {
                    for (auto& stats: v) {
                        co_await s.write(std::exchange(delim, ", "));
-                        tm::task_stats ts;
-                        ts = stats;
+                        tm::task_stats ts = make_stats(stats);
                        co_await formatter::write(s, ts);
                    }
                }
@@ -168,121 +138,70 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>

    tm::get_task_status.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        tasks::task_manager::foreign_task_ptr task;
+        tasks::task_status status;
        try {
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
-                }
-                co_return std::move(task);
-            }));
+            auto task = tasks::task_handler{tm.local(), id};
+            status = co_await task.get_status();
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
        }
-        auto s = co_await retrieve_status(task);
-        co_return make_status(s);
+        co_return make_status(status);
    });

    tm::abort_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                if (!task->is_abortable()) {
-                    co_await coroutine::return_exception(std::runtime_error("Requested task cannot be aborted"));
-                }
-                task->abort();
-            });
+            auto task = tasks::task_handler{tm.local(), id};
+            co_await task.abort();
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
+        } catch (tasks::task_not_abortable& e) {
+            throw httpd::base_exception{e.what(), http::reply::status_type::forbidden};
        }
        co_return json_void();
    });

    tm::wait_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        tasks::task_manager::foreign_task_ptr task;
+        tasks::task_status status;
+        std::optional<std::chrono::seconds> timeout = std::nullopt;
+        if (auto it = req->query_parameters.find("timeout"); it != req->query_parameters.end()) {
+            timeout = std::chrono::seconds(boost::lexical_cast<uint32_t>(it->second));
+        }
        try {
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) {
-                return task->done().then_wrapped([task] (auto f) {
-                    // done() is called only because we want the task to be complete before getting its status.
-                    // The future should be ignored here as the result does not matter.
-                    f.ignore_ready_future();
-                    return make_foreign(task);
-                });
-            }));
+            auto task = tasks::task_handler{tm.local(), id};
+            status = co_await task.wait_for_task(timeout);
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
+        } catch (timed_out_error& e) {
+            throw httpd::base_exception{e.what(), http::reply::status_type::request_timeout};
        }
-        auto s = co_await retrieve_status(task);
-        co_return make_status(s);
+        co_return make_status(status);
    });

    tm::get_task_status_recursively.set(r, [&_tm = tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto& tm = _tm;
        auto id = tasks::task_id{utils::UUID{req->get_path_param("task_id")}};
-        std::queue<task_variant> q;
-        utils::chunked_vector<full_task_status> res;
-
-        tasks::task_manager::foreign_task_ptr task;
        try {
-            // Get requested task.
-            task = co_await tasks::task_manager::invoke_on_task(tm, id, std::function([] (tasks::task_manager::task_ptr task) -> future<tasks::task_manager::foreign_task_ptr> {
-                if (task->is_complete()) {
-                    task->unregister_task();
+            auto task = tasks::task_handler{tm.local(), id};
+            auto res = co_await task.get_status_recursively(true);
+
+            std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
+                auto s = std::move(os);
+                auto res = std::move(r);
+                co_await s.write("[");
+                std::string delim = "";
+                for (auto& status: res) {
+                    co_await s.write(std::exchange(delim, ", "));
+                    co_await formatter::write(s, make_status(status));
                }
-                co_return task;
-            }));
+                co_await s.write("]");
+                co_await s.close();
+            };
+            co_return f;
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
        }
-
-        // Push children's statuses in BFS order.
-        q.push(co_await task.copy());   // Task cannot be moved since we need it to be alive during whole loop execution.
-        while (!q.empty()) {
-            auto& current = q.front();
-            co_await std::visit(overloaded_functor {
-                [&] (const tasks::task_manager::foreign_task_ptr& task) -> future<> {
-                    res.push_back(co_await retrieve_status(task));
-                    co_await task->get_children().for_each_task([&q] (const tasks::task_manager::foreign_task_ptr& child) -> future<> {
-                        q.push(co_await child.copy());
-                    }, [&] (const tasks::task_manager::task::task_essentials& child) {
-                        q.push(child);
-                        return make_ready_future();
-                    });
-                },
-                [&] (const tasks::task_manager::task::task_essentials& task) -> future<> {
-                    res.push_back(full_task_status{
-                        .task_status = task.task_status,
-                        .type = task.type,
-                        .progress = task.task_progress,
-                        .parent_id = task.parent_id,
-                        .abortable = task.abortable,
-                        .children_ids = boost::copy_range<std::vector<std::string>>(task.failed_children | boost::adaptors::transformed([] (auto& child) {
-                            return child.task_status.id.to_sstring();
-                        }))
-                    });
-                    for (auto& child: task.failed_children) {
-                        q.push(child);
-                    }
-                    return make_ready_future();
-                }
-            }, current);
-            q.pop();
-        }
-
-        std::function<future<>(output_stream<char>&&)> f = [r = std::move(res)] (output_stream<char>&& os) -> future<> {
-            auto s = std::move(os);
-            auto res = std::move(r);
-            co_await s.write("[");
-            std::string delim = "";
-            for (auto& status: res) {
-                co_await s.write(std::exchange(delim, ", "));
-                co_await formatter::write(s, make_status(status));
-            }
-            co_await s.write("]");
-            co_await s.close();
-        };
-        co_return f;
    });

    tm::get_and_update_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
@@ -294,6 +213,37 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        }
        co_return json::json_return_type(ttl);
    });
+
+    tm::get_ttl.set(r, [&cfg] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        uint32_t ttl = cfg.task_ttl_seconds();
+        co_return json::json_return_type(ttl);
+    });
+
+    tm::drain_tasks.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        co_await tm.invoke_on_all([&req] (tasks::task_manager& tm) -> future<> {
+            tasks::task_manager::module_ptr module;
+            try {
+                module = tm.find_module(req->get_path_param("module"));
+            } catch (...) {
+                throw bad_param_exception(fmt::format("{}", std::current_exception()));
+            }
+
+            const auto& local_tasks = module->get_local_tasks();
+            std::vector<tasks::task_id> ids;
+            ids.reserve(local_tasks.size());
+            std::transform(begin(local_tasks), end(local_tasks), std::back_inserter(ids), [] (const auto& task) {
+                return task.second->is_complete() ? task.first : tasks::task_id::create_null_id();
+            });
+
+            for (auto&& id : ids) {
+                if (id) {
+                    module->unregister_task(id);
+                }
+                co_await maybe_yield();
+            }
+        });
+        co_return json_void();
+    });
 }

 void unset_task_manager(http_context& ctx, routes& r) {
@@ -304,6 +254,8 @@ void unset_task_manager(http_context& ctx, routes& r) {
    tm::wait_task.unset(r);
    tm::get_task_status_recursively.unset(r);
    tm::get_and_update_ttl.unset(r);
+    tm::get_ttl.unset(r);
+    tm::drain_tasks.unset(r);
 }

 }
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -13,6 +13,7 @@
 #include "task_manager_test.hh"
 #include "api/api-doc/task_manager_test.json.hh"
 #include "tasks/test_module.hh"
+#include "utils/overloaded_functor.hh"

 namespace api {

@@ -61,8 +62,8 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        auto module = tms.local().find_module("test");
        id = co_await module->make_task<tasks::test_task_impl>(shard, id, keyspace, table, entity, data);
        co_await tms.invoke_on(shard, [id] (tasks::task_manager& tm) {
-            auto it = tm.get_all_tasks().find(id);
-            if (it != tm.get_all_tasks().end()) {
+            auto it = tm.get_local_tasks().find(id);
+            if (it != tm.get_local_tasks().end()) {
                it->second->start();
            }
        });
@@ -72,9 +73,16 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
    tmt::unregister_test_task.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
        auto id = tasks::task_id{utils::UUID{req->query_parameters["task_id"]}};
        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_ptr task) -> future<> {
-                tasks::test_task test_task{task};
-                co_await test_task.unregister_task();
+            co_await tasks::task_manager::invoke_on_task(tm, id, [] (tasks::task_manager::task_variant task_v) -> future<> {
+                return std::visit(overloaded_functor{
+                    [] (tasks::task_manager::task_ptr task) -> future<> {
+                        tasks::test_task test_task{task};
+                        co_await test_task.unregister_task();
+                    },
+                    [] (tasks::task_manager::virtual_task_ptr task) {
+                        return make_ready_future();
+                    }
+                }, task_v);
            });
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
@@ -89,13 +97,20 @@ void set_task_manager_test(http_context& ctx, routes& r, sharded<tasks::task_man
        std::string error = fail ? it->second : "";

        try {
-            co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) -> future<> {
-                tasks::test_task test_task{task};
-                if (fail) {
-                    co_await test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
-                } else {
-                    co_await test_task.finish();
-                }
+            co_await tasks::task_manager::invoke_on_task(tm, id, [fail, error = std::move(error)] (tasks::task_manager::task_variant task_v) -> future<> {
+                return std::visit(overloaded_functor{
+                    [fail, error = std::move(error)] (tasks::task_manager::task_ptr task) -> future<> {
+                        tasks::test_task test_task{task};
+                        if (fail) {
+                            co_await test_task.finish_failed(std::make_exception_ptr(std::runtime_error(error)));
+                        } else {
+                            co_await test_task.finish();
+                        }
+                    },
+                    [] (tasks::task_manager::virtual_task_ptr task) {
+                        return make_ready_future();
+                    }
+                }, task_v);
            });
        } catch (tasks::task_manager::task_not_found& e) {
            throw bad_param_exception(e.what());
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -21,6 +21,9 @@ using namespace json;
 void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_token_metadata>& tm) {
    ss::local_hostid.set(r, [&tm](std::unique_ptr<http::request> req) {
        auto id = tm.local().get()->get_my_id();
+        if (!bool(id)) {
+            throw not_found_exception("local host ID is not yet set");
+        }
        return make_ready_future<json::json_return_type>(id.to_sstring());
    });

@@ -68,7 +71,7 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to

    ss::get_host_id_map.set(r, [&tm](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(tm.local().get()->get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(tm.local().get()->get_endpoint_to_host_id_map(), res);
    });

    static auto host_or_broadcast = [&tm](const_req req) {
--- a/auth/authentication_options.hh
+++ b/auth/authentication_options.hh
@@ -25,6 +25,25 @@ enum class authentication_option {
    options
 };

+}
+
+template <>
+struct fmt::formatter<auth::authentication_option> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const auth::authentication_option a, FormatContext& ctx) const {
+        using enum auth::authentication_option;
+        switch (a) {
+        case password:
+            return formatter<string_view>::format("PASSWORD", ctx);
+        case options:
+            return formatter<string_view>::format("OPTIONS", ctx);
+        }
+        std::abort();
+    }
+};
+
+namespace auth {
+
 using authentication_option_set = std::unordered_set<authentication_option>;

 using custom_options = std::unordered_map<sstring, sstring>;
@@ -46,18 +65,3 @@ public:
 };

 }
-
-template <>
-struct fmt::formatter<auth::authentication_option> : fmt::formatter<string_view> {
-    template <typename FormatContext>
-    auto format(const auth::authentication_option a, FormatContext& ctx) const {
-        using enum auth::authentication_option;
-        switch (a) {
-        case password:
-            return formatter<string_view>::format("PASSWORD", ctx);
-        case options:
-            return formatter<string_view>::format("OPTIONS", ctx);
-        }
-        std::abort();
-    }
-};
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -76,7 +76,7 @@ auth::certificate_authenticator::certificate_authenticator(cql3::query_processor
                    continue;
                } catch (std::out_of_range&) {
                    // just fallthrough
-                } catch (std::regex_error&) {
+                } catch (boost::regex_error&) {
                    std::throw_with_nested(std::invalid_argument(fmt::format("Invalid query expression: {}", map.at(cfg_query_attr))));
                }
            }
@@ -149,7 +149,7 @@ future<std::optional<auth::authenticated_user>> auth::certificate_authenticator:
            co_return username;
        }
    }
-    throw exceptions::authentication_exception(format("Subject '{}'/'{}' does not match any query expression", subject, altname));
+    throw exceptions::authentication_exception(seastar::format("Subject '{}'/'{}' does not match any query expression", subject, altname));
 }


--- a/auth/common.cc
+++ b/auth/common.cc
@@ -16,6 +16,7 @@
 #include "mutation/canonical_mutation.hh"
 #include "schema/schema_fwd.hh"
 #include "timestamp.hh"
+#include "utils/assert.hh"
 #include "utils/exponential_backoff_retry.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/statements/create_table_statement.hh"
@@ -68,10 +69,10 @@ static future<> create_legacy_metadata_table_if_missing_impl(
        cql3::query_processor& qp,
        std::string_view cql,
        ::service::migration_manager& mm) {
-    assert(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
+    SCYLLA_ASSERT(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only

    auto db = qp.db();
-    auto parsed_statement = cql3::query_processor::parse_statement(cql);
+    auto parsed_statement = cql3::query_processor::parse_statement(cql, cql3::dialect{});
    auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);

    parsed_cf_statement.prepare_keyspace(meta::legacy::AUTH_KS);
@@ -121,7 +122,7 @@ static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
        std::vector<canonical_mutation> muts,
        ::service::group0_guard group0_guard,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    auto group0_cmd = group0_client.prepare_command(
        ::service::write_mutations{
@@ -137,7 +138,7 @@ future<> announce_mutations_with_batching(
        ::service::raft_group0_client& group0_client,
        start_operation_func_t start_operation_func,
        std::function<::service::mutations_generator(api::timestamp_type t)> gen,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    // account for command's overhead, it's better to use smaller threshold than constantly bounce off the limit
    size_t memory_threshold = group0_client.max_command_size() * 0.75;
@@ -188,7 +189,7 @@ future<> announce_mutations(
        ::service::raft_group0_client& group0_client,
        const sstring query_string,
        std::vector<data_value_or_unset> values,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout) {
    auto group0_guard = co_await group0_client.start_operation(as, timeout);
    auto timestamp = group0_guard.write_timestamp();
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -80,7 +80,7 @@ future<> create_legacy_metadata_table_if_missing(
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 // Use this function when need to perform read before write on a single guard or if
 // you have more than one mutation and potentially exceed single command size limit.
-using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source*)>;
+using start_operation_func_t = std::function<future<::service::group0_guard>(abort_source&)>;
 future<> announce_mutations_with_batching(
        ::service::raft_group0_client& group0_client,
        // since we can operate also in topology coordinator context where we need stronger
@@ -88,7 +88,7 @@ future<> announce_mutations_with_batching(
        // function here
        start_operation_func_t start_operation_func,
        std::function<::service::mutations_generator(api::timestamp_type t)> gen,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout);

 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
@@ -97,7 +97,7 @@ future<> announce_mutations(
        ::service::raft_group0_client& group0_client,
        const sstring query_string,
        std::vector<data_value_or_unset> values,
-        seastar::abort_source* as,
+        seastar::abort_source& as,
        std::optional<::service::raft_timeout> timeout);

 // Appends mutations to a collector, they will be applied later on all nodes via group0 mechanism.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -67,7 +67,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 }

 future<bool> default_authorizer::legacy_any_granted() const {
-    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::legacy::AUTH_KS, PERMISSIONS_CF);
+    static const sstring query = seastar::format("SELECT * FROM {}.{} LIMIT 1", meta::legacy::AUTH_KS, PERMISSIONS_CF);

    return _qp.execute_internal(
            query,
@@ -80,7 +80,7 @@ future<bool> default_authorizer::legacy_any_granted() const {

 future<> default_authorizer::migrate_legacy_metadata() {
    alogger.info("Starting migration of legacy permissions metadata.");
-    static const sstring query = format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
+    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);

    return _qp.execute_internal(
            query,
@@ -163,7 +163,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
        co_return permissions::NONE;
    }

-    const sstring query = format("SELECT {} FROM {}.{} WHERE {} = ? AND {} = ?",
+    const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? AND {} = ?",
            PERMISSIONS_NAME,
            get_auth_ks_name(_qp),
            PERMISSIONS_CF,
@@ -188,7 +188,7 @@ default_authorizer::modify(
        const resource& resource,
        std::string_view op,
        ::service::group0_batch& mc) {
-    const sstring query = format("UPDATE {}.{} SET {} = {} {} ? WHERE {} = ? AND {} = ?",
+    const sstring query = seastar::format("UPDATE {}.{} SET {} = {} {} ? WHERE {} = ? AND {} = ?",
            get_auth_ks_name(_qp),
            PERMISSIONS_CF,
            PERMISSIONS_NAME,
@@ -218,7 +218,7 @@ future<> default_authorizer::revoke(std::string_view role_name, permission_set s
 }

 future<std::vector<permission_details>> default_authorizer::list_all() const {
-    const sstring query = format("SELECT {}, {}, {} FROM {}.{}",
+    const sstring query = seastar::format("SELECT {}, {}, {} FROM {}.{}",
            ROLE_NAME,
            RESOURCE_NAME,
            PERMISSIONS_NAME,
@@ -246,7 +246,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {

 future<> default_authorizer::revoke_all(std::string_view role_name, ::service::group0_batch& mc) {
    try {
-        const sstring query = format("DELETE FROM {}.{} WHERE {} = ?",
+        const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
                get_auth_ks_name(_qp),
                PERMISSIONS_CF,
                ROLE_NAME);
@@ -266,7 +266,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name, ::service::g
 }

 future<> default_authorizer::revoke_all_legacy(const resource& resource) {
-    static const sstring query = format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
+    static const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
            ROLE_NAME,
            get_auth_ks_name(_qp),
            PERMISSIONS_CF,
@@ -283,7 +283,7 @@ future<> default_authorizer::revoke_all_legacy(const resource& resource) {
                    res->begin(),
                    res->end(),
                    [this, res, resource](const cql3::untyped_result_set::row& r) {
-                static const sstring query = format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
+                static const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
                        get_auth_ks_name(_qp),
                        PERMISSIONS_CF,
                        ROLE_NAME,
@@ -323,7 +323,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro

    auto name = resource.name();
    auto gen = [this, name] (api::timestamp_type t) -> ::service::mutations_generator {
-        const sstring query = format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
+        const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ? ALLOW FILTERING",
                ROLE_NAME,
                get_auth_ks_name(_qp),
                PERMISSIONS_CF,
@@ -334,7 +334,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
                {name},
                cql3::query_processor::cache_internal::no);
        for (const auto& r : *res) {
-            const sstring query = format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
+            const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
                    get_auth_ks_name(_qp),
                    PERMISSIONS_CF,
                    ROLE_NAME,
@@ -346,7 +346,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
                    {r.get_as<sstring>(ROLE_NAME), name});
            if (muts.size() != 1) {
                on_internal_error(alogger,
-                    format("expecting single delete mutation, got {}", muts.size()));
+                    seastar::format("expecting single delete mutation, got {}", muts.size()));
            }
            co_yield std::move(muts[0]);
        }
@@ -357,7 +357,7 @@ future<> default_authorizer::revoke_all(const resource& resource, ::service::gro
 void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resource, ::service::group0_batch& mc) {
    auto ks_name = ks_resource.name();
    auto gen = [this, ks_name] (api::timestamp_type t) -> ::service::mutations_generator {
-        const sstring query = format("SELECT {}, {} FROM {}.{}",
+        const sstring query = seastar::format("SELECT {}, {} FROM {}.{}",
                ROLE_NAME,
                RESOURCE_NAME,
                get_auth_ks_name(_qp),
@@ -374,7 +374,7 @@ void default_authorizer::revoke_all_keyspace_resources(const resource& ks_resour
                // r doesn't represent resource related to ks_resource
                continue;
            }
-            const sstring query = format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
+            const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ? AND {} = ?",
                    get_auth_ks_name(_qp),
                    PERMISSIONS_CF,
                    ROLE_NAME,
--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -43,10 +43,14 @@ future<> maintenance_socket_role_manager::stop() {
    return make_ready_future<>();
 }

+future<> maintenance_socket_role_manager::ensure_superuser_is_created() {
+    return make_ready_future<>();
+}
+
 template<typename T = void>
 future<T> operation_not_supported_exception(std::string_view operation) {
    return make_exception_future<T>(
-        std::runtime_error(format("role manager: {} operation not supported through maintenance socket", operation)));
+        std::runtime_error(fmt::format("role manager: {} operation not supported through maintenance socket", operation)));
 }

 future<> maintenance_socket_role_manager::create(std::string_view role_name, const role_config&, ::service::group0_batch&) {
@@ -73,6 +77,10 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
+    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
+}
+
 future<role_set> maintenance_socket_role_manager::query_all() {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }
--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -39,6 +39,8 @@ public:

    virtual future<> stop() override;

+    virtual future<> ensure_superuser_is_created() override;
+
    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
@@ -51,6 +53,8 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

+    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+
    virtual future<role_set> query_all() override;

    virtual future<bool> exists(std::string_view role_name) override;
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -75,7 +75,7 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
 }

 sstring password_authenticator::update_row_query() const {
-    return format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
+    return seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
            get_auth_ks_name(_qp),
            meta::roles_table::name,
            SALTED_HASH,
@@ -90,7 +90,7 @@ bool password_authenticator::legacy_metadata_exists() const {

 future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
-    static const sstring query = format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
+    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);

    return _qp.execute_internal(
            query,
@@ -136,7 +136,7 @@ future<> password_authenticator::create_default_if_missing() {
        plogger.info("Created default superuser authentication record.");
    } else {
        co_await announce_mutations(_qp, _group0_client, query,
-            {salted_pwd, _superuser}, &_as, ::service::raft_timeout{});
+            {salted_pwd, _superuser}, _as, ::service::raft_timeout{});
        plogger.info("Created default superuser authentication record.");
    }
 }
@@ -223,7 +223,7 @@ future<authenticated_user> password_authenticator::authenticate(
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    const sstring query = format("SELECT {} FROM {}.{} WHERE {} = ?",
+    const sstring query = seastar::format("SELECT {} FROM {}.{} WHERE {} = ?",
                SALTED_HASH,
                get_auth_ks_name(_qp),
                meta::roles_table::name,
@@ -280,7 +280,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
        co_return;
    }

-    const sstring query = format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
+    const sstring query = seastar::format("UPDATE {}.{} SET {} = ? WHERE {} = ?",
            get_auth_ks_name(_qp),
            meta::roles_table::name,
            SALTED_HASH,
@@ -299,7 +299,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
 }

 future<> password_authenticator::drop(std::string_view name, ::service::group0_batch& mc) {
-    const sstring query = format("DELETE {} FROM {}.{} WHERE {} = ?",
+    const sstring query = seastar::format("DELETE {} FROM {}.{} WHERE {} = ?",
            SALTED_HASH,
            get_auth_ks_name(_qp),
            meta::roles_table::name,
--- a/auth/resource.cc
+++ b/auth/resource.cc
@@ -193,7 +193,7 @@ service_level_resource_view::service_level_resource_view(const resource &r) {
 }

 sstring encode_signature(std::string_view name, std::vector<data_type> args) {
-    return format("{}[{}]", name,
+    return seastar::format("{}[{}]", name,
            fmt::join(args | boost::adaptors::transformed([] (const data_type t) {
                return t->name();
            }), "^"));
@@ -222,7 +222,7 @@ std::pair<sstring, std::vector<data_type>> decode_signature(std::string_view enc
 // to the short form (int)
 static sstring decoded_signature_string(std::string_view encoded_signature) {
    auto [function_name, arg_types] = decode_signature(encoded_signature);
-    return format("{}({})", cql3::util::maybe_quote(sstring(function_name)),
+    return seastar::format("{}({})", cql3::util::maybe_quote(sstring(function_name)),
            boost::algorithm::join(arg_types | boost::adaptors::transformed([] (data_type t) {
                return t->cql3_type_name();
            }), ", "));
--- a/auth/resource.hh
+++ b/auth/resource.hh
@@ -18,7 +18,6 @@
 #include <unordered_set>

 #include <fmt/core.h>
-#include <seastar/core/print.hh>
 #include <seastar/core/sstring.hh>

 #include "auth/permission.hh"
@@ -33,7 +32,7 @@ namespace auth {
 class invalid_resource_name : public std::invalid_argument {
 public:
    explicit invalid_resource_name(std::string_view name)
-            : std::invalid_argument(format("The resource name '{}' is invalid.", name)) {
+            : std::invalid_argument(fmt::format("The resource name '{}' is invalid.", name)) {
    }
 };

@@ -149,7 +148,7 @@ class resource_kind_mismatch : public std::invalid_argument {
 public:
    explicit resource_kind_mismatch(resource_kind expected, resource_kind actual)
        : std::invalid_argument(
-            format("This resource has kind '{}', but was expected to have kind '{}'.", actual, expected)) {
+            fmt::format("This resource has kind '{}', but was expected to have kind '{}'.", actual, expected)) {
    }
 };

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -48,14 +48,14 @@ public:
 class role_already_exists : public roles_argument_exception {
 public:
    explicit role_already_exists(std::string_view role_name)
-            : roles_argument_exception(format("Role {} already exists.", role_name)) {
+            : roles_argument_exception(seastar::format("Role {} already exists.", role_name)) {
    }
 };

 class nonexistant_role : public roles_argument_exception {
 public:
    explicit nonexistant_role(std::string_view role_name)
-            : roles_argument_exception(format("Role {} doesn't exist.", role_name)) {
+            : roles_argument_exception(seastar::format("Role {} doesn't exist.", role_name)) {
    }
 };

@@ -63,7 +63,7 @@ class role_already_included : public roles_argument_exception {
 public:
    role_already_included(std::string_view grantee_name, std::string_view role_name)
            : roles_argument_exception(
-                      format("{} already includes role {}.", grantee_name, role_name)) {
+                      seastar::format("{} already includes role {}.", grantee_name, role_name)) {
    }
 };

@@ -71,11 +71,12 @@ class revoke_ungranted_role : public roles_argument_exception {
 public:
    revoke_ungranted_role(std::string_view revokee_name, std::string_view role_name)
            : roles_argument_exception(
-                      format("{} was not granted role {}, so it cannot be revoked.", revokee_name, role_name)) {
+                      seastar::format("{} was not granted role {}, so it cannot be revoked.", revokee_name, role_name)) {
    }
 };

 using role_set = std::unordered_set<sstring>;
+using role_to_directly_granted_map = std::multimap<sstring, sstring>;

 enum class recursive_role_query { yes, no };

@@ -105,6 +106,13 @@ public:

    virtual future<> stop() = 0;

+    ///
+    /// Ensure that superuser role exists.
+    ///
+    /// \returns a future once it is ensured that the superuser role exists.
+    ///
+    virtual future<> ensure_superuser_is_created() = 0;
+
    ///
    /// \returns an exceptional future with \ref role_already_exists for a role that has previously been created.
    ///
@@ -144,6 +152,22 @@ public:
    ///
    virtual future<role_set> query_granted(std::string_view grantee, recursive_role_query) = 0;

+    /// \returns map of directly granted roles for all roles
+    ///
+    /// Example:
+    /// GRANT role2 TO role1
+    /// GRANT role3 TO role1
+    /// GRANT role3 TO role2
+    ///
+    /// Will return map:
+    /// {
+    ///   (role1, role2),
+    ///   (role1, role3),
+    ///   (role2, role3)
+    /// }
+    ///  
+    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
+
    virtual future<role_set> query_all() = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -47,7 +47,7 @@ future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p,
        std::optional<std::string> rolename) {
-    const sstring query = format("SELECT * FROM {}.{} WHERE {} = ?",
+    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
            get_auth_ks_name(qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);
@@ -69,7 +69,7 @@ future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p,
        std::optional<std::string> rolename) {
-    const sstring query = format("SELECT * FROM {}.{}", get_auth_ks_name(qp), meta::roles_table::name);
+    const sstring query = seastar::format("SELECT * FROM {}.{}", get_auth_ks_name(qp), meta::roles_table::name);

    auto results = co_await qp.execute_internal(query, db::consistency_level::QUORUM
        , internal_distributed_query_state(), cql3::query_processor::cache_internal::no
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -36,6 +36,7 @@
 #include "service/migration_manager.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "timestamp.hh"
+#include "utils/assert.hh"
 #include "utils/class_registrator.hh"
 #include "locator/abstract_replication_strategy.hh"
 #include "data_dictionary/keyspace_metadata.hh"
@@ -77,7 +78,7 @@ private:
    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
-    void on_update_tablet_metadata() override {}
+    void on_update_tablet_metadata(const locator::tablet_metadata_change_hint&) override {}

    void on_drop_keyspace(const sstring& ks_name) override {
        if (!legacy_mode(_qp)) {
@@ -194,7 +195,7 @@ service::service(
 }

 future<> service::create_legacy_keyspace_if_missing(::service::migration_manager& mm) const {
-    assert(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
+    SCYLLA_ASSERT(this_shard_id() == 0); // once_among_shards makes sure a function is executed on shard 0 only
    auto db = _qp.db();

    while (!db.has_keyspace(meta::legacy::AUTH_KS)) {
@@ -212,7 +213,7 @@ future<> service::create_legacy_keyspace_if_missing(::service::migration_manager

            try {
                co_return co_await mm.announce(::service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts),
-                        std::move(group0_guard), format("auth_service: create {} keyspace", meta::legacy::AUTH_KS));
+                        std::move(group0_guard), seastar::format("auth_service: create {} keyspace", meta::legacy::AUTH_KS));
            } catch (::service::group0_concurrent_modification&) {
                log.info("Concurrent operation is detected while creating {} keyspace, retrying.", meta::legacy::AUTH_KS);
            }
@@ -256,6 +257,10 @@ future<> service::stop() {
    });
 }

+future<> service::ensure_superuser_is_created() {
+    return _role_manager->ensure_superuser_is_created();
+}
+
 void service::update_cache_config() {
    auto db = _qp.db();

@@ -632,7 +637,7 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
            ::service::query_state qs(cs, empty_service_permit());

            auto rows = co_await qp.execute_internal(
-                    format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
+                    seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, cf_name),
                    db::consistency_level::ALL,
                    qs,
                    {},
@@ -681,7 +686,7 @@ future<> migrate_to_auth_v2(db::system_keyspace& sys_ks, ::service::raft_group0_
    co_await announce_mutations_with_batching(g0,
            start_operation_func,
            std::move(gen),
-            &as,
+            as,
            std::nullopt);
 }

--- a/auth/service.hh
+++ b/auth/service.hh
@@ -131,6 +131,8 @@ public:

    future<> stop();

+    future<> ensure_superuser_is_created();
+
    void update_cache_config();

    void reset_authorization_cache();
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -21,12 +21,15 @@
 #include <seastar/core/thread.hh>

 #include "auth/common.hh"
+#include "auth/role_manager.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
+#include "seastar/core/loop.hh"
+#include "seastar/coroutine/maybe_yield.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "utils/class_registrator.hh"
 #include "service/migration_manager.hh"
@@ -46,7 +49,7 @@ namespace role_attributes_table {
 constexpr std::string_view name{"role_attributes", 15};

 static std::string_view creation_query() noexcept {
-    static const sstring instance = format(
+    static const sstring instance = seastar::format(
            "CREATE TABLE {}.{} ("
            "  role text,"
            "  name text,"
@@ -86,7 +89,7 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
 }

 static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
-    const sstring query = format("SELECT * FROM {}.{} WHERE {} = ?",
+    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
            get_auth_ks_name(qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);
@@ -180,7 +183,7 @@ future<> standard_role_manager::create_default_role_if_missing() {
        if (exists) {
            co_return;
        }
-        const sstring query = format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
+        const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
                get_auth_ks_name(_qp),
                meta::roles_table::name,
                meta::roles_table::role_col_name);
@@ -192,7 +195,7 @@ future<> standard_role_manager::create_default_role_if_missing() {
                    {_superuser},
                    cql3::query_processor::cache_internal::no).discard_result();
        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, &_as, ::service::raft_timeout{});
+            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, _as, ::service::raft_timeout{});
        }
        log.info("Created default superuser role '{}'.", _superuser);
    } catch(const exceptions::unavailable_exception& e) {
@@ -209,7 +212,7 @@ bool standard_role_manager::legacy_metadata_exists() {

 future<> standard_role_manager::migrate_legacy_metadata() {
    log.info("Starting migration of legacy user metadata.");
-    static const sstring query = format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);
+    static const sstring query = seastar::format("SELECT * FROM {}.{}", meta::legacy::AUTH_KS, legacy_table_name);

    return _qp.execute_internal(
            query,
@@ -238,35 +241,39 @@ future<> standard_role_manager::migrate_legacy_metadata() {
 }

 future<> standard_role_manager::start() {
-    return once_among_shards([this] {
-        return futurize_invoke([this] () {
-            if (legacy_mode(_qp)) {
-                return create_legacy_metadata_tables_if_missing();
-            }
-            return make_ready_future<>();
-        }).then([this] {
-            _stopped = auth::do_after_system_ready(_as, [this] {
-                return seastar::async([this] {
-                    if (legacy_mode(_qp)) {
-                        _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as).get();
+    return once_among_shards([this] () -> future<> {
+        if (legacy_mode(_qp)) {
+            co_await create_legacy_metadata_tables_if_missing();
+        }

-                        if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get()) {
-                            if (legacy_metadata_exists()) {
-                                log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
-                            }
+        auto handler = [this] () -> future<> {
+            const bool legacy = legacy_mode(_qp);
+            if (legacy) {
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
+                co_await _migration_manager.wait_for_schema_agreement(_qp.db().real_database(), db::timeout_clock::time_point::max(), &_as);

-                            return;
-                        }
-
-                        if (legacy_metadata_exists()) {
-                            migrate_legacy_metadata().get();
-                            return;
-                        }
+                if (co_await any_nondefault_role_row_satisfies(_qp, &has_can_login)) {
+                    if (legacy_metadata_exists()) {
+                        log.warn("Ignoring legacy user metadata since nondefault roles already exist.");
                    }
-                    create_default_role_if_missing().get();
-                });
-            });
-        });
+                    co_return;
+                }
+
+                if (legacy_metadata_exists()) {
+                    co_await migrate_legacy_metadata();
+                    co_return;
+                }
+            }
+            co_await create_default_role_if_missing();
+            if (!legacy) {
+                _superuser_created_promise.set_value();
+            }
+        };
+
+        _stopped = auth::do_after_system_ready(_as, handler);
+        co_return;
    });
 }

@@ -275,8 +282,13 @@ future<> standard_role_manager::stop() {
    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

+future<> standard_role_manager::ensure_superuser_is_created() {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    return _superuser_created_promise.get_shared_future();
+}
+
 future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c, ::service::group0_batch& mc) {
-    const sstring query = format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
+    const sstring query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
            get_auth_ks_name(_qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);
@@ -323,7 +335,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
-        const sstring query = format("UPDATE {}.{} SET {} WHERE {} = ?",
+        const sstring query = seastar::format("UPDATE {}.{} SET {} WHERE {} = ?",
            get_auth_ks_name(_qp),
            meta::roles_table::name,
            build_column_assignments(u),
@@ -347,7 +359,7 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    }
    // First, revoke this role from all roles that are members of it.
    const auto revoke_from_members = [this, role_name, &mc] () -> future<> {
-        const sstring query = format("SELECT member FROM {}.{} WHERE role = ?",
+        const sstring query = seastar::format("SELECT member FROM {}.{} WHERE role = ?",
                get_auth_ks_name(_qp),
                meta::role_members_table::name);
        const auto members = co_await _qp.execute_internal(
@@ -379,7 +391,7 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    };
    // Delete all attributes for that role
    const auto remove_attributes_of = [this, role_name, &mc] () -> future<> {
-        const sstring query = format("DELETE FROM {}.{} WHERE role = ?",
+        const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ?",
                get_auth_ks_name(_qp),
                meta::role_attributes_table::name);
        if (legacy_mode(_qp)) {
@@ -391,7 +403,7 @@ future<> standard_role_manager::drop(std::string_view role_name, ::service::grou
    };
    // Finally, delete the role itself.
    const auto delete_role = [this, role_name, &mc] () -> future<> {
-        const sstring query = format("DELETE FROM {}.{} WHERE {} = ?",
+        const sstring query = seastar::format("DELETE FROM {}.{} WHERE {} = ?",
                get_auth_ks_name(_qp),
                meta::roles_table::name,
                meta::roles_table::role_col_name);
@@ -418,7 +430,7 @@ standard_role_manager::legacy_modify_membership(
        std::string_view role_name,
        membership_change ch) {
    const auto modify_roles = [this, role_name, grantee_name, ch] () -> future<> {
-        const auto query = format(
+        const auto query = seastar::format(
                "UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
                get_auth_ks_name(_qp),
                meta::roles_table::name,
@@ -435,7 +447,7 @@ standard_role_manager::legacy_modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] () -> future<> {
        switch (ch) {
            case membership_change::add: {
-                const sstring insert_query = format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
+                const sstring insert_query = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
                        get_auth_ks_name(_qp),
                        meta::role_members_table::name);
                co_return co_await _qp.execute_internal(
@@ -447,7 +459,7 @@ standard_role_manager::legacy_modify_membership(
            }

            case membership_change::remove: {
-                const sstring delete_query = format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
+                const sstring delete_query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
                        get_auth_ks_name(_qp),
                        meta::role_members_table::name);
                co_return co_await _qp.execute_internal(
@@ -473,7 +485,7 @@ standard_role_manager::modify_membership(
        co_return co_await legacy_modify_membership(grantee_name, role_name, ch);
    }

-    const auto modify_roles = format(
+    const auto modify_roles = seastar::format(
            "UPDATE {}.{} SET member_of = member_of {} ? WHERE {} = ?",
            get_auth_ks_name(_qp),
            meta::roles_table::name,
@@ -485,12 +497,12 @@ standard_role_manager::modify_membership(
    sstring modify_role_members;
    switch (ch) {
    case membership_change::add:
-        modify_role_members = format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
+        modify_role_members = seastar::format("INSERT INTO {}.{} (role, member) VALUES (?, ?)",
                get_auth_ks_name(_qp),
                meta::role_members_table::name);
        break;
    case membership_change::remove:
-        modify_role_members = format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
+        modify_role_members = seastar::format("DELETE FROM {}.{} WHERE role = ? AND member = ?",
                get_auth_ks_name(_qp),
                meta::role_members_table::name);
        break;
@@ -583,8 +595,22 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
+    const sstring query = seastar::format("SELECT * FROM {}.{}",
+            get_auth_ks_name(_qp),
+            meta::role_members_table::name);
+
+    role_to_directly_granted_map roles_map;
+    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
+        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
+        co_return stop_iteration::no;
+    });
+
+    co_return roles_map;
+}
+
 future<role_set> standard_role_manager::query_all() {
-    const sstring query = format("SELECT {} FROM {}.{}",
+    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
            meta::roles_table::name);
@@ -628,7 +654,7 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
-    const sstring query = format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
+    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
@@ -659,7 +685,7 @@ future<> standard_role_manager::set_attribute(std::string_view role_name, std::s
    if (!co_await exists(role_name)) {
        throw auth::nonexistant_role(role_name);
    }
-    const sstring query = format("INSERT INTO {}.{} (role, name, value)  VALUES (?, ?, ?)",
+    const sstring query = seastar::format("INSERT INTO {}.{} (role, name, value)  VALUES (?, ?, ?)",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
    if (legacy_mode(_qp)) {
@@ -674,7 +700,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
    if (!co_await exists(role_name)) {
        throw auth::nonexistant_role(role_name);
    }
-    const sstring query = format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
+    const sstring query = seastar::format("DELETE FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
    if (legacy_mode(_qp)) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -37,6 +37,7 @@ class standard_role_manager final : public role_manager {
    future<> _stopped;
    abort_source _as;
    std::string _superuser;
+    shared_promise<> _superuser_created_promise;

 public:
    standard_role_manager(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
@@ -49,6 +50,8 @@ public:

    virtual future<> stop() override;

+    virtual future<> ensure_superuser_is_created() override;
+
    virtual future<> create(std::string_view role_name, const role_config&, ::service::group0_batch&) override;

    virtual future<> drop(std::string_view role_name, ::service::group0_batch& mc) override;
@@ -61,6 +64,8 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

+    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+
    virtual future<role_set> query_all() override;

    virtual future<bool> exists(std::string_view role_name) override;
--- a/bin/cqlsh
+++ b/bin/cqlsh
@@ -4,5 +4,5 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later

 here=$(dirname "$0")
-exec "$here/../tools/cqlsh/bin/cqlsh" "$@"
+exec "$here/../tools/cqlsh/bin/cqlsh.py" "$@"

--- a/bytes.cc
+++ b/bytes.cc
@@ -42,7 +42,7 @@ bytes from_hex(sstring_view s) {
        auto half_byte1 = hex_to_int(s[i * 2]);
        auto half_byte2 = hex_to_int(s[i * 2 + 1]);
        if (half_byte1 == -1 || half_byte2 == -1) {
-            throw std::invalid_argument(format("Non-hex characters in {}", s));
+            throw std::invalid_argument(fmt::format("Non-hex characters in {}", s));
        }
        out[i] = (half_byte1 << 4) | half_byte2;
    }
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -11,6 +11,7 @@
 #include <boost/range/iterator_range.hpp>

 #include "bytes.hh"
+#include "utils/assert.hh"
 #include "utils/managed_bytes.hh"
 #include <seastar/core/simple-stream.hh>
 #include <seastar/core/loop.hh>
@@ -269,7 +270,7 @@ public:

    // Call only when is_linearized()
    bytes_view view() const {
-        assert(is_linearized());
+        SCYLLA_ASSERT(is_linearized());
        if (!_current) {
            return bytes_view();
        }
--- a/cache_mutation_reader.hh
+++ b/cache_mutation_reader.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "utils/assert.hh"
 #include <vector>
 #include "row_cache.hh"
 #include "mutation/mutation_fragment.hh"
@@ -121,6 +122,9 @@ class cache_mutation_reader final : public mutation_reader::impl {
    gc_clock::time_point _read_time;
    gc_clock::time_point _gc_before;

+    api::timestamp_type _max_purgeable_timestamp = api::missing_timestamp;
+    api::timestamp_type _max_purgeable_timestamp_shadowable = api::missing_timestamp;
+
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -199,12 +203,17 @@ class cache_mutation_reader final : public mutation_reader::impl {
    gc_clock::time_point get_gc_before(const schema& schema, dht::decorated_key dk, const gc_clock::time_point query_time) {
        auto gc_state = _read_context.tombstone_gc_state();
        if (gc_state) {
-            return gc_state->get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
+            return gc_state->with_commitlog_check_disabled().get_gc_before_for_key(schema.shared_from_this(), dk, query_time);
        }

        return gc_clock::time_point::min();
    }

+    bool can_gc(tombstone t, is_shadowable is) const {
+        const auto max_purgeable = is ? _max_purgeable_timestamp_shadowable : _max_purgeable_timestamp;
+        return t.timestamp < max_purgeable;
+    }
+
 public:
    cache_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -226,8 +235,19 @@ public:
        , _read_time(get_read_time())
        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
-        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
-                      fmt::ptr(&*_snp));
+        _max_purgeable_timestamp = ctx.get_max_purgeable(dk, is_shadowable::no);
+        _max_purgeable_timestamp_shadowable = ctx.get_max_purgeable(dk, is_shadowable::yes);
+
+        clogger.trace("csm {}: table={}.{}, dk={}, gc-before={}, max-purgeable-regular={}, max-purgeable-shadowable={}, reversed={}, snap={}",
+                fmt::ptr(this),
+                _schema->ks_name(),
+                _schema->cf_name(),
+                dk,
+                _gc_before,
+                _max_purgeable_timestamp,
+                _max_purgeable_timestamp_shadowable,
+                _read_context.is_reversed(),
+                fmt::ptr(&*_snp));
        push_mutation_fragment(*_schema, _permit, partition_start(std::move(dk), _snp->partition_tombstone()));
    }
    cache_mutation_reader(schema_ptr s,
@@ -283,7 +303,7 @@ future<> cache_mutation_reader::process_static_row() {
        return ensure_underlying().then([this] {
            return (*_underlying)().then([this] (mutation_fragment_v2_opt&& sr) {
                if (sr) {
-                    assert(sr->is_static_row());
+                    SCYLLA_ASSERT(sr->is_static_row());
                    maybe_add_to_cache(sr->as_static_row());
                    push_mutation_fragment(std::move(*sr));
                }
@@ -382,7 +402,7 @@ future<> cache_mutation_reader::do_fill_buffer() {
    if (_state == state::reading_from_underlying) {
        return read_from_underlying();
    }
-    // assert(_state == state::reading_from_cache)
+    // SCYLLA_ASSERT(_state == state::reading_from_cache)
    return _lsa_manager.run_in_read_section([this] {
        auto next_valid = _next_row.iterators_valid();
        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}, rt={}", fmt::ptr(this), _lower_bound,
@@ -785,23 +805,24 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {
            t.apply(range_tomb);

            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
+                return (tomb && tomb.max_deletion_time() < _gc_before && can_gc(tomb.tomb(), tomb.is_shadowable()));
            };

            auto is_row_dead = [&](const deletable_row& row) {
                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
+                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before && can_gc(tombstone(m.timestamp(), m.deletion_time()), is_shadowable::no));
            };

            if (row_tomb_expired(t) || is_row_dead(row)) {
-                can_gc_fn always_gc = [&](tombstone) { return true; };
                const schema& row_schema = _next_row.latest_row_schema();

                _read_context.cache()._tracker.on_row_compacted();

+                auto mutation_can_gc = can_gc_fn([this] (tombstone t, is_shadowable is) { return can_gc(t, is); });
+
                with_allocator(_snp->region().allocator(), [&] {
                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
+                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, mutation_can_gc, _gc_before, nullptr);
                    std::swap(row, row_copy);
                });
                remove_row = row.empty();
@@ -990,7 +1011,7 @@ void cache_mutation_reader::offer_from_underlying(mutation_fragment_v2&& mf) {
        maybe_add_to_cache(mf.as_clustering_row());
        add_clustering_row_to_buffer(std::move(mf));
    } else {
-        assert(mf.is_range_tombstone_change());
+        SCYLLA_ASSERT(mf.is_range_tombstone_change());
        auto& chg = mf.as_range_tombstone_change();
        if (maybe_add_to_cache(chg)) {
            add_to_buffer(std::move(mf).as_range_tombstone_change());
--- a/cdc/cdc_partitioner.cc
+++ b/cdc/cdc_partitioner.cc
@@ -23,7 +23,7 @@ const sstring cdc_partitioner::name() const {
 }

 static dht::token to_token(int64_t value) {
-    return dht::token(dht::token::kind::key, value);
+    return dht::token(value);
 }

 static dht::token to_token(bytes_view key) {
--- a/cdc/change_visitor.hh
+++ b/cdc/change_visitor.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "utils/assert.hh"
 #include "mutation/mutation.hh"

 /*
@@ -246,7 +247,7 @@ void inspect_mutation(const mutation& m, V& v) {

        if (r.deleted_at()) {
            auto t = r.deleted_at().tomb();
-            assert(t.timestamp != api::missing_timestamp);
+            SCYLLA_ASSERT(t.timestamp != api::missing_timestamp);
            v.clustered_row_delete(cr.key(), t);
            if (v.finished()) {
                return;
@@ -255,7 +256,7 @@ void inspect_mutation(const mutation& m, V& v) {
    }

    for (auto& rt: p.row_tombstones()) {
-        assert(rt.tombstone().tomb.timestamp != api::missing_timestamp);
+        SCYLLA_ASSERT(rt.tombstone().tomb.timestamp != api::missing_timestamp);
        v.range_delete(rt.tombstone());
        if (v.finished()) {
            return;
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -26,6 +26,7 @@
 #include "gms/inet_address.hh"
 #include "gms/gossiper.hh"
 #include "gms/feature_service.hh"
+#include "utils/assert.hh"
 #include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"
 #include "utils/to_string.hh"
@@ -107,8 +108,8 @@ stream_id::stream_id(dht::token token, size_t vnode_index)
    copy_int_to_bytes(dht::token::to_int64(token), 0, _value);
    copy_int_to_bytes(low_qword, sizeof(int64_t), _value);
    // not a hot code path. make sure we did not mess up the shifts and masks.
-    assert(version() == version_1);
-    assert(index() == vnode_index);
+    SCYLLA_ASSERT(version() == version_1);
+    SCYLLA_ASSERT(index() == vnode_index);
 }

 stream_id::stream_id(bytes b)
@@ -126,7 +127,7 @@ bool stream_id::is_set() const {
 }

 static int64_t bytes_to_int64(bytes_view b, size_t offset) {
-    assert(b.size() >= offset + sizeof(int64_t));
+    SCYLLA_ASSERT(b.size() >= offset + sizeof(int64_t));
    int64_t res;
    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
    return net::ntoh(res);
@@ -363,6 +364,9 @@ cdc::topology_description make_new_generation_description(
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
+    if (tokens.empty()) {
+        on_internal_error(cdc_log, "Attempted to create a CDC generation from an empty list of tokens");
+    }

    utils::chunked_vector<token_range_description> vnode_descriptions;
    vnode_descriptions.reserve(tokens.size());
@@ -411,7 +415,7 @@ future<cdc::generation_id> generation_service::legacy_make_new_generation(const

    // Our caller should ensure that there are normal tokens in the token ring.
    auto normal_token_owners = tmptr->count_normal_token_owners();
-    assert(normal_token_owners);
+    SCYLLA_ASSERT(normal_token_owners);

    if (_feature_service.cdc_generations_v2) {
        cdc_log.info("Inserting new generation data at UUID {}", uuid);
@@ -811,7 +815,7 @@ future<> generation_service::stop() {
 }

 generation_service::~generation_service() {
-    assert(_stopped);
+    SCYLLA_ASSERT(_stopped);
 }

 future<> generation_service::after_join(std::optional<cdc::generation_id>&& startup_gen_id) {
@@ -871,7 +875,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
            return;
        }
        if (!_gossiper.is_normal(addr)) {
-            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
+            throw std::runtime_error(fmt::format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

@@ -1110,7 +1114,9 @@ future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation
    auto sys_dist_ks = get_sys_dist_ks();
    auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
    if (!gen) {
-        throw std::runtime_error(format(
+        // This may happen during raft upgrade when a node gossips about a generation that
+        // was propagated through raft and we didn't apply it yet.
+        throw generation_handling_nonfatal_exception(fmt::format(
            "Could not find CDC generation {} in distributed system tables (current time: {}),"
            " even though some node gossiped about it.",
            gen_id, db_clock::now()));
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -121,7 +121,7 @@ public:
 class no_generation_data_exception : public std::runtime_error {
 public:
    no_generation_data_exception(cdc::generation_id generation_ts)
-        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
+        : std::runtime_error(fmt::format("could not find generation data for timestamp {}", generation_ts))
    {}
 };

--- a/cdc/generation_service.hh
+++ b/cdc/generation_service.hh
@@ -98,7 +98,7 @@ public:
     * that generation timestamp moved in as the `startup_gen_id` parameter.
     * This passes the responsibility of managing generations from the node startup code to this service;
     * until then, the service remains dormant.
-     * At the time of writing this comment, the startup code is in `storage_service::join_token_ring`, hence
+     * The startup code is in `storage_service::join_topology`, hence
     * `after_join` should be called at the end of that function.
     * Precondition: the node has completed bootstrapping and system_distributed_keyspace is initialized.
     * Must be called on shard 0 - that's where the generation management happens.
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -32,6 +32,7 @@
 #include "cql3/statements/select_statement.hh"
 #include "cql3/untyped_result_set.hh"
 #include "log.hh"
+#include "utils/assert.hh"
 #include "utils/rjson.hh"
 #include "utils/UUID_gen.hh"
 #include "utils/managed_bytes.hh"
@@ -65,8 +66,8 @@ void cdc::stats::parts_touched_stats::register_metrics(seastar::metrics::metric_
    namespace sm = seastar::metrics;
    auto register_part = [&] (part_type part, sstring part_name) {
        metrics.add_group(cdc_group_name, {
-                sm::make_total_operations(format("operations_on_{}_performed_{}", part_name, suffix), count[(size_t)part],
-                        sm::description(format("number of {} CDC operations that processed a {}", suffix, part_name)),
+                sm::make_total_operations(seastar::format("operations_on_{}_performed_{}", part_name, suffix), count[(size_t)part],
+                        sm::description(seastar::format("number of {} CDC operations that processed a {}", suffix, part_name)),
                        {})
            });
    };
@@ -148,7 +149,7 @@ public:
        _ctxt._migration_notifier.register_listener(this);
    }
    ~impl() {
-        assert(_stopped);
+        SCYLLA_ASSERT(_stopped);
    }

    future<> stop() {
@@ -455,7 +456,7 @@ schema_ptr get_base_table(const replica::database& db, sstring_view ks_name,std:
 }

 seastar::sstring base_name(std::string_view log_name) {
-    assert(is_log_name(log_name));
+    SCYLLA_ASSERT(is_log_name(log_name));
    return sstring(log_name.data(), log_name.size() - cdc_log_suffix.size());
 }

@@ -655,7 +656,7 @@ private:

 template<>
 void collection_iterator<std::pair<managed_bytes_view, managed_bytes_view>>::parse() {
-    assert(_rem > 0);
+    SCYLLA_ASSERT(_rem > 0);
    _next = _v;
    auto k = read_collection_key(_next);
    auto v = read_collection_value_nonnull(_next);
@@ -664,7 +665,7 @@ void collection_iterator<std::pair<managed_bytes_view, managed_bytes_view>>::par

 template<>
 void collection_iterator<managed_bytes_view>::parse() {
-    assert(_rem > 0);
+    SCYLLA_ASSERT(_rem > 0);
    _next = _v;
    auto k = read_collection_key(_next);
    _current = k;
@@ -672,7 +673,7 @@ void collection_iterator<managed_bytes_view>::parse() {

 template<>
 void collection_iterator<managed_bytes_view_opt>::parse() {
-    assert(_rem > 0);
+    SCYLLA_ASSERT(_rem > 0);
    _next = _v;
    auto k = read_collection_value_nonnull(_next);
    _current = k;
@@ -1065,7 +1066,7 @@ struct process_row_visitor {
    void update_row_state(const column_definition& cdef, managed_bytes_opt value) {
        if (!_row_state) {
            // static row always has a valid state, so this must be a clustering row missing
-            assert(_base_ck);
+            SCYLLA_ASSERT(_base_ck);
            auto [it, _] = _clustering_row_states.try_emplace(*_base_ck);
            _row_state = &it->second;
        }
@@ -1496,12 +1497,12 @@ public:
    }

    void generate_image(operation op, const clustering_key* ck, const one_kind_column_set* affected_columns) {
-        assert(op == operation::pre_image || op == operation::post_image);
+        SCYLLA_ASSERT(op == operation::pre_image || op == operation::post_image);

-        // assert that post_image is always full
-        assert(!(op == operation::post_image && affected_columns));
+        // SCYLLA_ASSERT that post_image is always full
+        SCYLLA_ASSERT(!(op == operation::post_image && affected_columns));

-        assert(_builder);
+        SCYLLA_ASSERT(_builder);

        const auto kind = ck ? column_kind::regular_column : column_kind::static_column;

@@ -1571,7 +1572,7 @@ public:
    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
    // more details like tombstones/ttl? Probably not but keep in mind.
    void process_change(const mutation& m) override {
-        assert(_builder);
+        SCYLLA_ASSERT(_builder);
        process_change_visitor v {
            ._touched_parts = _touched_parts,
            ._builder = *_builder,
@@ -1584,7 +1585,7 @@ public:
    }

    void end_record() override {
-        assert(_builder);
+        SCYLLA_ASSERT(_builder);
        _builder->end_record();
    }

--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -69,7 +69,7 @@ bool cdc::metadata::streams_available() const {
 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
    if (ts > now + get_generation_leeway().count()) {
-        throw exceptions::invalid_request_exception(format(
+        throw exceptions::invalid_request_exception(seastar::format(
                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
                " know what streams will be used at that time.\n"
@@ -100,7 +100,7 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
        // the generation under `it` because that generation was operating at `now - generation_leeway`.
        bool is_previous_gen = it != _gens.end() && std::next(it) != _gens.end() && std::next(it)->first <= now;
        if (it == _gens.end() || ts < it->first || is_previous_gen) {
-            throw exceptions::invalid_request_exception(format(
+            throw exceptions::invalid_request_exception(seastar::format(
                    "cdc: attempted to get a stream \"from the past\" ({}; current server time: {})."
                    " With CDC you cannot send writes with timestamps too far into the past, because that would break"
                    " consistency properties.\n"
@@ -112,7 +112,7 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)

    it = _gens.begin();
    if (it == _gens.end() || ts < it->first) {
-        throw std::runtime_error(format(
+        throw std::runtime_error(fmt::format(
                "cdc::metadata::get_stream: could not find any CDC stream for timestamp {}."
                " Are we in the middle of a cluster upgrade?", format_timestamp(ts)));
    }
@@ -129,7 +129,7 @@ cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok)
    // about the current generation in time. We won't be able to prevent it until we introduce transactions.

    if (!it->second) {
-        throw std::runtime_error(format(
+        throw std::runtime_error(fmt::format(
                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
@@ -186,7 +186,7 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    }

    auto ts = to_ts(tp);
-    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+    auto [it, emplaced] = _gens.emplace(to_ts(tp), std::nullopt);

    if (_last_stream_timestamp != api::missing_timestamp) {
        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
@@ -201,5 +201,5 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
        }
    }

-    return emplaced;
+    return !it->second;
 }
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -8,13 +8,19 @@

 #pragma once

+#include <exception>
+
 #include <boost/intrusive/unordered_set.hpp>

+#include "utils/assert.hh"
 #include "utils/small_vector.hh"
 #include "mutation/mutation_partition.hh"
 #include "utils/xx_hasher.hh"

 #include "db/timeout_clock.hh"
+#include "log.hh"
+
+extern logging::logger cell_locker_log;

 class cells_range {
    using ids_vector_type = utils::small_vector<column_id, 5>;
@@ -229,14 +235,24 @@ private:
        static constexpr size_t compute_rehash_at_size(size_t bucket_count) {
            return bucket_count * max_load_factor::num / max_load_factor::den;
        }
+
+        // Try to rehash the set, if needed.
+        // The function may fail silently on bad_alloc (logging a warning).
+        // Rehashing would be retried at a later time on failure.
        void maybe_rehash() {
            if (_cell_count >= _rehash_at_size) {
                auto new_bucket_count = std::min(_cells.bucket_count() * 2, _cells.bucket_count() + 1024);
-                auto buckets = std::make_unique<cells_type::bucket_type[]>(new_bucket_count);
-
-                _cells.rehash(cells_type::bucket_traits(buckets.get(), new_bucket_count));
-                _buckets = std::move(buckets);
+                try {
+                    auto buckets = std::make_unique<cells_type::bucket_type[]>(new_bucket_count);

+                    _cells.rehash(cells_type::bucket_traits(buckets.get(), new_bucket_count));
+                    _buckets = std::move(buckets);
+                } catch (const std::bad_alloc&) {
+                    cell_locker_log.warn("Could not rehash cell_locker partition cells set: bucket_count={} new_bucket_count={}: {}", _cells.bucket_count(), new_bucket_count, std::current_exception());
+                }
+                // Attempt rehash at the new size in both success and failure paths.
+                // On failure, we don't want to retry too soon since it may take some time
+                // for memory to free up.
                _rehash_at_size = compute_rehash_at_size(new_bucket_count);
            }
        }
@@ -320,14 +336,24 @@ private:
    static constexpr size_t compute_rehash_at_size(size_t bucket_count) {
        return bucket_count * max_load_factor::num / max_load_factor::den;
    }
+
+    // Try to rehash the set, if needed.
+    // The function may fail silently on bad_alloc (logging a warning).
+    // Rehashing would be retried at a later time on failure.
    void maybe_rehash() {
        if (_partition_count >= _rehash_at_size) {
            auto new_bucket_count = std::min(_partitions.bucket_count() * 2, _partitions.bucket_count() + 64 * 1024);
-            auto buckets = std::make_unique<partitions_type::bucket_type[]>(new_bucket_count);
-
-            _partitions.rehash(partitions_type::bucket_traits(buckets.get(), new_bucket_count));
-            _buckets = std::move(buckets);
+            try {
+                auto buckets = std::make_unique<partitions_type::bucket_type[]>(new_bucket_count);

+                _partitions.rehash(partitions_type::bucket_traits(buckets.get(), new_bucket_count));
+                _buckets = std::move(buckets);
+            } catch (const std::bad_alloc&) {
+                cell_locker_log.warn("Could not rehash cell_locker partitions set: bucket_count={} new_bucket_count={}: {}", _partitions.bucket_count(), new_bucket_count, std::current_exception());
+            }
+            // Attempt rehash at the new size in both success and failure paths.
+            // On failure, we don't want to retry too soon since it may take some time
+            // for memory to free up.
            _rehash_at_size = compute_rehash_at_size(new_bucket_count);
        }
    }
@@ -342,7 +368,7 @@ public:
    { }

    ~cell_locker() {
-        assert(_partitions.empty());
+        SCYLLA_ASSERT(_partitions.empty());
    }

    void set_schema(schema_ptr s) {
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "utils/assert.hh"
 #include "schema/schema_fwd.hh"
 #include "mutation/position_in_partition.hh"
 #include <boost/icl/interval_set.hpp>
@@ -87,8 +88,8 @@ public:
        }
    };
    static interval::type make_interval(const schema& s, const position_range& r) {
-        assert(r.start().has_clustering_key());
-        assert(r.end().has_clustering_key());
+        SCYLLA_ASSERT(r.start().has_clustering_key());
+        SCYLLA_ASSERT(r.end().has_clustering_key());
        return interval::right_open(
            position_in_partition_with_schema(s.shared_from_this(), r.start()),
            position_in_partition_with_schema(s.shared_from_this(), r.end()));
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -23,10 +23,6 @@ public:
    clustering_key_filter_ranges(clustering_row_ranges&& ranges)
        : _storage(std::make_move_iterator(ranges.begin()), std::make_move_iterator(ranges.end())), _ref(_storage) {}

-    struct reversed { };
-    clustering_key_filter_ranges(reversed, const clustering_row_ranges& ranges)
-        : _storage(ranges.rbegin(), ranges.rend()), _ref(_storage) { }
-
    clustering_key_filter_ranges(clustering_key_filter_ranges&& other) noexcept
        : _storage(std::move(other._storage))
        , _ref(&other._ref.get() == &other._storage ? _storage : other._ref.get())
@@ -47,21 +43,9 @@ public:
    const clustering_row_ranges& ranges() const { return _ref; }

    // Returns all clustering ranges determined by `slice` inside partition determined by `key`.
-    // If the slice contains the `reversed` option, we assume that it is given in 'half-reversed' format
-    // (i.e. the ranges within are given in reverse order, but the ranges themselves are not reversed)
-    // with respect to the table order.
-    // The ranges will be returned in forward (increasing) order even if the slice is reversed.
+    // The ranges will be returned in the same order as stored in the slice. For a reversed slice
+    // a reverse schema shall be provided.
    static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
-        const query::clustering_row_ranges& ranges = slice.row_ranges(schema, key);
-        if (slice.is_reversed()) {
-            return clustering_key_filter_ranges(clustering_key_filter_ranges::reversed{}, ranges);
-        }
-        return clustering_key_filter_ranges(ranges);
-    }
-
-    // Returns all clustering ranges determined by `slice` inside partition determined by `key`.
-    // The ranges will be returned in the same order as stored in the slice.
-    static clustering_key_filter_ranges get_native_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
        const query::clustering_row_ranges& ranges = slice.row_ranges(schema, key);
        return clustering_key_filter_ranges(ranges);
    }
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -10,6 +10,7 @@

 #pragma once

+#include "utils/assert.hh"
 #include "schema/schema.hh"
 #include "query-request.hh"
 #include "mutation/mutation_fragment.hh"
@@ -249,7 +250,7 @@ public:
            auto range_end = position_in_partition_view::for_range_end(rng);
            if (!less(rt.position(), range_start) && !less(range_end, rt.end_position())) {
                // Fully enclosed by this range.
-                assert(!first);
+                SCYLLA_ASSERT(!first);
                return std::move(rt);
            }
            auto this_range_rt = rt;
--- a/cmake/mode.Dev.cmake
+++ b/cmake/mode.Dev.cmake
@@ -11,7 +11,8 @@ set(Seastar_DEFINITIONS_DEV
  SCYLLA_BUILD_MODE=${scylla_build_mode_Dev}
  DEVEL
  SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
-  SCYLLA_ENABLE_ERROR_INJECTION)
+  SCYLLA_ENABLE_ERROR_INJECTION
+  SCYLLA_ENABLE_PREEMPTION_SOURCE)
 foreach(definition ${Seastar_DEFINITIONS_DEV})
  add_compile_definitions(
    $<$<CONFIG:Dev>:${definition}>)
--- a/cmake/mode.RelWithDebInfo.cmake
+++ b/cmake/mode.RelWithDebInfo.cmake
@@ -16,10 +16,11 @@ set(scylla_build_mode_RelWithDebInfo "release")
 add_compile_definitions(
    $<$<CONFIG:RelWithDebInfo>:SCYLLA_BUILD_MODE=${scylla_build_mode_RelWithDebInfo}>)

-set(clang_inline_threshold 2500)
+set(Scylla_CLANG_INLINE_THRESHOLD "2500" CACHE STRING
+  "LLVM-specific inline threshold compilation parameter")
 add_compile_options(
  "$<$<AND:$<CONFIG:RelWithDebInfo>,$<CXX_COMPILER_ID:GNU>>:--param;inline-unit-growth=300>"
-  "$<$<AND:$<CONFIG:RelWithDebInfo>,$<CXX_COMPILER_ID:Clang>>:-mllvm;-inline-threshold=${clang_inline_threshold}>")
+  "$<$<AND:$<CONFIG:RelWithDebInfo>,$<CXX_COMPILER_ID:Clang>>:-mllvm;-inline-threshold=${Scylla_CLANG_INLINE_THRESHOLD}>")
 # clang generates 16-byte loads that break store-to-load forwarding
 # gcc also has some trouble: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103554
 check_cxx_compiler_flag("-fno-slp-vectorize" _slp_vectorize_supported)
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -83,7 +83,7 @@ function(get_padded_dynamic_linker_option output length)
  set(${output} "${dynamic_linker_option}=${padded_dynamic_linker}" PARENT_SCOPE)
 endfunction()

-add_compile_options("-ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
+add_compile_options("-ffile-prefix-map=${CMAKE_BINARY_DIR}=.")

 default_target_arch(target_arch)
 if(target_arch)
--- a/collection_mutation.cc
+++ b/collection_mutation.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "utils/assert.hh"
 #include "types/collection.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
@@ -166,15 +167,18 @@ collection_mutation_view_description::materialize(const abstract_type& type) con
    return m;
 }

-bool collection_mutation_description::compact_and_expire(column_id id, row_tombstone base_tomb, gc_clock::time_point query_time,
+compact_and_expire_result collection_mutation_description::compact_and_expire(column_id id, row_tombstone base_tomb, gc_clock::time_point query_time,
    can_gc_fn& can_gc, gc_clock::time_point gc_before, compaction_garbage_collector* collector)
 {
-    bool any_live = false;
+    compact_and_expire_result res{};
+    if (tomb) {
+        res.collection_tombstones++;
+    }
    auto t = tomb;
    tombstone purged_tomb;
    if (tomb <= base_tomb.regular()) {
        tomb = tombstone();
-    } else if (tomb.deletion_time < gc_before && can_gc(tomb)) {
+    } else if (tomb.deletion_time < gc_before && can_gc(tomb, is_shadowable::no)) { // The collection tombstone is never shadowable
        purged_tomb = tomb;
        tomb = tombstone();
    }
@@ -184,10 +188,12 @@ bool collection_mutation_description::compact_and_expire(column_id id, row_tombs
    for (auto&& name_and_cell : cells) {
        atomic_cell& cell = name_and_cell.second;
        auto cannot_erase_cell = [&] {
-            return cell.deletion_time() >= gc_before || !can_gc(tombstone(cell.timestamp(), cell.deletion_time()));
+            // Only row tombstones can be shadowable, (collection) cell tombstones aren't
+            return cell.deletion_time() >= gc_before || !can_gc(tombstone(cell.timestamp(), cell.deletion_time()), is_shadowable::no);
        };

        if (cell.is_covered_by(t, false) || cell.is_covered_by(base_tomb.shadowable().tomb(), false)) {
+            res.dead_cells++;
            continue;
        }
        if (cell.has_expired(query_time)) {
@@ -198,22 +204,24 @@ bool collection_mutation_description::compact_and_expire(column_id id, row_tombs
                losers.emplace_back(std::pair(
                        std::move(name_and_cell.first), atomic_cell::make_dead(cell.timestamp(), cell.deletion_time())));
            }
+            res.dead_cells++;
        } else if (!cell.is_live()) {
            if (cannot_erase_cell()) {
                survivors.emplace_back(std::move(name_and_cell));
            } else if (collector) {
                losers.emplace_back(std::move(name_and_cell));
            }
+            res.dead_cells++;
        } else {
-            any_live |= true;
            survivors.emplace_back(std::move(name_and_cell));
+            res.live_cells++;
        }
    }
    if (collector) {
        collector->collect(id, collection_mutation_description{purged_tomb, std::move(losers)});
    }
    cells = std::move(survivors);
-    return any_live;
+    return res;
 }

 template <typename Iterator>
@@ -391,7 +399,7 @@ deserialize_collection_mutation(collection_mutation_input_stream& in, F&& read_k
        ret.cells.push_back(read_kv(in));
    }

-    assert(in.empty());
+    SCYLLA_ASSERT(in.empty());
    return ret;
 }

--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -12,6 +12,8 @@
 #include "schema/schema_fwd.hh"
 #include "gc_clock.hh"
 #include "mutation/atomic_cell.hh"
+#include "mutation/compact_and_expire_result.hh"
+#include "compaction/compaction_garbage_collector.hh"
 #include <iosfwd>
 #include <forward_list>

@@ -34,7 +36,7 @@ struct collection_mutation_description {

    // Expires cells based on query_time. Expires tombstones based on max_purgeable and gc_before.
    // Removes cells covered by tomb or this->tomb.
-    bool compact_and_expire(column_id id, row_tombstone tomb, gc_clock::time_point query_time,
+    compact_and_expire_result compact_and_expire(column_id id, row_tombstone tomb, gc_clock::time_point query_time,
        can_gc_fn&, gc_clock::time_point gc_before, compaction_garbage_collector* collector = nullptr);

    // Packs the data to a serialized blob.
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -28,7 +28,9 @@
 #include <seastar/util/closeable.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/shard_id.hh>
+#include <seastar/core/on_internal_error.hh>

+#include "compaction/compaction_garbage_collector.hh"
 #include "dht/i_partitioner.hh"
 #include "sstables/exceptions.hh"
 #include "sstables/sstables.hh"
@@ -46,12 +48,14 @@
 #include "mutation_writer/partition_based_splitting_writer.hh"
 #include "mutation/mutation_source_metadata.hh"
 #include "mutation/mutation_fragment_stream_validator.hh"
+#include "utils/assert.hh"
 #include "utils/error_injection.hh"
 #include "utils/pretty_printers.hh"
 #include "readers/multi_range.hh"
 #include "readers/compacting.hh"
 #include "tombstone_gc.hh"
 #include "replica/database.hh"
+#include "timestamp.hh"

 namespace sstables {

@@ -136,14 +140,38 @@ std::string_view to_string(compaction_type_options::scrub::quarantine_mode quara

 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks,
-        const api::timestamp_type compacting_max_timestamp) {
+        const api::timestamp_type compacting_max_timestamp, const bool gc_check_only_compacting_sstables, const is_shadowable is_shadowable) {
    if (!table_s.tombstone_gc_enabled()) [[unlikely]] {
        return api::min_timestamp;
    }

    auto timestamp = api::max_timestamp;
-    auto memtable_min_timestamp = table_s.min_memtable_timestamp();
-    // Use memtable timestamp if it contains data older than the sstables being compacted,
+    if (gc_check_only_compacting_sstables) {
+        // If gc_check_only_compacting_sstables is enabled, do not
+        // check memtables and other sstables not being compacted.
+        return timestamp;
+    }
+
+    api::timestamp_type memtable_min_timestamp;
+    if (is_shadowable) {
+        // For shadowable tombstones, check the minimum live row_marker timestamp
+        // as rows with timestamp larger than the tombstone's would shadow the tombstone,
+        // exposing all live cells in the row with timestamps potentially lower than
+        // the shadowable tombstone (and those are tracked in the min_memtable_live_timestamp).
+        // In contrast, a shadowable tombstone applies to rows with row_marker whose timestamp
+        // is less than or equal to the tombstone's timestamp, the same way as a regular tombstone would.
+        // See https://github.com/scylladb/scylladb/issues/20424
+        memtable_min_timestamp = table_s.min_memtable_live_row_marker_timestamp();
+    } else {
+        // For regular tombstones, check the minimum live data timestamp.
+        // Even if purgeable tombstones shadow dead data in the memtable, it's ok to purge them;
+        // since "resurrecting" the already-dead data will no have effect, as they are already dead.
+        // See https://github.com/scylladb/scylladb/issues/20423
+        memtable_min_timestamp = table_s.min_memtable_live_timestamp();
+    }
+    clogger.trace("memtable_min_timestamp={} compacting_max_timestamp={} memtable_has_key={} is_shadowable={} min_memtable_live_timestamp={} min_memtable_live_row_marker_timestamp={}",
+            memtable_min_timestamp, compacting_max_timestamp, table_s.memtable_has_key(dk), is_shadowable, table_s.min_memtable_live_timestamp(), table_s.min_memtable_live_row_marker_timestamp());
+    // Use memtable timestamp if it contains live data older than the sstables being compacted,
    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
    // First condition helps to not penalize the common scenario where memtable only contains
    // newer data.
@@ -155,9 +183,35 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
        if (compacting_set.contains(sst)) {
            continue;
        }
+        api::timestamp_type min_timestamp = sst->get_stats_metadata().min_timestamp;
+        auto ts_stats = sst->get_ext_timestamp_stats();
+        if (!ts_stats.empty()) {
+            auto stat = is_shadowable ?
+                    sstables::ext_timestamp_stats_type::min_live_row_marker_timestamp :
+                    sstables::ext_timestamp_stats_type::min_live_timestamp;
+            auto it = ts_stats.find(stat);
+            if (it != ts_stats.end()) {
+                min_timestamp = it->second;
+            } else {
+                // Do not throw an exception in production, just use the legacy min_timestamp set above
+                on_internal_error_noexcept(clogger, format("Missing extended timestamp statstics: stat={} is_shadowable={}", int(stat), bool(is_shadowable)));
+            }
+        }
+        if (clogger.is_enabled(log_level::trace)) {
+            if (!hk) {
+                hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
+            }
+            clogger.trace("get_max_purgeable_timestamp={}: min_timestamp={} timestamp={} filter_has_key={} is_shadowable={} stats.min_timestamp={} min_live_timestamp={} min_live_row_marker_timestamp={}: sst={}",
+                    min_timestamp >= timestamp || !sst->filter_has_key(*hk) ? timestamp : min_timestamp,
+                    min_timestamp, timestamp, sst->filter_has_key(*hk), is_shadowable,
+                    sst->get_stats_metadata().min_timestamp,
+                    ts_stats[sstables::ext_timestamp_stats_type::min_live_timestamp],
+                    ts_stats[sstables::ext_timestamp_stats_type::min_live_row_marker_timestamp],
+                    sst->get_filename());
+        }
        // There's no point in looking up the key in the sstable filter if
        // it does not contain data older than the minimum timestamp.
-        if (sst->get_stats_metadata().min_timestamp >= timestamp) {
+        if (min_timestamp >= timestamp) {
            continue;
        }
        if (!hk) {
@@ -165,14 +219,15 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
        }
        if (sst->filter_has_key(*hk)) {
            bloom_filter_checks++;
-            timestamp = sst->get_stats_metadata().min_timestamp;
+            timestamp = min_timestamp;
        }
    }
    return timestamp;
 }

 static std::vector<shared_sstable> get_uncompacting_sstables(const table_state& table_s, std::vector<shared_sstable> sstables) {
-    auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*table_s.main_sstable_set().all());
+    auto sstable_set = table_s.sstable_set_for_tombstone_gc();
+    auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*sstable_set->all());
    auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
    all_sstables.insert(all_sstables.end(), compacted_undeleted.begin(), compacted_undeleted.end());
    boost::sort(all_sstables, [] (const shared_sstable& x, const shared_sstable& y) {
@@ -283,7 +338,7 @@ private:

    utils::observer<> make_stop_request_observer(utils::observable<>& sro) {
        return sro.observe([this] () mutable {
-            assert(!_unclosed_partition);
+            SCYLLA_ASSERT(!_unclosed_partition);
            consume_end_of_stream();
        });
    }
@@ -476,6 +531,8 @@ protected:
    // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
    std::vector<shared_sstable> _used_garbage_collected_sstables;
    utils::observable<> _stop_request_observable;
+    // optional tombstone_gc_state that is used when gc has to check only the compacting sstables to collect tombstones.
+    std::optional<tombstone_gc_state> _tombstone_gc_state_with_commitlog_check_disabled;
 private:
    // Keeps track of monitors for input sstable.
    // If _update_backlog_tracker is set to true, monitors are responsible for adjusting backlog as compaction progresses.
@@ -524,6 +581,7 @@ protected:
        , _owned_ranges(std::move(descriptor.owned_ranges))
        , _sharder(descriptor.sharder)
        , _owned_ranges_checker(_owned_ranges ? std::optional<dht::incremental_owned_ranges_checker>(*_owned_ranges) : std::nullopt)
+        , _tombstone_gc_state_with_commitlog_check_disabled(descriptor.gc_check_only_compacting_sstables ? std::make_optional(_table_s.get_tombstone_gc_state().with_commitlog_check_disabled()) : std::nullopt)
        , _progress_monitor(progress_monitor)
    {
        std::unordered_set<run_id> ssts_run_ids;
@@ -722,6 +780,10 @@ private:
        return _table_s.get_compaction_strategy().make_sstable_set(_schema);
    }

+    const tombstone_gc_state& get_tombstone_gc_state() const {
+        return _tombstone_gc_state_with_commitlog_check_disabled ? _tombstone_gc_state_with_commitlog_check_disabled.value() : _table_s.get_tombstone_gc_state();
+    }
+
    future<> setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
        auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
@@ -755,7 +817,7 @@ private:
            // for a better estimate for the number of partitions in the merged
            // sstable than just adding up the lengths of individual sstables.
            _estimated_partitions += sst->get_estimated_key_count();
-            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_clock::now(), _table_s.get_tombstone_gc_state(), _schema);
+            sum_of_estimated_droppable_tombstone_ratio += sst->estimate_droppable_tombstone_ratio(gc_clock::now(), get_tombstone_gc_state(), _schema);
            _compacting_data_file_size += sst->ondisk_data_size();
            _compacting_max_timestamp = std::max(_compacting_max_timestamp, sst->get_stats_metadata().max_timestamp);
            if (sst->originated_on_this_node().value_or(false) && sst_stats.position.shard_id() == this_shard_id()) {
@@ -787,7 +849,7 @@ private:
                reader.consume_in_thread(std::move(cfc));
            });
        });
-        const auto& gc_state = _table_s.get_tombstone_gc_state();
+        const auto& gc_state = get_tombstone_gc_state();
        return consumer(make_compacting_reader(setup_sstable_reader(), compaction_time, max_purgeable_func(), gc_state));
    }

@@ -808,7 +870,7 @@ private:
                    using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, compacted_fragments_writer>;
                    auto cfc = compact_mutations(*schema(), now,
                        max_purgeable_func(),
-                        _table_s.get_tombstone_gc_state(),
+                        get_tombstone_gc_state(),
                        get_compacted_fragments_writer(),
                        get_gc_compacted_fragments_writer());

@@ -818,7 +880,7 @@ private:
                using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, noop_compacted_fragments_consumer>;
                auto cfc = compact_mutations(*schema(), now,
                    max_purgeable_func(),
-                    _table_s.get_tombstone_gc_state(),
+                    get_tombstone_gc_state(),
                    get_compacted_fragments_writer(),
                    noop_compacted_fragments_consumer());
                reader.consume_in_thread(std::move(cfc));
@@ -875,14 +937,12 @@ private:
    virtual std::string_view report_start_desc() const = 0;
    virtual std::string_view report_finish_desc() const = 0;

-    std::function<api::timestamp_type(const dht::decorated_key&)> max_purgeable_func() {
+    max_purgeable_fn max_purgeable_func() {
        if (!tombstone_expiration_enabled()) {
-            return [] (const dht::decorated_key& dk) {
-                return api::min_timestamp;
-            };
+            return can_never_purge;
        }
-        return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp);
+        return [this] (const dht::decorated_key& dk, is_shadowable is_shadowable) {
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks, _compacting_max_timestamp, _tombstone_gc_state_with_commitlog_check_disabled.has_value(), is_shadowable);
        };
    }

--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -171,6 +171,16 @@ struct compaction_descriptor {
    // Denotes if this compaction task is comprised solely of completely expired SSTables
    sstables::has_only_fully_expired has_only_fully_expired = has_only_fully_expired::no;

+    // If set to true, gc will check only the compacting sstables to collect tombstones.
+    // If set to false, gc will check the memtables, commit log and other uncompacting
+    // sstables to decide if a tombstone can be collected. Note that these checks are
+    // not perfect. W.r.to memtables and uncompacted SSTables, if their minimum timestamp
+    // is less than that of the tombstone and they contain the key, the tombstone will
+    // not be collected. No row-level, cell-level check takes place. W.r.to the commit
+    // log, there is currently no way to check if the key exists; only the minimum
+    // timestamp comparison, similar to memtables, is performed.
+    bool gc_check_only_compacting_sstables = false;
+
    compaction_descriptor() = default;

    static constexpr int default_level = 0;
--- a/compaction/compaction_garbage_collector.hh
+++ b/compaction/compaction_garbage_collector.hh
@@ -8,7 +8,24 @@

 #pragma once

+#include <seastar/util/bool_class.hh>
+
+#include "mutation/tombstone.hh"
 #include "schema/schema_fwd.hh"
+#include "dht/i_partitioner_fwd.hh"
+
+using is_shadowable = bool_class<struct is_shadowable_tag>;
+
+// Determines whether tombstone may be GC-ed.
+using can_gc_fn = std::function<bool(tombstone, is_shadowable)>;
+
+extern can_gc_fn always_gc;
+extern can_gc_fn never_gc;
+
+using max_purgeable_fn = std::function<api::timestamp_type(const dht::decorated_key&, is_shadowable)>;
+
+extern max_purgeable_fn can_always_purge;
+extern max_purgeable_fn can_never_purge;

 class atomic_cell;
 class row_marker;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -22,6 +22,7 @@
 #include <seastar/coroutine/maybe_yield.hh>
 #include "sstables/exceptions.hh"
 #include "sstables/sstable_directory.hh"
+#include "utils/assert.hh"
 #include "utils/error_injection.hh"
 #include "utils/UUID_gen.hh"
 #include "db/system_keyspace.hh"
@@ -187,7 +188,7 @@ unsigned compaction_manager::current_compaction_fan_in_threshold() const {
        return 0;
    }
    auto largest_fan_in = std::ranges::max(_tasks | boost::adaptors::transformed([] (auto& task) {
-        return task->compaction_running() ? task->compaction_data().compaction_fan_in : 0;
+        return task.compaction_running() ? task.compaction_data().compaction_fan_in : 0;
    }));
    // conservatively limit fan-in threshold to 32, such that tons of small sstables won't accumulate if
    // running major on a leveled table, which can even have more than one thousand files.
@@ -387,11 +388,26 @@ future<sstables::compaction_result> compaction_task_executor::compact_sstables_a

    co_return res;
 }
+
+future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombstone_gc(table_state& t) {
+    auto compound_set = t.sstable_set_for_tombstone_gc();
+    // Compound set will be linearized into a single set, since compaction might add or remove sstables
+    // to it for incremental compaction to work.
+    auto new_set = sstables::make_partitioned_sstable_set(t.schema(), false);
+    co_await compound_set->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
+        auto inserted = new_set.insert(sst);
+        if (!inserted) {
+            on_internal_error(cmlog, format("Unable to insert SSTable {} into set used for tombstone GC", sst->get_filename()));
+        }
+    });
+    co_return std::move(new_set);
+}
+
 future<sstables::compaction_result> compaction_task_executor::compact_sstables(sstables::compaction_descriptor descriptor, sstables::compaction_data& cdata, on_replacement& on_replace, compaction_manager::can_purge_tombstones can_purge,
                                                                               sstables::offstrategy offstrategy) {
    table_state& t = *_compacting_table;
    if (can_purge) {
-        descriptor.enable_garbage_collection(t.main_sstable_set());
+        descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
    }
    descriptor.creator = [&t] (shard_id dummy) {
        auto sst = t.make_sstable();
@@ -503,9 +519,10 @@ public:
    major_compaction_task_executor(compaction_manager& mgr,
            throw_if_stopping do_throw_if_stopping,
            table_state* t,
-            tasks::task_id parent_id)
+            tasks::task_id parent_id,
+            bool consider_only_existing_data)
        : compaction_task_executor(mgr, do_throw_if_stopping, t, sstables::compaction_type::Compaction, "Major compaction")
-        , major_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id)
+        , major_compaction_task_impl(mgr._task_manager_module, tasks::task_id::create_random_id(), 0, "compaction group", t->schema()->ks_name(), t->schema()->cf_name(), "", parent_id, flush_mode::compacted_tables, consider_only_existing_data)
    {
        _status.progress_units = "bytes";
    }
@@ -540,6 +557,7 @@ protected:
        table_state* t = _compacting_table;
        sstables::compaction_strategy cs = t->get_compaction_strategy();
        sstables::compaction_descriptor descriptor = cs.get_major_compaction_job(*t, _cm.get_candidates(*t));
+        descriptor.gc_check_only_compacting_sstables = _consider_only_existing_data;
        auto compacting = compacting_sstable_registration(_cm, _cm.get_compaction_state(t), descriptor.sstables);
        auto on_replace = compacting.update_on_sstable_replacement();
        setup_new_compaction(descriptor.run_identifier);
@@ -575,22 +593,18 @@ requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
 requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
    {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
 }
-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args) {
    auto task_executor = seastar::make_shared<TaskExecutor>(*this, do_throw_if_stopping, std::forward<Args>(args)...);
-    _tasks.push_back(task_executor);
-    auto unregister_task = defer([this, task_executor] {
-        _tasks.remove(task_executor);
+    _tasks.push_back(*task_executor);
+    auto unregister_task = defer([task_executor] {
+        task_executor->unlink();
        task_executor->switch_state(compaction_task_executor::state::none);
    });

-    if (parent_info) {
-        auto task = co_await get_task_manager_module().make_task(task_executor, parent_info.value());
-        task->start();
-        co_await task->done();
-        co_return task_executor->get_stats();
-    } else {
-        co_return co_await perform_task(std::move(task_executor), do_throw_if_stopping);
-    }
+    auto task = co_await get_task_manager_module().make_task(task_executor, parent_info);
+    task->start();
+    co_await task->done();
+    co_return task_executor->get_stats();
 }

 std::optional<gate::holder> compaction_manager::start_compaction(table_state& t) {
@@ -606,13 +620,13 @@ std::optional<gate::holder> compaction_manager::start_compaction(table_state& t)
    return it->second.gate.hold();
 }

-future<> compaction_manager::perform_major_compaction(table_state& t, std::optional<tasks::task_info> info) {
+future<> compaction_manager::perform_major_compaction(table_state& t, tasks::task_info info, bool consider_only_existing_data) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
    }

-    co_await perform_compaction<major_compaction_task_executor>(throw_if_stopping::no, info, &t, info.value_or(tasks::task_info{}).id).discard_result();
+    co_await perform_compaction<major_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, consider_only_existing_data).discard_result();
 }

 namespace compaction {
@@ -669,13 +683,13 @@ protected:

 }

-future<> compaction_manager::run_custom_job(table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, std::optional<tasks::task_info> info, throw_if_stopping do_throw_if_stopping) {
+future<> compaction_manager::run_custom_job(table_state& t, sstables::compaction_type type, const char* desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
    }

-    co_return co_await perform_compaction<custom_compaction_task_executor>(do_throw_if_stopping, info, &t, info.value_or(tasks::task_info{}).id, type, desc, std::move(job)).discard_result();
+    co_return co_await perform_compaction<custom_compaction_task_executor>(do_throw_if_stopping, info, &t, info.id, type, desc, std::move(job)).discard_result();
 }

 future<> compaction_manager::update_static_shares(float static_shares) {
@@ -885,10 +899,10 @@ public:
    explicit strategy_control(compaction_manager& cm) noexcept : _cm(cm) {}

    bool has_ongoing_compaction(table_state& table_s) const noexcept override {
-        return std::any_of(_cm._tasks.begin(), _cm._tasks.end(), [&s = table_s.schema()] (const shared_ptr<compaction_task_executor>& task) {
-            return task->compaction_running()
-                && task->compacting_table()->schema()->ks_name() == s->ks_name()
-                && task->compacting_table()->schema()->cf_name() == s->cf_name();
+        return std::any_of(_cm._tasks.begin(), _cm._tasks.end(), [&s = table_s.schema()] (const compaction_task_executor& task) {
+            return task.compaction_running()
+                && task.compacting_table()->schema()->ks_name() == s->ks_name()
+                && task.compacting_table()->schema()->cf_name() == s->cf_name();
        });
    }

@@ -958,7 +972,7 @@ compaction_manager::compaction_manager(tasks::task_manager& tm)
 compaction_manager::~compaction_manager() {
    // Assert that compaction manager was explicitly stopped, if started.
    // Otherwise, fiber(s) will be alive after the object is stopped.
-    assert(_state == state::none || _state == state::stopped);
+    SCYLLA_ASSERT(_state == state::none || _state == state::stopped);
 }

 future<> compaction_manager::update_throughput(uint32_t value_mbs) {
@@ -998,7 +1012,7 @@ void compaction_manager::register_metrics() {
 }

 void compaction_manager::enable() {
-    assert(_state == state::none || _state == state::disabled);
+    SCYLLA_ASSERT(_state == state::none || _state == state::disabled);
    _state = state::enabled;
    _compaction_submission_timer.arm_periodic(periodic_compaction_submission_interval());
    _waiting_reevalution = postponed_compactions_reevaluation();
@@ -1052,7 +1066,7 @@ void compaction_manager::postpone_compaction_for_table(table_state* t) {
    _postponed.insert(t);
 }

-future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) {
+future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) noexcept {
    // To prevent compaction from being postponed while tasks are being stopped,
    // let's stop all tasks before the deferring point below.
    for (auto& t : tasks) {
@@ -1060,14 +1074,16 @@ future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_e
        t->stop_compaction(reason);
    }
    co_await coroutine::parallel_for_each(tasks, [] (auto& task) -> future<> {
+        auto unlink_task = deferred_action([task] { task->unlink(); });
        try {
            co_await task->compaction_done();
        } catch (sstables::compaction_stopped_exception&) {
            // swallow stop exception if a given procedure decides to propagate it to the caller,
            // as it happens with reshard and reshape.
        } catch (...) {
+            // just log any other errors as the callers have nothing to do with them.
            cmlog.debug("Stopping {}: task returned error: {}", *task, std::current_exception());
-            throw;
+            co_return;
        }
        cmlog.debug("Stopping {}: done", *task);
    });
@@ -1076,9 +1092,12 @@ future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_e
 future<> compaction_manager::stop_ongoing_compactions(sstring reason, table_state* t, std::optional<sstables::compaction_type> type_opt) noexcept {
    try {
        auto ongoing_compactions = get_compactions(t).size();
-        auto tasks = boost::copy_range<std::vector<shared_ptr<compaction_task_executor>>>(_tasks | boost::adaptors::filtered([t, type_opt] (auto& task) {
-            return (!t || task->compacting_table() == t) && (!type_opt || task->compaction_type() == *type_opt);
-        }));
+        auto tasks = _tasks
+                | std::views::filter([t, type_opt] (const auto& task) {
+                    return (!t || task.compacting_table() == t) && (!type_opt || task.compaction_type() == *type_opt);
+                })
+                | std::views::transform([] (auto& task) { return task.shared_from_this(); })
+                | std::ranges::to<std::vector<shared_ptr<compaction_task_executor>>>();
        logging::log_level level = tasks.empty() ? log_level::debug : log_level::info;
        if (cmlog.is_enabled(level)) {
            std::string scope = "";
@@ -1092,15 +1111,19 @@ future<> compaction_manager::stop_ongoing_compactions(sstring reason, table_stat
        }
        return stop_tasks(std::move(tasks), std::move(reason));
    } catch (...) {
-        return current_exception_as_future<>();
+        cmlog.error("Stopping ongoing compactions failed: {}.  Ignored", std::current_exception());
    }
+    return make_ready_future();
 }

 future<> compaction_manager::drain() {
    cmlog.info("Asked to drain");
    if (*_early_abort_subscription) {
        _state = state::disabled;
+        _compaction_submission_timer.cancel();
        co_await stop_ongoing_compactions("drain");
+        // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
+        reevaluate_postponed_compactions();
    }
    cmlog.info("Drained");
 }
@@ -1110,17 +1133,17 @@ future<> compaction_manager::stop() {
    if (auto cm = std::exchange(_task_manager_module, nullptr)) {
        co_await cm->stop();
    }
-    if (_state != state::none) {
-        co_return co_await std::move(*_stop_future);
+    if (_stop_future) {
+        co_await std::exchange(*_stop_future, make_ready_future());
    }
 }

-future<> compaction_manager::really_do_stop() {
+future<> compaction_manager::really_do_stop() noexcept {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
    co_await stop_ongoing_compactions("shutdown");
-    co_await coroutine::parallel_for_each(_compaction_state | boost::adaptors::map_values, [] (compaction_state& cs) -> future<> {
+    co_await coroutine::parallel_for_each(_compaction_state | std::views::values, [] (compaction_state& cs) -> future<> {
        if (!cs.gate.is_closed()) {
            co_await cs.gate.close();
        }
@@ -1150,7 +1173,14 @@ void compaction_manager::do_stop() noexcept {
 }

 inline bool compaction_manager::can_proceed(table_state* t) const {
-    return (_state == state::enabled) && _compaction_state.contains(t) && !_compaction_state.at(t).compaction_disabled();
+    if (_state != state::enabled) {
+        return false;
+    }
+    auto found = _compaction_state.find(t);
+    if (found == _compaction_state.end()) {
+        return false;
+    }
+    return !found->second.compaction_disabled();
 }

 future<> compaction_task_executor::perform() {
@@ -1500,14 +1530,14 @@ protected:

 }

-future<bool> compaction_manager::perform_offstrategy(table_state& t, std::optional<tasks::task_info> info) {
+future<bool> compaction_manager::perform_offstrategy(table_state& t, tasks::task_info info) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return false;
    }

    bool performed;
-    co_await perform_compaction<offstrategy_compaction_task_executor>(throw_if_stopping::no, info, &t, info.value_or(tasks::task_info{}).id, performed);
+    co_await perform_compaction<offstrategy_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, performed);
    co_return performed;
 }

@@ -1553,11 +1583,16 @@ protected:
        co_return stats;
    }

-    virtual sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const {
+    static sstables::compaction_descriptor
+    make_descriptor(const sstables::shared_sstable& sst, const sstables::compaction_type_options& opt, owned_ranges_ptr owned_ranges = {}) {
        auto sstable_level = sst->get_sstable_level();
        auto run_identifier = sst->run_identifier();
        return sstables::compaction_descriptor({ sst },
-            sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, _options, _owned_ranges_ptr);
+            sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, opt, owned_ranges);
+    }
+
+    virtual sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const {
+        return make_descriptor(sst, _options, _owned_ranges_ptr);
    }

    virtual future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) {
@@ -1573,9 +1608,6 @@ protected:

            setup_new_compaction(descriptor.run_identifier);

-            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
-            _cm.register_backlog_tracker(user_initiated);
-
            std::exception_ptr ex;
            try {
                sstables::compaction_result res = co_await compact_sstables_and_update_history(std::move(descriptor), _compaction_data, on_replace, _can_purge);
@@ -1610,19 +1642,30 @@ public:
                std::move(sstables), std::move(compacting), compaction_manager::can_purge_tombstones::yes)
            , _opt(options.as<sstables::compaction_type_options::split>())
    {
+        if (utils::get_local_injector().is_enabled("split_sstable_rewrite")) {
+            _do_throw_if_stopping = throw_if_stopping::yes;
+        }
+    }
+
+    static bool sstable_needs_split(const sstables::shared_sstable& sst, const sstables::compaction_type_options::split& opt) {
+        return opt.classifier(sst->get_first_decorated_key().token()) != opt.classifier(sst->get_last_decorated_key().token());
+    }
+
+    static sstables::compaction_descriptor
+    make_descriptor(const sstables::shared_sstable& sst, const sstables::compaction_type_options::split& split_opt) {
+        auto opt = sstables::compaction_type_options::make_split(split_opt.classifier);
+        return rewrite_sstables_compaction_task_executor::make_descriptor(sst, std::move(opt));
    }
 private:
    bool sstable_needs_split(const sstables::shared_sstable& sst) const {
-        return _opt.classifier(sst->get_first_decorated_key().token()) != _opt.classifier(sst->get_last_decorated_key().token());
+        return sstable_needs_split(sst, _opt);
    }
 protected:
    sstables::compaction_descriptor make_descriptor(const sstables::shared_sstable& sst) const override {
-        auto desc = rewrite_sstables_compaction_task_executor::make_descriptor(sst);
-        desc.options = sstables::compaction_type_options::make_split(_opt.classifier);
-        return desc;
+        return make_descriptor(sst, _opt);
    }

-    future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) override {
+    future<sstables::compaction_result> do_rewrite_sstable(const sstables::shared_sstable sst) {
        if (sstable_needs_split(sst)) {
            return rewrite_sstables_compaction_task_executor::rewrite_sstable(std::move(sst));
        }
@@ -1635,6 +1678,20 @@ protected:
            return sstables::compaction_result{};
        });
    }
+
+    future<sstables::compaction_result> rewrite_sstable(const sstables::shared_sstable sst) override {
+        co_await utils::get_local_injector().inject("split_sstable_rewrite", [this] (auto& handler) -> future<> {
+            cmlog.info("split_sstable_rewrite: waiting");
+            while (!handler.poll_for_message() && !_compaction_data.is_stop_requested()) {
+                co_await sleep(std::chrono::milliseconds(5));
+            }
+            cmlog.info("split_sstable_rewrite: released");
+            if (_compaction_data.is_stop_requested()) {
+                throw make_compaction_stopped_exception();
+            }
+        }, false);
+        co_return co_await do_rewrite_sstable(std::move(sst));
+    }
 };

 }
@@ -1642,7 +1699,7 @@ protected:
 template<typename TaskType, typename... Args>
 requires std::derived_from<TaskType, compaction_task_executor> &&
         std::derived_from<TaskType, compaction_task_impl>
-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task_on_all_files(std::optional<tasks::task_info> info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task_on_all_files(tasks::task_info info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return std::nullopt;
@@ -1670,12 +1727,12 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_tas
    if (sstables.empty()) {
        co_return std::nullopt;
    }
-    co_return co_await perform_compaction<TaskType>(throw_if_stopping::no, info, &t, info.value_or(tasks::task_info{}).id, std::move(options), std::move(owned_ranges_ptr), std::move(sstables), std::move(compacting), std::forward<Args>(args)...);
+    co_return co_await perform_compaction<TaskType>(throw_if_stopping::no, info, &t, info.id, std::move(options), std::move(owned_ranges_ptr), std::move(sstables), std::move(compacting), std::forward<Args>(args)...);
 }

 future<compaction_manager::compaction_stats_opt>
 compaction_manager::rewrite_sstables(table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr,
-                                     get_candidates_func get_func, std::optional<tasks::task_info> info, can_purge_tombstones can_purge,
+                                     get_candidates_func get_func, tasks::task_info info, can_purge_tombstones can_purge,
                                     sstring options_desc) {
    return perform_task_on_all_files<rewrite_sstables_compaction_task_executor>(info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_func), can_purge, std::move(options_desc));
 }
@@ -1744,14 +1801,14 @@ static std::vector<sstables::shared_sstable> get_all_sstables(table_state& t) {
    return s;
 }

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub_validate_mode(table_state& t, std::optional<tasks::task_info> info) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub_validate_mode(table_state& t, tasks::task_info info) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return compaction_stats_opt{};
    }
    // All sstables must be included, even the ones being compacted, such that everything in table is validated.
    auto all_sstables = get_all_sstables(t);
-    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.value_or(tasks::task_info{}).id, std::move(all_sstables));
+    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, std::move(all_sstables));
 }

 namespace compaction {
@@ -1816,7 +1873,7 @@ protected:
    }
 private:
    future<> run_cleanup_job(sstables::compaction_descriptor descriptor) {
-        co_await coroutine::switch_to(_cm.compaction_sg());
+        co_await coroutine::switch_to(_cm.maintenance_sg());

        // Releases reference to cleaned files such that respective used disk space can be freed.
        using update_registration = compacting_sstable_registration::update_me;
@@ -1836,9 +1893,6 @@ private:
        };
        release_exhausted on_replace{_compacting, descriptor};
        for (;;) {
-            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_cm._compaction_controller.backlog_of_shares(200), _cm.available_memory()));
-            _cm.register_backlog_tracker(user_initiated);
-
            std::exception_ptr ex;
            try {
                setup_new_compaction(descriptor.run_identifier);
@@ -1917,7 +1971,7 @@ const std::unordered_set<sstables::shared_sstable>& compaction_manager::sstables
    return cs.sstables_requiring_cleanup;
 }

-future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, std::optional<tasks::task_info> info) {
+future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, tasks::task_info info) {
    auto gh = start_compaction(t);
    if (!gh) {
        co_return;
@@ -1962,10 +2016,10 @@ future<> compaction_manager::perform_cleanup(owned_ranges_ptr sorted_owned_range
    }
 }

-future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, std::optional<tasks::task_info> info) {
+future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, table_state& t, tasks::task_info info) {
    auto check_for_cleanup = [this, &t] {
        return boost::algorithm::any_of(_tasks, [&t] (auto& task) {
-            return task->compacting_table() == &t && task->compaction_type() == sstables::compaction_type::Cleanup;
+            return task.compacting_table() == &t && task.compaction_type() == sstables::compaction_type::Cleanup;
        });
    };
    if (check_for_cleanup()) {
@@ -2019,7 +2073,7 @@ future<> compaction_manager::try_perform_cleanup(owned_ranges_ptr sorted_owned_r
 }

 // Submit a table to be upgraded and wait for its termination.
-future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, table_state& t, bool exclude_current_version, std::optional<tasks::task_info> info) {
+future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, table_state& t, bool exclude_current_version, tasks::task_info info) {
    auto get_sstables = [this, &t, exclude_current_version] {
        std::vector<sstables::shared_sstable> tables;

@@ -2046,7 +2100,7 @@ future<> compaction_manager::perform_sstable_upgrade(owned_ranges_ptr sorted_own
    return rewrite_sstables(t, sstables::compaction_type_options::make_upgrade(), std::move(sorted_owned_ranges), std::move(get_sstables), info).discard_result();
 }

-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_split_compaction(table_state& t, sstables::compaction_type_options::split opt, std::optional<tasks::task_info> info) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_split_compaction(table_state& t, sstables::compaction_type_options::split opt, tasks::task_info info) {
    auto get_sstables = [this, &t] {
        return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(t));
    };
@@ -2056,8 +2110,33 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>(info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables));
 }

+future<std::vector<sstables::shared_sstable>>
+compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt) {
+    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
+        co_return std::vector<sstables::shared_sstable>{sst};
+    }
+    std::vector<sstables::shared_sstable> ret;
+
+        // FIXME: indentation.
+        auto gate = get_compaction_state(&t).gate.hold();
+        sstables::compaction_progress_monitor monitor;
+        sstables::compaction_data info = create_compaction_data();
+        sstables::compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
+        desc.creator = [&t] (shard_id _) {
+            return t.make_sstable();
+        };
+        desc.replacer = [&] (sstables::compaction_completion_desc d) {
+            std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));
+        };
+
+        co_await sstables::compact_sstables(std::move(desc), info, t, monitor);
+        co_await sst->unlink();
+
+    co_return ret;
+}
+
 // Submit a table to be scrubbed and wait for its termination.
-future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info) {
+future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sstable_scrub(table_state& t, sstables::compaction_type_options::scrub opts, tasks::task_info info) {
    auto scrub_mode = opts.operation_mode;
    if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
        return perform_sstable_scrub_validate_mode(t, info);
@@ -2097,7 +2176,7 @@ void compaction_manager::add(table_state& t) {
    }
 }

-future<> compaction_manager::remove(table_state& t) noexcept {
+future<> compaction_manager::remove(table_state& t, sstring reason) noexcept {
    auto& c_state = get_compaction_state(&t);
    auto erase_state = defer([&t, &c_state, this] () noexcept {
       c_state.backlog_tracker->disable();
@@ -2113,7 +2192,7 @@ future<> compaction_manager::remove(table_state& t) noexcept {
    // and prevent new tasks from entering the gate.
    if (!c_state.gate.is_closed()) {
        auto close_gate = c_state.gate.close();
-        co_await stop_ongoing_compactions("table removal", &t);
+        co_await stop_ongoing_compactions(reason, &t);
        co_await std::move(close_gate);
    }

@@ -2121,11 +2200,11 @@ future<> compaction_manager::remove(table_state& t) noexcept {
    auto found = false;
    sstring msg;
    for (auto& task : _tasks) {
-        if (task->compacting_table() == &t) {
+        if (task.compacting_table() == &t) {
            if (!msg.empty()) {
                msg += "\n";
            }
-            msg += format("Found {} after remove", *task.get());
+            msg += format("Found {} after remove", task);
            found = true;
        }
    }
@@ -2136,30 +2215,38 @@ future<> compaction_manager::remove(table_state& t) noexcept {
 }

 const std::vector<sstables::compaction_info> compaction_manager::get_compactions(table_state* t) const {
-    auto to_info = [] (const shared_ptr<compaction_task_executor>& task) {
+    auto to_info = [] (const compaction_task_executor& task) {
        sstables::compaction_info ret;
-        ret.compaction_uuid = task->compaction_data().compaction_uuid;
-        ret.type = task->compaction_type();
-        ret.ks_name = task->compacting_table()->schema()->ks_name();
-        ret.cf_name = task->compacting_table()->schema()->cf_name();
-        ret.total_partitions = task->compaction_data().total_partitions;
-        ret.total_keys_written = task->compaction_data().total_keys_written;
+        ret.compaction_uuid = task.compaction_data().compaction_uuid;
+        ret.type = task.compaction_type();
+        ret.ks_name = task.compacting_table()->schema()->ks_name();
+        ret.cf_name = task.compacting_table()->schema()->cf_name();
+        ret.total_partitions = task.compaction_data().total_partitions;
+        ret.total_keys_written = task.compaction_data().total_keys_written;
        return ret;
    };
    using ret = std::vector<sstables::compaction_info>;
-    return boost::copy_range<ret>(_tasks | boost::adaptors::filtered([t] (const shared_ptr<compaction_task_executor>& task) {
-                return (!t || task->compacting_table() == t) && task->compaction_running();
+    return boost::copy_range<ret>(_tasks | boost::adaptors::filtered([t] (const compaction_task_executor& task) {
+                return (!t || task.compacting_table() == t) && task.compaction_running();
            }) | boost::adaptors::transformed(to_info));
 }

 bool compaction_manager::has_table_ongoing_compaction(const table_state& t) const {
-    return std::any_of(_tasks.begin(), _tasks.end(), [&t] (const shared_ptr<compaction_task_executor>& task) {
-        return task->compacting_table() == &t && task->compaction_running();
+    return std::any_of(_tasks.begin(), _tasks.end(), [&t] (const compaction_task_executor& task) {
+        return task.compacting_table() == &t && task.compaction_running();
    });
 };

 bool compaction_manager::compaction_disabled(table_state& t) const {
-    return _compaction_state.contains(&t) && _compaction_state.at(&t).compaction_disabled();
+    if (auto it = _compaction_state.find(&t); it != _compaction_state.end()) {
+        return it->second.compaction_disabled();
+    } else {
+        cmlog.debug("compaction_disabled: {}:{} not in compaction_state", t.schema()->id(), t.get_group_id());
+        // Compaction is not strictly disabled, but it is not enabled either.
+        // The callers actually care about if it's enabled or not, not about the actual state of
+        // compaction_state::compaction_disabled()
+        return true;
+    }
 }

 future<> compaction_manager::stop_compaction(sstring type, table_state* table) {
@@ -2184,8 +2271,8 @@ future<> compaction_manager::stop_compaction(sstring type, table_state* table) {
 void compaction_manager::propagate_replacement(table_state& t,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& task : _tasks) {
-        if (task->compacting_table() == &t && task->compaction_running()) {
-            task->compaction_data().pending_replacements.push_back({ removed, added });
+        if (task.compacting_table() == &t && task.compaction_running()) {
+            task.compaction_data().pending_replacements.push_back({ removed, added });
        }
    }
 }
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -85,6 +85,7 @@ public:
        size_t available_memory = 0;
        utils::updateable_value<float> static_shares = utils::updateable_value<float>(0);
        utils::updateable_value<uint32_t> throughput_mb_per_sec = utils::updateable_value<uint32_t>(0);
+        std::chrono::seconds flush_all_tables_before_major = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::days(1));
    };

 public:
@@ -93,8 +94,13 @@ public:

 private:
    shared_ptr<compaction::task_manager_module> _task_manager_module;
+
+    using compaction_task_executor_list_type = bi::list<
+            compaction_task_executor,
+            bi::base_hook<bi::list_base_hook<bi::link_mode<bi::auto_unlink>>>,
+            bi::constant_time_size<false>>;
    // compaction manager may have N fibers to allow parallel compaction per shard.
-    std::list<shared_ptr<compaction::compaction_task_executor>> _tasks;
+    compaction_task_executor_list_type _tasks;

    // Possible states in which the compaction manager can be found.
    //
@@ -170,17 +176,15 @@ private:
    // Return nullopt if compaction cannot be started
    std::optional<gate::holder> start_compaction(table_state& t);

-    // parent_info set to std::nullopt means that task manager should not register this task executor.
-    // To create a task manager task with no parent, parent_info argument should contain empty task_info.
    template<typename TaskExecutor, typename... Args>
    requires std::is_base_of_v<compaction_task_executor, TaskExecutor> &&
            std::is_base_of_v<compaction_task_impl, TaskExecutor> &&
    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
    }
-    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
+    future<compaction_manager::compaction_stats_opt> perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);

-    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason);
+    future<> stop_tasks(std::vector<shared_ptr<compaction::compaction_task_executor>> tasks, sstring reason) noexcept;
    future<> update_throughput(uint32_t value_mbs);

    // Return the largest fan-in of currently running compactions
@@ -229,7 +233,7 @@ private:
    // similar-sized compaction.
    void postpone_compaction_for_table(compaction::table_state* t);

-    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<compaction_stats_opt> perform_sstable_scrub_validate_mode(compaction::table_state& t, tasks::task_info info);
    future<> update_static_shares(float shares);

    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
@@ -239,14 +243,14 @@ private:
    template<typename TaskType, typename... Args>
    requires std::derived_from<TaskType, compaction_task_executor> &&
            std::derived_from<TaskType, compaction_task_impl>
-    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(std::optional<tasks::task_info> info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);
+    future<compaction_manager::compaction_stats_opt> perform_task_on_all_files(tasks::task_info info, table_state& t, sstables::compaction_type_options options, owned_ranges_ptr owned_ranges_ptr, get_candidates_func get_func, Args... args);

-    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, std::optional<tasks::task_info> info,
+    future<compaction_stats_opt> rewrite_sstables(compaction::table_state& t, sstables::compaction_type_options options, owned_ranges_ptr, get_candidates_func, tasks::task_info info,
                                                  can_purge_tombstones can_purge = can_purge_tombstones::yes, sstring options_desc = "");

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    future<> really_do_stop();
+    future<> really_do_stop() noexcept;

    // Propagate replacement of sstables to all ongoing compaction of a given table
    void propagate_replacement(compaction::table_state& t, const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added);
@@ -285,6 +289,10 @@ public:
        return _cfg.throughput_mb_per_sec.get();
    }

+    std::chrono::seconds flush_all_tables_before_major() const noexcept {
+        return _cfg.flush_all_tables_before_major;
+    }
+
    void register_metrics();

    // enable the compaction manager.
@@ -314,7 +322,7 @@ public:

    // Submit a table to be off-strategy compacted.
    // Returns true iff off-strategy compaction was required and performed.
-    future<bool> perform_offstrategy(compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<bool> perform_offstrategy(compaction::table_state& t, tasks::task_info info);

    // Submit a table to be cleaned up and wait for its termination.
    //
@@ -323,9 +331,9 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);
 private:
-    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, std::optional<tasks::task_info> info);
+    future<> try_perform_cleanup(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, tasks::task_info info);

    // Add sst to or remove it from the respective compaction_state.sstables_requiring_cleanup set.
    bool update_sstable_cleanup_state(table_state& t, const sstables::shared_sstable& sst, const dht::token_range_vector& sorted_owned_ranges);
@@ -333,19 +341,24 @@ private:
    future<> on_compaction_completion(table_state& t, sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy);
 public:
    // Submit a table to be upgraded and wait for its termination.
-    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_sstable_upgrade(owned_ranges_ptr sorted_owned_ranges, compaction::table_state& t, bool exclude_current_version, tasks::task_info info);

    // Submit a table to be scrubbed and wait for its termination.
-    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, std::optional<tasks::task_info> info = std::nullopt);
+    future<compaction_stats_opt> perform_sstable_scrub(compaction::table_state& t, sstables::compaction_type_options::scrub opts, tasks::task_info info);

    // Submit a table for major compaction.
-    future<> perform_major_compaction(compaction::table_state& t, std::optional<tasks::task_info> info = std::nullopt);
+    future<> perform_major_compaction(compaction::table_state& t, tasks::task_info info, bool consider_only_existing_data = false);

    // Splits a compaction group by segregating all its sstable according to the classifier[1].
    // [1]: See sstables::compaction_type_options::splitting::classifier.
    // Returns when all sstables in the main sstable set are split. The only exception is shutdown
    // or user aborted splitting using stop API.
-    future<compaction_stats_opt> perform_split_compaction(compaction::table_state& t, sstables::compaction_type_options::split opt, std::optional<tasks::task_info> info = std::nullopt);
+    future<compaction_stats_opt> perform_split_compaction(compaction::table_state& t, sstables::compaction_type_options::split opt, tasks::task_info info);
+
+    // Splits a single SSTable by segregating all its data according to the classifier.
+    // If SSTable doesn't need split, the same input SSTable is returned as output.
+    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
+    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, table_state& t, sstables::compaction_type_options::split opt);

    // Run a custom job for a given table, defined by a function
    // it completes when future returned by job is ready or returns immediately
@@ -354,7 +367,7 @@ public:
    // parameter type is the compaction type the operation can most closely be
    //      associated with, use compaction_type::Compaction, if none apply.
    // parameter job is a function that will carry the operation
-    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, std::optional<tasks::task_info> info, throw_if_stopping do_throw_if_stopping);
+    future<> run_custom_job(compaction::table_state& s, sstables::compaction_type type, const char *desc, noncopyable_function<future<>(sstables::compaction_data&, sstables::compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping);

    class compaction_reenabler {
        compaction_manager& _cm;
@@ -394,7 +407,7 @@ public:

    // Remove a table from the compaction manager.
    // Cancel requests on table and wait for possible ongoing compactions.
-    future<> remove(compaction::table_state& t) noexcept;
+    future<> remove(compaction::table_state& t, sstring reason = "table removal") noexcept;

    const stats& get_stats() const {
        return _stats;
@@ -462,7 +475,9 @@ public:

 namespace compaction {

-class compaction_task_executor : public enable_shared_from_this<compaction_task_executor> {
+class compaction_task_executor
+    : public enable_shared_from_this<compaction_task_executor>
+    , public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>> {
 public:
    enum class state {
        none,       // initial and final state
@@ -586,6 +601,8 @@ private:
    future<compaction_manager::compaction_stats_opt> compaction_done() noexcept {
        return _compaction_done.get_future();
    }
+
+    future<sstables::sstable_set> sstable_set_for_tombstone_gc(::compaction::table_state& t);
 public:
    bool stopping() const noexcept {
        return _compaction_data.abort.abort_requested();
@@ -603,10 +620,10 @@ public:
    requires (compaction_manager& cm, throw_if_stopping do_throw_if_stopping, Args&&... args) {
        {TaskExecutor(cm, do_throw_if_stopping, std::forward<Args>(args)...)} -> std::same_as<TaskExecutor>;
    }
-    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, std::optional<tasks::task_info> parent_info, Args&&... args);
+    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_compaction(throw_if_stopping do_throw_if_stopping, tasks::task_info parent_info, Args&&... args);
    friend future<compaction_manager::compaction_stats_opt> compaction_manager::perform_task(shared_ptr<compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
    friend fmt::formatter<compaction_task_executor>;
-    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason);
+    friend future<> compaction_manager::stop_tasks(std::vector<shared_ptr<compaction_task_executor>> tasks, sstring reason) noexcept;
    friend sstables::test_env_compaction_manager;
 };

--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -10,6 +10,7 @@

 #pragma once

+#include "utils/assert.hh"
 #include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include "interval.hh"
@@ -311,7 +312,7 @@ public:

    template <typename T>
    static std::vector<sstables::shared_sstable> overlapping(const schema& s, const std::vector<sstables::shared_sstable>& candidates, const T& others) {
-        assert(!candidates.empty());
+        SCYLLA_ASSERT(!candidates.empty());
        /*
         * Picking each sstable from others that overlap one of the sstable of candidates is not enough
         * because you could have the following situation:
@@ -350,7 +351,7 @@ public:
     */
    template <typename T>
    static std::vector<sstables::shared_sstable> overlapping(const schema& s, dht::token start, dht::token end, const T& sstables) {
-        assert(start <= end);
+        SCYLLA_ASSERT(start <= end);

        std::vector<sstables::shared_sstable> overlapped;
        auto range = ::wrapping_interval<dht::token>::make(start, end);
@@ -459,7 +460,7 @@ private:
     * for prior failure), will return an empty list.  Never returns null.
     */
    candidates_info get_candidates_for(int level, const std::vector<std::optional<dht::decorated_key>>& last_compacted_keys) {
-        assert(!get_level(level).empty());
+        SCYLLA_ASSERT(!get_level(level).empty());

        logger.debug("Choosing candidates for L{}", level);

@@ -517,7 +518,7 @@ public:
            new_level = 0;
        } else {
            new_level = (minimum_level == maximum_level && can_promote) ? maximum_level + 1 : maximum_level;
-            assert(new_level > 0);
+            SCYLLA_ASSERT(new_level > 0);
        }
        return new_level;
    }
--- a/compaction/size_tiered_compaction_strategy.cc
+++ b/compaction/size_tiered_compaction_strategy.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include "utils/assert.hh"
 #include "sstables/sstables.hh"
 #include "size_tiered_compaction_strategy.hh"
 #include "cql3/statements/property_definitions.hh"
@@ -114,7 +115,7 @@ size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vect

    for(auto& sstable : sstables) {
        auto sstable_size = sstable->data_size();
-        assert(sstable_size != 0);
+        SCYLLA_ASSERT(sstable_size != 0);

        sstable_length_pairs.emplace_back(sstable, sstable_size);
    }
--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -39,6 +39,7 @@ public:
    virtual bool compaction_enforce_min_threshold() const noexcept = 0;
    virtual const sstables::sstable_set& main_sstable_set() const = 0;
    virtual const sstables::sstable_set& maintenance_sstable_set() const = 0;
+    virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
    virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
    virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
    virtual sstables::compaction_strategy& get_compaction_strategy() const noexcept = 0;
@@ -48,6 +49,8 @@ public:
    virtual sstables::shared_sstable make_sstable() const = 0;
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
+    virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
+    virtual api::timestamp_type min_memtable_live_row_marker_timestamp() const = 0;
    virtual bool memtable_has_key(const dht::decorated_key& key) const = 0;
    virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
    virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
--- a/compaction/task_manager_module.cc
+++ b/compaction/task_manager_module.cc
@@ -16,7 +16,6 @@
 #include "sstables/sstable_directory.hh"
 #include "utils/error_injection.hh"
 #include "utils/pretty_printers.hh"
-#include "db/config.hh"

 using namespace std::chrono_literals;

@@ -131,7 +130,7 @@ distribute_reshard_jobs(sstables::sstable_directory::sstable_open_info_vector so
 // A creator function must be passed that will create an SSTable object in the correct shard,
 // and an I/O priority must be specified.
 future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory::sstable_open_info_vector shared_info, replica::table& table,
-                           sstables::compaction_sstable_creator_fn creator, compaction::owned_ranges_ptr owned_ranges_ptr, std::optional<tasks::task_info> parent_info)
+                           sstables::compaction_sstable_creator_fn creator, compaction::owned_ranges_ptr owned_ranges_ptr, tasks::task_info parent_info)
 {
    // Resharding doesn't like empty sstable sets, so bail early. There is nothing
    // to reshard in this shard.
@@ -330,24 +329,32 @@ tasks::is_abortable compaction_task_impl::is_abortable() const noexcept {
    return tasks::is_abortable{!_parent_id};
 }

-static future<bool> maybe_flush_all_tables(sharded<replica::database>& db) {
-    auto interval = db.local().get_config().compaction_flush_all_tables_before_major_seconds();
-    if (interval) {
-        auto when = db_clock::now() - interval * 1s;
-        if (co_await replica::database::get_all_tables_flushed_at(db) <= when) {
-            co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
-                co_await db.flush_all_tables();
-            });
-            co_return true;
+static future<bool> maybe_flush_commitlog(sharded<replica::database>& db, bool force_flush) {
+    // flush commitlog if
+    // (a) force_flush == true (or)
+    // (b) flush_all_tables_before_major > 0s and the configured seconds have elapsed since last all tables flush
+    if (!force_flush) {
+        auto interval = db.local().get_compaction_manager().flush_all_tables_before_major();
+        if (interval <= 0s) {
+            co_return false;
+        }
+
+        auto when = db_clock::now() - interval;
+        if (co_await replica::database::get_all_tables_flushed_at(db) > when) {
+            co_return false;
        }
    }
-    co_return false;
+
+    co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
+        co_await db.flush_commitlog();
+    });
+    co_return true;
 }

 future<> global_major_compaction_task_impl::run() {
    bool flushed_all_tables = false;
    if (_flush_mode == flush_mode::all_tables) {
-        flushed_all_tables = co_await maybe_flush_all_tables(_db);
+        flushed_all_tables = co_await maybe_flush_commitlog(_db, _consider_only_existing_data);
    }

    std::unordered_map<sstring, std::vector<table_info>> tables_by_keyspace;
@@ -364,7 +371,7 @@ future<> global_major_compaction_task_impl::run() {
    flush_mode fm = flushed_all_tables ? flush_mode::skip : _flush_mode;
    for (auto& [ks, table_infos] : tables_by_keyspace) {
        auto task = co_await _module->make_and_start_task<major_keyspace_compaction_task_impl>(parent_info, ks, parent_info.id, _db, table_infos, fm,
-                &cv, &current_task);
+                _consider_only_existing_data, &cv, &current_task);
        keyspace_tasks.emplace_back(std::move(task), ks, std::move(table_infos));
    }
    co_await run_keyspace_tasks(_db.local(), keyspace_tasks, cv, current_task, false);
@@ -380,14 +387,14 @@ future<> major_keyspace_compaction_task_impl::run() {

    bool flushed_all_tables = false;
    if (_flush_mode == flush_mode::all_tables) {
-        flushed_all_tables = co_await maybe_flush_all_tables(_db);
+        flushed_all_tables = co_await maybe_flush_commitlog(_db, _consider_only_existing_data);
    }

    flush_mode fm = flushed_all_tables ? flush_mode::skip : _flush_mode;
    co_await _db.invoke_on_all([&] (replica::database& db) -> future<> {
        tasks::task_info parent_info{_status.id, _status.shard};
        auto& module = db.get_compaction_manager().get_task_manager_module();
-        auto task = co_await module.make_and_start_task<shard_major_keyspace_compaction_task_impl>(parent_info, _status.keyspace, _status.id, db, _table_infos, fm);
+        auto task = co_await module.make_and_start_task<shard_major_keyspace_compaction_task_impl>(parent_info, _status.keyspace, _status.id, db, _table_infos, fm, _consider_only_existing_data);
        co_await task->done();
    });
 }
@@ -398,7 +405,7 @@ future<> shard_major_keyspace_compaction_task_impl::run() {
    tasks::task_info parent_info{_status.id, _status.shard};
    std::vector<table_tasks_info> table_tasks;
    for (auto& ti : _local_tables) {
-        table_tasks.emplace_back(co_await _module->make_and_start_task<table_major_keyspace_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task, _flush_mode), ti);
+        table_tasks.emplace_back(co_await _module->make_and_start_task<table_major_keyspace_compaction_task_impl>(parent_info, _status.keyspace, ti.name, _status.id, _db, ti, cv, current_task, _flush_mode, _consider_only_existing_data), ti);
    }

    co_await run_table_tasks(_db, std::move(table_tasks), cv, current_task, true);
@@ -408,8 +415,8 @@ future<> table_major_keyspace_compaction_task_impl::run() {
    co_await wait_for_your_turn(_cv, _current_task, _status.id);
    tasks::task_info info{_status.id, _status.shard};
    replica::table::do_flush do_flush(_flush_mode != flush_mode::skip);
-    co_await run_on_table("force_keyspace_compaction", _db, _status.keyspace, _ti, [info, do_flush] (replica::table& t) {
-        return t.compact_all_sstables(info, do_flush);
+    co_await run_on_table("force_keyspace_compaction", _db, _status.keyspace, _ti, [info, do_flush, consider_only_existing_data = _consider_only_existing_data] (replica::table& t) {
+        return t.compact_all_sstables(info, do_flush, consider_only_existing_data);
    });
 }

@@ -467,7 +474,16 @@ future<> shard_cleanup_keyspace_compaction_task_impl::run() {

 future<> table_cleanup_keyspace_compaction_task_impl::run() {
    co_await wait_for_your_turn(_cv, _current_task, _status.id);
-    auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(_db.get_keyspace_local_ranges(_status.keyspace));
+    // Note that we do not hold an effective_replication_map_ptr throughout
+    // the cleanup operation, so the topology might change.
+    // Since clenaup is an admin operation required for vnodes,
+    // it is the responsibility of the system operator to not
+    // perform additional incompatible range movements during cleanup.
+    auto get_owned_ranges = [&] (std::string_view ks_name) -> future<owned_ranges_ptr> {
+        const auto& erm = _db.find_keyspace(ks_name).get_vnode_effective_replication_map();
+        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
+    };
+    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
    co_await run_on_table("force_keyspace_cleanup", _db, _status.keyspace, _ti, [&] (replica::table& t) {
        // skip the flush, as cleanup_keyspace_compaction_task_impl::run should have done this.
        return t.perform_cleanup_compaction(owned_ranges_ptr, tasks::task_info{_status.id, _status.shard}, replica::table::do_flush::no);
@@ -531,8 +547,15 @@ future<> shard_upgrade_sstables_compaction_task_impl::run() {

 future<> table_upgrade_sstables_compaction_task_impl::run() {
    co_await wait_for_your_turn(_cv, _current_task, _status.id);
-    auto owned_ranges = _db.maybe_get_keyspace_local_ranges(_status.keyspace);
-    auto owned_ranges_ptr = owned_ranges ? compaction::make_owned_ranges_ptr(std::move(owned_ranges.value())) : nullptr;
+    auto get_owned_ranges = [&] (std::string_view keyspace_name) -> future<owned_ranges_ptr> {
+        const auto& ks = _db.find_keyspace(keyspace_name);
+        if (ks.get_replication_strategy().is_per_table()) {
+            co_return nullptr;
+        }
+        const auto& erm = ks.get_vnode_effective_replication_map();
+        co_return compaction::make_owned_ranges_ptr(co_await _db.get_keyspace_local_ranges(erm));
+    };
+    auto owned_ranges_ptr = co_await get_owned_ranges(_status.keyspace);
    tasks::task_info info{_status.id, _status.shard};
    co_await run_on_table("upgrade_sstables", _db, _status.keyspace, _ti, [&] (replica::table& t) -> future<> {
        return t.parallel_foreach_table_state([&] (compaction::table_state& ts) -> future<> {
--- a/compaction/task_manager_module.hh
+++ b/compaction/task_manager_module.hh
@@ -62,9 +62,11 @@ public:
            std::string table,
            std::string entity,
            tasks::task_id parent_id,
-            flush_mode fm = flush_mode::compacted_tables) noexcept
+            flush_mode fm = flush_mode::compacted_tables,
+            bool consider_only_existing_data = false) noexcept
        : compaction_task_impl(module, id, sequence_number, std::move(scope), std::move(keyspace), std::move(table), std::move(entity), parent_id)
        , _flush_mode(fm)
+        , _consider_only_existing_data(consider_only_existing_data)
    {
        // FIXME: add progress units
    }
@@ -75,6 +77,7 @@ public:

 protected:
    flush_mode _flush_mode;
+    bool _consider_only_existing_data;

    virtual future<> run() override = 0;
 };
@@ -85,9 +88,10 @@ private:
 public:
    global_major_compaction_task_impl(tasks::task_manager::module_ptr module,
            sharded<replica::database>& db,
-            std::optional<flush_mode> fm = std::nullopt) noexcept
+            std::optional<flush_mode> fm = std::nullopt,
+            bool consider_only_existing_data = false) noexcept
        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), module->new_sequence_number(), "global", "", "", "", tasks::task_id::create_null_id(),
-                fm.value_or(flush_mode::all_tables))
+                fm.value_or(flush_mode::all_tables), consider_only_existing_data)
        , _db(db)
    {}
 protected:
@@ -109,12 +113,13 @@ public:
            sharded<replica::database>& db,
            std::vector<table_info> table_infos,
            std::optional<flush_mode> fm = std::nullopt,
+            bool consider_only_existing_data = false,
            seastar::condition_variable* cv = nullptr,
            tasks::task_manager::task_ptr* current_task = nullptr) noexcept
        : major_compaction_task_impl(module, tasks::task_id::create_random_id(),
                parent_id ? 0 : module->new_sequence_number(),
                "keyspace", std::move(keyspace), "", "", parent_id,
-                fm.value_or(flush_mode::all_tables))
+                fm.value_or(flush_mode::all_tables), consider_only_existing_data)
        , _db(db)
        , _table_infos(std::move(table_infos))
        , _cv(cv)
@@ -134,8 +139,9 @@ public:
            tasks::task_id parent_id,
            replica::database& db,
            std::vector<table_info> local_tables,
-            flush_mode fm) noexcept
-        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id, fm)
+            flush_mode fm,
+            bool consider_only_existing_data) noexcept
+        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "shard", std::move(keyspace), "", "", parent_id, fm, consider_only_existing_data)
        , _db(db)
        , _local_tables(std::move(local_tables))
    {}
@@ -158,8 +164,9 @@ public:
            table_info ti,
            seastar::condition_variable& cv,
            tasks::task_manager::task_ptr& current_task,
-            flush_mode fm) noexcept
-        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id, fm)
+            flush_mode fm,
+            bool consider_only_existing_data) noexcept
+        : major_compaction_task_impl(module, tasks::task_id::create_random_id(), 0, "table", std::move(keyspace), std::move(table), "", parent_id, fm, consider_only_existing_data)
        , _db(db)
        , _ti(std::move(ti))
        , _cv(cv)
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -17,7 +17,8 @@
 #include <boost/range/algorithm/remove_if.hpp>
 #include <boost/range/algorithm/min_element.hpp>
 #include <boost/range/algorithm/partial_sort.hpp>
-#include <boost/range/adaptor/reversed.hpp>
+
+#include <ranges>

 namespace sstables {

@@ -295,7 +296,8 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            // When trimming, let's keep sstables with overlapping time window, so as to reduce write amplification.
            // For example, if there are N sstables spanning window W, where N <= 32, then we can produce all data for W
            // in a single compaction round, removing the need to later compact W to reduce its number of files.
-            boost::partial_sort(multi_window, multi_window.begin() + max_sstables, [](const shared_sstable &a, const shared_sstable &b) {
+            auto sort_size = std::min(max_sstables, multi_window.size());
+            boost::partial_sort(multi_window, multi_window.begin() + sort_size, [](const shared_sstable &a, const shared_sstable &b) {
                return a->get_stats_metadata().max_timestamp < b->get_stats_metadata().max_timestamp;
            });
            maybe_trim_job(multi_window, job_size, disjoint);
@@ -309,10 +311,9 @@ time_window_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
    auto all_disjoint = !single_window.empty() && is_disjoint(single_window);
    auto all_buckets = get_buckets(single_window, _options);
    single_window.clear();
-    for (auto& pair : all_buckets.first) {
-        auto ssts = std::move(pair.second);
+    for (auto& [bucket, ssts] : all_buckets.first) {
        if (ssts.size() >= offstrategy_threshold) {
-            clogger.debug("time_window_compaction_strategy::get_reshaping_job: bucket={} bucket_size={}", pair.first, ssts.size());
+            clogger.debug("time_window_compaction_strategy::get_reshaping_job: bucket={} bucket_size={}", bucket, ssts.size());
            if (all_disjoint) {
                std::copy(ssts.begin(), ssts.end(), std::back_inserter(single_window));
                continue;
@@ -417,13 +418,13 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
 std::vector<shared_sstable>
 time_window_compaction_strategy::get_compaction_candidates(table_state& table_s, strategy_control& control, std::vector<shared_sstable> candidate_sstables) {
    auto& state = get_state(table_s);
-    auto p = get_buckets(std::move(candidate_sstables), _options);
+    auto [buckets, max_timestamp] = get_buckets(std::move(candidate_sstables), _options);
    // Update the highest window seen, if necessary
-    state.highest_window_seen = std::max(state.highest_window_seen, p.second);
+    state.highest_window_seen = std::max(state.highest_window_seen, max_timestamp);

-    update_estimated_compaction_by_tasks(state, p.first, table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold());
+    update_estimated_compaction_by_tasks(state, buckets, table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold());

-    return newest_bucket(table_s, control, std::move(p.first), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
+    return newest_bucket(table_s, control, std::move(buckets), table_s.min_compaction_threshold(), table_s.schema()->max_compaction_threshold(),
        state.highest_window_seen);
 }

@@ -463,7 +464,7 @@ struct fmt::formatter<std::map<sstables::timestamp_type, std::vector<sstables::s
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
    auto format(const std::map<sstables::timestamp_type, std::vector<sstables::shared_sstable>>& buckets, fmt::format_context& ctx) const {
        auto out = fmt::format_to(ctx.out(), "  buckets = {{\n");
-        for (auto& [timestamp, sstables] : buckets | boost::adaptors::reversed) {
+        for (auto& [timestamp, sstables] : buckets | std::views::reverse) {
            out = fmt::format_to(out, "    key={}, size={}\n", timestamp, sstables.size());
        }
        return fmt::format_to(out, "  }}\n");
@@ -478,10 +479,7 @@ time_window_compaction_strategy::newest_bucket(table_state& table_s, strategy_co
    auto& state = get_state(table_s);
    clogger.debug("time_window_compaction_strategy::newest_bucket:\n  now {}\n{}", now, buckets);

-    for (auto&& key_bucket : buckets | boost::adaptors::reversed) {
-        auto key = key_bucket.first;
-        auto& bucket = key_bucket.second;
-
+    for (auto&& [key, bucket] : buckets | std::views::reverse) {
        bool last_active_bucket = is_last_active_bucket(key, now);
        if (last_active_bucket) {
            state.recent_active_windows.insert(key);
@@ -536,10 +534,7 @@ void time_window_compaction_strategy::update_estimated_compaction_by_tasks(time_
    int64_t n = 0;
    timestamp_type now = state.highest_window_seen;

-    for (auto& task : tasks) {
-        const bucket_t& bucket = task.second;
-        timestamp_type bucket_key = task.first;
-
+    for (auto& [bucket_key, bucket] : tasks) {
        switch (compaction_mode(state, bucket, bucket_key, now, min_threshold)) {
        case bucket_compaction_mode::size_tiered:
            n += size_tiered_compaction_strategy::estimated_pending_compactions(bucket, min_threshold, max_threshold, _stcs_options);
--- a/compound.hh
+++ b/compound.hh
@@ -14,6 +14,7 @@
 #include <span>
 #include <boost/range/iterator_range.hpp>
 #include <boost/range/adaptor/transformed.hpp>
+#include "utils/assert.hh"
 #include "utils/serialization.hh"
 #include <seastar/util/backtrace.hh>

@@ -65,15 +66,15 @@ private:
        for (auto&& val : values) {
            using val_type = std::remove_cvref_t<decltype(val)>;
            if constexpr (FragmentedView<val_type>) {
-                assert(val.size_bytes() <= std::numeric_limits<size_type>::max());
+                SCYLLA_ASSERT(val.size_bytes() <= std::numeric_limits<size_type>::max());
                write<size_type>(out, size_type(val.size_bytes()));
                write_fragmented(out, val);
            } else if constexpr (std::same_as<val_type, managed_bytes>) {
-                assert(val.size() <= std::numeric_limits<size_type>::max());
+                SCYLLA_ASSERT(val.size() <= std::numeric_limits<size_type>::max());
                write<size_type>(out, size_type(val.size()));
                write_fragmented(out, managed_bytes_view(val));
            } else {
-                assert(val.size() <= std::numeric_limits<size_type>::max());
+                SCYLLA_ASSERT(val.size() <= std::numeric_limits<size_type>::max());
                write<size_type>(out, size_type(val.size()));
                write_fragmented(out, single_fragmented_view(val));
            }
@@ -135,7 +136,7 @@ public:
        partial.reserve(values.size());
        auto i = _types.begin();
        for (auto&& component : values) {
-            assert(i != _types.end());
+            SCYLLA_ASSERT(i != _types.end());
            partial.push_back((*i++)->decompose(component));
        }
        return serialize_value(partial);
@@ -256,7 +257,7 @@ public:
    }
    // Returns true iff given prefix has no missing components
    bool is_full(managed_bytes_view v) const {
-        assert(AllowPrefixes == allow_prefixes::yes);
+        SCYLLA_ASSERT(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
    bool is_empty(managed_bytes_view v) const {
--- a/Show More
+++ b/Show More